Repository: facebookresearch/fairseq Branch: main Commit: 3d262bb25690 Files: 1626 Total size: 9.2 MB Directory structure: gitextract_ldfkme3g/ ├── .github/ │ ├── CODEOWNERS │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ ├── documentation.md │ │ ├── feature_request.md │ │ └── how-to-question.md │ ├── ISSUE_TEMPLATE.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── stale.yml │ └── workflows/ │ ├── build.yml │ ├── depreview.yml │ └── release.yml ├── .gitignore ├── .gitmodules ├── .pre-commit-config.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── RELEASE.md ├── docs/ │ ├── Makefile │ ├── command_line_tools.rst │ ├── conf.py │ ├── criterions.rst │ ├── data.rst │ ├── docutils.conf │ ├── getting_started.rst │ ├── hydra_integration.md │ ├── index.rst │ ├── lr_scheduler.rst │ ├── make.bat │ ├── models.rst │ ├── modules.rst │ ├── optim.rst │ ├── overview.rst │ ├── tasks.rst │ ├── tutorial_classifying_names.rst │ └── tutorial_simple_lstm.rst ├── examples/ │ ├── .gitignore │ ├── MMPT/ │ │ ├── .gitignore │ │ ├── CONFIG.md │ │ ├── DATASET.md │ │ ├── README.md │ │ ├── endtask.md │ │ ├── locallaunch.py │ │ ├── mmpt/ │ │ │ ├── __init__.py │ │ │ ├── datasets/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fairseqmmdataset.py │ │ │ │ └── mmdataset.py │ │ │ ├── evaluators/ │ │ │ │ ├── __init__.py │ │ │ │ ├── evaluator.py │ │ │ │ ├── metric.py │ │ │ │ └── predictor.py │ │ │ ├── losses/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fairseqmmloss.py │ │ │ │ ├── loss.py │ │ │ │ └── nce.py │ │ │ ├── models/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fairseqmmmodel.py │ │ │ │ ├── mmfusion.py │ │ │ │ ├── mmfusionnlg.py │ │ │ │ └── transformermodel.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── mm.py │ │ │ │ ├── retri.py │ │ │ │ └── vectorpool.py │ │ │ ├── processors/ │ │ │ │ ├── __init__.py │ │ │ │ ├── dedupprocessor.py │ │ │ │ ├── dsprocessor.py │ │ │ │ ├── how2processor.py │ │ │ │ ├── how2retriprocessor.py │ │ │ │ ├── models/ │ │ │ │ │ └── s3dg.py │ │ │ │ └── processor.py │ │ │ ├── tasks/ │ │ │ │ ├── __init__.py │ │ │ │ ├── fairseqmmtask.py │ │ │ │ ├── milncetask.py │ │ │ │ ├── retritask.py │ │ │ │ ├── task.py │ │ │ │ └── vlmtask.py │ │ │ └── utils/ │ │ │ ├── __init__.py │ │ │ ├── load_config.py │ │ │ └── shardedtensor.py │ │ ├── mmpt_cli/ │ │ │ ├── localjob.py │ │ │ └── predict.py │ │ ├── pretraining.md │ │ ├── projects/ │ │ │ ├── mfmmlm.yaml │ │ │ ├── mtm/ │ │ │ │ ├── mmfusionmtm.yaml │ │ │ │ ├── vlm/ │ │ │ │ │ ├── coin.yaml │ │ │ │ │ ├── crosstask.yaml │ │ │ │ │ ├── how2.yaml │ │ │ │ │ ├── test_coin.yaml │ │ │ │ │ ├── test_crosstask.yaml │ │ │ │ │ ├── test_crosstask_zs.yaml │ │ │ │ │ ├── test_vtt.yaml │ │ │ │ │ ├── test_vttqa.yaml │ │ │ │ │ ├── test_youcook.yaml │ │ │ │ │ ├── test_youcookcap.yaml │ │ │ │ │ ├── vtt.yaml │ │ │ │ │ ├── vttqa.yaml │ │ │ │ │ ├── youcook.yaml │ │ │ │ │ └── youcookcap.yaml │ │ │ │ └── vlm.yaml │ │ │ ├── retri/ │ │ │ │ ├── videoclip/ │ │ │ │ │ ├── coin_videoclip.yaml │ │ │ │ │ ├── crosstask_videoclip.yaml │ │ │ │ │ ├── how2.yaml │ │ │ │ │ ├── test_coin_videoclip.yaml │ │ │ │ │ ├── test_coin_zs.yaml │ │ │ │ │ ├── test_crosstask_videoclip.yaml │ │ │ │ │ ├── test_crosstask_zs_videoclip.yaml │ │ │ │ │ ├── test_didemo_zs.yaml │ │ │ │ │ ├── test_vtt_videoclip.yaml │ │ │ │ │ ├── test_vtt_zs.yaml │ │ │ │ │ ├── test_vttqa_videoclip.yaml │ │ │ │ │ ├── test_vttqa_zs.yaml │ │ │ │ │ ├── test_youcook_videoclip.yaml │ │ │ │ │ ├── test_youcook_zs.yaml │ │ │ │ │ ├── vtt_videoclip.yaml │ │ │ │ │ ├── vttqa_videoclip.yaml │ │ │ │ │ └── youcook_videoclip.yaml │ │ │ │ ├── videoclip.yaml │ │ │ │ └── videoretri.yaml │ │ │ └── task/ │ │ │ ├── coin.yaml │ │ │ ├── coin_videoclip.yaml │ │ │ ├── crosstask.yaml │ │ │ ├── crosstask_videoclip.yaml │ │ │ ├── default.yaml │ │ │ ├── ft.yaml │ │ │ ├── how2.yaml │ │ │ ├── test.yaml │ │ │ ├── test_coin.yaml │ │ │ ├── test_coin_videoclip.yaml │ │ │ ├── test_coin_zs.yaml │ │ │ ├── test_crosstask.yaml │ │ │ ├── test_crosstask_videoclip.yaml │ │ │ ├── test_crosstask_zs.yaml │ │ │ ├── test_crosstask_zs_videoclip.yaml │ │ │ ├── test_didemo_zs.yaml │ │ │ ├── test_vtt.yaml │ │ │ ├── test_vtt_videoclip.yaml │ │ │ ├── test_vtt_zs.yaml │ │ │ ├── test_vttqa.yaml │ │ │ ├── test_vttqa_videoclip.yaml │ │ │ ├── test_vttqa_zs.yaml │ │ │ ├── test_youcook.yaml │ │ │ ├── test_youcook_videoclip.yaml │ │ │ ├── test_youcook_zs.yaml │ │ │ ├── test_youcookcap.yaml │ │ │ ├── vtt.yaml │ │ │ ├── vtt_videoclip.yaml │ │ │ ├── vttqa.yaml │ │ │ ├── vttqa_videoclip.yaml │ │ │ ├── youcook.yaml │ │ │ ├── youcook_videoclip.yaml │ │ │ └── youcookcap.yaml │ │ ├── scripts/ │ │ │ ├── text_token_extractor/ │ │ │ │ ├── configs/ │ │ │ │ │ └── bert-base-uncased.yaml │ │ │ │ └── pretokenization.py │ │ │ └── video_feature_extractor/ │ │ │ ├── extract.py │ │ │ ├── how2/ │ │ │ │ └── s3d.sh │ │ │ ├── model.py │ │ │ ├── pathbuilder.py │ │ │ ├── preprocessing.py │ │ │ ├── random_sequence_shuffler.py │ │ │ ├── shard_feature.py │ │ │ └── videoreader.py │ │ └── setup.py │ ├── __init__.py │ ├── adaptive_span/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── adagrad_with_grad_clip.py │ │ ├── adaptive_span_attention.py │ │ ├── adaptive_span_loss.py │ │ ├── adaptive_span_model.py │ │ └── adaptive_span_model_wrapper.py │ ├── attention_head_selection/ │ │ ├── README.md │ │ └── src/ │ │ ├── __init__.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ └── speech_to_text_dataset_with_domain.py │ │ ├── loss/ │ │ │ ├── __init__.py │ │ │ └── attention_head_selection.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── head_selection_s2t_transformer.py │ │ │ └── head_selection_transformer.py │ │ ├── modules/ │ │ │ ├── __init__.py │ │ │ ├── attn_head_selector.py │ │ │ ├── head_selection_transformer_layer.py │ │ │ ├── multihead_attention_selection.py │ │ │ └── multihead_functional.py │ │ └── speech_to_text_head_selection.py │ ├── audio_nlp/ │ │ └── nlu/ │ │ ├── README.md │ │ ├── configs/ │ │ │ └── nlu_finetuning.yaml │ │ ├── create_dict_stop.sh │ │ └── generate_manifests.py │ ├── backtranslation/ │ │ ├── README.md │ │ ├── deduplicate_lines.py │ │ ├── extract_bt_data.py │ │ ├── prepare-de-monolingual.sh │ │ ├── prepare-wmt18en2de.sh │ │ ├── sacrebleu.sh │ │ └── tokenized_bleu.sh │ ├── bart/ │ │ ├── README.glue.md │ │ ├── README.md │ │ ├── README.summarization.md │ │ └── summarize.py │ ├── byte_level_bpe/ │ │ ├── README.md │ │ ├── get_bitext.py │ │ ├── get_data.sh │ │ └── gru_transformer.py │ ├── camembert/ │ │ └── README.md │ ├── constrained_decoding/ │ │ ├── README.md │ │ ├── normalize.py │ │ └── tok.py │ ├── conv_seq2seq/ │ │ └── README.md │ ├── criss/ │ │ ├── README.md │ │ ├── download_and_preprocess_flores_test.sh │ │ ├── download_and_preprocess_tatoeba.sh │ │ ├── mining/ │ │ │ ├── mine.py │ │ │ └── mine_example.sh │ │ ├── save_encoder.py │ │ ├── sentence_retrieval/ │ │ │ ├── encoder_analysis.py │ │ │ └── sentence_retrieval_tatoeba.sh │ │ └── unsupervised_mt/ │ │ └── eval.sh │ ├── cross_lingual_language_model/ │ │ └── README.md │ ├── data2vec/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── audio/ │ │ │ │ ├── classification/ │ │ │ │ │ ├── base_classification.yaml │ │ │ │ │ └── run_config/ │ │ │ │ │ ├── slurm_1.yaml │ │ │ │ │ ├── slurm_1g.yaml │ │ │ │ │ └── slurm_2.yaml │ │ │ │ └── pretraining/ │ │ │ │ ├── audioset.yaml │ │ │ │ ├── base_librispeech.yaml │ │ │ │ └── run_config/ │ │ │ │ ├── local.yaml │ │ │ │ ├── slurm_1.yaml │ │ │ │ ├── slurm_1_aws.yaml │ │ │ │ ├── slurm_2.yaml │ │ │ │ ├── slurm_2_aws.yaml │ │ │ │ ├── slurm_3.yaml │ │ │ │ ├── slurm_4.yaml │ │ │ │ ├── slurm_4_aws.yaml │ │ │ │ ├── slurm_6_aws.yaml │ │ │ │ └── slurm_8_aws.yaml │ │ │ ├── text/ │ │ │ │ └── pretraining/ │ │ │ │ ├── base.yaml │ │ │ │ └── run_config/ │ │ │ │ ├── local.yaml │ │ │ │ ├── slurm_1_aws.yaml │ │ │ │ ├── slurm_2.yaml │ │ │ │ ├── slurm_2_aws.yaml │ │ │ │ ├── slurm_3.yaml │ │ │ │ ├── slurm_4.yaml │ │ │ │ ├── slurm_4_aws.yaml │ │ │ │ └── slurm_8_aws.yaml │ │ │ ├── v2/ │ │ │ │ ├── base_audio_only_task.yaml │ │ │ │ ├── base_images_only_task.yaml │ │ │ │ ├── base_text_only_task.yaml │ │ │ │ ├── huge_images14_only_task.yaml │ │ │ │ ├── huge_images_only_task.yaml │ │ │ │ ├── large_audio_only_task.yaml │ │ │ │ ├── large_images_only_task.yaml │ │ │ │ ├── large_text_only_task.yaml │ │ │ │ ├── large_text_only_task_pgrp_1M.yaml │ │ │ │ ├── run_config/ │ │ │ │ │ ├── local.yaml │ │ │ │ │ ├── slurm_1.yaml │ │ │ │ │ ├── slurm_1_aws.yaml │ │ │ │ │ ├── slurm_2.yaml │ │ │ │ │ ├── slurm_2_aws.yaml │ │ │ │ │ ├── slurm_3.yaml │ │ │ │ │ ├── slurm_4.yaml │ │ │ │ │ ├── slurm_4_aws.yaml │ │ │ │ │ ├── slurm_6_aws.yaml │ │ │ │ │ ├── slurm_8.yaml │ │ │ │ │ └── slurm_8_aws.yaml │ │ │ │ └── text_finetuning/ │ │ │ │ ├── cola.yaml │ │ │ │ ├── mnli.yaml │ │ │ │ ├── mrpc.yaml │ │ │ │ ├── qnli.yaml │ │ │ │ ├── qqp.yaml │ │ │ │ ├── rte.yaml │ │ │ │ ├── run_config/ │ │ │ │ │ └── local.yaml │ │ │ │ ├── sst_2.yaml │ │ │ │ └── sts_b.yaml │ │ │ └── vision/ │ │ │ ├── finetuning/ │ │ │ │ ├── imagenet.yaml │ │ │ │ ├── mae_imagenet_clean.yaml │ │ │ │ ├── mae_imagenet_huge_clean.yaml │ │ │ │ ├── mae_imagenet_large_clean.yaml │ │ │ │ └── run_config/ │ │ │ │ ├── local.yaml │ │ │ │ ├── slurm_1.yaml │ │ │ │ ├── slurm_1_aws.yaml │ │ │ │ ├── slurm_2.yaml │ │ │ │ ├── slurm_2_aws.yaml │ │ │ │ ├── slurm_3.yaml │ │ │ │ ├── slurm_4.yaml │ │ │ │ ├── slurm_4_aws.yaml │ │ │ │ ├── slurm_6_aws.yaml │ │ │ │ └── slurm_8_aws.yaml │ │ │ └── pretraining/ │ │ │ ├── base_imagenet.yaml │ │ │ ├── base_imagenet_d2v1.yaml │ │ │ ├── base_mae_imagenet.yaml │ │ │ └── run_config/ │ │ │ ├── local.yaml │ │ │ ├── slurm_1.yaml │ │ │ ├── slurm_1_aws.yaml │ │ │ ├── slurm_2.yaml │ │ │ ├── slurm_2_aws.yaml │ │ │ ├── slurm_3.yaml │ │ │ ├── slurm_4.yaml │ │ │ ├── slurm_4_aws.yaml │ │ │ ├── slurm_6_aws.yaml │ │ │ └── slurm_8_aws.yaml │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── add_class_target_dataset.py │ │ │ ├── image_dataset.py │ │ │ ├── mae_finetuning_image_dataset.py │ │ │ ├── mae_image_dataset.py │ │ │ ├── modality.py │ │ │ └── path_dataset.py │ │ ├── fb_convert_beit_cp.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── audio_classification.py │ │ │ ├── data2vec2.py │ │ │ ├── data2vec_audio.py │ │ │ ├── data2vec_image_classification.py │ │ │ ├── data2vec_text.py │ │ │ ├── data2vec_text_classification.py │ │ │ ├── data2vec_vision.py │ │ │ ├── mae.py │ │ │ ├── mae_image_classification.py │ │ │ ├── modalities/ │ │ │ │ ├── __init__.py │ │ │ │ ├── audio.py │ │ │ │ ├── base.py │ │ │ │ ├── images.py │ │ │ │ ├── modules.py │ │ │ │ └── text.py │ │ │ └── utils.py │ │ ├── scripts/ │ │ │ ├── convert_audioset_labels.py │ │ │ ├── multi/ │ │ │ │ ├── finetune_all_fair_aws_local_lr.sh │ │ │ │ ├── finetune_all_fair_aws_local_lr_nodep.sh │ │ │ │ └── finetune_all_fair_local_lr.sh │ │ │ └── text/ │ │ │ ├── finetune_all_char_fair_aws_local_lr.sh │ │ │ ├── finetune_all_fair.sh │ │ │ ├── finetune_all_fair_aws.sh │ │ │ ├── finetune_all_fair_aws_local_lr.sh │ │ │ ├── finetune_all_fair_aws_lr.sh │ │ │ ├── finetune_all_fair_local_lr.sh │ │ │ ├── finetune_all_fair_nodep.sh │ │ │ ├── finetune_all_fair_nodep_aws.sh │ │ │ ├── finetune_all_fair_nodep_aws_local_lr.sh │ │ │ ├── finetune_all_fair_nodep_aws_lr.sh │ │ │ ├── finetune_all_fair_nodep_aws_lr_nopos.sh │ │ │ ├── finetune_all_large_fair_aws_local_lr.sh │ │ │ ├── finetune_all_large_fair_local_lr.sh │ │ │ ├── finetune_all_large_fair_nodep_aws_local_lr.sh │ │ │ ├── finetune_sst2_qnli_sweep_fair_nodep.sh │ │ │ ├── glue.py │ │ │ ├── glue_lr.py │ │ │ ├── unprocess_data.py │ │ │ └── valids.py │ │ └── tasks/ │ │ ├── __init__.py │ │ ├── audio_classification.py │ │ ├── image_classification.py │ │ ├── image_pretraining.py │ │ ├── mae_image_classification.py │ │ ├── mae_image_pretraining.py │ │ └── multimodal.py │ ├── discriminative_reranking_nmt/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── config/ │ │ │ └── deen.yaml │ │ ├── criterions/ │ │ │ ├── __init__.py │ │ │ └── discriminative_reranking_criterion.py │ │ ├── drnmt_rerank.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ └── discriminative_reranking_model.py │ │ ├── scripts/ │ │ │ └── prep_data.py │ │ └── tasks/ │ │ ├── __init__.py │ │ └── discriminative_reranking_task.py │ ├── emotion_conversion/ │ │ ├── README.md │ │ ├── emotion_models/ │ │ │ ├── __init__.py │ │ │ ├── duration_predictor.py │ │ │ ├── duration_predictor.yaml │ │ │ ├── pitch_predictor.py │ │ │ ├── pitch_predictor.yaml │ │ │ └── utils.py │ │ ├── fairseq_models/ │ │ │ └── __init__.py │ │ ├── preprocess/ │ │ │ ├── __init__.py │ │ │ ├── build_hifigan_manifest.py │ │ │ ├── build_translation_manifests.py │ │ │ ├── create_core_manifest.py │ │ │ ├── extract_f0.py │ │ │ ├── process_km.py │ │ │ ├── split_emov_km_tsv_by_uttid.py │ │ │ ├── split_km.py │ │ │ └── split_km_tsv.py │ │ ├── requirements.txt │ │ └── synthesize.py │ ├── fast_noisy_channel/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── noisy_channel_beam_search.py │ │ ├── noisy_channel_sequence_generator.py │ │ └── noisy_channel_translation.py │ ├── flores101/ │ │ └── README.md │ ├── fully_sharded_data_parallel/ │ │ └── README.md │ ├── gottbert/ │ │ └── README.md │ ├── hubert/ │ │ ├── README.md │ │ ├── config/ │ │ │ ├── decode/ │ │ │ │ ├── ax_sweep/ │ │ │ │ │ ├── ngram.yaml │ │ │ │ │ └── transformer.yaml │ │ │ │ ├── infer_fsqlm.yaml │ │ │ │ ├── infer_kenlm.yaml │ │ │ │ ├── infer_viterbi.yaml │ │ │ │ └── run/ │ │ │ │ ├── submitit_slurm.yaml │ │ │ │ └── submitit_slurm_8gpu.yaml │ │ │ ├── finetune/ │ │ │ │ ├── base_10h.yaml │ │ │ │ ├── ckpt/ │ │ │ │ │ └── it1.yaml │ │ │ │ ├── lm/ │ │ │ │ │ └── ls_4gram.yaml │ │ │ │ └── run/ │ │ │ │ └── submitit_reg.yaml │ │ │ └── pretrain/ │ │ │ ├── data/ │ │ │ │ ├── iter1.yaml │ │ │ │ └── iter2.yaml │ │ │ ├── hubert_base_librispeech.yaml │ │ │ ├── hubert_large_librivox.yaml │ │ │ ├── hubert_xlarge_librivox.yaml │ │ │ └── run/ │ │ │ └── submitit_reg.yaml │ │ ├── measure_teacher_quality.py │ │ ├── simple_kmeans/ │ │ │ ├── README.md │ │ │ ├── dump_hubert_feature.py │ │ │ ├── dump_hubert_feature_s2t.py │ │ │ ├── dump_km_label.py │ │ │ ├── dump_mfcc_feature.py │ │ │ ├── dump_w2v2_feature.py │ │ │ ├── feature_utils.py │ │ │ └── learn_kmeans.py │ │ ├── tests/ │ │ │ ├── 6313-76958-0021.flac │ │ │ ├── sample.base.L9.km500.km │ │ │ ├── sample.base.L9.len │ │ │ ├── sample.base.L9.npy │ │ │ ├── sample.large.L20.len │ │ │ ├── sample.large.L20.npy │ │ │ ├── sample.large.hypo.word │ │ │ ├── sample.xlarge.L30.len │ │ │ ├── sample.xlarge.L30.npy │ │ │ ├── sample.xlarge.hypo.word │ │ │ ├── test_feature_and_unit.sh │ │ │ └── test_finetuned_asr.sh │ │ └── update_ckpt.py │ ├── joint_alignment_translation/ │ │ ├── README.md │ │ └── prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh │ ├── language_model/ │ │ ├── README.adaptive_inputs.md │ │ ├── README.conv.md │ │ ├── README.md │ │ └── prepare-wikitext-103.sh │ ├── laser/ │ │ ├── README.md │ │ └── laser_src/ │ │ ├── __init__.py │ │ ├── laser_lstm.py │ │ ├── laser_task.py │ │ ├── laser_transformer.py │ │ └── multitask_data_utils.py │ ├── latent_depth/ │ │ ├── README.md │ │ └── latent_depth_src/ │ │ ├── __init__.py │ │ ├── loss/ │ │ │ ├── __init__.py │ │ │ └── latent_depth.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── latent_multilingual_transformer.py │ │ │ └── latent_transformer.py │ │ ├── modules/ │ │ │ ├── __init__.py │ │ │ └── latent_layers.py │ │ └── multilingual_translation_latent_depth.py │ ├── layerdrop/ │ │ └── README.md │ ├── linformer/ │ │ ├── README.md │ │ └── linformer_src/ │ │ ├── __init__.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ └── linformer_roberta.py │ │ └── modules/ │ │ ├── __init__.py │ │ ├── linformer_sentence_encoder.py │ │ ├── linformer_sentence_encoder_layer.py │ │ └── multihead_linear_attention.py │ ├── m2m_100/ │ │ ├── README.md │ │ ├── install_dependecies.sh │ │ ├── process_data/ │ │ │ ├── clean_histogram.py │ │ │ ├── dedup_data.py │ │ │ └── remove_too_much_punc.py │ │ ├── tok.sh │ │ └── tokenizers/ │ │ ├── README.md │ │ ├── seg_ja.sh │ │ ├── seg_ko.sh │ │ ├── thirdparty/ │ │ │ └── .gitignore │ │ ├── tokenize_indic.py │ │ ├── tokenize_thai.py │ │ ├── tokenize_zh.py │ │ └── tokenizer_ar.sh │ ├── mbart/ │ │ └── README.md │ ├── megatron_11b/ │ │ ├── README.md │ │ └── detok.py │ ├── mms/ │ │ ├── MODEL_CARD.md │ │ ├── README.md │ │ ├── asr/ │ │ │ ├── config/ │ │ │ │ └── infer_common.yaml │ │ │ ├── infer/ │ │ │ │ ├── example_infer_adapter.sh │ │ │ │ └── mms_infer.py │ │ │ └── tutorial/ │ │ │ └── MMS_ASR_Inference_Colab.ipynb │ │ ├── data_prep/ │ │ │ ├── README.md │ │ │ ├── align_and_segment.py │ │ │ ├── align_utils.py │ │ │ ├── norm_config.py │ │ │ ├── punctuations.lst │ │ │ └── text_normalization.py │ │ ├── lid/ │ │ │ ├── infer.py │ │ │ └── tutorial/ │ │ │ └── MMS_LID_Inference_Colab.ipynb │ │ ├── lid_rerank/ │ │ │ ├── README.md │ │ │ ├── cer_langs.txt │ │ │ ├── mala/ │ │ │ │ └── infer.py │ │ │ ├── mms/ │ │ │ │ ├── make_parallel_single_runs.py │ │ │ │ ├── merge_by_lang.py │ │ │ │ ├── prep_wav_list.py │ │ │ │ ├── run_single_lang.py │ │ │ │ └── split_by_lang.py │ │ │ ├── mms-zs/ │ │ │ │ ├── falign.py │ │ │ │ ├── lib.py │ │ │ │ └── uromanize.py │ │ │ ├── nllb/ │ │ │ │ └── infer.py │ │ │ ├── requirements.txt │ │ │ ├── rerank/ │ │ │ │ ├── rerank.py │ │ │ │ └── tune_coefficients.py │ │ │ └── whisper/ │ │ │ ├── infer_asr.py │ │ │ ├── infer_lid.py │ │ │ └── lid_mapping.txt │ │ ├── misc/ │ │ │ └── get_sample_size.py │ │ ├── tts/ │ │ │ ├── infer.py │ │ │ └── tutorial/ │ │ │ └── MMS_TTS_Inference_Colab.ipynb │ │ └── zero_shot/ │ │ └── README.md │ ├── moe_lm/ │ │ ├── README.md │ │ ├── data_card.md │ │ └── model_card.md │ ├── mr_hubert/ │ │ ├── README.md │ │ ├── config/ │ │ │ ├── decode/ │ │ │ │ ├── infer.yaml │ │ │ │ ├── infer_lm.yaml │ │ │ │ └── run/ │ │ │ │ ├── submitit_slurm.yaml │ │ │ │ └── submitit_slurm_8gpu.yaml │ │ │ ├── finetune/ │ │ │ │ ├── base_100h.yaml │ │ │ │ ├── base_100h_large.yaml │ │ │ │ ├── base_10h.yaml │ │ │ │ ├── base_10h_large.yaml │ │ │ │ ├── base_1h.yaml │ │ │ │ └── base_1h_large.yaml │ │ │ └── pretrain/ │ │ │ ├── mrhubert_base_librispeech.yaml │ │ │ ├── mrhubert_large_librilight.yaml │ │ │ └── run/ │ │ │ └── submitit_reg.yaml │ │ ├── decode.sh │ │ ├── finetune.sh │ │ └── train.sh │ ├── multilingual/ │ │ ├── ML50_langs.txt │ │ ├── README.md │ │ ├── data_scripts/ │ │ │ ├── README.md │ │ │ ├── binarize.py │ │ │ ├── check_iswlt_test_data.py │ │ │ ├── check_self_overlaps.py │ │ │ ├── check_valid_test_overlaps.py │ │ │ ├── dedup_all.py │ │ │ ├── download_ML50_v1.sh │ │ │ ├── download_af_xh.sh │ │ │ ├── download_flores_data.sh │ │ │ ├── download_iitb.sh │ │ │ ├── download_iwslt_and_extract.sh │ │ │ ├── download_lotus.sh │ │ │ ├── download_ted_and_extract.py │ │ │ ├── download_wat19_my.sh │ │ │ ├── download_wmt19_and_before.py │ │ │ ├── download_wmt20.sh │ │ │ ├── preprocess_ML50_v1.sh │ │ │ ├── remove_valid_test_in_train.py │ │ │ ├── requirement.txt │ │ │ └── utils/ │ │ │ ├── dedup.py │ │ │ ├── fasttext_multi_filter.py │ │ │ └── strip_sgm.sh │ │ ├── finetune_multilingual_model.sh │ │ ├── multilingual_fairseq_gen.sh │ │ └── train_multilingual_model.sh │ ├── noisychannel/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── rerank.py │ │ ├── rerank_generate.py │ │ ├── rerank_options.py │ │ ├── rerank_score_bw.py │ │ ├── rerank_score_lm.py │ │ ├── rerank_tune.py │ │ └── rerank_utils.py │ ├── nonautoregressive_translation/ │ │ ├── README.md │ │ └── scripts.md │ ├── normformer/ │ │ ├── README.md │ │ └── train_lm.sh │ ├── operators/ │ │ ├── alignment_train_cpu.cpp │ │ ├── alignment_train_cuda.cpp │ │ ├── alignment_train_cuda.h │ │ ├── alignment_train_kernel.cu │ │ └── utils.h │ ├── paraphraser/ │ │ ├── README.md │ │ └── paraphrase.py │ ├── pay_less_attention_paper/ │ │ └── README.md │ ├── pointer_generator/ │ │ ├── README.md │ │ ├── README.xsum.md │ │ ├── pointer_generator_src/ │ │ │ ├── __init__.py │ │ │ └── transformer_pg.py │ │ ├── postprocess.py │ │ └── preprocess.py │ ├── quant_noise/ │ │ ├── README.md │ │ └── transformer_quantization_config.yaml │ ├── roberta/ │ │ ├── README.custom_classification.md │ │ ├── README.glue.md │ │ ├── README.md │ │ ├── README.pretraining.md │ │ ├── README.race.md │ │ ├── commonsense_qa/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── commonsense_qa_task.py │ │ │ └── download_cqa_data.sh │ │ ├── config/ │ │ │ ├── finetuning/ │ │ │ │ ├── cola.yaml │ │ │ │ ├── mnli.yaml │ │ │ │ ├── mrpc.yaml │ │ │ │ ├── qnli.yaml │ │ │ │ ├── qqp.yaml │ │ │ │ ├── rte.yaml │ │ │ │ ├── run_config/ │ │ │ │ │ ├── local.yaml │ │ │ │ │ ├── slurm_1g.yaml │ │ │ │ │ └── slurm_1g_aws.yaml │ │ │ │ ├── sst_2.yaml │ │ │ │ └── sts_b.yaml │ │ │ └── pretraining/ │ │ │ ├── base.yaml │ │ │ └── run_config/ │ │ │ ├── local.yaml │ │ │ ├── slurm_2.yaml │ │ │ ├── slurm_2_aws.yaml │ │ │ ├── slurm_3.yaml │ │ │ └── slurm_4.yaml │ │ ├── fb_multilingual/ │ │ │ └── README.multilingual.pretraining.md │ │ ├── multiprocessing_bpe_encoder.py │ │ ├── preprocess_GLUE_tasks.sh │ │ ├── preprocess_RACE.py │ │ ├── preprocess_RACE.sh │ │ └── wsc/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── wsc_criterion.py │ │ ├── wsc_task.py │ │ └── wsc_utils.py │ ├── rxf/ │ │ ├── README.md │ │ ├── __init__.py │ │ └── rxf_src/ │ │ ├── __init__.py │ │ ├── label_smoothed_cross_entropy_r3f.py │ │ └── sentence_prediction_r3f.py │ ├── scaling_nmt/ │ │ └── README.md │ ├── shuffled_word_order/ │ │ ├── README.finetuning.md │ │ └── README.md │ ├── simultaneous_translation/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── docs/ │ │ │ ├── ende-mma.md │ │ │ └── enja-waitk.md │ │ ├── eval/ │ │ │ └── agents/ │ │ │ └── simul_t2t_enja.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── convtransformer_simul_trans.py │ │ │ └── transformer_monotonic_attention.py │ │ ├── modules/ │ │ │ ├── __init__.py │ │ │ ├── fixed_pre_decision.py │ │ │ ├── monotonic_multihead_attention.py │ │ │ └── monotonic_transformer_layer.py │ │ ├── tests/ │ │ │ ├── test_alignment_train.py │ │ │ └── test_text_models.py │ │ └── utils/ │ │ ├── __init__.py │ │ ├── functions.py │ │ ├── monotonic_attention.py │ │ └── p_choose_strategy.py │ ├── speech_recognition/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── criterions/ │ │ │ ├── ASG_loss.py │ │ │ ├── __init__.py │ │ │ └── cross_entropy_acc.py │ │ ├── data/ │ │ │ ├── __init__.py │ │ │ ├── asr_dataset.py │ │ │ ├── collaters.py │ │ │ ├── data_utils.py │ │ │ └── replabels.py │ │ ├── datasets/ │ │ │ ├── asr_prep_json.py │ │ │ └── prepare-librispeech.sh │ │ ├── infer.py │ │ ├── kaldi/ │ │ │ ├── __init__.py │ │ │ ├── add-self-loop-simple.cc │ │ │ ├── config/ │ │ │ │ └── kaldi_initializer.yaml │ │ │ ├── kaldi_decoder.py │ │ │ └── kaldi_initializer.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── vggtransformer.py │ │ │ └── w2l_conv_glu_enc.py │ │ ├── new/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── conf/ │ │ │ │ ├── hydra/ │ │ │ │ │ └── sweeper/ │ │ │ │ │ ├── ax.yaml │ │ │ │ │ └── ax_sil.yaml │ │ │ │ ├── infer.yaml │ │ │ │ └── run_config/ │ │ │ │ ├── fb_slurm_1.yaml │ │ │ │ └── fb_slurm_2g.yaml │ │ │ ├── decoders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base_decoder.py │ │ │ │ ├── decoder.py │ │ │ │ ├── decoder_config.py │ │ │ │ ├── flashlight_decoder.py │ │ │ │ └── viterbi_decoder.py │ │ │ └── infer.py │ │ ├── tasks/ │ │ │ ├── __init__.py │ │ │ └── speech_recognition.py │ │ ├── utils/ │ │ │ └── wer_utils.py │ │ └── w2l_decoder.py │ ├── speech_synthesis/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data_utils.py │ │ ├── docs/ │ │ │ ├── common_voice_example.md │ │ │ ├── ljspeech_example.md │ │ │ └── vctk_example.md │ │ ├── evaluation/ │ │ │ ├── __init__.py │ │ │ ├── eval_asr.py │ │ │ ├── eval_f0.py │ │ │ ├── eval_sp.py │ │ │ └── get_eval_manifest.py │ │ ├── generate_waveform.py │ │ ├── preprocessing/ │ │ │ ├── __init__.py │ │ │ ├── denoise_and_vad_audio.py │ │ │ ├── denoiser/ │ │ │ │ ├── __init__.py │ │ │ │ ├── demucs.py │ │ │ │ ├── pretrained.py │ │ │ │ ├── resample.py │ │ │ │ └── utils.py │ │ │ ├── get_common_voice_audio_manifest.py │ │ │ ├── get_feature_manifest.py │ │ │ ├── get_ljspeech_audio_manifest.py │ │ │ ├── get_speaker_embedding.py │ │ │ ├── get_vctk_audio_manifest.py │ │ │ ├── speaker_embedder/ │ │ │ │ └── __init__.py │ │ │ └── vad/ │ │ │ └── __init__.py │ │ └── utils.py │ ├── speech_text_joint_to_text/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── configs/ │ │ │ └── mustc_noise.list │ │ ├── criterions/ │ │ │ ├── __init__.py │ │ │ ├── multi_modality_compound.py │ │ │ ├── multi_modality_cross_entropy.py │ │ │ └── text_guide_cross_entropy_acc.py │ │ ├── data/ │ │ │ └── pair_denoising_dataset.py │ │ ├── docs/ │ │ │ ├── ende-mustc.md │ │ │ ├── iwslt2021.md │ │ │ └── pre-training.md │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── joint_speech_text_pretrain_transformer.py │ │ │ ├── s2t_dualinputtransformer.py │ │ │ ├── s2t_dualinputwavtransformer.py │ │ │ └── s2t_dualinputxmtransformer.py │ │ ├── scripts/ │ │ │ ├── convert_model.py │ │ │ └── g2p_encode.py │ │ └── tasks/ │ │ ├── __init__.py │ │ ├── pair_denoising.py │ │ ├── speech_text_denoise_pretrain.py │ │ └── speech_text_joint.py │ ├── speech_to_speech/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── asr_bleu/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── asr_model_cfgs.json │ │ │ ├── compute_asr_bleu.py │ │ │ ├── requirements.txt │ │ │ └── utils.py │ │ ├── benchmarking/ │ │ │ ├── README.md │ │ │ ├── configs/ │ │ │ │ ├── 2StageS2ST.yaml │ │ │ │ ├── 3StageS2ST.yaml │ │ │ │ ├── DirectS2U.yaml │ │ │ │ └── S2T.yaml │ │ │ ├── core.py │ │ │ ├── data_utils.py │ │ │ └── get_metrics.py │ │ ├── docs/ │ │ │ ├── data_augmentation.md │ │ │ ├── direct_s2st_discrete_units.md │ │ │ ├── enhanced_direct_s2st_discrete_units.md │ │ │ └── textless_s2st_real_data.md │ │ ├── generate_waveform_from_code.py │ │ ├── preprocessing/ │ │ │ ├── __init__.py │ │ │ ├── data_utils.py │ │ │ ├── prep_s2spect_data.py │ │ │ ├── prep_s2ut_data.py │ │ │ ├── prep_sn_data.py │ │ │ └── prep_sn_output_data.py │ │ └── unity/ │ │ ├── __init__.py │ │ ├── sequence_generator.py │ │ └── sequence_generator_multi_decoder.py │ ├── speech_to_text/ │ │ ├── README.md │ │ ├── data_utils.py │ │ ├── docs/ │ │ │ ├── covost_example.md │ │ │ ├── librispeech_example.md │ │ │ ├── mtedx_example.md │ │ │ ├── mustc_example.md │ │ │ └── simulst_mustc_example.md │ │ ├── prep_covost_data.py │ │ ├── prep_librispeech_data.py │ │ ├── prep_mtedx_data.py │ │ ├── prep_mustc_data.py │ │ ├── seg_mustc_data.py │ │ └── simultaneous_translation/ │ │ └── agents/ │ │ └── fairseq_simul_st_agent.py │ ├── stories/ │ │ └── README.md │ ├── textless_nlp/ │ │ ├── dgslm/ │ │ │ ├── README.md │ │ │ ├── create_code_file.py │ │ │ ├── dgslm_utils.py │ │ │ ├── hubert_fisher/ │ │ │ │ └── README.md │ │ │ ├── sample_speech_dlm.py │ │ │ └── vocoder_hifigan/ │ │ │ ├── README.md │ │ │ └── generate_stereo_waveform.py │ │ ├── gslm/ │ │ │ ├── README.md │ │ │ ├── metrics/ │ │ │ │ ├── README.md │ │ │ │ ├── abx_metrics/ │ │ │ │ │ ├── README.md │ │ │ │ │ └── dump_abx_feats.py │ │ │ │ └── asr_metrics/ │ │ │ │ ├── README.md │ │ │ │ ├── continuation_eval.py │ │ │ │ ├── misc/ │ │ │ │ │ ├── bleu_utils.py │ │ │ │ │ ├── cut_as.py │ │ │ │ │ └── dict.ltr.txt │ │ │ │ ├── ppx.py │ │ │ │ └── self_auto_bleu.py │ │ │ ├── speech2unit/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── clustering/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── cluster_kmeans.py │ │ │ │ │ ├── dump_feats.py │ │ │ │ │ ├── quantize_with_kmeans.py │ │ │ │ │ └── utils.py │ │ │ │ └── pretrained/ │ │ │ │ ├── cpc_feature_reader.py │ │ │ │ ├── hubert_feature_reader.py │ │ │ │ ├── logmel_feature_reader.py │ │ │ │ ├── utils.py │ │ │ │ └── w2v2_feature_reader.py │ │ │ ├── tools/ │ │ │ │ ├── README.md │ │ │ │ └── resynthesize_speech.py │ │ │ ├── ulm/ │ │ │ │ ├── README.md │ │ │ │ └── sample.py │ │ │ └── unit2speech/ │ │ │ ├── README.md │ │ │ ├── convert_to_16k.py │ │ │ ├── glow.py │ │ │ ├── multiproc.py │ │ │ ├── synthesize_audio_from_units.py │ │ │ ├── tacotron2/ │ │ │ │ ├── __init__.py │ │ │ │ ├── audio_processing.py │ │ │ │ ├── cleaners.py │ │ │ │ ├── cmudict.py │ │ │ │ ├── layers.py │ │ │ │ ├── model.py │ │ │ │ ├── numbers.py │ │ │ │ ├── stft.py │ │ │ │ ├── symbols.py │ │ │ │ ├── text.py │ │ │ │ ├── utils.py │ │ │ │ └── waveglow_denoiser.py │ │ │ ├── tts_data.py │ │ │ └── utils.py │ │ ├── pgslm/ │ │ │ ├── README.md │ │ │ ├── data_utils.py │ │ │ ├── eval/ │ │ │ │ ├── __init__.py │ │ │ │ └── cont_metrics.py │ │ │ ├── generate_waveform.py │ │ │ ├── inference_dataset.py │ │ │ ├── naive_decoder.py │ │ │ ├── prepare_dataset.py │ │ │ ├── preprocess_f0.py │ │ │ ├── quantize_f0.py │ │ │ ├── sample/ │ │ │ │ ├── __init__.py │ │ │ │ └── sample.py │ │ │ ├── scripts/ │ │ │ │ ├── join_units_manifest.py │ │ │ │ ├── prepare_data.sh │ │ │ │ └── prepare_f0_quantization.sh │ │ │ └── truncated_laplace.py │ │ └── speech-resynth/ │ │ └── README.md │ ├── translation/ │ │ ├── README.md │ │ ├── prepare-iwslt14.sh │ │ ├── prepare-iwslt17-multilingual.sh │ │ ├── prepare-wmt14en2de.sh │ │ └── prepare-wmt14en2fr.sh │ ├── translation_moe/ │ │ ├── README.md │ │ ├── score.py │ │ └── translation_moe_src/ │ │ ├── __init__.py │ │ ├── logsumexp_moe.py │ │ ├── mean_pool_gating_network.py │ │ └── translation_moe.py │ ├── truncated_bptt/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── transformer_xl_model.py │ │ └── truncated_bptt_lm_task.py │ ├── unsupervised_quality_estimation/ │ │ ├── README.md │ │ ├── aggregate_scores.py │ │ ├── meteor.py │ │ └── repeat_lines.py │ ├── wav2vec/ │ │ ├── README.md │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── finetuning/ │ │ │ │ ├── base_100h.yaml │ │ │ │ ├── base_10h.yaml │ │ │ │ ├── base_10m.yaml │ │ │ │ ├── base_1h.yaml │ │ │ │ ├── base_960h.yaml │ │ │ │ ├── run_config/ │ │ │ │ │ ├── slurm_1.yaml │ │ │ │ │ ├── slurm_16.yaml │ │ │ │ │ ├── slurm_1_aws.yaml │ │ │ │ │ ├── slurm_1_old.yaml │ │ │ │ │ ├── slurm_2.yaml │ │ │ │ │ ├── slurm_2_aws.yaml │ │ │ │ │ ├── slurm_2g.yaml │ │ │ │ │ ├── slurm_3.yaml │ │ │ │ │ ├── slurm_4g.yaml │ │ │ │ │ ├── slurm_4g_aws.yaml │ │ │ │ │ └── slurm_8.yaml │ │ │ │ ├── vox_100h.yaml │ │ │ │ ├── vox_100h_2.yaml │ │ │ │ ├── vox_100h_2_aws.yaml │ │ │ │ ├── vox_100h_3.yaml │ │ │ │ ├── vox_10h.yaml │ │ │ │ ├── vox_10h_2.yaml │ │ │ │ ├── vox_10h_2_aws.yaml │ │ │ │ ├── vox_10h_aws.yaml │ │ │ │ ├── vox_10h_aws_v100.yaml │ │ │ │ ├── vox_10m.yaml │ │ │ │ ├── vox_10m_2.yaml │ │ │ │ ├── vox_10m_2_aws.yaml │ │ │ │ ├── vox_10m_3.yaml │ │ │ │ ├── vox_1h.yaml │ │ │ │ ├── vox_1h_2.yaml │ │ │ │ ├── vox_1h_2_aws.yaml │ │ │ │ ├── vox_1h_3.yaml │ │ │ │ ├── vox_1h_4.yaml │ │ │ │ ├── vox_1h_aws.yaml │ │ │ │ ├── vox_960h.yaml │ │ │ │ ├── vox_960h_2.yaml │ │ │ │ ├── vox_960h_2_aws.yaml │ │ │ │ └── vox_960h_3.yaml │ │ │ └── pretraining/ │ │ │ ├── wav2vec2_base_librispeech.yaml │ │ │ ├── wav2vec2_conformer_base_librispeech.yaml │ │ │ ├── wav2vec2_conformer_large_librivox.yaml │ │ │ ├── wav2vec2_large_librivox.yaml │ │ │ ├── wav2vec2_large_librivox_tpu-pod.yaml │ │ │ └── wav2vec2_large_librivox_tpu.yaml │ │ ├── libri_labels.py │ │ ├── scripts/ │ │ │ └── binarize_manifest.sh │ │ ├── unsupervised/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── config/ │ │ │ │ ├── finetuning/ │ │ │ │ │ └── w2v_finetune.yaml │ │ │ │ ├── gan/ │ │ │ │ │ ├── w2vu.yaml │ │ │ │ │ └── w2vu2.yaml │ │ │ │ ├── generate/ │ │ │ │ │ └── viterbi.yaml │ │ │ │ ├── timit_matched/ │ │ │ │ │ ├── test.uid │ │ │ │ │ ├── train.uid │ │ │ │ │ ├── train_text.uid │ │ │ │ │ └── valid.uid │ │ │ │ └── timit_unmatched/ │ │ │ │ ├── test.uid │ │ │ │ ├── train.uid │ │ │ │ ├── train_text.uid │ │ │ │ └── valid.uid │ │ │ ├── data/ │ │ │ │ ├── __init__.py │ │ │ │ ├── extracted_features_dataset.py │ │ │ │ └── random_input_dataset.py │ │ │ ├── kaldi_self_train/ │ │ │ │ ├── README.md │ │ │ │ └── st/ │ │ │ │ ├── cmd.sh │ │ │ │ ├── decode_phone.sh │ │ │ │ ├── decode_word_step1.sh │ │ │ │ ├── decode_word_step2.sh │ │ │ │ ├── local/ │ │ │ │ │ ├── copy_aligned_text.py │ │ │ │ │ ├── decode.sh │ │ │ │ │ ├── prepare_data_from_w2v.py │ │ │ │ │ ├── prepare_lang.sh │ │ │ │ │ ├── prepare_lang_word.sh │ │ │ │ │ ├── prepare_lm.sh │ │ │ │ │ ├── score.sh │ │ │ │ │ ├── show_wer.sh │ │ │ │ │ ├── train_subset_lgbeam.sh │ │ │ │ │ ├── unsup_select.py │ │ │ │ │ ├── unsup_select_decode.sh │ │ │ │ │ └── unsup_select_decode_word.sh │ │ │ │ ├── path.sh │ │ │ │ ├── steps_gan/ │ │ │ │ │ ├── train_deltas.sh │ │ │ │ │ ├── train_lda_mllt.sh │ │ │ │ │ └── train_sat.sh │ │ │ │ └── train.sh │ │ │ ├── models/ │ │ │ │ ├── __init__.py │ │ │ │ └── wav2vec_u.py │ │ │ ├── scripts/ │ │ │ │ ├── apply_pca.py │ │ │ │ ├── copy_labels.py │ │ │ │ ├── filter_lexicon.py │ │ │ │ ├── filter_tsv.py │ │ │ │ ├── g2p_wrd_to_phn.py │ │ │ │ ├── ltr_to_wrd.py │ │ │ │ ├── mean_pool.py │ │ │ │ ├── merge_clusters.py │ │ │ │ ├── normalize_and_filter_text.py │ │ │ │ ├── normalize_text.py │ │ │ │ ├── pca.py │ │ │ │ ├── phonemize_with_sil.py │ │ │ │ ├── prepare_audio.sh │ │ │ │ ├── prepare_audio_v2.sh │ │ │ │ ├── prepare_text.sh │ │ │ │ ├── prepare_timit.sh │ │ │ │ ├── remove_silence.py │ │ │ │ ├── vads.py │ │ │ │ ├── wav2vec_apply_cluster_faiss.py │ │ │ │ ├── wav2vec_cluster_faiss.py │ │ │ │ ├── wav2vec_extract_features.py │ │ │ │ ├── wer.py │ │ │ │ └── wrd_to_ltr.py │ │ │ ├── tasks/ │ │ │ │ ├── __init__.py │ │ │ │ └── unpaired_audio_text.py │ │ │ └── w2vu_generate.py │ │ ├── vq-wav2vec_featurize.py │ │ ├── wav2vec_featurize.py │ │ ├── wav2vec_manifest.py │ │ └── xlsr/ │ │ ├── README.md │ │ ├── config/ │ │ │ └── finetune.yaml │ │ └── scripts/ │ │ ├── eval_speaker_clf_task.py │ │ └── gen_audio_embedding.py │ ├── wmt19/ │ │ └── README.md │ ├── wmt20/ │ │ └── README.md │ ├── wmt21/ │ │ ├── README.md │ │ ├── eval.sh │ │ └── scripts/ │ │ ├── normalize-punctuation.perl │ │ └── replace-unicode-punctuation.perl │ ├── womens_bios/ │ │ ├── README.md │ │ └── query_occupations_from_wikidata.py │ ├── xformers/ │ │ └── README.md │ ├── xglm/ │ │ ├── README.md │ │ ├── XStoryCloze.md │ │ └── model_card.md │ ├── xlmr/ │ │ └── README.md │ └── xmod/ │ ├── README.md │ └── preprocess_nli.py ├── fairseq/ │ ├── __init__.py │ ├── benchmark/ │ │ ├── __init__.py │ │ ├── benchmark_multihead_attention.py │ │ ├── dummy_dataset.py │ │ ├── dummy_lm.py │ │ ├── dummy_masked_lm.py │ │ ├── dummy_model.py │ │ └── dummy_mt.py │ ├── binarizer.py │ ├── checkpoint_utils.py │ ├── clib/ │ │ ├── cuda/ │ │ │ ├── ngram_repeat_block_cuda.cpp │ │ │ └── ngram_repeat_block_cuda_kernel.cu │ │ ├── libbase/ │ │ │ └── balanced_assignment.cpp │ │ ├── libbleu/ │ │ │ ├── libbleu.cpp │ │ │ └── module.cpp │ │ ├── libnat/ │ │ │ └── edit_dist.cpp │ │ └── libnat_cuda/ │ │ ├── binding.cpp │ │ ├── edit_dist.cu │ │ └── edit_dist.h │ ├── config/ │ │ ├── __init__.py │ │ ├── config.yaml │ │ ├── fb_run_config/ │ │ │ └── slurm.yaml │ │ └── model/ │ │ ├── transformer_lm/ │ │ │ ├── transformer_lm_baevski_gbw.yaml │ │ │ ├── transformer_lm_baevski_wiki103.yaml │ │ │ ├── transformer_lm_big.yaml │ │ │ ├── transformer_lm_gbw.yaml │ │ │ ├── transformer_lm_gpt.yaml │ │ │ ├── transformer_lm_gpt2_big.yaml │ │ │ ├── transformer_lm_gpt2_medium.yaml │ │ │ ├── transformer_lm_gpt2_small.yaml │ │ │ └── transformer_lm_wiki103.yaml │ │ ├── wav2vec/ │ │ │ └── vq_wav2vec_gumbel.yaml │ │ └── wav2vec2/ │ │ ├── wav2vec2_base.yaml │ │ └── wav2vec2_large.yaml │ ├── criterions/ │ │ ├── __init__.py │ │ ├── adaptive_loss.py │ │ ├── composite_loss.py │ │ ├── cross_entropy.py │ │ ├── ctc.py │ │ ├── fairseq_criterion.py │ │ ├── fastspeech2_loss.py │ │ ├── hubert_criterion.py │ │ ├── label_smoothed_cross_entropy.py │ │ ├── label_smoothed_cross_entropy_latency_augmented.py │ │ ├── label_smoothed_cross_entropy_with_alignment.py │ │ ├── label_smoothed_cross_entropy_with_ctc.py │ │ ├── label_smoothed_cross_entropy_with_rdrop.py │ │ ├── legacy_masked_lm.py │ │ ├── masked_lm.py │ │ ├── model_criterion.py │ │ ├── nat_loss.py │ │ ├── sentence_prediction.py │ │ ├── sentence_prediction_adapters.py │ │ ├── sentence_ranking.py │ │ ├── speech_dlm_criterion.py │ │ ├── speech_to_speech_criterion.py │ │ ├── speech_ulm_criterion.py │ │ ├── tacotron2_loss.py │ │ └── wav2vec_criterion.py │ ├── data/ │ │ ├── __init__.py │ │ ├── add_class_target_dataset.py │ │ ├── add_target_dataset.py │ │ ├── append_token_dataset.py │ │ ├── audio/ │ │ │ ├── __init__.py │ │ │ ├── audio_utils.py │ │ │ ├── data_cfg.py │ │ │ ├── dataset_transforms/ │ │ │ │ ├── __init__.py │ │ │ │ ├── concataugment.py │ │ │ │ └── noisyoverlapaugment.py │ │ │ ├── feature_transforms/ │ │ │ │ ├── __init__.py │ │ │ │ ├── delta_deltas.py │ │ │ │ ├── global_cmvn.py │ │ │ │ ├── specaugment.py │ │ │ │ └── utterance_cmvn.py │ │ │ ├── frm_text_to_speech_dataset.py │ │ │ ├── hubert_dataset.py │ │ │ ├── multi_modality_dataset.py │ │ │ ├── raw_audio_dataset.py │ │ │ ├── speech_to_speech_dataset.py │ │ │ ├── speech_to_text_dataset.py │ │ │ ├── speech_to_text_joint_dataset.py │ │ │ ├── text_to_speech_dataset.py │ │ │ └── waveform_transforms/ │ │ │ ├── __init__.py │ │ │ └── noiseaugment.py │ │ ├── backtranslation_dataset.py │ │ ├── base_wrapper_dataset.py │ │ ├── bucket_pad_length_dataset.py │ │ ├── codedataset.py │ │ ├── colorize_dataset.py │ │ ├── concat_dataset.py │ │ ├── concat_sentences_dataset.py │ │ ├── data_utils.py │ │ ├── data_utils_fast.pyx │ │ ├── denoising_dataset.py │ │ ├── dictionary.py │ │ ├── encoders/ │ │ │ ├── __init__.py │ │ │ ├── byte_bpe.py │ │ │ ├── byte_utils.py │ │ │ ├── bytes.py │ │ │ ├── characters.py │ │ │ ├── fastbpe.py │ │ │ ├── gpt2_bpe.py │ │ │ ├── gpt2_bpe_utils.py │ │ │ ├── hf_bert_bpe.py │ │ │ ├── hf_byte_bpe.py │ │ │ ├── moses_tokenizer.py │ │ │ ├── nltk_tokenizer.py │ │ │ ├── sentencepiece_bpe.py │ │ │ ├── space_tokenizer.py │ │ │ ├── subword_nmt_bpe.py │ │ │ └── utils.py │ │ ├── fairseq_dataset.py │ │ ├── fasta_dataset.py │ │ ├── huffman/ │ │ │ ├── __init__.py │ │ │ ├── huffman_coder.py │ │ │ └── huffman_mmap_indexed_dataset.py │ │ ├── id_dataset.py │ │ ├── indexed_dataset.py │ │ ├── iterators.py │ │ ├── language_pair_dataset.py │ │ ├── legacy/ │ │ │ ├── __init__.py │ │ │ ├── block_pair_dataset.py │ │ │ ├── masked_lm_dataset.py │ │ │ └── masked_lm_dictionary.py │ │ ├── list_dataset.py │ │ ├── lm_context_window_dataset.py │ │ ├── lru_cache_dataset.py │ │ ├── mask_tokens_dataset.py │ │ ├── monolingual_dataset.py │ │ ├── multi_corpus_dataset.py │ │ ├── multi_corpus_sampled_dataset.py │ │ ├── multilingual/ │ │ │ ├── __init__.py │ │ │ ├── multilingual_data_manager.py │ │ │ ├── multilingual_utils.py │ │ │ ├── sampled_multi_dataset.py │ │ │ ├── sampled_multi_epoch_dataset.py │ │ │ └── sampling_method.py │ │ ├── nested_dictionary_dataset.py │ │ ├── noising.py │ │ ├── num_samples_dataset.py │ │ ├── numel_dataset.py │ │ ├── offset_tokens_dataset.py │ │ ├── pad_dataset.py │ │ ├── padding_mask_dataset.py │ │ ├── plasma_utils.py │ │ ├── prepend_dataset.py │ │ ├── prepend_token_dataset.py │ │ ├── raw_label_dataset.py │ │ ├── replace_dataset.py │ │ ├── resampling_dataset.py │ │ ├── roll_dataset.py │ │ ├── round_robin_zip_datasets.py │ │ ├── shorten_dataset.py │ │ ├── sort_dataset.py │ │ ├── span_mask_tokens_dataset.py │ │ ├── speech_dlm_dataset.py │ │ ├── strip_token_dataset.py │ │ ├── subsample_dataset.py │ │ ├── text_compressor.py │ │ ├── token_block_dataset.py │ │ ├── token_block_utils_fast.pyx │ │ ├── transform_eos_concat_langpair_dataset.py │ │ ├── transform_eos_dataset.py │ │ └── transform_eos_lang_pair_dataset.py │ ├── dataclass/ │ │ ├── __init__.py │ │ ├── configs.py │ │ ├── constants.py │ │ ├── initialize.py │ │ └── utils.py │ ├── distributed/ │ │ ├── __init__.py │ │ ├── distributed_timeout_wrapper.py │ │ ├── fully_sharded_data_parallel.py │ │ ├── legacy_distributed_data_parallel.py │ │ ├── module_proxy_wrapper.py │ │ ├── tpu_distributed_data_parallel.py │ │ └── utils.py │ ├── file_chunker_utils.py │ ├── file_io.py │ ├── file_utils.py │ ├── hub_utils.py │ ├── incremental_decoding_utils.py │ ├── iterative_refinement_generator.py │ ├── logging/ │ │ ├── __init__.py │ │ ├── meters.py │ │ ├── metrics.py │ │ └── progress_bar.py │ ├── model_parallel/ │ │ ├── __init__.py │ │ ├── criterions/ │ │ │ ├── __init__.py │ │ │ └── vocab_parallel_cross_entropy.py │ │ ├── megatron_trainer.py │ │ ├── models/ │ │ │ ├── __init__.py │ │ │ ├── pipeline_parallel_transformer/ │ │ │ │ ├── __init__.py │ │ │ │ ├── layers.py │ │ │ │ └── model.py │ │ │ ├── roberta/ │ │ │ │ ├── __init__.py │ │ │ │ └── model.py │ │ │ ├── transformer.py │ │ │ └── transformer_lm.py │ │ └── modules/ │ │ ├── __init__.py │ │ ├── multihead_attention.py │ │ └── transformer_layer.py │ ├── models/ │ │ ├── __init__.py │ │ ├── bart/ │ │ │ ├── __init__.py │ │ │ ├── hub_interface.py │ │ │ └── model.py │ │ ├── composite_encoder.py │ │ ├── distributed_fairseq_model.py │ │ ├── ema/ │ │ │ ├── __init__.py │ │ │ └── ema.py │ │ ├── fairseq_decoder.py │ │ ├── fairseq_encoder.py │ │ ├── fairseq_incremental_decoder.py │ │ ├── fairseq_model.py │ │ ├── fconv.py │ │ ├── fconv_lm.py │ │ ├── fconv_self_att.py │ │ ├── hubert/ │ │ │ ├── __init__.py │ │ │ ├── hubert.py │ │ │ └── hubert_asr.py │ │ ├── huggingface/ │ │ │ ├── __init__.py │ │ │ └── hf_gpt2.py │ │ ├── lightconv.py │ │ ├── lightconv_lm.py │ │ ├── lstm.py │ │ ├── lstm_lm.py │ │ ├── masked_lm.py │ │ ├── model_utils.py │ │ ├── multilingual_transformer.py │ │ ├── multires_hubert/ │ │ │ ├── __init__.py │ │ │ ├── multires_hubert.py │ │ │ └── multires_hubert_asr.py │ │ ├── nat/ │ │ │ ├── __init__.py │ │ │ ├── cmlm_transformer.py │ │ │ ├── fairseq_nat_model.py │ │ │ ├── insertion_transformer.py │ │ │ ├── iterative_nonautoregressive_transformer.py │ │ │ ├── levenshtein_transformer.py │ │ │ ├── levenshtein_utils.py │ │ │ ├── nat_crf_transformer.py │ │ │ ├── nonautoregressive_ensembles.py │ │ │ └── nonautoregressive_transformer.py │ │ ├── roberta/ │ │ │ ├── __init__.py │ │ │ ├── alignment_utils.py │ │ │ ├── enc_dec.py │ │ │ ├── hub_interface.py │ │ │ ├── model.py │ │ │ ├── model_camembert.py │ │ │ ├── model_gottbert.py │ │ │ └── model_xlmr.py │ │ ├── speech_dlm/ │ │ │ ├── __init__.py │ │ │ ├── hub_interface.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── speech_dlm_decoder.py │ │ │ │ └── speech_dlm_decoder_layer.py │ │ │ ├── sequence_generator/ │ │ │ │ ├── __init__.py │ │ │ │ ├── multichannel_search.py │ │ │ │ └── multichannel_sequence_generator.py │ │ │ └── speech_dlm.py │ │ ├── speech_to_speech/ │ │ │ ├── __init__.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── ctc_decoder.py │ │ │ │ ├── stacked_embedding.py │ │ │ │ ├── transformer_decoder_aug.py │ │ │ │ └── transformer_encoder.py │ │ │ ├── s2s_conformer.py │ │ │ ├── s2s_conformer_translatotron2.py │ │ │ ├── s2s_conformer_unity.py │ │ │ └── s2s_transformer.py │ │ ├── speech_to_text/ │ │ │ ├── __init__.py │ │ │ ├── berard.py │ │ │ ├── convtransformer.py │ │ │ ├── hub_interface.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── augmented_memory_attention.py │ │ │ │ ├── convolution.py │ │ │ │ └── emformer.py │ │ │ ├── multi_modality_model.py │ │ │ ├── s2t_conformer.py │ │ │ ├── s2t_transformer.py │ │ │ ├── s2t_wav_transformer.py │ │ │ ├── utils.py │ │ │ ├── xm_transformer.py │ │ │ └── xm_transformer_unity.py │ │ ├── text_to_speech/ │ │ │ ├── __init__.py │ │ │ ├── codehifigan.py │ │ │ ├── fastspeech2.py │ │ │ ├── hifigan.py │ │ │ ├── hub_interface.py │ │ │ ├── tacotron2.py │ │ │ ├── tts_transformer.py │ │ │ └── vocoder.py │ │ ├── transformer/ │ │ │ ├── __init__.py │ │ │ ├── transformer_base.py │ │ │ ├── transformer_config.py │ │ │ ├── transformer_decoder.py │ │ │ ├── transformer_decoder_aug.py │ │ │ ├── transformer_encoder.py │ │ │ └── transformer_legacy.py │ │ ├── transformer_align.py │ │ ├── transformer_from_pretrained_xlm.py │ │ ├── transformer_lm.py │ │ ├── transformer_ulm.py │ │ ├── wav2vec/ │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ ├── wav2vec.py │ │ │ ├── wav2vec2.py │ │ │ ├── wav2vec2_asr.py │ │ │ ├── wav2vec2_classification.py │ │ │ └── wav2vec2_laser.py │ │ └── xmod/ │ │ ├── __init__.py │ │ ├── hub_interface.py │ │ ├── model.py │ │ └── transformer_layer_xmod.py │ ├── modules/ │ │ ├── __init__.py │ │ ├── adaptive_input.py │ │ ├── adaptive_softmax.py │ │ ├── base_layer.py │ │ ├── beamable_mm.py │ │ ├── character_token_embedder.py │ │ ├── checkpoint_activations.py │ │ ├── conformer_layer.py │ │ ├── conv_tbc.py │ │ ├── cross_entropy.py │ │ ├── cuda_utils.cu │ │ ├── downsampled_multihead_attention.py │ │ ├── dynamic_convolution.py │ │ ├── dynamic_crf_layer.py │ │ ├── dynamicconv_layer/ │ │ │ ├── __init__.py │ │ │ ├── cuda_function_gen.py │ │ │ ├── dynamicconv_cuda.cpp │ │ │ ├── dynamicconv_cuda.cuh │ │ │ ├── dynamicconv_cuda_kernel.cu │ │ │ ├── dynamicconv_layer.py │ │ │ ├── dynamiconv_cpu.cpp │ │ │ └── setup.py │ │ ├── ema_module.py │ │ ├── espnet_multihead_attention.py │ │ ├── fairseq_dropout.py │ │ ├── fp32_batch_norm.py │ │ ├── fp32_group_norm.py │ │ ├── fp32_instance_norm.py │ │ ├── gelu.py │ │ ├── grad_multiply.py │ │ ├── gumbel_vector_quantizer.py │ │ ├── kmeans_attention.py │ │ ├── kmeans_vector_quantizer.py │ │ ├── layer_drop.py │ │ ├── layer_norm.py │ │ ├── learned_positional_embedding.py │ │ ├── lightconv_layer/ │ │ │ ├── __init__.py │ │ │ ├── cuda_function_gen.py │ │ │ ├── lightconv_cuda.cpp │ │ │ ├── lightconv_cuda.cuh │ │ │ ├── lightconv_cuda_kernel.cu │ │ │ ├── lightconv_layer.py │ │ │ └── setup.py │ │ ├── lightweight_convolution.py │ │ ├── linearized_convolution.py │ │ ├── location_attention.py │ │ ├── lstm_cell_with_zoneout.py │ │ ├── multihead_attention.py │ │ ├── positional_embedding.py │ │ ├── positional_encoding.py │ │ ├── quant_noise.py │ │ ├── quantization/ │ │ │ ├── __init__.py │ │ │ ├── pq/ │ │ │ │ ├── __init__.py │ │ │ │ ├── em.py │ │ │ │ ├── modules/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── qconv.py │ │ │ │ │ ├── qemb.py │ │ │ │ │ └── qlinear.py │ │ │ │ ├── pq.py │ │ │ │ └── utils.py │ │ │ ├── quantization_options.py │ │ │ └── scalar/ │ │ │ ├── __init__.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── qact.py │ │ │ │ ├── qconv.py │ │ │ │ ├── qemb.py │ │ │ │ └── qlinear.py │ │ │ ├── ops.py │ │ │ └── utils.py │ │ ├── rotary_positional_embedding.py │ │ ├── same_pad.py │ │ ├── scalar_bias.py │ │ ├── sinusoidal_positional_embedding.py │ │ ├── sparse_multihead_attention.py │ │ ├── sparse_transformer_sentence_encoder.py │ │ ├── sparse_transformer_sentence_encoder_layer.py │ │ ├── transformer_layer.py │ │ ├── transformer_layer_aug.py │ │ ├── transformer_sentence_encoder.py │ │ ├── transformer_sentence_encoder_layer.py │ │ ├── transpose_last.py │ │ ├── unfold.py │ │ └── vggblock.py │ ├── nan_detector.py │ ├── ngram_repeat_block.py │ ├── optim/ │ │ ├── __init__.py │ │ ├── adadelta.py │ │ ├── adafactor.py │ │ ├── adagrad.py │ │ ├── adam.py │ │ ├── adamax.py │ │ ├── amp_optimizer.py │ │ ├── bmuf.py │ │ ├── composite.py │ │ ├── cpu_adam.py │ │ ├── dynamic_loss_scaler.py │ │ ├── fairseq_optimizer.py │ │ ├── fp16_optimizer.py │ │ ├── fused_adam.py │ │ ├── fused_lamb.py │ │ ├── lr_scheduler/ │ │ │ ├── __init__.py │ │ │ ├── cosine_lr_scheduler.py │ │ │ ├── fairseq_lr_scheduler.py │ │ │ ├── fixed_schedule.py │ │ │ ├── inverse_square_root_schedule.py │ │ │ ├── manual_lr_scheduler.py │ │ │ ├── pass_through.py │ │ │ ├── polynomial_decay_schedule.py │ │ │ ├── reduce_lr_on_plateau.py │ │ │ ├── step_lr_scheduler.py │ │ │ ├── tri_stage_lr_scheduler.py │ │ │ └── triangular_lr_scheduler.py │ │ ├── nag.py │ │ ├── sgd.py │ │ └── shard.py │ ├── options.py │ ├── pdb.py │ ├── quantization_utils.py │ ├── registry.py │ ├── scoring/ │ │ ├── __init__.py │ │ ├── bertscore.py │ │ ├── bleu.py │ │ ├── chrf.py │ │ ├── meteor.py │ │ ├── tokenizer.py │ │ └── wer.py │ ├── search.py │ ├── sequence_generator.py │ ├── sequence_scorer.py │ ├── speech_generator.py │ ├── tasks/ │ │ ├── __init__.py │ │ ├── audio_classification.py │ │ ├── audio_finetuning.py │ │ ├── audio_pretraining.py │ │ ├── cross_lingual_lm.py │ │ ├── denoising.py │ │ ├── fairseq_task.py │ │ ├── frm_text_to_speech.py │ │ ├── hubert_pretraining.py │ │ ├── language_modeling.py │ │ ├── legacy_masked_lm.py │ │ ├── masked_lm.py │ │ ├── multilingual_denoising.py │ │ ├── multilingual_language_modeling.py │ │ ├── multilingual_masked_lm.py │ │ ├── multilingual_translation.py │ │ ├── multires_hubert_pretraining.py │ │ ├── nlu_finetuning.py │ │ ├── online_backtranslation.py │ │ ├── semisupervised_translation.py │ │ ├── sentence_prediction.py │ │ ├── sentence_prediction_adapters.py │ │ ├── sentence_ranking.py │ │ ├── simultaneous_translation.py │ │ ├── span_masked_lm.py │ │ ├── speech_dlm_task.py │ │ ├── speech_to_speech.py │ │ ├── speech_to_text.py │ │ ├── speech_ulm_task.py │ │ ├── text_to_speech.py │ │ ├── translation.py │ │ ├── translation_from_pretrained_bart.py │ │ ├── translation_from_pretrained_xlm.py │ │ ├── translation_lev.py │ │ └── translation_multi_simple_epoch.py │ ├── token_generation_constraints.py │ ├── tokenizer.py │ ├── trainer.py │ ├── utils.py │ └── version.txt ├── fairseq_cli/ │ ├── __init__.py │ ├── eval_lm.py │ ├── generate.py │ ├── hydra_train.py │ ├── hydra_validate.py │ ├── interactive.py │ ├── preprocess.py │ ├── score.py │ ├── train.py │ └── validate.py ├── hubconf.py ├── hydra_plugins/ │ └── dependency_submitit_launcher/ │ ├── hydra_plugins/ │ │ └── dependency_submitit_launcher/ │ │ ├── __init__.py │ │ ├── config.py │ │ └── launcher.py │ └── setup.py ├── pyproject.toml ├── release_utils.py ├── scripts/ │ ├── __init__.py │ ├── average_checkpoints.py │ ├── build_sym_alignment.py │ ├── check_installation.py │ ├── compare_namespaces.py │ ├── compound_split_bleu.sh │ ├── constraints/ │ │ ├── extract.py │ │ └── validate.py │ ├── convert_dictionary.lua │ ├── convert_model.lua │ ├── count_docs.py │ ├── read_binarized.py │ ├── rm_pt.py │ ├── sacrebleu.sh │ ├── shard_docs.py │ ├── split_train_valid_docs.py │ ├── spm_decode.py │ ├── spm_encode.py │ ├── spm_train.py │ └── test_fsdp.sh ├── setup.cfg ├── setup.py ├── tests/ │ ├── __init__.py │ ├── distributed/ │ │ ├── __init__.py │ │ ├── test_bmuf.py │ │ ├── test_distributed_timeout_wrapper.py │ │ ├── test_module_proxy_wrapper.py │ │ ├── test_utils.py │ │ └── utils.py │ ├── gpu/ │ │ ├── __init__.py │ │ ├── test_binaries_gpu.py │ │ ├── test_ema_gpu.py │ │ └── transformer_quantization_config.yaml │ ├── speech/ │ │ ├── __init__.py │ │ ├── test_convtransformer_simul_trans.py │ │ ├── test_dual_input_wav_transformer.py │ │ ├── test_dualinput_s2t_transformer.py │ │ ├── test_fastspeech2.py │ │ ├── test_s2s_transformer.py │ │ ├── test_s2t_conformer.py │ │ ├── test_s2t_transformer.py │ │ ├── test_tts_transformer.py │ │ ├── test_wav2vec2.py │ │ └── test_xm_transformer.py │ ├── speech_recognition/ │ │ ├── __init__.py │ │ ├── asr_test_base.py │ │ ├── test_collaters.py │ │ ├── test_cross_entropy.py │ │ ├── test_data_utils.py │ │ └── test_vggtransformer.py │ ├── tasks/ │ │ ├── test_denoising.py │ │ ├── test_masked_lm.py │ │ ├── test_multilingual_denoising.py │ │ └── test_span_masked_lm.py │ ├── test_activation_checkpointing.py │ ├── test_amp_optimizer.py │ ├── test_average_checkpoints.py │ ├── test_backtranslation_dataset.py │ ├── test_binaries.py │ ├── test_binarizer.py │ ├── test_character_token_embedder.py │ ├── test_checkpoint_utils.py │ ├── test_checkpoint_utils_for_task_level_attributes.py │ ├── test_concat_dataset.py │ ├── test_constraints.py │ ├── test_convtbc.py │ ├── test_data_utils.py │ ├── test_dataclass_utils.py │ ├── test_dataset.py │ ├── test_dictionary.py │ ├── test_ema.py │ ├── test_espnet_multihead_attention.py │ ├── test_export.py │ ├── test_file_chunker_utils.py │ ├── test_file_io.py │ ├── test_fp16_optimizer.py │ ├── test_hf_hub.py │ ├── test_huffman.py │ ├── test_inference_dropout.py │ ├── test_iopath.py │ ├── test_iterators.py │ ├── test_label_smoothing.py │ ├── test_lm_context_window.py │ ├── test_lstm_jitable.py │ ├── test_memory_efficient_fp16.py │ ├── test_metrics.py │ ├── test_multi_corpus_dataset.py │ ├── test_multi_corpus_sampled_dataset.py │ ├── test_multihead_attention.py │ ├── test_noising.py │ ├── test_online_backtranslation.py │ ├── test_plasma_utils.py │ ├── test_positional_encoding.py │ ├── test_reproducibility.py │ ├── test_resampling_dataset.py │ ├── test_roberta.py │ ├── test_rotary_positional_embedding.py │ ├── test_sequence_generator.py │ ├── test_sequence_scorer.py │ ├── test_sparse_multihead_attention.py │ ├── test_token_block_dataset.py │ ├── test_train.py │ ├── test_transformer.py │ ├── test_utils.py │ ├── test_valid_subset_checks.py │ └── utils.py └── train.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/CODEOWNERS ================================================ # Setting up CODEOWNERS for UST related codebase # Documentation for open sourced models relevant to UST examples/speech_to_text @kahne @sravyapopuri388 @jmp84 examples/speech_to_speech @an918tw @sravyapopuri388 @jmp84 examples/speech_synthesis @kahne @jmp84 examples/simultaneous_translation @kahne @jmp84 examples/speech_text_joint_to_text @yuntang @jmp84 # Speech related models relevant to UST fairseq/models/speech_to_speech @sravyapopuri388 @jmp84 fairseq/models/speech_to_text @kahne @sravyapopuri388 @jmp84 fairseq/models/text_to_speech @kahne @jmp84 # CONFORMER IMPLEMENTATION fairseq/modules/conformer_layer.py @sravyapopuri388 @jmp84 fairseq/modules/espnet_multihead_attention.py @sravyapopuri388 @jmp84 fairseq/modules/rotary_positional_embedding.py @sravyapopuri388 @jmp84 fairseq/modules/positional_encoding.py @sravyapopuri388 @jmp84 # Machine Translation/NLLB fairseq/tasks/translation.py @gwenzek ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: 🐛 Bug Report about: Submit a bug report to help us improve labels: 'bug, needs triage' --- ## 🐛 Bug ### To Reproduce Steps to reproduce the behavior (**always include the command you ran**): 1. Run cmd '....' 2. See error #### Code sample ### Expected behavior ### Environment - fairseq Version (e.g., 1.0 or main): - PyTorch Version (e.g., 1.0) - OS (e.g., Linux): - How you installed fairseq (`pip`, source): - Build command you used (if compiling from source): - Python version: - CUDA/cuDNN version: - GPU models and configuration: - Any other relevant information: ### Additional context ================================================ FILE: .github/ISSUE_TEMPLATE/documentation.md ================================================ --- name: 📚 Documentation/Typos about: Report an issue related to documentation or a typo labels: 'documentation, needs triage' --- ## 📚 Documentation For typos and doc fixes, please go ahead and: 1. Create an issue. 2. Fix the typo. 3. Submit a PR. Thanks! ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: 🚀 Feature Request about: Submit a proposal/request for a new feature labels: 'enhancement, help wanted, needs triage' --- ## 🚀 Feature Request ### Motivation ### Pitch ### Alternatives ### Additional context ================================================ FILE: .github/ISSUE_TEMPLATE/how-to-question.md ================================================ --- name: ❓ Questions/Help about: If you have questions, please first search existing issues and docs labels: 'question, needs triage' --- ## ❓ Questions and Help ### Before asking: 1. search the issues. 2. search the docs. #### What is your question? #### Code #### What have you tried? #### What's your environment? - fairseq Version (e.g., 1.0 or main): - PyTorch Version (e.g., 1.0) - OS (e.g., Linux): - How you installed fairseq (`pip`, source): - Build command you used (if compiling from source): - Python version: - CUDA/cuDNN version: - GPU models and configuration: - Any other relevant information: ================================================ FILE: .github/ISSUE_TEMPLATE.md ================================================ ## 👉 [Please follow one of these issue templates](https://github.com/pytorch/fairseq/issues/new/choose) 👈 Note: to keep the backlog clean and actionable, issues may be immediately closed if they do not follow one of the above issue templates. ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ # Before submitting - [ ] Was this discussed/approved via a Github issue? (no need for typos, doc improvements) - [ ] Did you read the [contributor guideline](https://github.com/pytorch/fairseq/blob/main/CONTRIBUTING.md)? - [ ] Did you make sure to update the docs? - [ ] Did you write any new necessary tests? ## What does this PR do? Fixes # (issue). ## PR review Anyone in the community is free to review the PR once the tests have passed. If we didn't discuss your PR in Github issues there's a high chance it will not be merged. ## Did you have fun? Make sure you had fun coding 🙃 ================================================ FILE: .github/stale.yml ================================================ # Configuration for probot-stale - https://github.com/probot/stale # Mostly copied from github.com/facebook/react/blob/master/.github/stale.yml # Number of days of inactivity before an issue becomes stale daysUntilStale: 90 # Number of days of inactivity before a stale issue is closed daysUntilClose: 7 # Issues with these labels will never be considered stale exemptLabels: - bug # Label to use when marking an issue as stale staleLabel: stale issues: # Comment to post when marking an issue as stale. markComment: > This issue has been automatically marked as stale. **If this issue is still affecting you, please leave any comment** (for example, "bump"), and we'll keep it open. We are sorry that we haven't been able to prioritize it yet. If you have any new additional information, please include it with your comment! # Comment to post when closing a stale issue. closeComment: > Closing this issue after a prolonged period of inactivity. If this issue is still present in the latest release, please create a new issue with up-to-date information. Thank you! pulls: # Comment to post when marking a pull request as stale. markComment: > This pull request has been automatically marked as stale. **If this pull request is still relevant, please leave any comment** (for example, "bump"), and we'll keep it open. We are sorry that we haven't been able to prioritize reviewing it yet. Your contribution is very much appreciated. # Comment to post when closing a stale pull request. closeComment: > Closing this pull request after a prolonged period of inactivity. If this issue is still present in the latest release, please ask for this pull request to be reopened. Thank you! ================================================ FILE: .github/workflows/build.yml ================================================ name: build on: # Trigger the workflow on push to main or any pull request push: branches: - main pull_request: jobs: build: strategy: max-parallel: 4 matrix: platform: [ubuntu-latest, macos-latest] python-version: [3.8, 3.9] runs-on: ${{ matrix.platform }} steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Conditionally install pytorch if: matrix.platform == 'windows-latest' run: pip3 install torch -f https://download.pytorch.org/whl/torch_stable.html - name: Install locally run: | python -m pip install --upgrade pip git submodule update --init --recursive python -m pip install . - name: Check installation working-directory: /tmp run: python $GITHUB_WORKSPACE/scripts/check_installation.py - name: Install optional test requirements run: | python -m pip install '.[dev,docs]' python -m pip install iopath transformers pyarrow python -m pip install git+https://github.com/facebookresearch/fairscale.git@main python -m pip install pygit2 pgzip - name: Install xformers for Macos if: matrix.platform == 'macos-latest' run: | brew install llvm libomp CC=/usr/local/opt/llvm/bin/clang CXX=clang++ pip install git+https://github.com/facebookresearch/xformers.git@main - name: Install xformers for non-MacOS if: matrix.platform != 'macos-latest' run: | python -m pip install --progress-bar off git+https://github.com/facebookresearch/xformers.git@main - name: Lint with black run: black --check --diff . - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - name: Build doc run: make singlehtml working-directory: docs/ - name: Run tests # When installing in non-editable mode, the .so files will be generated in 'site-packages/fairseq'. # But by default, pytest import machinery will load local fairseq, and won't see the .so. # Use --import-mode=append to favorize the 'site-packages/fairseq'. # https://docs.pytest.org/en/7.1.x/explanation/pythonpath.html run: pytest --import-mode=append -vvv tests/ ================================================ FILE: .github/workflows/depreview.yml ================================================ name: 'Dependency Review' on: [pull_request] permissions: contents: read jobs: dependency-review: runs-on: ubuntu-latest steps: - name: 'Checkout Repository' uses: actions/checkout@v4 - name: Dependency Review uses: actions/dependency-review-action@v4 ================================================ FILE: .github/workflows/release.yml ================================================ name: Fairseq Release on: workflow_dispatch: inputs: name: description: 'Release Type' default: 'patch' required: true jobs: get_next_version: runs-on: ubuntu-latest steps: - name: checkout-repo-content uses: actions/checkout@v2 - name: setup-python uses: actions/setup-python@v2 with: python-version: 3.8 - name: get next version and tag id: get-next-version-and-tag run: | output=$(python3 release_utils.py --release-type ${{ github.event.inputs.name }}) echo $output new_version=$(echo $output | awk '{print $1}') new_tag=$(echo $output | awk '{print $2}') echo "new version is $new_version" echo "new tag is $new_tag" echo ::set-output name=version::$new_version echo ::set-output name=tag::$new_tag echo ::set-output name=branch_name::$new_version-release echo "NEW_TAG=$new_tag" >> $GITHUB_ENV echo "NEW_BRANCH=$new_version-release" >> $GITHUB_ENV # update the version number in version.txt - name: update version id: update-version run : | echo "current folder = $PWD" echo "current branch = $(git branch --show-current)" output=$(python3 release_utils.py --release-type ${{ github.event.inputs.name }} --update-version) - name: add and commit uses: EndBug/add-and-commit@v9 with: author_name: ${{ secrets.AUTHOR_NAME }} author_email: ${{ secrets.AUTHOR_EMAIL }} # TODO: change this to main once shipit is disabled. new_branch: '${{ env.NEW_BRANCH }}' default_author: github_actor message: '${{ env.NEW_TAG }} release' pathspec_error_handling: exitAtEnd # Arguments for the git pull command. Use NO-PULL to avoid the action pulling at all. # pull: 'NO-PULL' tag: '${{ env.NEW_TAG }}' outputs: new_version: ${{ steps.get-next-version-and-tag.outputs.version }} new_tag: ${{ steps.get-next-version-and-tag.outputs.tag }} branch_name: ${{ steps.get-next-version-and-tag.outputs.branch_name }} create_sdist: runs-on: ubuntu-latest name: Create Source Distribution needs: get_next_version steps: - uses: actions/checkout@v3 with: ref: ${{ needs.get_next_version.outputs.branch_name }} - name: Install Python uses: actions/setup-python@v2 with: python-version: '3.8' - name: Upgrade pip run: | python3 -m pip install --upgrade pip - name: Create Source Distribution run: | python3 -m pip install setuptools wheel twine torch python3 setup.py sdist - uses: actions/upload-artifact@v2 with: path: dist/*.tar.gz build_wheels: name: Build wheels on ${{ matrix.os }} runs-on: ${{ matrix.os }} needs: get_next_version strategy: matrix: os: [ubuntu-latest, macos-latest] steps: - uses: actions/checkout@v3 with: ref: ${{ needs.get_next_version.outputs.branch_name }} - name: Install Python uses: actions/setup-python@v2 with: python-version: '3.8' - name: Upgrade pip run: | python3 -m pip install --upgrade pip - name: Install cibuildwheel run: | python3 -m pip install cibuildwheel - name: Build wheels for CPython run: | python3 -m cibuildwheel --output-dir dist env: CIBW_BUILD: "cp38-*64" CIBW_MANYLINUX_X86_64_IMAGE: manylinux1 CIBW_BEFORE_BUILD: git submodule update --init --recursive && pip install . # Install system library CIBW_BEFORE_BUILD_LINUX: (yum install -y libffi-devel || apt-get install -y libffi-devel || apk add --update --no-cache libffi-devel || true) && (yum install -y libc6 || apt-get install -y libc6 || apk add --update --no-cache libc6 || true) CIBW_ENVIRONMENT: "PIP_ONLY_BINARY=numpy" CIBW_SKIP: "*musllinux*" - uses: actions/upload-artifact@v2 with: path: dist upload: name: Upload to PyPi and create release runs-on: ubuntu-latest needs: [build_wheels, create_sdist, get_next_version] steps: - uses: actions/download-artifact@v2 with: name: artifact path: dist # build the PyPI package and upload it - name: upload env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | pip install setuptools wheel twine python3 -m twine upload --repository pypi dist/* # create the release on github - name: create release on github uses: ncipollo/release-action@v1 with: tag: '${{ needs.get_next_version.outputs.new_tag }}' ================================================ FILE: .gitignore ================================================ # JetBrains PyCharm IDE .idea/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # macOS dir files .DS_Store # Distribution / packaging .Python env/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg # Checkpoints checkpoints # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # dotenv .env # virtualenv .venv venv/ ENV/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # Generated files /fairseq/temporal_convolution_tbc /fairseq/modules/*_layer/*_forward.cu /fairseq/modules/*_layer/*_backward.cu /fairseq/version.py # data data-bin/ # reranking /examples/reranking/rerank_data # Cython-generated C++ source files /fairseq/data/data_utils_fast.cpp /fairseq/data/token_block_utils_fast.cpp # VSCODE .vscode/ftp-sync.json .vscode/settings.json # Experimental Folder experimental/* # Weights and Biases logs wandb/ # Hydra artifacts nohup.out multirun outputs ================================================ FILE: .gitmodules ================================================ [submodule "fairseq/model_parallel/megatron"] path = fairseq/model_parallel/megatron url = https://github.com/ngoyal2707/Megatron-LM branch = fairseq ================================================ FILE: .pre-commit-config.yaml ================================================ exclude: 'build|stubs' default_language_version: python: python3 repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.1.0 hooks: - id: trailing-whitespace - id: check-ast - id: check-merge-conflict - id: no-commit-to-branch args: ['--branch=master'] - id: check-added-large-files args: ['--maxkb=500'] - id: end-of-file-fixer - repo: https://github.com/ambv/black rev: 22.3.0 hooks: - id: black language_version: python3.8 - repo: https://gitlab.com/pycqa/flake8 rev: 3.9.2 hooks: - id: flake8 args: [ # only error for syntax errors and undefined names "--select=E9,F63,F7,F82", ] - repo: https://github.com/pycqa/isort rev: 5.10.1 hooks: - id: isort exclude: README.md additional_dependencies: [toml] args: ["--profile", "black"] ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies within all project spaces, and it also applies when an individual is representing the project or its community in public spaces. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at . All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq) We want to make contributing to this project as easy and transparent as possible. ## Pull Requests We actively welcome your pull requests. 1. Fork the repo and create your branch from `main`. 2. If you've added code that should be tested, add tests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. 5. Make sure your code lints. 6. If you haven't already, complete the Contributor License Agreement ("CLA"). ## Contributor License Agreement ("CLA") In order to accept your pull request, we need you to submit a CLA. You only need to do this once to work on any of Facebook's open source projects. Complete your CLA here: ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. ## License By contributing to Facebook AI Research Sequence-to-Sequence Toolkit (fairseq), you agree that your contributions will be licensed under the LICENSE file in the root directory of this source tree. ## Pre-commit hooks In order to ensure your code lints, there are pre-commit hooks configured in the repository which you can install. After installation, they will automatically run each time you commit. An abbreviated guide is given below; for more information, refer to [the offical pre-commit documentation](https://pre-commit.com/). ### Installation ``` pip install pre-commit pre-commit install ``` ### Usage Just commit your changes: ``` git commit -m "My informative commit message" ``` If there was a failure, you will get feedback ``` [INFO] Initializing environment for https://github.com/PyCQA/flake8. [INFO] Installing environment for https://github.com/pre-commit/pre-commit-hooks. [INFO] Once installed this environment will be reused. [INFO] This may take a few minutes... [INFO] Installing environment for https://github.com/PyCQA/flake8. [INFO] Once installed this environment will be reused. [INFO] This may take a few minutes... Trim Trailing Whitespace.................................................Failed - hook id: trailing-whitespace - exit code: 1 - files were modified by this hook Fixing examples/nllb/modeling/wmt15_benchmark/eval_langs2.sh Fix End of Files.........................................................Failed - hook id: end-of-file-fixer - exit code: 1 - files were modified by this hook Fixing examples/few_shot/scripts/schedule_jobs_few_shot.py flake8...................................................................Passed ``` Certain hooks modify your files to comply. To include these modifications, you will need to add them (i.e. `git add ...`) and commit again. If all is well, you should see something like: ``` Trim Trailing Whitespace.................................................Passed Fix End of Files.........................................................Passed flake8...................................................................Passed [gshard-fix-ci 8698644e1] Fix lint, add pre-commit hooks 10 files changed, 148 insertions(+), 110 deletions(-) create mode 100644 .flake8 create mode 100644 .pre-commit-config.yaml rename examples/nllb/modeling/wmt15_benchmark/{eval_langs2.py => eval_langs2.sh} (99%) ``` ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) Facebook, Inc. and its affiliates. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: MANIFEST.in ================================================ include fairseq/version.txt ================================================ FILE: README.md ================================================



Support Ukraine MIT License Latest Release Build Status Documentation Status CicleCI Status

-------------------------------------------------------------------------------- Fairseq(-py) is a sequence modeling toolkit that allows researchers and developers to train custom models for translation, summarization, language modeling and other text generation tasks. We provide reference implementations of various sequence modeling papers:
List of implemented papers

* **Convolutional Neural Networks (CNN)** + [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/conv_lm/README.md) + [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md) + [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel) + [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md) + [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md) * **LightConv and DynamicConv models** + [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md) * **Long Short-Term Memory (LSTM) networks** + Effective Approaches to Attention-based Neural Machine Translation (Luong et al., 2015) * **Transformer (self-attention) networks** + Attention Is All You Need (Vaswani et al., 2017) + [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md) + [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md) + [Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018)](examples/language_model/README.adaptive_inputs.md) + [Lexically constrained decoding with dynamic beam allocation (Post & Vilar, 2018)](examples/constrained_decoding/README.md) + [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context (Dai et al., 2019)](examples/truncated_bptt/README.md) + [Adaptive Attention Span in Transformers (Sukhbaatar et al., 2019)](examples/adaptive_span/README.md) + [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md) + [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md) + [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md) + [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md ) + [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md) + [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md) + [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md) + [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md) + [Generating Medical Reports from Patient-Doctor Conversations Using Sequence-to-Sequence Models (Enarvi et al., 2020)](examples/pointer_generator/README.md) + [Linformer: Self-Attention with Linear Complexity (Wang et al., 2020)](examples/linformer/README.md) + [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md) + [Deep Transformers with Latent Depth (Li et al., 2020)](examples/latent_depth/README.md) + [Unsupervised Cross-lingual Representation Learning for Speech Recognition (Conneau et al., 2020)](https://arxiv.org/abs/2006.13979) + [Self-training and Pre-training are Complementary for Speech Recognition (Xu et al., 2020)](https://arxiv.org/abs/2010.11430) + [Robust wav2vec 2.0: Analyzing Domain Shift in Self-Supervised Pre-Training (Hsu, et al., 2021)](https://arxiv.org/abs/2104.01027) + [Unsupervised Speech Recognition (Baevski, et al., 2021)](https://arxiv.org/abs/2105.11084) + [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., 2021)](https://arxiv.org/abs/2109.11680) + [VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding (Xu et. al., 2021)](https://arxiv.org/pdf/2109.14084.pdf) + [VLM: Task-agnostic Video-Language Model Pre-training for Video Understanding (Xu et. al., 2021)](https://aclanthology.org/2021.findings-acl.370.pdf) + [NormFormer: Improved Transformer Pretraining with Extra Normalization (Shleifer et. al, 2021)](examples/normformer/README.md) * **Non-autoregressive Transformers** + Non-Autoregressive Neural Machine Translation (Gu et al., 2017) + Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al. 2018) + Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al. 2019) + Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019) + [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md) * **Finetuning** + [Better Fine-Tuning by Reducing Representational Collapse (Aghajanyan et al. 2020)](examples/rxf/README.md)

### What's New: * May 2023 [Released models for Scaling Speech Technology to 1,000+ Languages (Pratap, et al., 2023)](examples/mms/README.md) * June 2022 [Released code for wav2vec-U 2.0 from Towards End-to-end Unsupervised Speech Recognition (Liu, et al., 2022)](examples/wav2vec/unsupervised/README.md) * May 2022 [Integration with xFormers](https://github.com/facebookresearch/xformers) * December 2021 [Released Direct speech-to-speech translation code](examples/speech_to_speech/README.md) * October 2021 [Released VideoCLIP and VLM models](examples/MMPT/README.md) * October 2021 [Released multilingual finetuned XLSR-53 model](examples/wav2vec/README.md) * September 2021 [`master` branch renamed to `main`](https://github.com/github/renaming). * July 2021 [Released DrNMT code](examples/discriminative_reranking_nmt/README.md) * July 2021 [Released Robust wav2vec 2.0 model](examples/wav2vec/README.md) * June 2021 [Released XLMR-XL and XLMR-XXL models](examples/xlmr/README.md) * May 2021 [Released Unsupervised Speech Recognition code](examples/wav2vec/unsupervised/README.md) * March 2021 [Added full parameter and optimizer state sharding + CPU offloading](examples/fully_sharded_data_parallel/README.md) * February 2021 [Added LASER training code](examples/laser/README.md) * December 2020: [Added Adaptive Attention Span code](examples/adaptive_span/README.md) * December 2020: [GottBERT model and code released](examples/gottbert/README.md) * November 2020: Adopted the [Hydra](https://github.com/facebookresearch/hydra) configuration framework * [see documentation explaining how to use it for new and existing projects](docs/hydra_integration.md) * November 2020: [fairseq 0.10.0 released](https://github.com/pytorch/fairseq/releases/tag/v0.10.0) * October 2020: [Added R3F/R4F (Better Fine-Tuning) code](examples/rxf/README.md) * October 2020: [Deep Transformer with Latent Depth code released](examples/latent_depth/README.md) * October 2020: [Added CRISS models and code](examples/criss/README.md)
Previous updates

* September 2020: [Added Linformer code](examples/linformer/README.md) * September 2020: [Added pointer-generator networks](examples/pointer_generator/README.md) * August 2020: [Added lexically constrained decoding](examples/constrained_decoding/README.md) * August 2020: [wav2vec2 models and code released](examples/wav2vec/README.md) * July 2020: [Unsupervised Quality Estimation code released](examples/unsupervised_quality_estimation/README.md) * May 2020: [Follow fairseq on Twitter](https://twitter.com/fairseq) * April 2020: [Monotonic Multihead Attention code released](examples/simultaneous_translation/README.md) * April 2020: [Quant-Noise code released](examples/quant_noise/README.md) * April 2020: [Initial model parallel support and 11B parameters unidirectional LM released](examples/megatron_11b/README.md) * March 2020: [Byte-level BPE code released](examples/byte_level_bpe/README.md) * February 2020: [mBART model and code released](examples/mbart/README.md) * February 2020: [Added tutorial for back-translation](https://github.com/pytorch/fairseq/tree/main/examples/backtranslation#training-your-own-model-wmt18-english-german) * December 2019: [fairseq 0.9.0 released](https://github.com/pytorch/fairseq/releases/tag/v0.9.0) * November 2019: [VizSeq released (a visual analysis toolkit for evaluating fairseq models)](https://facebookresearch.github.io/vizseq/docs/getting_started/fairseq_example) * November 2019: [CamemBERT model and code released](examples/camembert/README.md) * November 2019: [BART model and code released](examples/bart/README.md) * November 2019: [XLM-R models and code released](examples/xlmr/README.md) * September 2019: [Nonautoregressive translation code released](examples/nonautoregressive_translation/README.md) * August 2019: [WMT'19 models released](examples/wmt19/README.md) * July 2019: fairseq relicensed under MIT license * July 2019: [RoBERTa models and code released](examples/roberta/README.md) * June 2019: [wav2vec models and code released](examples/wav2vec/README.md)

### Features: * multi-GPU training on one machine or across multiple machines (data and model parallel) * fast generation on both CPU and GPU with multiple search algorithms implemented: + beam search + Diverse Beam Search ([Vijayakumar et al., 2016](https://arxiv.org/abs/1610.02424)) + sampling (unconstrained, top-k and top-p/nucleus) + [lexically constrained decoding](examples/constrained_decoding/README.md) (Post & Vilar, 2018) * [gradient accumulation](https://fairseq.readthedocs.io/en/latest/getting_started.html#large-mini-batch-training-with-delayed-updates) enables training with large mini-batches even on a single GPU * [mixed precision training](https://fairseq.readthedocs.io/en/latest/getting_started.html#training-with-half-precision-floating-point-fp16) (trains faster with less GPU memory on [NVIDIA tensor cores](https://developer.nvidia.com/tensor-cores)) * [extensible](https://fairseq.readthedocs.io/en/latest/overview.html): easily register new models, criterions, tasks, optimizers and learning rate schedulers * [flexible configuration](docs/hydra_integration.md) based on [Hydra](https://github.com/facebookresearch/hydra) allowing a combination of code, command-line and file based configuration * [full parameter and optimizer state sharding](examples/fully_sharded_data_parallel/README.md) * [offloading parameters to CPU](examples/fully_sharded_data_parallel/README.md) We also provide [pre-trained models for translation and language modeling](#pre-trained-models-and-examples) with a convenient `torch.hub` interface: ``` python en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model') en2de.translate('Hello world', beam=5) # 'Hallo Welt' ``` See the PyTorch Hub tutorials for [translation](https://pytorch.org/hub/pytorch_fairseq_translation/) and [RoBERTa](https://pytorch.org/hub/pytorch_fairseq_roberta/) for more examples. # Requirements and Installation * [PyTorch](http://pytorch.org/) version >= 1.10.0 * Python version >= 3.8 * For training new models, you'll also need an NVIDIA GPU and [NCCL](https://github.com/NVIDIA/nccl) * **To install fairseq** and develop locally: ``` bash git clone https://github.com/pytorch/fairseq cd fairseq pip install --editable ./ # on MacOS: # CFLAGS="-stdlib=libc++" pip install --editable ./ # to install the latest stable release (0.10.x) # pip install fairseq ``` * **For faster training** install NVIDIA's [apex](https://github.com/NVIDIA/apex) library: ``` bash git clone https://github.com/NVIDIA/apex cd apex pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \ --global-option="--deprecated_fused_adam" --global-option="--xentropy" \ --global-option="--fast_multihead_attn" ./ ``` * **For large datasets** install [PyArrow](https://arrow.apache.org/docs/python/install.html#using-pip): `pip install pyarrow` * If you use Docker make sure to increase the shared memory size either with `--ipc=host` or `--shm-size` as command line options to `nvidia-docker run` . # Getting Started The [full documentation](https://fairseq.readthedocs.io/) contains instructions for getting started, training new models and extending fairseq with new model types and tasks. # Pre-trained models and examples We provide pre-trained models and pre-processed, binarized test sets for several tasks listed below, as well as example training and evaluation commands. * [Translation](examples/translation/README.md): convolutional and transformer models are available * [Language Modeling](examples/language_model/README.md): convolutional and transformer models are available We also have more detailed READMEs to reproduce results from specific papers: * [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale (Babu et al., 2021)](examples/wav2vec/xlsr/README.md) * [Cross-lingual Retrieval for Iterative Self-Supervised Training (Tran et al., 2020)](examples/criss/README.md) * [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](examples/wav2vec/README.md) * [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](examples/unsupervised_quality_estimation/README.md) * [Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020)](examples/quant_noise/README.md) * [Neural Machine Translation with Byte-Level Subwords (Wang et al., 2020)](examples/byte_level_bpe/README.md) * [Multilingual Denoising Pre-training for Neural Machine Translation (Liu et at., 2020)](examples/mbart/README.md) * [Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019)](examples/layerdrop/README.md) * [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](examples/joint_alignment_translation/README.md) * [Levenshtein Transformer (Gu et al., 2019)](examples/nonautoregressive_translation/README.md) * [Facebook FAIR's WMT19 News Translation Task Submission (Ng et al., 2019)](examples/wmt19/README.md) * [RoBERTa: A Robustly Optimized BERT Pretraining Approach (Liu et al., 2019)](examples/roberta/README.md) * [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](examples/wav2vec/README.md) * [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](examples/translation_moe/README.md) * [Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019)](examples/pay_less_attention_paper/README.md) * [Understanding Back-Translation at Scale (Edunov et al., 2018)](examples/backtranslation/README.md) * [Classical Structured Prediction Losses for Sequence to Sequence Learning (Edunov et al., 2018)](https://github.com/pytorch/fairseq/tree/classic_seqlevel) * [Hierarchical Neural Story Generation (Fan et al., 2018)](examples/stories/README.md) * [Scaling Neural Machine Translation (Ott et al., 2018)](examples/scaling_nmt/README.md) * [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](examples/conv_seq2seq/README.md) * [Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017)](examples/language_model/README.conv.md) # Join the fairseq community * Twitter: https://twitter.com/fairseq * Facebook page: https://www.facebook.com/groups/fairseq.users * Google group: https://groups.google.com/forum/#!forum/fairseq-users # License fairseq(-py) is MIT-licensed. The license applies to the pre-trained models as well. # Citation Please cite as: ``` bibtex @inproceedings{ott2019fairseq, title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, year = {2019}, } ``` ================================================ FILE: RELEASE.md ================================================ # Creating a New Release In order to create a new release: 1. Navigate to the [Fairseq Workflows](https://github.com/facebookresearch/fairseq/actions) and find the one named _Fairseq Release_. 2. Under _Run Workflow_ choose the branch `main` and for _Release Type_ enter either `major`, `minor`, or `patch`. 3. A branch named `$new_version-release` will be created where the `version.txt` file is updated. Merge those changes into `main`. 4. Make sure that a [new PYPI package](https://pypi.org/project/fairseq/) has been uploaded. 5. Make sure that a [new github release](https://github.com/facebookresearch/fairseq/releases) has been created. ================================================ FILE: docs/Makefile ================================================ # Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python -msphinx SPHINXPROJ = fairseq SOURCEDIR = . BUILDDIR = _build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/command_line_tools.rst ================================================ .. _Command-line Tools: Command-line Tools ================== Fairseq provides several command-line tools for training and evaluating models: - :ref:`fairseq-preprocess`: Data pre-processing: build vocabularies and binarize training data - :ref:`fairseq-train`: Train a new model on one or multiple GPUs - :ref:`fairseq-generate`: Translate pre-processed data with a trained model - :ref:`fairseq-interactive`: Translate raw text with a trained model - :ref:`fairseq-score`: BLEU scoring of generated translations against reference translations - :ref:`fairseq-eval-lm`: Language model evaluation .. _fairseq-preprocess: fairseq-preprocess ~~~~~~~~~~~~~~~~~~ .. automodule:: fairseq_cli.preprocess .. argparse:: :module: fairseq.options :func: get_preprocessing_parser :prog: fairseq-preprocess .. _fairseq-train: fairseq-train ~~~~~~~~~~~~~ .. automodule:: fairseq_cli.train .. argparse:: :module: fairseq.options :func: get_training_parser :prog: fairseq-train .. _fairseq-generate: fairseq-generate ~~~~~~~~~~~~~~~~ .. automodule:: fairseq_cli.generate .. argparse:: :module: fairseq.options :func: get_generation_parser :prog: fairseq-generate .. _fairseq-interactive: fairseq-interactive ~~~~~~~~~~~~~~~~~~~ .. automodule:: fairseq_cli.interactive .. argparse:: :module: fairseq.options :func: get_interactive_generation_parser :prog: fairseq-interactive .. _fairseq-score: fairseq-score ~~~~~~~~~~~~~ .. automodule:: fairseq_cli.score .. argparse:: :module: fairseq_cli.score :func: get_parser :prog: fairseq-score .. _fairseq-eval-lm: fairseq-eval-lm ~~~~~~~~~~~~~~~ .. automodule:: fairseq_cli.eval_lm .. argparse:: :module: fairseq.options :func: get_eval_lm_parser :prog: fairseq-eval-lm ================================================ FILE: docs/conf.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # # fairseq documentation build configuration file, created by # sphinx-quickstart on Fri Aug 17 21:45:30 2018. # # This file is execfile()d with the current directory set to its # containing dir. # # Note that not all possible configuration values are present in this # autogenerated file. # # All configuration values have a default; values that are commented out # serve to show the default. # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. import os import sys from fairseq import __version__ # source code directory, relative to this file, for sphinx-autobuild sys.path.insert(0, os.path.abspath("..")) source_suffix = [".rst"] # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.viewcode", "sphinx.ext.napoleon", "sphinxarg.ext", ] # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] # The master toctree document. master_doc = "index" # General information about the project. project = "fairseq" copyright = "Facebook AI Research (FAIR)" author = "Facebook AI Research (FAIR)" github_doc_root = "https://github.com/pytorch/fairseq/tree/main/docs/" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. version = __version__ # The full version, including alpha/beta/rc tags. release = __version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This patterns also effect to html_static_path and html_extra_path exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" highlight_language = "python" # If true, `todo` and `todoList` produce output, else they produce nothing. todo_include_todos = False # -- Options for HTML output ---------------------------------------------- html_theme = "classic" # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "numpy": ("http://docs.scipy.org/doc/numpy/", None), "python": ("https://docs.python.org/", None), "torch": ("https://pytorch.org/docs/master/", None), } ================================================ FILE: docs/criterions.rst ================================================ .. role:: hidden :class: hidden-section .. _Criterions: Criterions ========== Criterions compute the loss function given the model and batch, roughly:: loss = criterion(model, batch) .. automodule:: fairseq.criterions :members: .. autoclass:: fairseq.criterions.FairseqCriterion :members: :undoc-members: .. autoclass:: fairseq.criterions.adaptive_loss.AdaptiveLoss :members: :undoc-members: .. autoclass:: fairseq.criterions.composite_loss.CompositeLoss :members: :undoc-members: .. autoclass:: fairseq.criterions.cross_entropy.CrossEntropyCriterion :members: :undoc-members: .. autoclass:: fairseq.criterions.label_smoothed_cross_entropy.LabelSmoothedCrossEntropyCriterion :members: :undoc-members: ================================================ FILE: docs/data.rst ================================================ .. role:: hidden :class: hidden-section .. module:: fairseq.data Data Loading and Utilities ========================== .. _datasets: Datasets -------- **Datasets** define the data format and provide helpers for creating mini-batches. .. autoclass:: fairseq.data.FairseqDataset :members: .. autoclass:: fairseq.data.LanguagePairDataset :members: .. autoclass:: fairseq.data.MonolingualDataset :members: **Helper Datasets** These datasets wrap other :class:`fairseq.data.FairseqDataset` instances and provide additional functionality: .. autoclass:: fairseq.data.BacktranslationDataset :members: .. autoclass:: fairseq.data.ConcatDataset :members: .. autoclass:: fairseq.data.ResamplingDataset :members: .. autoclass:: fairseq.data.RoundRobinZipDatasets :members: .. autoclass:: fairseq.data.TransformEosDataset :members: Dictionary ---------- .. autoclass:: fairseq.data.Dictionary :members: Iterators --------- .. autoclass:: fairseq.data.CountingIterator :members: .. autoclass:: fairseq.data.EpochBatchIterator :members: .. autoclass:: fairseq.data.GroupedIterator :members: .. autoclass:: fairseq.data.ShardedIterator :members: ================================================ FILE: docs/docutils.conf ================================================ [writers] option-limit=0 ================================================ FILE: docs/getting_started.rst ================================================ Evaluating Pre-trained Models ============================= First, download a pre-trained model along with its vocabularies: .. code-block:: console > curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf - This model uses a `Byte Pair Encoding (BPE) vocabulary `__, so we'll have to apply the encoding to the source text before it can be translated. This can be done with the `apply\_bpe.py `__ script using the ``wmt14.en-fr.fconv-cuda/bpecodes`` file. ``@@`` is used as a continuation marker and the original text can be easily recovered with e.g. ``sed s/@@ //g`` or by passing the ``--remove-bpe`` flag to :ref:`fairseq-generate`. Prior to BPE, input text needs to be tokenized using ``tokenizer.perl`` from `mosesdecoder `__. Let's use :ref:`fairseq-interactive` to generate translations interactively. Here, we use a beam size of 5 and preprocess the input with the Moses tokenizer and the given Byte-Pair Encoding vocabulary. It will automatically remove the BPE continuation markers and detokenize the output. .. code-block:: console > MODEL_DIR=wmt14.en-fr.fconv-py > fairseq-interactive \ --path $MODEL_DIR/model.pt $MODEL_DIR \ --beam 5 --source-lang en --target-lang fr \ --tokenizer moses \ --bpe subword_nmt --bpe-codes $MODEL_DIR/bpecodes | loading model(s) from wmt14.en-fr.fconv-py/model.pt | [en] dictionary: 44206 types | [fr] dictionary: 44463 types | Type the input sentence and press return: Why is it rare to discover new marine mammal species? S-0 Why is it rare to discover new marine mam@@ mal species ? H-0 -0.0643349438905716 Pourquoi est-il rare de découvrir de nouvelles espèces de mammifères marins? P-0 -0.0763 -0.1849 -0.0956 -0.0946 -0.0735 -0.1150 -0.1301 -0.0042 -0.0321 -0.0171 -0.0052 -0.0062 -0.0015 This generation script produces three types of outputs: a line prefixed with *O* is a copy of the original source sentence; *H* is the hypothesis along with an average log-likelihood; and *P* is the positional score per token position, including the end-of-sentence marker which is omitted from the text. Other types of output lines you might see are *D*, the detokenized hypothesis, *T*, the reference target, *A*, alignment info, *E* the history of generation steps. See the `README `__ for a full list of pre-trained models available. Training a New Model ==================== The following tutorial is for machine translation. For an example of how to use Fairseq for other tasks, such as :ref:`language modeling`, please see the ``examples/`` directory. Data Pre-processing ------------------- Fairseq contains example pre-processing scripts for several translation datasets: IWSLT 2014 (German-English), WMT 2014 (English-French) and WMT 2014 (English-German). To pre-process and binarize the IWSLT dataset: .. code-block:: console > cd examples/translation/ > bash prepare-iwslt14.sh > cd ../.. > TEXT=examples/translation/iwslt14.tokenized.de-en > fairseq-preprocess --source-lang de --target-lang en \ --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ --destdir data-bin/iwslt14.tokenized.de-en This will write binarized data that can be used for model training to ``data-bin/iwslt14.tokenized.de-en``. Training -------- Use :ref:`fairseq-train` to train a new model. Here a few example settings that work well for the IWSLT 2014 dataset: .. code-block:: console > mkdir -p checkpoints/fconv > CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt14.tokenized.de-en \ --optimizer nag --lr 0.25 --clip-norm 0.1 --dropout 0.2 --max-tokens 4000 \ --arch fconv_iwslt_de_en --save-dir checkpoints/fconv By default, :ref:`fairseq-train` will use all available GPUs on your machine. Use the ``CUDA_VISIBLE_DEVICES`` environment variable to select specific GPUs and/or to change the number of GPU devices that will be used. Also note that the batch size is specified in terms of the maximum number of tokens per batch (``--max-tokens``). You may need to use a smaller value depending on the available GPU memory on your system. Generation ---------- Once your model is trained, you can generate translations using :ref:`fairseq-generate` **(for binarized data)** or :ref:`fairseq-interactive` **(for raw text)**: .. code-block:: console > fairseq-generate data-bin/iwslt14.tokenized.de-en \ --path checkpoints/fconv/checkpoint_best.pt \ --batch-size 128 --beam 5 | [de] dictionary: 35475 types | [en] dictionary: 24739 types | data-bin/iwslt14.tokenized.de-en test 6750 examples | model fconv | loaded checkpoint trainings/fconv/checkpoint_best.pt S-721 danke . T-721 thank you . ... To generate translations with only a CPU, use the ``--cpu`` flag. BPE continuation markers can be removed with the ``--remove-bpe`` flag. Advanced Training Options ========================= Large mini-batch training with delayed updates ---------------------------------------------- The ``--update-freq`` option can be used to accumulate gradients from multiple mini-batches and delay updating, creating a larger effective batch size. Delayed updates can also improve training speed by reducing inter-GPU communication costs and by saving idle time caused by variance in workload across GPUs. See `Ott et al. (2018) `__ for more details. To train on a single GPU with an effective batch size that is equivalent to training on 8 GPUs: .. code-block:: console > CUDA_VISIBLE_DEVICES=0 fairseq-train --update-freq 8 (...) Training with half precision floating point (FP16) -------------------------------------------------- .. note:: FP16 training requires a Volta GPU and CUDA 9.1 or greater Recent GPUs enable efficient half precision floating point computation, e.g., using `Nvidia Tensor Cores `__. Fairseq supports FP16 training with the ``--fp16`` flag: .. code-block:: console > fairseq-train --fp16 (...) Distributed training -------------------- Distributed training in fairseq is implemented on top of ``torch.distributed``. The easiest way to launch jobs is with the `torch.distributed.launch `__ tool. For example, to train a large English-German Transformer model on 2 nodes each with 8 GPUs (in total 16 GPUs), run the following command on each node, replacing ``node_rank=0`` with ``node_rank=1`` on the second node and making sure to update ``--master_addr`` to the IP address of the first node: .. code-block:: console > python -m torch.distributed.launch --nproc_per_node=8 \ --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" \ --master_port=12345 \ $(which fairseq-train) data-bin/wmt16_en_de_bpe32k \ --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \ --lr 0.0005 \ --dropout 0.3 --weight-decay 0.0 --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --max-tokens 3584 \ --max-epoch 70 \ --fp16 On SLURM clusters, fairseq will automatically detect the number of nodes and GPUs, but a port number must be provided: .. code-block:: console > salloc --gpus=16 --nodes 2 (...) > srun fairseq-train --distributed-port 12345 (...). .. warning:: PyTorch Distributed features used in fairseq are intended for internal communication only. They are not built for use in untrusted environments or networks. For performance reasons, none of the PyTorch Distributed primitives include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a distributed fairseq job on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running the job. Sharding very large datasets ---------------------------- It can be challenging to train over very large datasets, particularly if your machine does not have much system RAM. Most tasks in fairseq support training over "sharded" datasets, in which the original dataset has been preprocessed into non-overlapping chunks (or "shards"). For example, instead of preprocessing all your data into a single "data-bin" directory, you can split the data and create "data-bin1", "data-bin2", etc. Then you can adapt your training command like so: .. code-block:: console > fairseq-train data-bin1:data-bin2:data-bin3 (...) Training will now iterate over each shard, one by one, with each shard corresponding to an "epoch", thus reducing system memory usage. ================================================ FILE: docs/hydra_integration.md ================================================ ## Hydra [Hydra](https://github.com/facebookresearch/hydra) is an open-source Python framework that simplifies the development of research and other complex applications. The key feature is the ability to dynamically create a hierarchical configuration by composition and override it through config files and the command line. The name Hydra comes from its ability to run multiple similar jobs - much like a Hydra with multiple heads. ## Motivation Until recently, all components in fairseq were configured through a shared `args` namespace that was created at application startup. Components declared their own `add_args` method to update the argparse parser, hoping that the names would not clash with arguments from other components. While this model works for smaller applications, as fairseq grew and became integrated into other applications, this became problematic. In order to determine how to configure each component, one needed to a) examine what args were added by this component, and b) read the code to figure out what shared arguments it is using that were added in other places. Reproducing models involved sharing commands that often contained dozens of command line switches. The model described above is still supported by fairseq for backward compatibility, but will be deprecated some time in the future. New components in fairseq should now create a dataclass that encapsulates all parameters required to configure this component. The dataclass is registered along with the component, and fairseq takes care of constructing and providing this configuration object to the component's constructor. Note that sharing parameters can optionally still work, but one has to explicitly point to the "source of truth" (see inheritance example below). These changes make components in fairseq more independent and re-usable by other applications: all that is needed to create a component is to initialize its dataclass and overwrite some of the defaults. While configuring fairseq through command line (using either the legacy argparse based or the new Hydra based entry points) is still fully supported, you can now take advantage of configuring fairseq completely or piece-by-piece through hierarchical YAML configuration files. These files can also be shipped as examples that others can use to run an identically configured job. Additionally, Hydra has a rich and growing [library of plugins](https://github.com/facebookresearch/hydra/tree/master/plugins) that provide functionality such as hyperparameter sweeping (including using bayesian optimization through the [Ax](https://github.com/facebook/Ax) library), job launching across various platforms, and more. ## Creating or migrating components In general, each new (or updated) component should provide a companion [dataclass](https://www.python.org/dev/peps/pep-0557/). These dataclass are typically located in the same file as the component and are passed as arguments to the `register_*()` functions. Top-level configs that should be present in every fairseq application are placed in the [global](fairseq/dataclass/configs.py) config file and added to the `FairseqConfig` object. Each dataclass is a plain-old-data object, similar to a `NamedTuple`. These classes are decorated with a `@dataclass` decorator, and typically inherit from `FairseqDataclass` (which adds some functionality for backward compatibility). Each field must have a type, and generally has metadata (such as a help string) and a default value. Only primitive types or other config objects are allowed as data types for each field. #### Example: ```python from dataclasses import dataclass, field from fairseq.dataclass import FairseqDataclass @dataclass class InteractiveConfig(FairseqDataclass): buffer_size: int = field( default=0, metadata={ "help": "read this many sentences into a buffer before processing them" }, ) input: str = field( default="-", metadata={"help": "file to read from; use - for stdin"}, ) ``` ### Inherting values Some components require sharing a value. For example, a learning rate scheduler and an optimizer may both need to know the initial learning rate value. One can declare a field that, by default, will inherit its value from another config node in the same hierarchy: ```python @dataclass FairseqAdamConfig(FairseqDataclass): ... lr: List[float] = II("optimization.lr") ... ``` `II("optimization.lr")` is syntactic sugar for `"${optimization.lr}"`, which is the value one can use in a YAML config file or through command line to achieve the same effect. Note that this assumes that there is an "optimization" config object in the root config and it has a field called "lr". ### Tasks and Models Creating Tasks and Models works same as before, except that legacy implementations now inherit from `LegacyFairseq*` base classes, while new components inherit from `FairseqTask` and `FairseqModel` and provide a dataclass to the `register_*()` functions. #### Task example: ```python @dataclass class LanguageModelingConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={"help": "path to data directory"} ) ... @register_task("language_modeling", dataclass=LanguageModelingConfig) class LanguageModelingTask(FairseqTask): ... @classmethod def setup_task(cls, cfg: LanguageModelingConfig): ... ``` #### Model example: ```python @dataclass class TransformerLanguageModelConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"} ) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) ... @register_model("transformer_lm", dataclass=TransformerLanguageModelConfig) class TransformerLanguageModel(FairseqLanguageModel): ... @classmethod def build_model(cls, cfg: TransformerLanguageModelConfig, task: FairseqTask): ... ``` ### Other components Other components work as before, but they now take their configuration dataclass as the only constructor argument: ```python @dataclass class MosesTokenizerConfig(FairseqDataclass): source_lang: str = field(default="en", metadata={"help": "source language"}) ... @register_tokenizer("moses", dataclass=MosesTokenizerConfig) class MosesTokenizer(object): def __init__(self, cfg: MosesTokenizerConfig): ... ``` Note that if you are adding a new registry for a new set of components, you need to add it to the `FairseqConfig` object in `fairseq/dataclass/configs.py`: ```python @dataclass class FairseqConfig(object): ... my_new_registry: Any = None ``` ## Training with `fairseq-hydra-train` To fully take advantage of configuration flexibility offered by Hydra, you may want to train new models using the `fairseq-hydra-train` entry point. Legacy CLI tools such as `fairseq-train` will remain supported for the foreseeable future but will be deprecated eventually. On startup, Hydra will create a configuration object that contains a hierarchy of all the necessary dataclasses populated with their default values in the code. The default values are overwritten by values found in YAML files in `fairseq/config` directory (which currently sets minimal defaults) and then further overwritten by values provided through command line arguments. Some of the most common use cases are shown below: ### 1. Override default values through command line: ```shell script $ fairseq-hydra-train \ distributed_training.distributed_world_size=1 \ dataset.batch_size=2 \ task.data=data-bin \ model=transformer_lm/transformer_lm_gpt \ task=language_modeling \ optimization.max_update=5000 ``` Note that along with explicitly providing values for parameters such as `dataset.batch_size`, this also tells Hydra to overlay configuration found in `fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml` over the default values in the dataclass. If you want to train a model without specifying a particular architecture you can simply specify `model=transformer_lm`. This only works for migrated tasks and models. ### 2. Replace bundled configs with an external config: ```shell script $ fairseq-hydra-train \ --config-dir /path/to/external/configs \ --config-name wiki103 ``` where `/path/to/external/configs/wiki103.yaml` contains: ```yaml # @package _group_ model: _name: transformer_lm distributed_training: distributed_world_size: 1 dataset: batch_size: 2 task: _name: language_modeling data: /path/to/data add_bos_token: false max_target_positions: 1024 optimization: max_update: 50000 lr: [ 0.25 ] criterion: cross_entropy optimizer: adam lr_scheduler: _name: cosine ``` Note that here bundled configs from `fairseq/config` directory are not used, however the defaults from each dataclass will still be used (unless overwritten by your external config). Additionally you can choose to break up your configs by creating a directory structure in the same location as your main config file, with the names of the top-level fields (such as "model", "dataset", etc), and placing config files with meaningful names that would populate that specific section of your top-level config file (for example, you might have `model/small_transformer_lm.yaml`, `model/big_transformer_lm.yaml`, etc). You can then specify the correct configuration via command line, defaults in the main config, or even launch all of them as a sweep (see Hydra documentation on how to do this). ### 3. Add an external config directory to Hydra search path: This allows combining default configuration (including using any bundled config files), while specifying your own config files for some parts of the configuration. ```shell script $ fairseq-hydra-train \ distributed_training.distributed_world_size=1 \ dataset.batch_size=2 \ task.data=/path/to/data/ \ model=transformer_lm/2_layers \ task=language_modeling \ optimization.max_update=5000 \ --config-dir /path/to/external/configs ``` where `/path/to/external/configs` has the following structure: ``` . +-- model | +-- transformer_lm | | +-- 2_layers.yaml ``` and `2_layers.yaml` contains a copy of `transformer_lm_gpt.yaml` but with `decoder_layers` set to 2. You can add other configs to configure other components as well. ================================================ FILE: docs/index.rst ================================================ .. fairseq documentation master file, created by sphinx-quickstart on Fri Aug 17 21:45:30 2018. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. :github_url: https://github.com/pytorch/fairseq fairseq documentation ===================== Fairseq is a sequence modeling toolkit written in `PyTorch `_ that allows researchers and developers to train custom models for translation, summarization, language modeling and other text generation tasks. .. toctree:: :maxdepth: 1 :caption: Getting Started getting_started command_line_tools .. toctree:: :maxdepth: 1 :caption: Extending Fairseq overview tutorial_simple_lstm tutorial_classifying_names .. toctree:: :maxdepth: 2 :caption: Library Reference tasks models criterions optim lr_scheduler data modules Indices and tables ================== * :ref:`genindex` * :ref:`search` ================================================ FILE: docs/lr_scheduler.rst ================================================ .. role:: hidden :class: hidden-section .. _Learning Rate Schedulers: Learning Rate Schedulers ======================== Learning Rate Schedulers update the learning rate over the course of training. Learning rates can be updated after each update via :func:`step_update` or at epoch boundaries via :func:`step`. .. automodule:: fairseq.optim.lr_scheduler :members: .. autoclass:: fairseq.optim.lr_scheduler.FairseqLRScheduler :members: :undoc-members: .. autoclass:: fairseq.optim.lr_scheduler.cosine_lr_scheduler.CosineSchedule :members: :undoc-members: .. autoclass:: fairseq.optim.lr_scheduler.fixed_schedule.FixedSchedule :members: :undoc-members: .. autoclass:: fairseq.optim.lr_scheduler.inverse_square_root_schedule.InverseSquareRootSchedule :members: :undoc-members: .. autoclass:: fairseq.optim.lr_scheduler.reduce_lr_on_plateau.ReduceLROnPlateau :members: :undoc-members: .. autoclass:: fairseq.optim.lr_scheduler.triangular_lr_scheduler.TriangularSchedule :members: :undoc-members: ================================================ FILE: docs/make.bat ================================================ @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=python -msphinx ) set SOURCEDIR=. set BUILDDIR=_build set SPHINXPROJ=fairseq if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The Sphinx module was not found. Make sure you have Sphinx installed, echo.then set the SPHINXBUILD environment variable to point to the full echo.path of the 'sphinx-build' executable. Alternatively you may add the echo.Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% :end popd ================================================ FILE: docs/models.rst ================================================ .. role:: hidden :class: hidden-section .. module:: fairseq.models .. _Models: Models ====== A Model defines the neural network's ``forward()`` method and encapsulates all of the learnable parameters in the network. Each model also provides a set of named *architectures* that define the precise network configuration (e.g., embedding dimension, number of layers, etc.). Both the model type and architecture are selected via the ``--arch`` command-line argument. Once selected, a model may expose additional command-line arguments for further configuration. .. note:: All fairseq Models extend :class:`BaseFairseqModel`, which in turn extends :class:`torch.nn.Module`. Thus any fairseq Model can be used as a stand-alone Module in other PyTorch code. Convolutional Neural Networks (CNN) ----------------------------------- .. module:: fairseq.models.fconv .. autoclass:: fairseq.models.fconv.FConvModel :members: .. autoclass:: fairseq.models.fconv.FConvEncoder :members: :undoc-members: .. autoclass:: fairseq.models.fconv.FConvDecoder :members: Long Short-Term Memory (LSTM) networks -------------------------------------- .. module:: fairseq.models.lstm .. autoclass:: fairseq.models.lstm.LSTMModel :members: .. autoclass:: fairseq.models.lstm.LSTMEncoder :members: .. autoclass:: fairseq.models.lstm.LSTMDecoder :members: Transformer (self-attention) networks ------------------------------------- .. module:: fairseq.models.transformer .. autoclass:: fairseq.models.transformer.TransformerModel :members: .. autoclass:: fairseq.models.transformer.TransformerEncoder :members: .. autoclass:: fairseq.models.transformer.TransformerEncoderLayer :members: .. autoclass:: fairseq.models.transformer.TransformerDecoder :members: .. autoclass:: fairseq.models.transformer.TransformerDecoderLayer :members: Adding new models ----------------- .. currentmodule:: fairseq.models .. autofunction:: fairseq.models.register_model .. autofunction:: fairseq.models.register_model_architecture .. autoclass:: fairseq.models.BaseFairseqModel :members: :undoc-members: .. autoclass:: fairseq.models.FairseqEncoderDecoderModel :members: :undoc-members: .. autoclass:: fairseq.models.FairseqEncoderModel :members: :undoc-members: .. autoclass:: fairseq.models.FairseqLanguageModel :members: :undoc-members: .. autoclass:: fairseq.models.FairseqMultiModel :members: :undoc-members: .. autoclass:: fairseq.models.FairseqEncoder :members: .. autoclass:: fairseq.models.CompositeEncoder :members: .. autoclass:: fairseq.models.FairseqDecoder :members: .. _Incremental decoding: Incremental decoding -------------------- .. autoclass:: fairseq.models.FairseqIncrementalDecoder :members: :undoc-members: ================================================ FILE: docs/modules.rst ================================================ Modules ======= Fairseq provides several stand-alone :class:`torch.nn.Module` classes that may be helpful when implementing a new :class:`~fairseq.models.BaseFairseqModel`. .. automodule:: fairseq.modules :members: :undoc-members: ================================================ FILE: docs/optim.rst ================================================ .. role:: hidden :class: hidden-section .. _optimizers: Optimizers ========== Optimizers update the Model parameters based on the gradients. .. automodule:: fairseq.optim :members: .. autoclass:: fairseq.optim.FairseqOptimizer :members: :undoc-members: .. autoclass:: fairseq.optim.adadelta.Adadelta :members: :undoc-members: .. autoclass:: fairseq.optim.adagrad.Adagrad :members: :undoc-members: .. autoclass:: fairseq.optim.adafactor.FairseqAdafactor :members: :undoc-members: .. autoclass:: fairseq.optim.adam.FairseqAdam :members: :undoc-members: .. autoclass:: fairseq.optim.fp16_optimizer.FP16Optimizer :members: :undoc-members: .. autoclass:: fairseq.optim.nag.FairseqNAG :members: :undoc-members: .. autoclass:: fairseq.optim.sgd.SGD :members: :undoc-members: ================================================ FILE: docs/overview.rst ================================================ Overview ======== Fairseq can be extended through user-supplied `plug-ins `_. We support five kinds of plug-ins: - :ref:`Models` define the neural network architecture and encapsulate all of the learnable parameters. - :ref:`Criterions` compute the loss function given the model outputs and targets. - :ref:`Tasks` store dictionaries and provide helpers for loading/iterating over Datasets, initializing the Model/Criterion and calculating the loss. - :ref:`Optimizers` update the Model parameters based on the gradients. - :ref:`Learning Rate Schedulers` update the learning rate over the course of training. **Training Flow** Given a ``model``, ``criterion``, ``task``, ``optimizer`` and ``lr_scheduler``, fairseq implements the following high-level training flow:: for epoch in range(num_epochs): itr = task.get_batch_iterator(task.dataset('train')) for num_updates, batch in enumerate(itr): task.train_step(batch, model, criterion, optimizer) average_and_clip_gradients() optimizer.step() lr_scheduler.step_update(num_updates) lr_scheduler.step(epoch) where the default implementation for ``task.train_step`` is roughly:: def train_step(self, batch, model, criterion, optimizer, **unused): loss = criterion(model, batch) optimizer.backward(loss) return loss **Registering new plug-ins** New plug-ins are *registered* through a set of ``@register`` function decorators, for example:: @register_model('my_lstm') class MyLSTM(FairseqEncoderDecoderModel): (...) Once registered, new plug-ins can be used with the existing :ref:`Command-line Tools`. See the Tutorial sections for more detailed walkthroughs of how to add new plug-ins. **Loading plug-ins from another directory** New plug-ins can be defined in a custom module stored in the user system. In order to import the module, and make the plugin available to *fairseq*, the command line supports the ``--user-dir`` flag that can be used to specify a custom location for additional modules to load into *fairseq*. For example, assuming this directory tree:: /home/user/my-module/ └── __init__.py with ``__init__.py``:: from fairseq.models import register_model_architecture from fairseq.models.transformer import transformer_vaswani_wmt_en_de_big @register_model_architecture('transformer', 'my_transformer') def transformer_mmt_big(args): transformer_vaswani_wmt_en_de_big(args) it is possible to invoke the :ref:`fairseq-train` script with the new architecture with:: fairseq-train ... --user-dir /home/user/my-module -a my_transformer --task translation ================================================ FILE: docs/tasks.rst ================================================ .. role:: hidden :class: hidden-section .. module:: fairseq.tasks .. _Tasks: Tasks ===== Tasks store dictionaries and provide helpers for loading/iterating over Datasets, initializing the Model/Criterion and calculating the loss. Tasks can be selected via the ``--task`` command-line argument. Once selected, a task may expose additional command-line arguments for further configuration. Example usage:: # setup the task (e.g., load dictionaries) task = fairseq.tasks.setup_task(args) # build model and criterion model = task.build_model(args) criterion = task.build_criterion(args) # load datasets task.load_dataset('train') task.load_dataset('valid') # iterate over mini-batches of data batch_itr = task.get_batch_iterator( task.dataset('train'), max_tokens=4096, ) for batch in batch_itr: # compute the loss loss, sample_size, logging_output = task.get_loss( model, criterion, batch, ) loss.backward() Translation ----------- .. autoclass:: fairseq.tasks.translation.TranslationTask .. _language modeling: Language Modeling ----------------- .. autoclass:: fairseq.tasks.language_modeling.LanguageModelingTask Adding new tasks ---------------- .. autofunction:: fairseq.tasks.register_task .. autoclass:: fairseq.tasks.FairseqTask :members: :undoc-members: ================================================ FILE: docs/tutorial_classifying_names.rst ================================================ Tutorial: Classifying Names with a Character-Level RNN ====================================================== In this tutorial we will extend fairseq to support *classification* tasks. In particular we will re-implement the PyTorch tutorial for `Classifying Names with a Character-Level RNN `_ in fairseq. It is recommended to quickly skim that tutorial before beginning this one. This tutorial covers: 1. **Preprocessing the data** to create dictionaries. 2. **Registering a new Model** that encodes an input sentence with a simple RNN and predicts the output label. 3. **Registering a new Task** that loads our dictionaries and dataset. 4. **Training the Model** using the existing command-line tools. 5. **Writing an evaluation script** that imports fairseq and allows us to interactively evaluate our model on new inputs. 1. Preprocessing the data ------------------------- The original tutorial provides raw data, but we'll work with a modified version of the data that is already tokenized into characters and split into separate train, valid and test sets. Download and extract the data from here: `tutorial_names.tar.gz `_ Once extracted, let's preprocess the data using the :ref:`fairseq-preprocess` command-line tool to create the dictionaries. While this tool is primarily intended for sequence-to-sequence problems, we're able to reuse it here by treating the label as a "target" sequence of length 1. We'll also output the preprocessed files in "raw" format using the ``--dataset-impl`` option to enhance readability: .. code-block:: console > fairseq-preprocess \ --trainpref names/train --validpref names/valid --testpref names/test \ --source-lang input --target-lang label \ --destdir names-bin --dataset-impl raw After running the above command you should see a new directory, :file:`names-bin/`, containing the dictionaries for *inputs* and *labels*. 2. Registering a new Model -------------------------- Next we'll register a new model in fairseq that will encode an input sentence with a simple RNN and predict the output label. Compared to the original PyTorch tutorial, our version will also work with batches of data and GPU Tensors. First let's copy the simple RNN module implemented in the `PyTorch tutorial `_. Create a new file named :file:`fairseq/models/rnn_classifier.py` with the following contents:: import torch import torch.nn as nn class RNN(nn.Module): def __init__(self, input_size, hidden_size, output_size): super(RNN, self).__init__() self.hidden_size = hidden_size self.i2h = nn.Linear(input_size + hidden_size, hidden_size) self.i2o = nn.Linear(input_size + hidden_size, output_size) self.softmax = nn.LogSoftmax(dim=1) def forward(self, input, hidden): combined = torch.cat((input, hidden), 1) hidden = self.i2h(combined) output = self.i2o(combined) output = self.softmax(output) return output, hidden def initHidden(self): return torch.zeros(1, self.hidden_size) We must also *register* this model with fairseq using the :func:`~fairseq.models.register_model` function decorator. Once the model is registered we'll be able to use it with the existing :ref:`Command-line Tools`. All registered models must implement the :class:`~fairseq.models.BaseFairseqModel` interface, so we'll create a small wrapper class in the same file and register it in fairseq with the name ``'rnn_classifier'``:: from fairseq.models import BaseFairseqModel, register_model # Note: the register_model "decorator" should immediately precede the # definition of the Model class. @register_model('rnn_classifier') class FairseqRNNClassifier(BaseFairseqModel): @staticmethod def add_args(parser): # Models can override this method to add new command-line arguments. # Here we'll add a new command-line argument to configure the # dimensionality of the hidden state. parser.add_argument( '--hidden-dim', type=int, metavar='N', help='dimensionality of the hidden state', ) @classmethod def build_model(cls, args, task): # Fairseq initializes models by calling the ``build_model()`` # function. This provides more flexibility, since the returned model # instance can be of a different type than the one that was called. # In this case we'll just return a FairseqRNNClassifier instance. # Initialize our RNN module rnn = RNN( # We'll define the Task in the next section, but for now just # notice that the task holds the dictionaries for the "source" # (i.e., the input sentence) and "target" (i.e., the label). input_size=len(task.source_dictionary), hidden_size=args.hidden_dim, output_size=len(task.target_dictionary), ) # Return the wrapped version of the module return FairseqRNNClassifier( rnn=rnn, input_vocab=task.source_dictionary, ) def __init__(self, rnn, input_vocab): super(FairseqRNNClassifier, self).__init__() self.rnn = rnn self.input_vocab = input_vocab # The RNN module in the tutorial expects one-hot inputs, so we can # precompute the identity matrix to help convert from indices to # one-hot vectors. We register it as a buffer so that it is moved to # the GPU when ``cuda()`` is called. self.register_buffer('one_hot_inputs', torch.eye(len(input_vocab))) def forward(self, src_tokens, src_lengths): # The inputs to the ``forward()`` function are determined by the # Task, and in particular the ``'net_input'`` key in each # mini-batch. We'll define the Task in the next section, but for # now just know that *src_tokens* has shape `(batch, src_len)` and # *src_lengths* has shape `(batch)`. bsz, max_src_len = src_tokens.size() # Initialize the RNN hidden state. Compared to the original PyTorch # tutorial we'll also handle batched inputs and work on the GPU. hidden = self.rnn.initHidden() hidden = hidden.repeat(bsz, 1) # expand for batched inputs hidden = hidden.to(src_tokens.device) # move to GPU for i in range(max_src_len): # WARNING: The inputs have padding, so we should mask those # elements here so that padding doesn't affect the results. # This is left as an exercise for the reader. The padding symbol # is given by ``self.input_vocab.pad()`` and the unpadded length # of each input is given by *src_lengths*. # One-hot encode a batch of input characters. input = self.one_hot_inputs[src_tokens[:, i].long()] # Feed the input to our RNN. output, hidden = self.rnn(input, hidden) # Return the final output state for making a prediction return output Finally let's define a *named architecture* with the configuration for our model. This is done with the :func:`~fairseq.models.register_model_architecture` function decorator. Thereafter this named architecture can be used with the ``--arch`` command-line argument, e.g., ``--arch pytorch_tutorial_rnn``:: from fairseq.models import register_model_architecture # The first argument to ``register_model_architecture()`` should be the name # of the model we registered above (i.e., 'rnn_classifier'). The function we # register here should take a single argument *args* and modify it in-place # to match the desired architecture. @register_model_architecture('rnn_classifier', 'pytorch_tutorial_rnn') def pytorch_tutorial_rnn(args): # We use ``getattr()`` to prioritize arguments that are explicitly given # on the command-line, so that the defaults defined below are only used # when no other value has been specified. args.hidden_dim = getattr(args, 'hidden_dim', 128) 3. Registering a new Task ------------------------- Now we'll register a new :class:`~fairseq.tasks.FairseqTask` that will load our dictionaries and dataset. Tasks can also control how the data is batched into mini-batches, but in this tutorial we'll reuse the batching provided by :class:`fairseq.data.LanguagePairDataset`. Create a new file named :file:`fairseq/tasks/simple_classification.py` with the following contents:: import os import torch from fairseq.data import Dictionary, LanguagePairDataset from fairseq.tasks import LegacyFairseqTask, register_task @register_task('simple_classification') class SimpleClassificationTask(LegacyFairseqTask): @staticmethod def add_args(parser): # Add some command-line arguments for specifying where the data is # located and the maximum supported input length. parser.add_argument('data', metavar='FILE', help='file prefix for data') parser.add_argument('--max-positions', default=1024, type=int, help='max input length') @classmethod def setup_task(cls, args, **kwargs): # Here we can perform any setup required for the task. This may include # loading Dictionaries, initializing shared Embedding layers, etc. # In this case we'll just load the Dictionaries. input_vocab = Dictionary.load(os.path.join(args.data, 'dict.input.txt')) label_vocab = Dictionary.load(os.path.join(args.data, 'dict.label.txt')) print('| [input] dictionary: {} types'.format(len(input_vocab))) print('| [label] dictionary: {} types'.format(len(label_vocab))) return SimpleClassificationTask(args, input_vocab, label_vocab) def __init__(self, args, input_vocab, label_vocab): super().__init__(args) self.input_vocab = input_vocab self.label_vocab = label_vocab def load_dataset(self, split, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" prefix = os.path.join(self.args.data, '{}.input-label'.format(split)) # Read input sentences. sentences, lengths = [], [] with open(prefix + '.input', encoding='utf-8') as file: for line in file: sentence = line.strip() # Tokenize the sentence, splitting on spaces tokens = self.input_vocab.encode_line( sentence, add_if_not_exist=False, ) sentences.append(tokens) lengths.append(tokens.numel()) # Read labels. labels = [] with open(prefix + '.label', encoding='utf-8') as file: for line in file: label = line.strip() labels.append( # Convert label to a numeric ID. torch.LongTensor([self.label_vocab.add_symbol(label)]) ) assert len(sentences) == len(labels) print('| {} {} {} examples'.format(self.args.data, split, len(sentences))) # We reuse LanguagePairDataset since classification can be modeled as a # sequence-to-sequence task where the target sequence has length 1. self.datasets[split] = LanguagePairDataset( src=sentences, src_sizes=lengths, src_dict=self.input_vocab, tgt=labels, tgt_sizes=torch.ones(len(labels)), # targets have length 1 tgt_dict=self.label_vocab, left_pad_source=False, # Since our target is a single class label, there's no need for # teacher forcing. If we set this to ``True`` then our Model's # ``forward()`` method would receive an additional argument called # *prev_output_tokens* that would contain a shifted version of the # target sequence. input_feeding=False, ) def max_positions(self): """Return the max input length allowed by the task.""" # The source should be less than *args.max_positions* and the "target" # has max length 1. return (self.args.max_positions, 1) @property def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary`.""" return self.input_vocab @property def target_dictionary(self): """Return the target :class:`~fairseq.data.Dictionary`.""" return self.label_vocab # We could override this method if we wanted more control over how batches # are constructed, but it's not necessary for this tutorial since we can # reuse the batching provided by LanguagePairDataset. # # def get_batch_iterator( # self, dataset, max_tokens=None, max_sentences=None, max_positions=None, # ignore_invalid_inputs=False, required_batch_size_multiple=1, # seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, # data_buffer_size=0, disable_iterator_cache=False, # ): # (...) 4. Training the Model --------------------- Now we're ready to train the model. We can use the existing :ref:`fairseq-train` command-line tool for this, making sure to specify our new Task (``--task simple_classification``) and Model architecture (``--arch pytorch_tutorial_rnn``): .. note:: You can also configure the dimensionality of the hidden state by passing the ``--hidden-dim`` argument to :ref:`fairseq-train`. .. code-block:: console > fairseq-train names-bin \ --task simple_classification \ --arch pytorch_tutorial_rnn \ --optimizer adam --lr 0.001 --lr-shrink 0.5 \ --max-tokens 1000 (...) | epoch 027 | loss 1.200 | ppl 2.30 | wps 15728 | ups 119.4 | wpb 116 | bsz 116 | num_updates 3726 | lr 1.5625e-05 | gnorm 1.290 | clip 0% | oom 0 | wall 32 | train_wall 21 | epoch 027 | valid on 'valid' subset | valid_loss 1.41304 | valid_ppl 2.66 | num_updates 3726 | best 1.41208 | done training in 31.6 seconds The model files should appear in the :file:`checkpoints/` directory. 5. Writing an evaluation script ------------------------------- Finally we can write a short script to evaluate our model on new inputs. Create a new file named :file:`eval_classifier.py` with the following contents:: from fairseq import checkpoint_utils, data, options, tasks # Parse command-line arguments for generation parser = options.get_generation_parser(default_task='simple_classification') args = options.parse_args_and_arch(parser) # Setup task task = tasks.setup_task(args) # Load model print('| loading model from {}'.format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble([args.path], task=task) model = models[0] while True: sentence = input('\nInput: ') # Tokenize into characters chars = ' '.join(list(sentence.strip())) tokens = task.source_dictionary.encode_line( chars, add_if_not_exist=False, ) # Build mini-batch to feed to the model batch = data.language_pair_dataset.collate( samples=[{'id': -1, 'source': tokens}], # bsz = 1 pad_idx=task.source_dictionary.pad(), eos_idx=task.source_dictionary.eos(), left_pad_source=False, input_feeding=False, ) # Feed batch to the model and get predictions preds = model(**batch['net_input']) # Print top 3 predictions and their log-probabilities top_scores, top_labels = preds[0].topk(k=3) for score, label_idx in zip(top_scores, top_labels): label_name = task.target_dictionary.string([label_idx]) print('({:.2f})\t{}'.format(score, label_name)) Now we can evaluate our model interactively. Note that we have included the original data path (:file:`names-bin/`) so that the dictionaries can be loaded: .. code-block:: console > python eval_classifier.py names-bin --path checkpoints/checkpoint_best.pt | [input] dictionary: 64 types | [label] dictionary: 24 types | loading model from checkpoints/checkpoint_best.pt Input: Satoshi (-0.61) Japanese (-1.20) Arabic (-2.86) Italian Input: Sinbad (-0.30) Arabic (-1.76) English (-4.08) Russian ================================================ FILE: docs/tutorial_simple_lstm.rst ================================================ Tutorial: Simple LSTM ===================== In this tutorial we will extend fairseq by adding a new :class:`~fairseq.models.FairseqEncoderDecoderModel` that encodes a source sentence with an LSTM and then passes the final hidden state to a second LSTM that decodes the target sentence (without attention). This tutorial covers: 1. **Writing an Encoder and Decoder** to encode/decode the source/target sentence, respectively. 2. **Registering a new Model** so that it can be used with the existing :ref:`Command-line tools`. 3. **Training the Model** using the existing command-line tools. 4. **Making generation faster** by modifying the Decoder to use :ref:`Incremental decoding`. 1. Building an Encoder and Decoder ---------------------------------- In this section we'll define a simple LSTM Encoder and Decoder. All Encoders should implement the :class:`~fairseq.models.FairseqEncoder` interface and Decoders should implement the :class:`~fairseq.models.FairseqDecoder` interface. These interfaces themselves extend :class:`torch.nn.Module`, so FairseqEncoders and FairseqDecoders can be written and used in the same ways as ordinary PyTorch Modules. Encoder ~~~~~~~ Our Encoder will embed the tokens in the source sentence, feed them to a :class:`torch.nn.LSTM` and return the final hidden state. To create our encoder save the following in a new file named :file:`fairseq/models/simple_lstm.py`:: import torch.nn as nn from fairseq import utils from fairseq.models import FairseqEncoder class SimpleLSTMEncoder(FairseqEncoder): def __init__( self, args, dictionary, embed_dim=128, hidden_dim=128, dropout=0.1, ): super().__init__(dictionary) self.args = args # Our encoder will embed the inputs before feeding them to the LSTM. self.embed_tokens = nn.Embedding( num_embeddings=len(dictionary), embedding_dim=embed_dim, padding_idx=dictionary.pad(), ) self.dropout = nn.Dropout(p=dropout) # We'll use a single-layer, unidirectional LSTM for simplicity. self.lstm = nn.LSTM( input_size=embed_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=False, batch_first=True, ) def forward(self, src_tokens, src_lengths): # The inputs to the ``forward()`` function are determined by the # Task, and in particular the ``'net_input'`` key in each # mini-batch. We discuss Tasks in the next tutorial, but for now just # know that *src_tokens* has shape `(batch, src_len)` and *src_lengths* # has shape `(batch)`. # Note that the source is typically padded on the left. This can be # configured by adding the `--left-pad-source "False"` command-line # argument, but here we'll make the Encoder handle either kind of # padding by converting everything to be right-padded. if self.args.left_pad_source: # Convert left-padding to right-padding. src_tokens = utils.convert_padding_direction( src_tokens, padding_idx=self.dictionary.pad(), left_to_right=True ) # Embed the source. x = self.embed_tokens(src_tokens) # Apply dropout. x = self.dropout(x) # Pack the sequence into a PackedSequence object to feed to the LSTM. x = nn.utils.rnn.pack_padded_sequence(x, src_lengths, batch_first=True) # Get the output from the LSTM. _outputs, (final_hidden, _final_cell) = self.lstm(x) # Return the Encoder's output. This can be any object and will be # passed directly to the Decoder. return { # this will have shape `(bsz, hidden_dim)` 'final_hidden': final_hidden.squeeze(0), } # Encoders are required to implement this method so that we can rearrange # the order of the batch elements during inference (e.g., beam search). def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to `new_order`. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: `encoder_out` rearranged according to `new_order` """ final_hidden = encoder_out['final_hidden'] return { 'final_hidden': final_hidden.index_select(0, new_order), } Decoder ~~~~~~~ Our Decoder will predict the next word, conditioned on the Encoder's final hidden state and an embedded representation of the previous target word -- which is sometimes called *teacher forcing*. More specifically, we'll use a :class:`torch.nn.LSTM` to produce a sequence of hidden states that we'll project to the size of the output vocabulary to predict each target word. :: import torch from fairseq.models import FairseqDecoder class SimpleLSTMDecoder(FairseqDecoder): def __init__( self, dictionary, encoder_hidden_dim=128, embed_dim=128, hidden_dim=128, dropout=0.1, ): super().__init__(dictionary) # Our decoder will embed the inputs before feeding them to the LSTM. self.embed_tokens = nn.Embedding( num_embeddings=len(dictionary), embedding_dim=embed_dim, padding_idx=dictionary.pad(), ) self.dropout = nn.Dropout(p=dropout) # We'll use a single-layer, unidirectional LSTM for simplicity. self.lstm = nn.LSTM( # For the first layer we'll concatenate the Encoder's final hidden # state with the embedded target tokens. input_size=encoder_hidden_dim + embed_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=False, ) # Define the output projection. self.output_projection = nn.Linear(hidden_dim, len(dictionary)) # During training Decoders are expected to take the entire target sequence # (shifted right by one position) and produce logits over the vocabulary. # The *prev_output_tokens* tensor begins with the end-of-sentence symbol, # ``dictionary.eos()``, followed by the target sequence. def forward(self, prev_output_tokens, encoder_out): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ bsz, tgt_len = prev_output_tokens.size() # Extract the final hidden state from the Encoder. final_encoder_hidden = encoder_out['final_hidden'] # Embed the target sequence, which has been shifted right by one # position and now starts with the end-of-sentence symbol. x = self.embed_tokens(prev_output_tokens) # Apply dropout. x = self.dropout(x) # Concatenate the Encoder's final hidden state to *every* embedded # target token. x = torch.cat( [x, final_encoder_hidden.unsqueeze(1).expand(bsz, tgt_len, -1)], dim=2, ) # Using PackedSequence objects in the Decoder is harder than in the # Encoder, since the targets are not sorted in descending length order, # which is a requirement of ``pack_padded_sequence()``. Instead we'll # feed nn.LSTM directly. initial_state = ( final_encoder_hidden.unsqueeze(0), # hidden torch.zeros_like(final_encoder_hidden).unsqueeze(0), # cell ) output, _ = self.lstm( x.transpose(0, 1), # convert to shape `(tgt_len, bsz, dim)` initial_state, ) x = output.transpose(0, 1) # convert to shape `(bsz, tgt_len, hidden)` # Project the outputs to the size of the vocabulary. x = self.output_projection(x) # Return the logits and ``None`` for the attention weights return x, None 2. Registering the Model ------------------------ Now that we've defined our Encoder and Decoder we must *register* our model with fairseq using the :func:`~fairseq.models.register_model` function decorator. Once the model is registered we'll be able to use it with the existing :ref:`Command-line Tools`. All registered models must implement the :class:`~fairseq.models.BaseFairseqModel` interface. For sequence-to-sequence models (i.e., any model with a single Encoder and Decoder), we can instead implement the :class:`~fairseq.models.FairseqEncoderDecoderModel` interface. Create a small wrapper class in the same file and register it in fairseq with the name ``'simple_lstm'``:: from fairseq.models import FairseqEncoderDecoderModel, register_model # Note: the register_model "decorator" should immediately precede the # definition of the Model class. @register_model('simple_lstm') class SimpleLSTMModel(FairseqEncoderDecoderModel): @staticmethod def add_args(parser): # Models can override this method to add new command-line arguments. # Here we'll add some new command-line arguments to configure dropout # and the dimensionality of the embeddings and hidden states. parser.add_argument( '--encoder-embed-dim', type=int, metavar='N', help='dimensionality of the encoder embeddings', ) parser.add_argument( '--encoder-hidden-dim', type=int, metavar='N', help='dimensionality of the encoder hidden state', ) parser.add_argument( '--encoder-dropout', type=float, default=0.1, help='encoder dropout probability', ) parser.add_argument( '--decoder-embed-dim', type=int, metavar='N', help='dimensionality of the decoder embeddings', ) parser.add_argument( '--decoder-hidden-dim', type=int, metavar='N', help='dimensionality of the decoder hidden state', ) parser.add_argument( '--decoder-dropout', type=float, default=0.1, help='decoder dropout probability', ) @classmethod def build_model(cls, args, task): # Fairseq initializes models by calling the ``build_model()`` # function. This provides more flexibility, since the returned model # instance can be of a different type than the one that was called. # In this case we'll just return a SimpleLSTMModel instance. # Initialize our Encoder and Decoder. encoder = SimpleLSTMEncoder( args=args, dictionary=task.source_dictionary, embed_dim=args.encoder_embed_dim, hidden_dim=args.encoder_hidden_dim, dropout=args.encoder_dropout, ) decoder = SimpleLSTMDecoder( dictionary=task.target_dictionary, encoder_hidden_dim=args.encoder_hidden_dim, embed_dim=args.decoder_embed_dim, hidden_dim=args.decoder_hidden_dim, dropout=args.decoder_dropout, ) model = SimpleLSTMModel(encoder, decoder) # Print the model architecture. print(model) return model # We could override the ``forward()`` if we wanted more control over how # the encoder and decoder interact, but it's not necessary for this # tutorial since we can inherit the default implementation provided by # the FairseqEncoderDecoderModel base class, which looks like: # # def forward(self, src_tokens, src_lengths, prev_output_tokens): # encoder_out = self.encoder(src_tokens, src_lengths) # decoder_out = self.decoder(prev_output_tokens, encoder_out) # return decoder_out Finally let's define a *named architecture* with the configuration for our model. This is done with the :func:`~fairseq.models.register_model_architecture` function decorator. Thereafter this named architecture can be used with the ``--arch`` command-line argument, e.g., ``--arch tutorial_simple_lstm``:: from fairseq.models import register_model_architecture # The first argument to ``register_model_architecture()`` should be the name # of the model we registered above (i.e., 'simple_lstm'). The function we # register here should take a single argument *args* and modify it in-place # to match the desired architecture. @register_model_architecture('simple_lstm', 'tutorial_simple_lstm') def tutorial_simple_lstm(args): # We use ``getattr()`` to prioritize arguments that are explicitly given # on the command-line, so that the defaults defined below are only used # when no other value has been specified. args.encoder_embed_dim = getattr(args, 'encoder_embed_dim', 256) args.encoder_hidden_dim = getattr(args, 'encoder_hidden_dim', 256) args.decoder_embed_dim = getattr(args, 'decoder_embed_dim', 256) args.decoder_hidden_dim = getattr(args, 'decoder_hidden_dim', 256) 3. Training the Model --------------------- Now we're ready to train the model. We can use the existing :ref:`fairseq-train` command-line tool for this, making sure to specify our new Model architecture (``--arch tutorial_simple_lstm``). .. note:: Make sure you've already preprocessed the data from the IWSLT example in the :file:`examples/translation/` directory. .. code-block:: console > fairseq-train data-bin/iwslt14.tokenized.de-en \ --arch tutorial_simple_lstm \ --encoder-dropout 0.2 --decoder-dropout 0.2 \ --optimizer adam --lr 0.005 --lr-shrink 0.5 \ --max-tokens 12000 (...) | epoch 052 | loss 4.027 | ppl 16.30 | wps 420805 | ups 39.7 | wpb 9841 | bsz 400 | num_updates 20852 | lr 1.95313e-05 | gnorm 0.218 | clip 0% | oom 0 | wall 529 | train_wall 396 | epoch 052 | valid on 'valid' subset | valid_loss 4.74989 | valid_ppl 26.91 | num_updates 20852 | best 4.74954 The model files should appear in the :file:`checkpoints/` directory. While this model architecture is not very good, we can use the :ref:`fairseq-generate` script to generate translations and compute our BLEU score over the test set: .. code-block:: console > fairseq-generate data-bin/iwslt14.tokenized.de-en \ --path checkpoints/checkpoint_best.pt \ --beam 5 \ --remove-bpe (...) | Translated 6750 sentences (153132 tokens) in 17.3s (389.12 sentences/s, 8827.68 tokens/s) | Generate test with beam=5: BLEU4 = 8.18, 38.8/12.1/4.7/2.0 (BP=1.000, ratio=1.066, syslen=139865, reflen=131146) 4. Making generation faster --------------------------- While autoregressive generation from sequence-to-sequence models is inherently slow, our implementation above is especially slow because it recomputes the entire sequence of Decoder hidden states for every output token (i.e., it is ``O(n^2)``). We can make this significantly faster by instead caching the previous hidden states. In fairseq this is called :ref:`Incremental decoding`. Incremental decoding is a special mode at inference time where the Model only receives a single timestep of input corresponding to the immediately previous output token (for teacher forcing) and must produce the next output incrementally. Thus the model must cache any long-term state that is needed about the sequence, e.g., hidden states, convolutional states, etc. To implement incremental decoding we will modify our model to implement the :class:`~fairseq.models.FairseqIncrementalDecoder` interface. Compared to the standard :class:`~fairseq.models.FairseqDecoder` interface, the incremental decoder interface allows ``forward()`` methods to take an extra keyword argument (*incremental_state*) that can be used to cache state across time-steps. Let's replace our ``SimpleLSTMDecoder`` with an incremental one:: import torch from fairseq.models import FairseqIncrementalDecoder class SimpleLSTMDecoder(FairseqIncrementalDecoder): def __init__( self, dictionary, encoder_hidden_dim=128, embed_dim=128, hidden_dim=128, dropout=0.1, ): # This remains the same as before. super().__init__(dictionary) self.embed_tokens = nn.Embedding( num_embeddings=len(dictionary), embedding_dim=embed_dim, padding_idx=dictionary.pad(), ) self.dropout = nn.Dropout(p=dropout) self.lstm = nn.LSTM( input_size=encoder_hidden_dim + embed_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=False, ) self.output_projection = nn.Linear(hidden_dim, len(dictionary)) # We now take an additional kwarg (*incremental_state*) for caching the # previous hidden and cell states. def forward(self, prev_output_tokens, encoder_out, incremental_state=None): if incremental_state is not None: # If the *incremental_state* argument is not ``None`` then we are # in incremental inference mode. While *prev_output_tokens* will # still contain the entire decoded prefix, we will only use the # last step and assume that the rest of the state is cached. prev_output_tokens = prev_output_tokens[:, -1:] # This remains the same as before. bsz, tgt_len = prev_output_tokens.size() final_encoder_hidden = encoder_out['final_hidden'] x = self.embed_tokens(prev_output_tokens) x = self.dropout(x) x = torch.cat( [x, final_encoder_hidden.unsqueeze(1).expand(bsz, tgt_len, -1)], dim=2, ) # We will now check the cache and load the cached previous hidden and # cell states, if they exist, otherwise we will initialize them to # zeros (as before). We will use the ``utils.get_incremental_state()`` # and ``utils.set_incremental_state()`` helpers. initial_state = utils.get_incremental_state( self, incremental_state, 'prev_state', ) if initial_state is None: # first time initialization, same as the original version initial_state = ( final_encoder_hidden.unsqueeze(0), # hidden torch.zeros_like(final_encoder_hidden).unsqueeze(0), # cell ) # Run one step of our LSTM. output, latest_state = self.lstm(x.transpose(0, 1), initial_state) # Update the cache with the latest hidden and cell states. utils.set_incremental_state( self, incremental_state, 'prev_state', latest_state, ) # This remains the same as before x = output.transpose(0, 1) x = self.output_projection(x) return x, None # The ``FairseqIncrementalDecoder`` interface also requires implementing a # ``reorder_incremental_state()`` method, which is used during beam search # to select and reorder the incremental state. def reorder_incremental_state(self, incremental_state, new_order): # Load the cached state. prev_state = utils.get_incremental_state( self, incremental_state, 'prev_state', ) # Reorder batches according to *new_order*. reordered_state = ( prev_state[0].index_select(1, new_order), # hidden prev_state[1].index_select(1, new_order), # cell ) # Update the cached state. utils.set_incremental_state( self, incremental_state, 'prev_state', reordered_state, ) Finally, we can rerun generation and observe the speedup: .. code-block:: console # Before > fairseq-generate data-bin/iwslt14.tokenized.de-en \ --path checkpoints/checkpoint_best.pt \ --beam 5 \ --remove-bpe (...) | Translated 6750 sentences (153132 tokens) in 17.3s (389.12 sentences/s, 8827.68 tokens/s) | Generate test with beam=5: BLEU4 = 8.18, 38.8/12.1/4.7/2.0 (BP=1.000, ratio=1.066, syslen=139865, reflen=131146) # After > fairseq-generate data-bin/iwslt14.tokenized.de-en \ --path checkpoints/checkpoint_best.pt \ --beam 5 \ --remove-bpe (...) | Translated 6750 sentences (153132 tokens) in 5.5s (1225.54 sentences/s, 27802.94 tokens/s) | Generate test with beam=5: BLEU4 = 8.18, 38.8/12.1/4.7/2.0 (BP=1.000, ratio=1.066, syslen=139865, reflen=131146) ================================================ FILE: examples/.gitignore ================================================ !*/*.sh !*/*.md ================================================ FILE: examples/MMPT/.gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ pip-wheel-metadata/ share/python-wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .nox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover *.py,cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 db.sqlite3-journal # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # IPython profile_default/ ipython_config.py # pyenv .python-version # pipenv # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. #Pipfile.lock # PEP 582; used by e.g. github.com/David-OConnor/pyflow __pypackages__/ # Celery stuff celerybeat-schedule celerybeat.pid # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ .dmypy.json dmypy.json # Pyre type checker .pyre/ runs data pretrained_models projects/mmfusion_* log_test third-party python_log slurm_snapshot_code lightning_logs demos ================================================ FILE: examples/MMPT/CONFIG.md ================================================ ### Config Files Explained Taking `projects/mfmmlm.yaml` for example, which run pretraining using masked frame model (MFM) and masked language model (MLM) on a single BERT: ```yaml project_dir: mfmmlm # specify the project dir for this baseline. run_task: - how2.yaml # run pretraining on how2 when launching `projects/taskmfmmlm.yaml` - [vtt.yaml, vttcap.yaml, vttqa.yaml, youcook.yaml, youcookcap.yaml, crosstask.yaml, coin.yaml] # run fine-tuning tasks. base_dir: task # a global template folder to specify each training task. task_group: pretrain: # section for pretraining. Most baselines differs in this section. task_list: - how2.yaml # reconfig `projects/task/how2.yaml` dataset: aligner: MFMMLMAligner # overwrite the aligner for MFMMLM training task. model: model_cls: MMFusionMFMMLM # overwrite the model, which constructs negative examples for MFM on-the-fly. loss: loss_cls: MFMMLM # overwrite the loss as MFMMLM, which combines MFM and MLM together. fairseq: # all fairseq args can be expecified under this name. dataset: batch_size: 128 finetune: # section for fine-tuning tasks, we don't need to change anything here mostly since we want to see how pretraining can contribute to finetuning. task_list: # specify the list of downstream tasks, e.g., copy `projects/task/vtt.yaml` to `projects/mfmmlm`. - vtt.yaml - vttqa.yaml - youcook.yaml - youcookcap.yaml - crosstask.yaml - coin.yaml test: # section for testing. task_list: - test_vtt.yaml - test_vttqa.yaml - test_youcook.yaml - test_youcookcap.yaml - test_crosstask.yaml - test_crosstask_zs.yaml - test_coin.yaml ``` ================================================ FILE: examples/MMPT/DATASET.md ================================================ # Dataset We understand video data are challenging to download and process. For videos, we provide our preprocessing scripts under `scripts/video_feature_extractor` (deeply adapted from `https://github.com/antoine77340/video_feature_extractor`); for text, we pre-tokenizing scripts under `scripts/text_token_extractor`. ### S3D Feature Extraction We use pre-trained [S3D](https://github.com/antoine77340/S3D_HowTo100M) for video feature extraction. Please place the models as `pretrained_models/s3d_dict.npy` and `pretrained_models/s3d_howto100m.pth`. We implement a `PathBuilder` to automatically track video ids, source video paths to their feature locations (you may need `conda install -c anaconda pandas`). Decoding may need `pip install ffmpeg-python`. ### Howto100M [Howto100M](https://www.di.ens.fr/willow/research/howto100m/) is a large-scale video pre-training datasets. You may download videos by yourself and run preprocessing of our scripts. Several key differences of our preprocessing from existing papers: (1) we use `raw_caption.json` instead of `caption.json` to have pure self-supervision on text (`caption.json` has manual removal of stop words); (2) we remove partially duplicated texts that are originally designed for real-time readability (see `mmpt/processors/dedupprocessor.py`); (3) then we shard video/text features using `SharedTensor` in `mmpt/utils/shardedtensor.py` for fast loading during training (faster than `h5py`). #### Steps ##### video To extract video features: edit and run `bash scripts/video_feature_extractor/how2/s3d.sh`. (consider to run this on multiple machines; by default, we store features in fp16 to save space and also for faster training). Split available video ids as `data/how2/how2_s3d_train.lst` and `data/how2/how2_s3d_val.lst`. Lastly, pack video features into `ShardedTensor` using `python scripts/video_feature_extractor/shard_feature.py`. ##### text Clean captions using `python -m mmpt.processors.dedupprocessor`. Tokenize dedupped captions `data/how2/raw_caption_dedup.pkl` into sharded numpy arrays: ``` python scripts/text_token_extractor/pretokenization.py scripts/text_token_extractor/configs/bert-base-uncased.yaml ``` ### Youcook, MSRVTT etc. We use the version of Youcook and MSRVTT come with Howto100M and MILNCE. Please download the data to `data/youcook` and `data/msrvtt` accordingly, you can also check `projects/task/youcook.yaml` and `projects/task/vtt.yaml` etc. in details. We extract features for Youcook, MSRVTT similar to the first step of Howto100M but we read text from meta data directly and perform on-the-fly tokenization. ================================================ FILE: examples/MMPT/README.md ================================================ # VideoCLIP and VLM You just find this toolkit for multimodal video understanding! It contains implementation of two recent multi-modal video understanding papers [VideoCLIP](https://arxiv.org/pdf/2109.14084.pdf) (EMNLP, 2021) and [VLM](https://aclanthology.org/2021.findings-acl.370.pdf) (ACL Findings, 2021), along with high-performance toolkits that are typically lacking in existing codebase. The toolkit is desigend to contain generic performance-tuned components that can be potentially adapted to other frameworks (we initially use fairseq). VideoCLIP is a contrastive learning model for zero-shot transfer to retrieval/classification/sequence labeling style tasks. VLM is a masked language model style pre-training using only one encoder with masked modality model (MMM) for retrieval/generation/sequence labeling style tasks. ### News [Oct. 2021] Initial release of implementation for the following papers: [VideoCLIP: Contrastive Pre-training for Zero-shot Video-Text Understanding](https://arxiv.org/pdf/2109.14084.pdf) (Xu et. al., EMNLP 2021) [VLM: Task-agnostic Video-Language Model Pre-training for Video Understanding](https://aclanthology.org/2021.findings-acl.370.pdf) (Xu et. al., ACL Findings 2021) ### Installation We aim to minimize the dependency of this repo on other packages. We use fairseq as the main trainer (no models/datasets dependency on fairseq. We will support other trainer in future): ``` git clone https://github.com/pytorch/fairseq cd fairseq pip install -e . # also optionally follow fairseq README for apex installation for fp16 training. export MKL_THREADING_LAYER=GNU # fairseq may need this for numpy. ``` Then install this toolkit: ``` cd examples/MMPT # MMPT can be in any folder, not necessarily under fairseq/examples. pip install -e . ``` The code is developed under Python=3.8.8, Pytorch=1.8, cuda=11.0 with fairseq=1.0.0a0+af0389f and tested under Python=3.8.8 pytorch=1.9 cuda=11.0 fairseq=1.0.0a0+8e7bc73 during code release. Most models require `transformers==3.4` for API compatibility `pip install transformers==3.4`. In addition, some downstream tasks may need `conda install pandas`. ### Usage #### Download Checkpoints We use pre-trained [S3D](https://github.com/antoine77340/S3D_HowTo100M) for video feature extraction. Please place the models as `pretrained_models/s3d_dict.npy` and `pretrained_models/s3d_howto100m.pth`. Download VideoCLIP checkpoint `https://dl.fbaipublicfiles.com/MMPT/retri/videoclip/checkpoint_best.pt` to `runs/retri/videoclip` or VLM checkpoint `https://dl.fbaipublicfiles.com/MMPT/mtm/vlm/checkpoint_best.pt` to `runs/mtm/vlm`. #### Demo of Inference run `python locallaunch.py projects/retri/videoclip.yaml --dryrun` to get all `.yaml`s for VideoCLIP. ```python import torch from mmpt.models import MMPTModel model, tokenizer, aligner = MMPTModel.from_pretrained( "projects/retri/videoclip/how2.yaml") model.eval() # B, T, FPS, H, W, C (VideoCLIP is trained on 30 fps of s3d) video_frames = torch.randn(1, 2, 30, 224, 224, 3) caps, cmasks = aligner._build_text_seq( tokenizer("some text", add_special_tokens=False)["input_ids"] ) caps, cmasks = caps[None, :], cmasks[None, :] # bsz=1 with torch.no_grad(): output = model(video_frames, caps, cmasks, return_score=True) print(output["score"]) # dot-product ``` #### Data Preparation See [dataset](DATASET.md) for each dataset. #### Global Config for Training Pipeline We organize a global config file for a training/testing pipeline under projects (see a detailed [explanation](CONFIG.md)). For example, VideoCLIP in `projects/retri/videoclip.yaml` and VLM is in `projects/mtm/vlm.yaml`. We wrap all cmds into `locallaunch.py` and `mmpt_cli/localjob.py`. You can check concrete cmds by `--dryrun` and then drop it for actual run. First, run `python locallaunch.py projects/retri/videoclip.yaml --dryrun` will generate configs for all configs of pre-training, zero-shot evaluation, fine-tuning and testing, for VideoCLIP under `projects/retri/videoclip`. Then each (either training or evaluation) process will be configed by a concrete config file (we save all complex arguments into the concrete config file for reproducibility, including fairseq args). For example, run zero-shot evaluation on youcook, ``` python locallaunch.py projects/retri/videoclip/test_youcook_zs.yaml --jobtype local_predict # zero-shot evaluation. python locallaunch.py projects/retri/videoclip/youcook_videoclip.yaml --jobtype local_single --dryrun # fine-tuning: use --dryrun to check cmds and drop it to make an actual run; local_small will run on two gpus (as in paper). python locallaunch.py projects/retri/videoclip/test_youcook_videoclip.yaml --jobtype local_predict # testing on fine-tuned model. ``` Pretraining can be run as: ``` python locallaunch.py projects/retri/videoclip/how2.yaml --jobtype local_single --dryrun # check then drop dryrun; paper is ran on local_big as 8 gpus. ``` You may need to change `--jobtype`, check/extend `LocalJob` in `mmpt_cli/localjob.py` for multi-gpu/multi-node pre-training. The detailed instructions of pretraining and fine-tuning can be found at [pretraining instruction](pretraining.md) and [finetuning instruction](endtask.md). ### Development Several components of this toolkit can be re-used for future research (and also our ongoing research). #### Framework Wrapper We currently only support fairseq, but most components can be easily fit into other frameworks like huggingface. This repo is a `--user-dir` of fairseq with fairseq wrapper. For example, `mmpt/tasks` includes a `FairseqMMTTask`, which manages `mmpt/datasets` with `FairseqDataset`, `mmpt/models` with `FairseqModel`, `mmpt/losses` with `FairseqCriterion`. #### Processors **Multi**modal research introduces the complexity on modality alignment from different input sources to losses. Inspired by [MMF](https://github.com/facebookresearch/mmf), this toolkit leverages `mmpt/processors` to handle various needs of data preprocessing and loading, **alleviating** the needs of multiple `torch.data.utils.Dataset` (that can be tricky for ablation study). Processors can also be decoupled from `torch.data.utils.Dataset` for offline preprocessing instead of on-the-fly data preprocessing. We decouple a `mmpt.MMDataset` as 3 types of processors: `MetaProcessor`, `VideoProcessor`, `TextProcessor` and `Aligner`. They can be configed in `dataset` field of a config file (e.g., see `projects/task/how2.yaml`). `MetaProcessor` is used to load the meta data about a dataset, aka, all video_ids of how2 dataset. `VideoProcessor` is used to load the video features about a dataset. For example, S3D features for each second of a video. `TextProcessor` is used to load the text (feature). For example, BERT pre-tokenized text clips for how2 dataset (with `start`s, `end`s of timestamps and `cap` for `token_ids`). `Aligner` is the core class for different baselines that prepares the training data. For example, sampling a clip, masking tokens for MLM, etc. #### Performance-tuned Components To speed up pre-training, this toolkit uses sharded features stored in mmaped numpy, backed by `ShardedTensor` in `mmpt/utils/shardedtensor.py` (adopted from MARGE paper). This reduces the loads of IO for multi-GPU training without loading all features for a video into the memory each time and `ShardedTensor` ensure features are stored in continuous disk space for near random access. This is used for both How2 video features and texts in `mmpt/processors/how2processor.py`. ### Citation If this codebase is useful for your work, please cite the following papers: ```BibTeX @inproceedings{xu-etal-2021-videoclip, title = "{VideoCLIP}: Contrastive Pre-training for\\Zero-shot Video-Text Understanding", author = "Xu, Hu and Ghosh, Gargi and Huang, Po-Yao and Okhonko, Dmytro and Aghajanyan, Armen and Metze, Florian and Zettlemoyer, Luke and Feichtenhofer, Christoph", booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP)", month = nov, year = "2021", address = "Online", publisher = "Association for Computational Linguistics", } @inproceedings{xu-etal-2021-vlm, title = "{VLM}: Task-agnostic Video-Language Model Pre-training for Video Understanding", author = "Xu, Hu and Ghosh, Gargi and Huang, Po-Yao and Arora, Prahal and Aminzadeh, Masoumeh and Feichtenhofer, Christoph and Metze, Florian and Zettlemoyer, Luke", booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", month = aug, year = "2021", address = "Online", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2021.findings-acl.370", doi = "10.18653/v1/2021.findings-acl.370", pages = "4227--4239", } ``` ### Bug Reports This repo is in its initial stage, welcome bug reports to huxu@fb.com ### Copyright The majority of Multimodal Pre-training (MMPT) is licensed under CC-BY-NC, however portions of the project are available under separate license terms: Evaluation Codes/Models: Howto100M and HuggingFace Transformers are licensed under the Apache2.0 license; COIN and NLG-eval are licensed under the MIT license; CrossTask is licensed under the BSD-3; DiDeMo is licensed under the BSD-2 license. ================================================ FILE: examples/MMPT/endtask.md ================================================ # Zero-shot Transfer and Finetuning (If you are new to the ideas of `mmpt.processors`, see [README](README.md) first.) All finetuning datasets (specifically `processors`) are defined in `mmpt.processors.dsprocessor`. Given the complexity of different types of finetuning tasks, each task may have their own meta/video/text/aligner processors and `mmpt/evaluators/{Predictor,Metric}`. ### Tasks Currently, we support 5 end datasets: `MSRVTT`, `Youcook`, `COIN`, `Crosstask` and `DiDeMo` with the following tasks: text-video retrieval: `MSRVTT`, `Youcook`, `DiDeMo`; video captioning: `Youcook`; Video Question and Answering: `MSRVTT-QA`. To add your own dataset, you can specify the corresponding processors and config them in the `dataset` field of a config file, such as `projects/task/vtt.yaml`. ### Zero-shot Transfer (no Training) Zero-shot transfer will run the pre-trained model (e.g., VideoCLIP) directly on testing data. Configs with pattern: `projects/task/*_zs_*.yaml` are dedicated for zero-shot transfer. ### Fine-tuning The training of a downstream task is similar to pretraining, execept you may need to specify the `restore_file` in `fairseq.checkpoint` and reset optimizers, see `projects/task/ft.yaml` that is included by `projects/task/vtt.yaml`. We typically do finetuning on 2 gpus (`local_small`). ### Testing For each finetuning dataset, you may need to specify a testing config, similar to `projects/task/test_vtt.yaml`. We define `mmpt.evaluators.Predictor` for different types of prediction. For example, `MSRVTT` and `Youcook` are video-retrieval tasks and expecting to use `RetrievalPredictor`. You may need to define your new type of predictors and specify that in `predictor` field of a testing config. Each task may also have their own metric for evaluation. This can be created in `mmpt.evaluators.Metric` and specified in the `metric` field of a testing config. Launching a testing is as simple as training by specifying the path of a testing config: ```python locallaunch.py projects/mfmmlm/test_vtt.yaml``` Testing will be launched locally by default since prediction is computationally less expensive. ### Third-party Libraries We list the following finetuning tasks that require third-party libraries. Youcook captioning: `https://github.com/Maluuba/nlg-eval` CrossTask: `https://github.com/DmZhukov/CrossTask`'s `dp` under `third-party/CrossTask` (`python setup.py build_ext --inplace`) ================================================ FILE: examples/MMPT/locallaunch.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os from omegaconf import OmegaConf from mmpt.utils import recursive_config, overwrite_dir from mmpt_cli.localjob import LocalJob class JobLauncher(object): JOB_CONFIG = { "local": LocalJob, } def __init__(self, yaml_file): self.yaml_file = yaml_file job_key = "local" if yaml_file.endswith(".yaml"): config = recursive_config(yaml_file) if config.task_type is not None: job_key = config.task_type.split("_")[0] else: raise ValueError("unknown extension of job file:", yaml_file) self.job_key = job_key def __call__(self, job_type=None, dryrun=False): if job_type is not None: self.job_key = job_type.split("_")[0] print("[JobLauncher] job_key", self.job_key) job = JobLauncher.JOB_CONFIG[self.job_key]( self.yaml_file, job_type=job_type, dryrun=dryrun) return job.submit() class Pipeline(object): """a job that loads yaml config.""" def __init__(self, fn): """ load a yaml config of a job and save generated configs as yaml for each task. return: a list of files to run as specified by `run_task`. """ if fn.endswith(".py"): # a python command. self.backend = "python" self.run_yamls = [fn] return job_config = recursive_config(fn) if job_config.base_dir is None: # single file job config. self.run_yamls = [fn] return self.project_dir = os.path.join("projects", job_config.project_dir) self.run_dir = os.path.join("runs", job_config.project_dir) if job_config.run_task is not None: run_yamls = [] for stage in job_config.run_task: # each stage can have multiple tasks running in parallel. if OmegaConf.is_list(stage): stage_yamls = [] for task_file in stage: stage_yamls.append( os.path.join(self.project_dir, task_file)) run_yamls.append(stage_yamls) else: run_yamls.append(os.path.join(self.project_dir, stage)) self.run_yamls = run_yamls configs_to_save = self._overwrite_task(job_config) self._save_configs(configs_to_save) def __getitem__(self, idx): yaml_files = self.run_yamls[idx] if isinstance(yaml_files, list): return [JobLauncher(yaml_file) for yaml_file in yaml_files] return [JobLauncher(yaml_files)] def __len__(self): return len(self.run_yamls) def _save_configs(self, configs_to_save: dict): # save os.makedirs(self.project_dir, exist_ok=True) for config_file in configs_to_save: config = configs_to_save[config_file] print("saving", config_file) OmegaConf.save(config=config, f=config_file) def _overwrite_task(self, job_config): configs_to_save = {} self.base_project_dir = os.path.join("projects", job_config.base_dir) self.base_run_dir = os.path.join("runs", job_config.base_dir) for config_sets in job_config.task_group: overwrite_config = job_config.task_group[config_sets] if ( overwrite_config.task_list is None or len(overwrite_config.task_list) == 0 ): print( "[warning]", job_config.task_group, "has no task_list specified.") # we don't want this added to a final config. task_list = overwrite_config.pop("task_list", None) for config_file in task_list: config_file_path = os.path.join( self.base_project_dir, config_file) config = recursive_config(config_file_path) # overwrite it. if overwrite_config: config = OmegaConf.merge(config, overwrite_config) overwrite_dir(config, self.run_dir, basedir=self.base_run_dir) save_file_path = os.path.join(self.project_dir, config_file) configs_to_save[save_file_path] = config return configs_to_save def main(args): job_type = args.jobtype if args.jobtype else None # parse multiple pipelines. pipelines = [Pipeline(fn) for fn in args.yamls.split(",")] for pipe_id, pipeline in enumerate(pipelines): if not hasattr(pipeline, "project_dir"): for job in pipeline[0]: job(job_type=job_type, dryrun=args.dryrun) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("yamls", type=str) parser.add_argument( "--dryrun", action="store_true", help="run config and prepare to submit without launch the job.", ) parser.add_argument( "--jobtype", type=str, default="", help="force to run jobs as specified.") args = parser.parse_args() main(args) ================================================ FILE: examples/MMPT/mmpt/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. try: # fairseq user dir from .datasets import FairseqMMDataset from .losses import FairseqCriterion from .models import FairseqMMModel from .tasks import FairseqMMTask except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/datasets/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .mmdataset import * try: from .fairseqmmdataset import * except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/datasets/fairseqmmdataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ TODO (huxu): fairseq wrapper class for all dataset you defined: mostly MMDataset. """ from collections import OrderedDict from torch.utils.data import Dataset from torch.utils.data.dataloader import default_collate from fairseq.data import FairseqDataset, data_utils class FairseqMMDataset(FairseqDataset): """ A wrapper class for MMDataset for fairseq. """ def __init__(self, mmdataset): if not isinstance(mmdataset, Dataset): raise TypeError("mmdataset must be of type `torch.utils.data.dataset`.") self.mmdataset = mmdataset def set_epoch(self, epoch, **unused): super().set_epoch(epoch) self.epoch = epoch def __getitem__(self, idx): with data_utils.numpy_seed(43211, self.epoch, idx): return self.mmdataset[idx] def __len__(self): return len(self.mmdataset) def collater(self, samples): if hasattr(self.mmdataset, "collator"): return self.mmdataset.collator(samples) if len(samples) == 0: return {} if isinstance(samples[0], dict): batch = OrderedDict() for key in samples[0]: if samples[0][key] is not None: batch[key] = default_collate([sample[key] for sample in samples]) return batch else: return default_collate(samples) def size(self, index): """dummy implementation: we don't use --max-tokens""" return 1 def num_tokens(self, index): """dummy implementation: we don't use --max-tokens""" return 1 ================================================ FILE: examples/MMPT/mmpt/datasets/mmdataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from collections import OrderedDict from torch.utils.data import Dataset from torch.utils.data.dataloader import default_collate from ..utils import set_seed class MMDataset(Dataset): """ A generic multi-modal dataset. Args: `meta_processor`: a meta processor, handling loading meta data and return video_id and text_id. `video_processor`: a video processor, handling e.g., decoding, loading .np files. `text_processor`: a text processor, handling e.g., tokenization. `aligner`: combine the video and text feature as one training example. """ def __init__( self, meta_processor, video_processor, text_processor, align_processor, ): self.split = meta_processor.split self.meta_processor = meta_processor self.video_processor = video_processor self.text_processor = text_processor self.align_processor = align_processor def __len__(self): return len(self.meta_processor) def __getitem__(self, idx): if self.split == "test": set_seed(idx) video_id, text_id = self.meta_processor[idx] video_feature = self.video_processor(video_id) text_feature = self.text_processor(text_id) output = self.align_processor(video_id, video_feature, text_feature) # TODO (huxu): the following is for debug purpose. output.update({"idx": idx}) return output def collater(self, samples): """This collator is deprecated. set self.collator = MMDataset.collater. see collator in FairseqMMDataset. """ if len(samples) == 0: return {} if isinstance(samples[0], dict): batch = OrderedDict() for key in samples[0]: if samples[0][key] is not None: batch[key] = default_collate( [sample[key] for sample in samples]) # if torch.is_tensor(batch[key]): # print(key, batch[key].size()) # else: # print(key, len(batch[key])) return batch else: return default_collate(samples) def print_example(self, output): print("[one example]", output["video_id"]) if ( hasattr(self.align_processor, "subsampling") and self.align_processor.subsampling is not None and self.align_processor.subsampling > 1 ): for key in output: if torch.is_tensor(output[key]): output[key] = output[key][0] # search tokenizer to translate ids back. tokenizer = None if hasattr(self.text_processor, "tokenizer"): tokenizer = self.text_processor.tokenizer elif hasattr(self.align_processor, "tokenizer"): tokenizer = self.align_processor.tokenizer if tokenizer is not None: caps = output["caps"].tolist() if isinstance(caps[0], list): caps = caps[0] print("caps", tokenizer.decode(caps)) print("caps", tokenizer.convert_ids_to_tokens(caps)) for key, value in output.items(): if torch.is_tensor(value): if len(value.size()) >= 3: # attention_mask. print(key, value.size()) print(key, "first", value[0, :, :]) print(key, "last", value[-1, :, :]) else: print(key, value) print("[end of one example]") ================================================ FILE: examples/MMPT/mmpt/evaluators/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .metric import * from .evaluator import * # experimental. try: from .expmetric import * except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/evaluators/evaluator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import glob import numpy as np from . import metric as metric_path from . import predictor as predictor_path class Evaluator(object): """ perform evaluation on a single (downstream) task. make this both offline and online. TODO(huxu) saving evaluation results. """ def __init__(self, config, eval_dataloader=None): if config.metric is None: raise ValueError("config.metric is", config.metric) metric_cls = getattr(metric_path, config.metric) self.metric = metric_cls(config) if config.predictor is None: raise ValueError("config.predictor is", config.predictor) predictor_cls = getattr(predictor_path, config.predictor) self.predictor = predictor_cls(config) self.eval_dataloader = eval_dataloader def __call__(self): try: print(self.predictor.pred_dir) for pred_file in glob.glob( self.predictor.pred_dir + "/*_merged.npy"): outputs = np.load(pred_file) results = self.metric.compute_metrics(outputs) self.metric.print_computed_metrics(results) outputs = np.load(os.path.join( self.predictor.pred_dir, "merged.npy")) results = self.metric.compute_metrics(outputs) return {"results": results, "metric": self.metric} except FileNotFoundError: print("\n[missing]", self.predictor.pred_dir) return {} def evaluate(self, model, eval_dataloader=None, output_file="merged"): if eval_dataloader is None: eval_dataloader = self.eval_dataloader outputs = self.predictor.predict_loop( model, eval_dataloader, output_file) results = self.metric.compute_metrics(**outputs) return results ================================================ FILE: examples/MMPT/mmpt/evaluators/metric.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import json class Metric(object): def __init__(self, config, metric_names): self.metric_names = metric_names def best_metric(self, metric): return metric[self.metric_names[0]] def save_metrics(self, fn, metrics): with open(fn, "w") as fw: json.dump(fw, metrics) def print_computed_metrics(self, metrics): raise NotImplementedError class RetrievalMetric(Metric): """ this is modified from `howto100m/metrics.py`. History of changes: refactor as a class. add metric_key in __init__ """ def __init__(self, config, metric_names=["R1", "R5", "R10", "MR"]): super().__init__(config, metric_names) self.error = False # TODO(huxu): add to config to print error. def compute_metrics(self, outputs, texts, **kwargs): x = outputs sx = np.sort(-x, axis=1) d = np.diag(-x) d = d[:, np.newaxis] ind = sx - d ind = np.where(ind == 0) ind = ind[1] metrics = {} metrics["R1"] = float(np.sum(ind == 0)) / len(ind) metrics["R5"] = float(np.sum(ind < 5)) / len(ind) metrics["R10"] = float(np.sum(ind < 10)) / len(ind) metrics["MR"] = np.median(ind) + 1 max_idx = np.argmax(outputs, axis=1) if self.error: # print top-20 errors. error = [] for ex_idx in range(20): error.append((texts[ex_idx], texts[max_idx[ex_idx]])) metrics["error"] = error return metrics def print_computed_metrics(self, metrics): r1 = metrics["R1"] r5 = metrics["R5"] r10 = metrics["R10"] mr = metrics["MR"] print( "R@1: {:.4f} - R@5: {:.4f} - R@10: {:.4f} - Median R: {}".format( r1, r5, r10, mr ) ) if "error" in metrics: print(metrics["error"]) class DiDeMoMetric(Metric): """ History of changes: python 2.x to python 3.x. merge utils.py into eval to save one file. reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py Code to evaluate your results on the DiDeMo dataset. """ def __init__(self, config, metric_names=["rank1", "rank5", "miou"]): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, **kwargs): assert len(outputs) == len(targets) rank1, rank5, miou = self._eval_predictions(outputs, targets) metrics = { "rank1": rank1, "rank5": rank5, "miou": miou } return metrics def print_computed_metrics(self, metrics): rank1 = metrics["rank1"] rank5 = metrics["rank5"] miou = metrics["miou"] # print("Average rank@1: %f" % rank1) # print("Average rank@5: %f" % rank5) # print("Average iou: %f" % miou) print( "Average rank@1: {:.4f} Average rank@5: {:.4f} Average iou: {:.4f}".format( rank1, rank5, miou ) ) def _iou(self, pred, gt): intersection = max(0, min(pred[1], gt[1]) + 1 - max(pred[0], gt[0])) union = max(pred[1], gt[1]) + 1 - min(pred[0], gt[0]) return float(intersection)/union def _rank(self, pred, gt): return pred.index(tuple(gt)) + 1 def _eval_predictions(self, segments, data): ''' Inputs: segments: For each item in the ground truth data, rank possible video segments given the description and video. In DiDeMo, there are 21 posible moments extracted for each video so the list of video segments will be of length 21. The first video segment should be the video segment that best corresponds to the text query. There are 4180 sentence in the validation data, so when evaluating a model on the val dataset, segments should be a list of lenght 4180, and each item in segments should be a list of length 21. data: ground truth data ''' average_ranks = [] average_iou = [] for s, d in zip(segments, data): pred = s[0] ious = [self._iou(pred, t) for t in d['times']] average_iou.append(np.mean(np.sort(ious)[-3:])) ranks = [self._rank(s, t) for t in d['times'] if tuple(t) in s] # if t in s] is added for s, e not in prediction. average_ranks.append(np.mean(np.sort(ranks)[:3])) rank1 = np.sum(np.array(average_ranks) <= 1)/float(len(average_ranks)) rank5 = np.sum(np.array(average_ranks) <= 5)/float(len(average_ranks)) miou = np.mean(average_iou) # print("Average rank@1: %f" % rank1) # print("Average rank@5: %f" % rank5) # print("Average iou: %f" % miou) return rank1, rank5, miou class NLGMetric(Metric): def __init__( self, config, metric_names=[ "Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4", "METEOR", "ROUGE_L", "CIDEr" ] ): super().__init__(config, metric_names) # please install NLGEval from `https://github.com/Maluuba/nlg-eval` from nlgeval import NLGEval self.nlg = NLGEval() def compute_metrics(self, outputs, targets, **kwargs): return self.nlg.compute_metrics( hyp_list=outputs, ref_list=targets) def print_computed_metrics(self, metrics): Bleu_1 = metrics["Bleu_1"] Bleu_2 = metrics["Bleu_2"] Bleu_3 = metrics["Bleu_3"] Bleu_4 = metrics["Bleu_4"] METEOR = metrics["METEOR"] ROUGE_L = metrics["ROUGE_L"] CIDEr = metrics["CIDEr"] print( "Bleu_1: {:.4f} - Bleu_2: {:.4f} - Bleu_3: {:.4f} - Bleu_4: {:.4f} - METEOR: {:.4f} - ROUGE_L: {:.4f} - CIDEr: {:.4f}".format( Bleu_1, Bleu_2, Bleu_3, Bleu_4, METEOR, ROUGE_L, CIDEr ) ) class QAMetric(Metric): def __init__( self, config, metric_names=["acc"] ): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, **kwargs): from sklearn.metrics import accuracy_score return {"acc": accuracy_score(targets, outputs)} def print_computed_metrics(self, metrics): print("acc: {:.4f}".format(metrics["acc"])) class COINActionSegmentationMetric(Metric): """ COIN dataset listed 3 repos for Action Segmentation. Action Sets, NeuralNetwork-Viterbi, TCFPN-ISBA. The first and second are the same. https://github.com/alexanderrichard/action-sets/blob/master/eval.py Future reference for the third: `https://github.com/Zephyr-D/TCFPN-ISBA/blob/master/utils/metrics.py` """ def __init__(self, config, metric_name=["frame_acc"]): super().__init__(config, metric_name) def compute_metrics(self, outputs, targets): n_frames = 0 n_errors = 0 n_errors = sum(outputs != targets) n_frames = len(targets) return {"frame_acc": 1.0 - float(n_errors) / n_frames} def print_computed_metrics(self, metrics): fa = metrics["frame_acc"] print("frame accuracy:", fa) class CrossTaskMetric(Metric): def __init__(self, config, metric_names=["recall"]): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, **kwargs): """refactored from line 166: https://github.com/DmZhukov/CrossTask/blob/master/train.py""" recalls = self._get_recalls(Y_true=targets, Y_pred=outputs) results = {} for task, rec in recalls.items(): results[str(task)] = rec avg_recall = np.mean(list(recalls.values())) results["recall"] = avg_recall return results def print_computed_metrics(self, metrics): print('Recall: {0:0.3f}'.format(metrics["recall"])) for task in metrics: if task != "recall": print('Task {0}. Recall = {1:0.3f}'.format( task, metrics[task])) def _get_recalls(self, Y_true, Y_pred): """refactored from https://github.com/DmZhukov/CrossTask/blob/master/train.py""" step_match = {task: 0 for task in Y_true.keys()} step_total = {task: 0 for task in Y_true.keys()} for task, ys_true in Y_true.items(): ys_pred = Y_pred[task] for vid in set(ys_pred.keys()).intersection(set(ys_true.keys())): y_true = ys_true[vid] y_pred = ys_pred[vid] step_total[task] += (y_true.sum(axis=0) > 0).sum() step_match[task] += (y_true*y_pred).sum() recalls = { task: step_match[task] / n for task, n in step_total.items()} return recalls class ActionRecognitionMetric(Metric): def __init__( self, config, metric_names=["acc", "acc_splits", "r1_splits", "r5_splits", "r10_splits"] ): super().__init__(config, metric_names) def compute_metrics(self, outputs, targets, splits, **kwargs): all_video_embd = outputs labels = targets split1, split2, split3 = splits accs = [] r1s = [] r5s = [] r10s = [] for split in range(3): if split == 0: s = split1 elif split == 1: s = split2 else: s = split3 X_pred = all_video_embd[np.where(s == 2)[0]] label_test = labels[np.where(s == 2)[0]] logits = X_pred X_pred = np.argmax(X_pred, axis=1) acc = np.sum(X_pred == label_test) / float(len(X_pred)) accs.append(acc) # compute recall. sorted_pred = (-logits).argsort(axis=-1) label_test_sp = label_test.reshape(-1, 1) r1 = np.mean((sorted_pred[:, :1] == label_test_sp).sum(axis=1), axis=0) r5 = np.mean((sorted_pred[:, :5] == label_test_sp).sum(axis=1), axis=0) r10 = np.mean((sorted_pred[:, :10] == label_test_sp).sum(axis=1), axis=0) r1s.append(r1) r5s.append(r5) r10s.append(r10) return {"acc": accs[0], "acc_splits": accs, "r1_splits": r1s, "r5_splits": r5s, "r10_splits": r10s} def print_computed_metrics(self, metrics): for split, acc in enumerate(metrics["acc_splits"]): print("Top 1 accuracy on split {}: {}; r1 {}; r5 {}; r10 {}".format( split + 1, acc, metrics["r1_splits"][split], metrics["r5_splits"][split], metrics["r10_splits"][split], ) ) ================================================ FILE: examples/MMPT/mmpt/evaluators/predictor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import random import json import numpy as np import torch import pickle import math from tqdm import tqdm class Predictor(object): """this base class is used to save predictions to disk (and being called by a evaluator later). Predictor has minimum support of single gpu prediction. """ def __init__(self, config): self.pred_dir = None # on-the-fly eval does not save the results. if hasattr(config, "eval") and config.eval is not None: self.pred_dir = config.eval.save_path os.makedirs(self.pred_dir, exist_ok=True) def __call__(self, outputs): """extract the prediction and save it.""" raise NotImplementedError def predict_loop(self, model, eval_dataloader, output_file=None): """on-the-fly prediction on a single gpu.""" self.full_scores = [] model.eval() model = model.to(0) with torch.no_grad(): for data in eval_dataloader: data = self.to_ctx(data) outputs = model(**data) outputs.update(data) self(outputs) return self.finalize(output_file) def finalize(self, output_file): pass def to_ctx(self, data, ctx=0, dtype=None): if isinstance(data, dict): for key in data: if torch.is_tensor(data[key]): if dtype is not None and data[key].dtype == torch.float32: data[key] = data[key].to(dtype) data[key] = data[key].to(ctx) return data else: raise ValueError("non-dict type of batch is not supported yet.") class NLGPredictor(Predictor): """Predicting Text from MMFusion models.""" """TODO: make a context.""" def __init__(self, config): super().__init__(config) from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( config.dataset.bert_name, bos_token="[CLS]", eos_token="[SEP]") self.bos_token_id = self.tokenizer.bos_token_id self.eos_token_id = self.tokenizer.eos_token_id def predict_loop(self, model, eval_dataloader, output_file=None): """TODO: refactor base classes.""" ctx = 0 outputs = {"outputs": [], "targets": [[]]} model.eval() model = model.to(ctx) with torch.no_grad(): for data in tqdm(eval_dataloader): data = self.to_ctx(data, ctx) self(data, model, outputs) return self.finalize(outputs, output_file) def __call__(self, data, model, outputs): data.update({ "bos_token_id": self.bos_token_id, "eos_token_id": self.eos_token_id }) output = model.generate(**data) assert len(output) == len(data["ref"]) for idx, _output in enumerate(output): generated_text = self.tokenizer.decode( _output, skip_special_tokens=True) if generated_text == "": generated_text = "none" outputs["outputs"].append(generated_text) outputs["targets"][0].append(data["ref"][idx]) if random.random() < 0.001: print("_output", _output) print("generated_text", generated_text) print("ref", data["ref"][idx]) def finalize(self, outputs, output_file=None): if output_file is not None: with open(os.path.join( self.pred_dir, output_file + ".json"), "w") as fw: json.dump(outputs, fw, indent=4) return outputs class RetrievalPredictor(Predictor): """generated `pooled_video` and `pooled_text`.""" def __init__(self, config): super().__init__(config) from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( config.dataset.bert_name) def predict_loop( self, model, eval_dataloader, output_file="retrieval.npy" ): """on-the-fly prediction on a single gpu.""" full_scores = [] texts = [] model.eval() model = model.cuda() with torch.no_grad(): for data in eval_dataloader: # convert to dict. if not isinstance(data, dict): data = { "caps": data[0], "cmasks": data[1], "vfeats": data[2], "vmasks": data[3], "video_id": data[4] } data = self.to_ctx(data) outputs = model(**data) outputs.update(data) self(outputs, full_scores) for _cap in data["caps"]: texts.append( self.tokenizer.decode(_cap, skip_special_tokens=True) ) return self.finalize(full_scores, texts, output_file) def __call__(self, sample, full_scores): scores = self._get_pooled_outputs(sample) self._append_scores(scores, full_scores) def finalize(self, full_scores, texts, output_file=None): outputs = self._aggregate_scores(full_scores) if output_file is not None: np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) return {"outputs": outputs, "texts": texts} def _get_pooled_outputs(self, outputs): if "pooled_video" in outputs: return outputs["pooled_video"], outputs["pooled_text"] else: raise ValueError("unknown format of outputs.") def _append_scores(self, scores, full_scores): assert len(scores) == 2 if len(full_scores) == 0: full_scores.append([]) full_scores.append([]) full_scores[0].append(scores[0].cpu().detach().numpy()) full_scores[1].append(scores[1].cpu().detach().numpy()) def _aggregate_scores(self, scores): assert len(scores) == 2 video_hidden = np.concatenate(scores[0], axis=0) text_hidden = np.concatenate(scores[1], axis=0) # clear up. self.full_scores = [] return np.matmul(text_hidden, video_hidden.T) class QAPredictor(Predictor): """generated `pooled_video` and `pooled_text`.""" def __init__(self, config): super().__init__(config) """predictor maintains scores and aggregate them.""" def predict_loop(self, model, eval_dataloader, output_file="qa.npy"): """on-the-fly prediction on a single gpu.""" self.full_scores = [] model.eval() model = model.cuda() with torch.no_grad(): for data in eval_dataloader: # reshape ans and dup video 5 times. v_len = data["vfeats"].size(1) hidden_size = data["vfeats"].size(2) data["vfeats"] = data["vfeats"].unsqueeze(1).repeat(1, 5, 1, 1).view(-1, v_len, hidden_size) data["vmasks"] = data["vmasks"].unsqueeze(1).repeat(1, 5, 1).view(-1, v_len) t_len = data["caps"].size(-1) data["caps"] = data["caps"].view(-1, t_len) data["cmasks"] = data["cmasks"].view(-1, t_len) data = self.to_ctx(data) outputs = model(**data) outputs.update(data) self(outputs) return self.finalize(output_file) def __call__(self, sample): hidden_size = sample["pooled_video"].size(-1) pooled_video = sample["pooled_video"].view(-1, 5, hidden_size) pooled_text = sample["pooled_text"].view(-1, 5, hidden_size) scores = torch.bmm(pooled_video, pooled_text.transpose(2, 1)) scores = scores.argmax(-1) self._append_scores(scores[:, 0], sample["answers"], self.full_scores) def finalize(self, output_file=None): outputs, targets = self._aggregate_scores(self.full_scores) if output_file is not None: np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) return {"outputs": outputs, "targets": targets} def _append_scores(self, scores, answers, full_scores): if len(full_scores) == 0: full_scores.append([]) full_scores.append([]) full_scores[0].append(scores.cpu().detach().numpy()) full_scores[1].append(answers.cpu().detach().numpy()) def _aggregate_scores(self, scores): assert len(scores) == 2 outputs = np.concatenate(scores[0], axis=0) targets = np.concatenate(scores[1], axis=0) # clear up. self.full_scores = [] return outputs, targets class CrossTaskPredictor(Predictor): """ CrossTaskPredictor needs to compute the average of logits for overlapped sliding-window. """ def __init__(self, config): super().__init__(config) self.lsm = torch.nn.LogSoftmax(dim=1) self.max_video_len = config.dataset.max_video_len self.sliding_window = config.dataset.sliding_window self.sliding_window_size = config.dataset.sliding_window_size self.annotation_path = config.dataset.annotation_path def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): """refactored from line 144: https://github.com/DmZhukov/CrossTask/blob/master/train.py """ ctx = 0 model.eval() model = model.to(ctx) # this is not a loss but just compute neg_log_prob. Y_pred = {} Y_true = {} with torch.no_grad(): for batch in eval_dataloader: self(batch, model, Y_pred, Y_true) return self.finalize(Y_pred, Y_true, output_file) def __call__(self, sample, model, Y_pred, Y_true): # please install dp from `https://github.com/DmZhukov/CrossTask` from dp import dp vid, task = sample['video_id'][0], sample['task'][0] sample = self.to_ctx(sample) # compute the average logits over sliding windows. output = model(**sample) batch_logits = output["logits"].cpu() video_len = sample["video_len"][0] # the following version is slow. logits = torch.zeros((video_len, batch_logits.size(1))) logits_counts = torch.zeros((video_len, 1), dtype=torch.long) # use the same loop as aligner to recover. batch_logit_idx = 0 for window_start in range(0, video_len, self.sliding_window): video_end = min(video_len - window_start, self.sliding_window_size) logits[window_start: window_start + video_end] += batch_logits[ batch_logit_idx: batch_logit_idx + video_end] batch_logit_idx += video_end logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long) if (video_len - window_start) <= self.sliding_window_size: break logits /= logits_counts assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len) O = self.lsm(logits) y = np.zeros(O.size(), dtype=np.float32) dp(y, -O.detach().cpu().numpy()) if task not in Y_pred: Y_pred[task] = {} Y_pred[task][vid] = y annot_path = os.path.join( self.annotation_path, task+'_'+vid+'.csv') if os.path.exists(annot_path): if task not in Y_true: Y_true[task] = {} Y_true[task][vid] = self._read_assignment( *y.shape, annot_path) def finalize(self, Y_pred, Y_true, output_file=None): if output_file is not None: with open( os.path.join(self.pred_dir, output_file + ".pkl"), "wb") as fw: pickle.dump( {"Y_pred": Y_pred, "Y_true": Y_true}, fw, protocol=pickle.HIGHEST_PROTOCOL) return {"outputs": Y_pred, "targets": Y_true} def _read_assignment(self, T, K, path): """ refactored from https://github.com/DmZhukov/CrossTask/blob/master/data.py Howto interpret contraints on loss that is going to be minimized: lambd is a big number; self.lambd * C is a big number for all valid position (csv stores invalids) def forward(self, O, Y, C): return (Y*(self.lambd * C - self.lsm(O))).mean(dim=0).sum() This will load the csv file and fill-in the step col from start to end rows. """ Y = np.zeros([T, K], dtype=np.uint8) with open(path, 'r') as f: for line in f: step, start, end = line.strip().split(',') start = int(math.floor(float(start))) end = int(math.ceil(float(end))) step = int(step) - 1 Y[start:end, step] = 1 return Y class COINPredictor(Predictor): """ COINPredictor is similar to CrossTask on sliding windows. """ def __init__(self, config): super().__init__(config) self.max_video_len = config.dataset.max_video_len self.sliding_window = config.dataset.sliding_window self.sliding_window_size = config.dataset.sliding_window_size def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): """refactored from line 144: https://github.com/DmZhukov/CrossTask/blob/master/train.py """ ctx = 0 model.eval() model = model.to(ctx) # this is not a loss but just compute neg_log_prob. Y_pred = [] Y_true = [] with torch.no_grad(): for batch in eval_dataloader: self(batch, model, Y_pred, Y_true) return self.finalize(Y_pred, Y_true, output_file) def __call__(self, sample, model, Y_pred, Y_true): sample = self.to_ctx(sample) # compute the average logits over sliding windows. output = model(**sample) logits = self._merge_windows(sample, output) Y_pred.append(logits.argmax(dim=1)) Y_true.append(sample["video_targets"].squeeze(0).cpu()) def _merge_windows(self, sample, output): targets = sample["targets"].reshape(-1).cpu() valid_mask = targets != -100 targets = targets[valid_mask] batch_logits = output["logits"].cpu() batch_logits = batch_logits.reshape(-1, batch_logits.size(-1)) batch_logits = batch_logits[valid_mask] video_len = sample["video_len"][0] # the following version is slow. logits = torch.zeros((video_len, batch_logits.size(1))) logits_counts = torch.zeros((video_len, 1), dtype=torch.long) # use the same loop as aligner to recover. batch_logit_idx = 0 for window_start in range(0, video_len, self.sliding_window): video_end = min(video_len - window_start, self.sliding_window_size) logits[window_start: window_start + video_end] += batch_logits[ batch_logit_idx: batch_logit_idx + video_end] batch_logit_idx += video_end logits_counts[window_start: window_start + video_end] += torch.ones((video_end, 1), dtype=torch.long) if (video_len - window_start) <= self.sliding_window_size: break logits /= logits_counts assert logits.size() == (video_len, batch_logits.size(1)), "{}, {}".format(logits.size(), video_len) return logits def finalize(self, Y_pred, Y_true, output_file=None): Y_pred = torch.cat(Y_pred, dim=0).numpy() Y_true = torch.cat(Y_true, dim=0).numpy() assert len(Y_pred) == len(Y_true) error_mask = Y_pred != Y_true print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10]) print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20]) if output_file is not None: with open( os.path.join(self.pred_dir, output_file + ".pkl"), "wb") as fw: pickle.dump( {"Y_pred": Y_pred, "Y_true": Y_true}, fw, protocol=pickle.HIGHEST_PROTOCOL) return {"outputs": Y_pred, "targets": Y_true} class COINZSPredictor(COINPredictor): """ COINZSPredictor for COIN zero-shot prediction. """ def __init__(self, config): super().__init__(config) self.dataset_config = config.dataset def predict_loop(self, model, eval_dataloader, output_file="result.pkl"): """refactored from line 144: https://github.com/DmZhukov/CrossTask/blob/master/train.py """ ctx = 0 model.eval() model = model.to(ctx) with torch.no_grad(): outputs = eval_dataloader.dataset.meta_processor.meta_text_labels( self.dataset_config) outputs = self.to_ctx(outputs, ctx) label_hidden_states = model.forward_text(**outputs).cpu() label_sim = label_hidden_states @ label_hidden_states.t() num_labels = label_sim.size(0) eye_mask = ~torch.eye(num_labels, dtype=torch.bool) label_sim = label_sim.masked_select(eye_mask).view(num_labels, num_labels - 1) lbd = label_sim.max() # this is not a loss but just compute neg_log_prob. Y_pred = [] Y_true = [] with torch.no_grad(): for batch in eval_dataloader: self(batch, label_hidden_states, model, lbd, Y_pred, Y_true) return self.finalize(Y_pred, Y_true, output_file) def reshape_subsample(self, sample): for key in sample: if torch.is_tensor(sample[key]): sample[key] = self.flat_subsample(sample[key]) return sample def flat_subsample(self, tensor): if len(tensor.size()) > 1 and tensor.size(0) == 1: tensor = tensor.squeeze(0) return tensor def __call__(self, sample, label_hidden_states, model, lbd, Y_pred, Y_true): sample = self.reshape_subsample(sample) sample = self.to_ctx(sample) # compute the average logits over sliding windows. sample["output_hidden_states"] = True video_outputs = model.forward_video(**sample).cpu() output = {"logits": video_outputs[:, 1:sample["vmasks"].size(1)+1] @ label_hidden_states.t()} logits = self._merge_windows(sample, output) # logic of zero-shot for sequence labeling. logits_argmax = logits.argmax(dim=1) + 1 # 0 is "O" label. logits_max = logits.max(dim=1)[0] pred = torch.zeros_like(logits_argmax) label_select = logits_max > lbd # 73 or 74 pred[label_select] = logits_argmax[label_select] Y_pred.append(pred) Y_true.append(sample["video_targets"].squeeze(0).cpu()) def finalize(self, Y_pred, Y_true, output_file=None): Y_pred = torch.cat(Y_pred, dim=0).numpy() Y_true = torch.cat(Y_true, dim=0).numpy() assert len(Y_pred) == len(Y_true) error_mask = Y_pred != Y_true print("sample error", Y_pred[error_mask][:10], Y_true[error_mask][:10]) print("sample error", Y_pred[error_mask][10:20], Y_true[error_mask][10:20]) if output_file is not None: with open( os.path.join(self.pred_dir, output_file + ".pkl"), "wb") as fw: pickle.dump( {"Y_pred": Y_pred, "Y_true": Y_true}, fw, protocol=pickle.HIGHEST_PROTOCOL) return {"outputs": Y_pred, "targets": Y_true} class DiDeMoPredictor(Predictor): """reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py """ def __init__(self, config): super().__init__(config) # load targets. with open(config.dataset.test_path) as data_file: self.test_data = json.load(data_file) def predict_loop(self, model, eval_dataloader, output_file="didemo.npy"): """ TODO: two solutions here. """ import itertools # 21 chunks. self.possible_segments = [(0,0), (1,1), (2,2), (3,3), (4,4), (5,5)] for i in itertools.combinations(range(6), 2): self.possible_segments.append(i) # pick segments from a video. """on-the-fly prediction on a single gpu.""" self.full_scores = [] model.eval() model = model.cuda() with torch.no_grad(): for data in eval_dataloader: # TODO special forwarding logic here. data = self.to_ctx(data) data["output_hidden_states"] = True hidden_video = model.forward_video(**data) data["output_hidden_states"] = False pooled_text = model.forward_text(**data) outputs = { "hidden_video": hidden_video, "pooled_text": pooled_text } outputs.update(data) self(outputs) return self.finalize(output_file) def __call__(self, sample): # TODO: make an index select from self.possible_segments. hidden_video = sample["hidden_video"] pooled_text = sample["pooled_text"] vmasks = sample["vmasks"] # probably maintain valid results here. hidden_video = hidden_video[:, 1:-1, :] # probably maintain valid results here. pooled_video = [] for s, e in self.possible_segments: pooled_video.append( torch.mean( hidden_video[:, int(s*5):int((e+1)*5), :], dim=1, keepdim=True) ) pooled_video = torch.cat(pooled_video, dim=1) scores = torch.bmm( pooled_video, pooled_text.unsqueeze(-1)).squeeze(-1).cpu() ranks = scores.argsort(dim=-1, descending=True) for batch_idx, rank in enumerate(ranks): rank_of_moment = [] for m_idx, moment in enumerate(rank): s, e = self.possible_segments[moment.item()] if torch.any( vmasks[batch_idx, int(s*5):int((e+1)*5)] ): rank_of_moment.append((s, e)) self.full_scores.append(rank_of_moment) def finalize(self, output_file=None): outputs = self._aggregate_scores(self.full_scores) if output_file is not None: np.save(os.path.join(self.pred_dir, output_file + ".npy"), outputs) return {"outputs": outputs, "targets": self.test_data} def _aggregate_scores(self, scores): self.full_scores = [] return scores ================================================ FILE: examples/MMPT/mmpt/losses/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .loss import * from .nce import * try: from .fairseqmmloss import * except ImportError: pass try: from .expnce import * except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/losses/fairseqmmloss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ TODO (huxu): a general fairseq criterion for all your pre-defined losses. """ from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.logging import metrics @register_criterion("mmloss") class MMCriterion(FairseqCriterion): def __init__(self, task): super().__init__(task) # TODO (huxu): wrap forward call of loss_fn and eval_fn into task. self.mmtask = task.mmtask def forward(self, model, sample): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ outputs = self.mmtask(model, sample) loss, loss_scalar, max_len, batch_size, sample_size = ( outputs["loss"], outputs["loss_scalar"], outputs["max_len"], outputs["batch_size"], outputs["sample_size"], ) logging_output = { "loss": loss_scalar, "ntokens": max_len * batch_size, # dummy report. "nsentences": batch_size, # dummy report. "sample_size": sample_size, } return loss, 1, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" """since we use NCE, our actual batch_size is 1 per GPU. Then we take the mean of each worker.""" loss_sum = sum(log.get("loss", 0.0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar("loss", loss_sum / sample_size, round=3) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: examples/MMPT/mmpt/losses/loss.py ================================================ # Copyright (c) Facebook, Inc. All Rights Reserved import torch from torch import nn class Loss(object): def __call__(self, *args, **kwargs): raise NotImplementedError # Dummy Loss for testing. class DummyLoss(Loss): def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__(self, logits, targets, **kwargs): return self.loss(logits, targets) class DummyK400Loss(Loss): """dummy k400 loss for MViT.""" def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__(self, logits, targets, **kwargs): return self.loss( logits, torch.randint(0, 400, (logits.size(0),), device=logits.device)) class CrossEntropy(Loss): def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__(self, logits, targets, **kwargs): return self.loss(logits.reshape(-1, logits.size(-1)), targets.reshape(-1)) class ArgmaxCrossEntropy(Loss): def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__(self, logits, targets, **kwargs): return self.loss(logits, targets.argmax(dim=1)) class BCE(Loss): def __init__(self): self.loss = nn.BCEWithLogitsLoss() def __call__(self, logits, targets, **kwargs): targets = targets.squeeze(0) return self.loss(logits, targets) class NLGLoss(Loss): def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__(self, logits, text_label, **kwargs): targets = text_label[text_label != -100] return self.loss(logits, targets) class MSE(Loss): def __init__(self): self.loss = nn.MSELoss() def __call__(self, logits, targets, **kwargs): return self.loss(logits, targets) class L1(Loss): def __init__(self): self.loss = nn.L1Loss() def __call__(self, logits, targets, **kwargs): return self.loss(logits, targets) class SmoothL1(Loss): def __init__(self): self.loss = nn.SmoothL1Loss() def __call__(self, logits, targets, **kwargs): return self.loss(logits, targets) ================================================ FILE: examples/MMPT/mmpt/losses/nce.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ softmax-based NCE loss, used by this project. """ import torch from torch import nn from .loss import Loss class NCE(Loss): def __init__(self): # TODO (huxu): define temperature. self.loss = nn.CrossEntropyLoss() def __call__(self, align_scores, **kargs): # note: we reuse the same shape as cls head in BERT (batch_size, 2) # but NCE only needs one logits. # (so we drop all weights in the second neg logits.) align_scores = align_scores[:, :1] # duplicate negative examples batch_size = align_scores.size(0) // 2 pos_scores = align_scores[:batch_size] neg_scores = align_scores[batch_size:].view(1, batch_size).repeat( batch_size, 1) scores = torch.cat([pos_scores, neg_scores], dim=1) return self.loss( scores, torch.zeros( (batch_size,), dtype=torch.long, device=align_scores.device), ) class T2VContraLoss(Loss): """NCE for MM joint space, on softmax text2video matrix. """ def __init__(self): # TODO (huxu): define temperature. self.loss = nn.CrossEntropyLoss() def __call__(self, pooled_video, pooled_text, **kargs): batch_size = pooled_video.size(0) logits = torch.mm(pooled_text, pooled_video.transpose(1, 0)) targets = torch.arange( batch_size, dtype=torch.long, device=pooled_video.device) return self.loss(logits, targets) class V2TContraLoss(Loss): """NCE for MM joint space, with softmax on video2text matrix.""" def __init__(self): # TODO (huxu): define temperature. self.loss = nn.CrossEntropyLoss() def __call__(self, pooled_video, pooled_text, **kargs): batch_size = pooled_video.size(0) logits = torch.mm(pooled_video, pooled_text.transpose(1, 0)) targets = torch.arange( batch_size, dtype=torch.long, device=pooled_video.device) return self.loss(logits, targets) class MMContraLoss(Loss): def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__(self, pooled_video, pooled_text, **kwargs): logits_per_video = pooled_video @ pooled_text.t() logits_per_text = pooled_text @ pooled_video.t() targets = torch.arange( pooled_video.size(0), dtype=torch.long, device=pooled_video.device) loss_video = self.loss(logits_per_video, targets) loss_text = self.loss(logits_per_text, targets) return loss_video + loss_text class MTM(Loss): """Combination of MFM and MLM.""" def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__( self, video_logits, text_logits, video_label, text_label, **kwargs ): text_logits = torch.cat([ text_logits, torch.zeros( (text_logits.size(0), 1), device=text_logits.device) ], dim=1) vt_logits = torch.cat([video_logits, text_logits], dim=0) # loss for video. video_label = torch.zeros( (video_logits.size(0),), dtype=torch.long, device=video_logits.device ) # loss for text. text_label = text_label.reshape(-1) labels_mask = text_label != -100 selected_text_label = text_label[labels_mask] vt_label = torch.cat([video_label, selected_text_label], dim=0) return self.loss(vt_logits, vt_label) class MFMMLM(Loss): """Combination of MFM and MLM.""" def __init__(self): self.loss = nn.CrossEntropyLoss() def __call__( self, video_logits, text_logits, video_label, text_label, **kwargs ): # loss for video. video_label = torch.zeros( (video_logits.size(0),), dtype=torch.long, device=video_logits.device ) masked_frame_loss = self.loss(video_logits, video_label) # loss for text. text_label = text_label.reshape(-1) labels_mask = text_label != -100 selected_text_label = text_label[labels_mask] masked_lm_loss = self.loss(text_logits, selected_text_label) return masked_frame_loss + masked_lm_loss ================================================ FILE: examples/MMPT/mmpt/models/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .mmfusion import * from .transformermodel import * from .mmfusionnlg import * try: from .fairseqmmmodel import * except ImportError: pass try: from .expmmfusion import * except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/models/fairseqmmmodel.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.models import ( BaseFairseqModel, register_model, register_model_architecture ) @register_model("mmmodel") class FairseqMMModel(BaseFairseqModel): """a fairseq wrapper of model built by `task`.""" @classmethod def build_model(cls, args, task): return FairseqMMModel(task.mmtask.model) def __init__(self, mmmodel): super().__init__() self.mmmodel = mmmodel def forward(self, *args, **kwargs): return self.mmmodel(*args, **kwargs) def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) keys_to_delete = [] for key in state_dict: if key not in self.state_dict(): keys_to_delete.append(key) for key in keys_to_delete: print("[INFO]", key, "not used anymore.") del state_dict[key] # copy any newly defined parameters. for key in self.state_dict(): if key not in state_dict: print("[INFO] adding", key) state_dict[key] = self.state_dict()[key] # a dummy arch, we config the model. @register_model_architecture("mmmodel", "mmarch") def mmarch(args): pass ================================================ FILE: examples/MMPT/mmpt/models/mmfusion.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Copyright (c) Facebook, Inc. All Rights Reserved import torch from torch import nn try: from transformers import AutoConfig, AutoTokenizer except ImportError: pass from . import transformermodel class MMPTModel(nn.Module): """An e2e wrapper of inference model. """ @classmethod def from_pretrained(cls, config, checkpoint="checkpoint_best.pt"): import os from ..utils import recursive_config from ..tasks import Task config = recursive_config(config) mmtask = Task.config_task(config) checkpoint_path = os.path.join(config.eval.save_path, checkpoint) mmtask.build_model(checkpoint=checkpoint_path) # TODO(huxu): make the video encoder configurable. from ..processors.models.s3dg import S3D video_encoder = S3D('pretrained_models/s3d_dict.npy', 512) video_encoder.load_state_dict( torch.load('pretrained_models/s3d_howto100m.pth')) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( config.dataset.bert_name, use_fast=config.dataset.use_fast ) from ..processors import Aligner aligner = Aligner(config.dataset) return ( MMPTModel(config, mmtask.model, video_encoder), tokenizer, aligner ) def __init__(self, config, model, video_encoder, **kwargs): super().__init__() self.max_video_len = config.dataset.max_video_len self.video_encoder = video_encoder self.model = model def forward(self, video_frames, caps, cmasks, return_score=False): bsz = video_frames.size(0) assert bsz == 1, "only bsz=1 is supported now." seq_len = video_frames.size(1) video_frames = video_frames.view(-1, *video_frames.size()[2:]) vfeats = self.video_encoder(video_frames.permute(0, 4, 1, 2, 3)) vfeats = vfeats['video_embedding'] vfeats = vfeats.view(bsz, seq_len, vfeats.size(-1)) padding = torch.zeros( bsz, self.max_video_len - seq_len, vfeats.size(-1)) vfeats = torch.cat([vfeats, padding], dim=1) vmasks = torch.cat([ torch.ones((bsz, seq_len), dtype=torch.bool), torch.zeros((bsz, self.max_video_len - seq_len), dtype=torch.bool) ], dim=1 ) output = self.model(caps, cmasks, vfeats, vmasks) if return_score: output = {"score": torch.bmm( output["pooled_video"][:, None, :], output["pooled_text"][:, :, None] ).squeeze(-1).squeeze(-1)} return output class MMFusion(nn.Module): """a MMPT wrapper class for MMBert style models. TODO: move isolated mask to a subclass. """ def __init__(self, config, **kwargs): super().__init__() transformer_config = AutoConfig.from_pretrained( config.dataset.bert_name) self.hidden_size = transformer_config.hidden_size self.is_train = False if config.dataset.train_path is not None: self.is_train = True # 0 means no iso; 1-12 means iso up to that layer. self.num_hidden_layers = transformer_config.num_hidden_layers self.last_iso_layer = 0 if config.dataset.num_iso_layer is not None: self.last_iso_layer = config.dataset.num_iso_layer - 1 + 1 if config.model.mm_encoder_cls is not None: mm_encoder_cls = getattr(transformermodel, config.model.mm_encoder_cls) model_config = AutoConfig.from_pretrained(config.dataset.bert_name) model_config.max_video_len = config.dataset.max_video_len # TODO: a general way to add parameter for a model. model_config.use_seg_emb = config.model.use_seg_emb self.mm_encoder = mm_encoder_cls.from_pretrained( config.dataset.bert_name, config=model_config) elif config.model.video_encoder_cls is not None\ and config.model.text_encoder_cls is not None: video_encoder_cls = getattr(transformermodel, config.model.video_encoder_cls) model_config = AutoConfig.from_pretrained(config.dataset.bert_name) model_config.max_video_len = config.dataset.max_video_len # TODO: make each model a set of config class. if hasattr(model_config, "num_layers"): model_config.num_layers = config.model.num_hidden_video_layers else: model_config.num_hidden_layers = config.model.num_hidden_video_layers self.video_encoder = video_encoder_cls.from_pretrained( config.dataset.bert_name, config=model_config) # exact same NLP model from Huggingface. text_encoder_cls = getattr(transformermodel, config.model.text_encoder_cls) self.text_encoder = text_encoder_cls.from_pretrained( config.dataset.bert_name) else: raise ValueError("the encoder must be either MM or two backbones.") def forward( self, caps, cmasks, vfeats, vmasks, **kwargs ): raise NotImplementedError( "Please derive MMFusion module." ) def _mm_on_the_fly( self, cmasks, vmasks, attention_mask ): """helper function for mask, seg_ids and token_type_ids.""" if attention_mask is None: attention_mask = self._mm_attention_mask(cmasks, vmasks) """ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | """ token_type_ids = torch.cat( [ torch.zeros( (vmasks.size(0), vmasks.size(1) + 2), dtype=torch.long, device=vmasks.device, ), torch.ones( (cmasks.size(0), cmasks.size(1) - 2), dtype=torch.long, device=cmasks.device, ), ], dim=1, ) return attention_mask, token_type_ids def _mm_attention_mask(self, cmasks, vmasks): assert cmasks.size(0) == vmasks.size(0), "{}, {}, {}, {}".format( str(cmasks.size()), str(vmasks.size()), str(cmasks.size(0)), str(vmasks.size(0)), ) mm_mask = torch.cat([cmasks[:, :1], vmasks, cmasks[:, 1:]], dim=1) if self.last_iso_layer == 0: # hard attention mask. return mm_mask else: # a gpu iso mask; 0 : num_iso_layer is isolated; # num_iso_layer: are MM-fused. # make an iso layer batch_size = cmasks.size(0) iso_mask = self._make_iso_mask(batch_size, cmasks, vmasks) mm_mask = mm_mask[:, None, :].repeat(1, mm_mask.size(-1), 1) iso_mm_masks = [] # hard attention mask. iso_mask = iso_mask[:, None, :, :].repeat( 1, self.last_iso_layer, 1, 1) iso_mm_masks.append(iso_mask) if self.last_iso_layer < self.num_hidden_layers: mm_mask = mm_mask[:, None, :, :].repeat( 1, self.num_hidden_layers - self.last_iso_layer, 1, 1 ) iso_mm_masks.append(mm_mask) iso_mm_masks = torch.cat(iso_mm_masks, dim=1) return iso_mm_masks def _make_iso_mask(self, batch_size, cmasks, vmasks): cls_self_mask = torch.cat( [ torch.ones( (batch_size, 1), dtype=torch.bool, device=cmasks.device), torch.zeros( (batch_size, cmasks.size(1) + vmasks.size(1) - 1), dtype=torch.bool, device=cmasks.device) ], dim=1) iso_video_mask = torch.cat( [ # [CLS] is not used. torch.zeros( (batch_size, 1), dtype=torch.bool, device=cmasks.device ), vmasks, # assume to be 1. cmasks[:, 1:2], # 2 means [CLS] + [SEP] torch.zeros( (batch_size, cmasks.size(1) - 2), dtype=torch.bool, device=cmasks.device, ), ], dim=1, ) iso_text_mask = torch.cat( [ torch.zeros( (batch_size, 2 + vmasks.size(1)), dtype=torch.bool, device=cmasks.device, ), # [CLS] is not used. cmasks[:, 2:], # assume to be 1. ], dim=1, ) cls_self_mask = cls_self_mask[:, None, :] iso_video_mask = iso_video_mask[:, None, :].repeat( 1, vmasks.size(1) + 1, 1) iso_text_mask = iso_text_mask[:, None, :].repeat( 1, cmasks.size(1) - 2, 1) return torch.cat([cls_self_mask, iso_video_mask, iso_text_mask], dim=1) def _pooling_vt_layer( self, layered_sequence_output, cmasks, vmasks ): layer_idx = self.last_iso_layer \ if self.last_iso_layer > 0 else self.num_hidden_layers hidden_state = layered_sequence_output[layer_idx] # also output pooled_video and pooled_text. batch_size = cmasks.size(0) # pool the modality. text_offset = vmasks.size(1) + 2 # [CLS] + [SEP] # video tokens + [SEP] video_outputs = hidden_state[:, 1:text_offset] video_attention_mask = torch.cat( [ vmasks, torch.ones( (batch_size, 1), dtype=torch.bool, device=vmasks.device), ], dim=1, ) assert video_outputs.size(1) == video_attention_mask.size(1) pooled_video = torch.sum( video_outputs * video_attention_mask.unsqueeze(-1), dim=1 ) / video_attention_mask.sum(1, keepdim=True) # pooled_video = torch.mean(video_outputs[0], dim=1) # text tokens + [SEP] text_attention_mask = cmasks[:, 2:] text_outputs = hidden_state[:, text_offset:] assert text_outputs.size(1) == text_attention_mask.size(1) pooled_text = torch.sum( text_outputs * text_attention_mask.unsqueeze(-1), dim=1 ) / text_attention_mask.sum(1, keepdim=True) return pooled_video, pooled_text class MMFusionMFMMLM(MMFusion): """forward function for MFM and MLM.""" def forward( self, caps, cmasks, vfeats, vmasks, attention_mask=None, video_label=None, text_label=None, **kwargs ): output_hidden_states = False if self.is_train else True target_vfeats, non_masked_frame_mask = None, None if video_label is not None: target_vfeats = vfeats.masked_select( video_label.unsqueeze(-1)).view( -1, vfeats.size(-1) ) # mask video token. vfeats[video_label] = 0.0 non_masked_frame_mask = vmasks.clone() non_masked_frame_mask[video_label] = False attention_mask, token_type_ids = self._mm_on_the_fly( cmasks, vmasks, attention_mask) outputs = self.mm_encoder( input_ids=caps, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, masked_frame_labels=video_label, target_video_hidden_states=target_vfeats, non_masked_frame_mask=non_masked_frame_mask, masked_lm_labels=text_label, output_hidden_states=output_hidden_states, ) video_logits, text_logits = outputs[0], outputs[1] if self.is_train: # return earlier for training. return { "video_logits": video_logits, "text_logits": text_logits, } pooled_video, pooled_text = self._pooling_vt_layer( outputs[2], cmasks, vmasks) return {"pooled_video": pooled_video, "pooled_text": pooled_text} class MMFusionMTM(MMFusionMFMMLM): def __init__(self, config, **kwargs): super().__init__(config) """ For reproducibility: self.mm_encoder will be initialized then discarded. """ from .transformermodel import MMBertForMTM model_config = AutoConfig.from_pretrained(config.dataset.bert_name) model_config.max_video_len = config.dataset.max_video_len model_config.use_seg_emb = config.model.use_seg_emb self.mm_encoder = MMBertForMTM.from_pretrained( config.dataset.bert_name, config=model_config) class MMFusionShare(MMFusion): """A retrival wrapper using mm_encoder as both video/text backbone. TODO: move formally. """ def forward( self, caps, cmasks, vfeats, vmasks, attention_mask=None, video_label=None, text_label=None, output_hidden_states=False, **kwargs ): pooled_video = self.forward_video( vfeats, vmasks, caps, cmasks, output_hidden_states ) pooled_text = self.forward_text( caps, cmasks, output_hidden_states ) return {"pooled_video": pooled_video, "pooled_text": pooled_text} def forward_video( self, vfeats, vmasks, caps, cmasks, output_hidden_states=False, **kwargs ): input_ids = caps[:, :2] attention_mask = torch.cat([ cmasks[:, :1], vmasks, cmasks[:, 1:2] ], dim=1) token_type_ids = torch.zeros( (vmasks.size(0), vmasks.size(1) + 2), dtype=torch.long, device=vmasks.device) outputs = self.mm_encoder( input_ids=input_ids, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True ) video_outputs = outputs[0] if output_hidden_states: return video_outputs batch_size = cmasks.size(0) video_attention_mask = torch.cat( [ torch.zeros( (batch_size, 1), dtype=torch.bool, device=vmasks.device), vmasks, torch.ones( (batch_size, 1), dtype=torch.bool, device=vmasks.device), ], dim=1, ) assert video_outputs.size(1) == video_attention_mask.size(1) video_attention_mask = video_attention_mask.type(video_outputs.dtype) \ / video_attention_mask.sum(1, keepdim=True) pooled_video = torch.bmm( video_outputs.transpose(2, 1), video_attention_mask.unsqueeze(2) ).squeeze(-1) return pooled_video # video_outputs def forward_text( self, caps, cmasks, output_hidden_states=False, **kwargs ): input_ids = torch.cat([ caps[:, :1], caps[:, 2:], ], dim=1) attention_mask = torch.cat([ cmasks[:, :1], cmasks[:, 2:] ], dim=1) token_type_ids = torch.cat([ torch.zeros( (cmasks.size(0), 1), dtype=torch.long, device=cmasks.device), torch.ones( (cmasks.size(0), cmasks.size(1) - 2), dtype=torch.long, device=cmasks.device) ], dim=1) outputs = self.mm_encoder( input_ids=input_ids, input_video_embeds=None, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True ) text_outputs = outputs[0] if output_hidden_states: return text_outputs batch_size = caps.size(0) # text tokens + [SEP] text_attention_mask = torch.cat([ torch.zeros( (batch_size, 1), dtype=torch.bool, device=cmasks.device), cmasks[:, 2:] ], dim=1) assert text_outputs.size(1) == text_attention_mask.size(1) text_attention_mask = text_attention_mask.type(text_outputs.dtype) \ / text_attention_mask.sum(1, keepdim=True) pooled_text = torch.bmm( text_outputs.transpose(2, 1), text_attention_mask.unsqueeze(2) ).squeeze(-1) return pooled_text # text_outputs class MMFusionSeparate(MMFusionShare): def forward_video( self, vfeats, vmasks, caps, cmasks, output_hidden_states=False, **kwargs ): input_ids = caps[:, :2] attention_mask = torch.cat([ cmasks[:, :1], vmasks, cmasks[:, 1:2] ], dim=1) token_type_ids = torch.zeros( (vmasks.size(0), vmasks.size(1) + 2), dtype=torch.long, device=vmasks.device) outputs = self.video_encoder( input_ids=input_ids, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True ) video_outputs = outputs[0] if output_hidden_states: return video_outputs batch_size = cmasks.size(0) video_attention_mask = torch.cat( [ torch.zeros( (batch_size, 1), dtype=torch.bool, device=vmasks.device), vmasks, torch.ones( (batch_size, 1), dtype=torch.bool, device=vmasks.device), ], dim=1, ) assert video_outputs.size(1) == video_attention_mask.size(1) video_attention_mask = video_attention_mask.type(video_outputs.dtype) \ / video_attention_mask.sum(1, keepdim=True) pooled_video = torch.bmm( video_outputs.transpose(2, 1), video_attention_mask.unsqueeze(2) ).squeeze(-1) return pooled_video # video_outputs def forward_text( self, caps, cmasks, output_hidden_states=False, **kwargs ): input_ids = torch.cat([ caps[:, :1], caps[:, 2:], ], dim=1) attention_mask = torch.cat([ cmasks[:, :1], cmasks[:, 2:] ], dim=1) # different from sharing, we use all-0 type. token_type_ids = torch.zeros( (cmasks.size(0), cmasks.size(1) - 1), dtype=torch.long, device=cmasks.device) outputs = self.text_encoder( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=True ) text_outputs = outputs[0] if output_hidden_states: return text_outputs batch_size = caps.size(0) # text tokens + [SEP] text_attention_mask = torch.cat([ torch.zeros( (batch_size, 1), dtype=torch.bool, device=cmasks.device), cmasks[:, 2:] ], dim=1) assert text_outputs.size(1) == text_attention_mask.size(1) text_attention_mask = text_attention_mask.type(text_outputs.dtype) \ / text_attention_mask.sum(1, keepdim=True) pooled_text = torch.bmm( text_outputs.transpose(2, 1), text_attention_mask.unsqueeze(2) ).squeeze(-1) return pooled_text # text_outputs class MMFusionJoint(MMFusion): """fine-tuning wrapper for retrival task.""" def forward( self, caps, cmasks, vfeats, vmasks, attention_mask=None, video_label=None, text_label=None, **kwargs ): # TODO (huxu): other ways to do negative examples; move the following # into your criterion forward. output_hidden_states = True attention_mask, token_type_ids = self._mm_on_the_fly( cmasks, vmasks, attention_mask) separate_forward_split = ( None if self.is_train else vmasks.size(1) + 2 ) # [CLS] + [SEP] outputs = self.mm_encoder( input_ids=caps, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=output_hidden_states, separate_forward_split=separate_forward_split, ) pooled_video, pooled_text = self._pooling_vt_layer( outputs[2], cmasks, vmasks) return {"pooled_video": pooled_video, "pooled_text": pooled_text} class MMFusionActionSegmentation(MMFusion): """Fine-tuning wrapper for action segmentation. TODO: rename this for VLM. """ def forward( self, caps, cmasks, vfeats, vmasks, attention_mask=None, **kwargs ): # ActionLocalization assume of batch_size=1, squeeze it. caps = caps.view(-1, caps.size(-1)) cmasks = cmasks.view(-1, cmasks.size(-1)) vfeats = vfeats.view(-1, vfeats.size(2), vfeats.size(3)) vmasks = vmasks.view(-1, vmasks.size(-1)) # this may not cover all shapes of attention_mask. attention_mask = attention_mask.view( -1, attention_mask.size(2), attention_mask.size(3)) \ if attention_mask is not None else None # TODO (huxu): other ways to do negative examples; move the following # into your criterion forward. output_hidden_states = True # video forwarding, text is dummy; never use attention_mask. attention_mask, token_type_ids = self._mm_on_the_fly( cmasks, vmasks, attention_mask) logits = self.mm_encoder( input_ids=caps, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=output_hidden_states, ) return {"logits": logits[0][:, 1:vmasks.size(1)+1]} class MMFusionActionLocalization(MMFusion): """fine-tuning model for retrival task.""" def __init__(self, config, **kwargs): super().__init__(config) tokenizer = AutoTokenizer.from_pretrained( config.dataset.bert_name) self.cls_token_id = tokenizer.cls_token_id self.sep_token_id = tokenizer.sep_token_id self.pad_token_id = tokenizer.pad_token_id def forward( self, caps, cmasks, vfeats, vmasks, attention_mask=None, **kwargs ): # ActionLocalization assume of batch_size=1, squeeze it. caps = caps.squeeze(0) cmasks = cmasks.squeeze(0) vfeats = vfeats.squeeze(0) vmasks = vmasks.squeeze(0) attention_mask = attention_mask.squeeze(0) if attention_mask is not None else None # TODO (huxu): other ways to do negative examples; move the following # into your criterion forward. output_hidden_states = True # a len1 dummy video token. dummy_vfeats = torch.zeros( (caps.size(0), 1, vfeats.size(-1)), device=vfeats.device, dtype=vfeats.dtype) dummy_vmasks = torch.ones( (caps.size(0), 1), dtype=torch.bool, device=vfeats.device) dummy_caps = torch.LongTensor( [[self.cls_token_id, self.sep_token_id, self.pad_token_id, self.sep_token_id]], ).to(caps.device).repeat(vfeats.size(0), 1) dummy_cmasks = torch.BoolTensor( [[0, 1, 0, 1]] # pad are valid for attention. ).to(caps.device).repeat(vfeats.size(0), 1) # video forwarding, text is dummy; never use attention_mask. attention_mask, token_type_ids = self._mm_on_the_fly( dummy_cmasks, vmasks, None) outputs = self.mm_encoder( input_ids=dummy_caps, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=output_hidden_states, ) layer_idx = self.last_iso_layer \ if self.last_iso_layer > 0 else self.num_hidden_layers video_seq = outputs[2][layer_idx][:, 1:vmasks.size(1)+1].masked_select( vmasks.unsqueeze(-1) ).view(-1, self.hidden_size) # text forwarding, video is dummy attention_mask, token_type_ids = self._mm_on_the_fly( cmasks, dummy_vmasks, None) outputs = self.mm_encoder( input_ids=caps, input_video_embeds=dummy_vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, output_hidden_states=output_hidden_states, ) _, pooled_text = self._pooling_vt_layer( outputs[2], cmasks, dummy_vmasks) # this line is not right. logits = torch.mm(video_seq, pooled_text.transpose(1, 0)) return {"logits": logits} # --------------- MMFusionSeparate for end tasks --------------- class MMFusionSeparateActionSegmentation(MMFusionSeparate): """Fine-tuning wrapper for action segmentation.""" def forward( self, caps, cmasks, vfeats, vmasks, attention_mask=None, **kwargs ): # ActionLocalization assume of batch_size=1, squeeze it. caps = caps.view(-1, caps.size(-1)) cmasks = cmasks.view(-1, cmasks.size(-1)) vfeats = vfeats.view(-1, vfeats.size(2), vfeats.size(3)) vmasks = vmasks.view(-1, vmasks.size(-1)) logits = self.forward_video( vfeats, vmasks, caps, cmasks, output_hidden_states=True ) return {"logits": logits[:, 1:vmasks.size(1)+1]} class MMFusionSeparateActionLocalization(MMFusionSeparate): def __init__(self, config, **kwargs): super().__init__(config) tokenizer = AutoTokenizer.from_pretrained( config.dataset.bert_name) self.cls_token_id = tokenizer.cls_token_id self.sep_token_id = tokenizer.sep_token_id self.pad_token_id = tokenizer.pad_token_id def forward( self, caps, cmasks, vfeats, vmasks, **kwargs ): # ActionLocalization assume of batch_size=1, squeeze it. caps = caps.squeeze(0) cmasks = cmasks.squeeze(0) vfeats = vfeats.squeeze(0) vmasks = vmasks.squeeze(0) # TODO (huxu): other ways to do negative examples; move the following # into your criterion forward. dummy_caps = torch.LongTensor( [[self.cls_token_id, self.sep_token_id, self.pad_token_id, self.sep_token_id]], ).to(caps.device).repeat(vfeats.size(0), 1) dummy_cmasks = torch.BoolTensor( [[0, 1, 0, 1]] # pad are valid for attention. ).to(caps.device).repeat(vfeats.size(0), 1) outputs = self.forward_video( vfeats, vmasks, dummy_caps, dummy_cmasks, output_hidden_states=True ) video_seq = outputs[:, 1:vmasks.size(1)+1].masked_select( vmasks.unsqueeze(-1) ).view(-1, self.hidden_size) pooled_text = self.forward_text( caps, cmasks, output_hidden_states=False ) # this line is not right. logits = torch.mm(video_seq, pooled_text.transpose(1, 0)) return {"logits": logits} class MMFusionShareActionLocalization(MMFusionShare): def __init__(self, config, **kwargs): super().__init__(config) tokenizer = AutoTokenizer.from_pretrained( config.dataset.bert_name) self.cls_token_id = tokenizer.cls_token_id self.sep_token_id = tokenizer.sep_token_id self.pad_token_id = tokenizer.pad_token_id def forward( self, caps, cmasks, vfeats, vmasks, **kwargs ): # ActionLocalization assume of batch_size=1, squeeze it. caps = caps.squeeze(0) cmasks = cmasks.squeeze(0) vfeats = vfeats.squeeze(0) vmasks = vmasks.squeeze(0) # TODO (huxu): other ways to do negative examples; move the following # into your criterion forward. dummy_caps = torch.LongTensor( [[self.cls_token_id, self.sep_token_id, self.pad_token_id, self.sep_token_id]], ).to(caps.device).repeat(vfeats.size(0), 1) dummy_cmasks = torch.BoolTensor( [[0, 1, 0, 1]] # pad are valid for attention. ).to(caps.device).repeat(vfeats.size(0), 1) outputs = self.forward_video( vfeats, vmasks, dummy_caps, dummy_cmasks, output_hidden_states=True ) video_seq = outputs[:, 1:vmasks.size(1)+1].masked_select( vmasks.unsqueeze(-1) ).view(-1, self.hidden_size) pooled_text = self.forward_text( caps, cmasks, output_hidden_states=False ) # this line is not right. logits = torch.mm(video_seq, pooled_text.transpose(1, 0)) return {"logits": logits} ================================================ FILE: examples/MMPT/mmpt/models/mmfusionnlg.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Copyright (c) Facebook, Inc. All Rights Reserved import torch from torch.nn import functional as F from typing import Optional, Iterable try: from transformers import BertPreTrainedModel from transformers.modeling_bert import BertOnlyMLMHead from transformers.file_utils import ModelOutput from transformers.modeling_outputs import CausalLMOutput from transformers.generation_utils import ( BeamHypotheses, top_k_top_p_filtering ) except ImportError: pass from .mmfusion import MMFusion from .transformermodel import MMBertModel from ..modules import VideoTokenMLP class MMFusionNLG(MMFusion): def __init__(self, config, **kwargs): super().__init__(config) if config.model.max_decode_length is not None: self.max_length = min( config.model.max_decode_length, config.dataset.max_len - config.dataset.max_video_len - 3 ) else: self.max_length = \ config.dataset.max_len - config.dataset.max_video_len - 3 self.gen_param = config.gen_param if config.gen_param is not None \ else {} def forward( self, caps, cmasks, vfeats, vmasks, attention_mask, video_label=None, text_label=None, **kwargs ): """use pre-trained LM header for generation.""" attention_mask, token_type_ids = self._mm_on_the_fly( cmasks, vmasks, attention_mask) outputs = self.mm_encoder( input_ids=caps, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, masked_lm_labels=text_label, ) return {"logits": outputs[0]} @torch.no_grad() def generate( self, caps, cmasks, vfeats, vmasks, attention_mask=None, bos_token_id=None, eos_token_id=None, **kwargs ): # a simplified interface from # https://huggingface.co/transformers/v3.4.0/_modules/transformers/generation_utils.html#GenerationMixin.generate # caps now only have # [CLS], [SEP] (for video) and [CLS] (as bos_token) assert caps.size(1) == 3 attention_mask, token_type_ids = self._mm_on_the_fly( cmasks, vmasks, attention_mask) output = self.mm_encoder.generate( input_ids=caps, input_video_embeds=vfeats, attention_mask=attention_mask, token_type_ids=token_type_ids, bos_token_id=bos_token_id, eos_token_id=eos_token_id, max_length=self.max_length, **self.gen_param ) return output class MMBertForNLG(BertPreTrainedModel): def __init__(self, config): super().__init__(config) self.bert = MMBertModel(config) self.videomlp = VideoTokenMLP(config) # we do not use `BertGenerationOnlyLMHead` # because we can reuse pretraining. self.cls = BertOnlyMLMHead(config) self.hidden_size = config.hidden_size self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def forward( self, input_ids=None, input_video_embeds=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): # similar to MMBertForMFMMLM without MFM. video_tokens = self.videomlp(input_video_embeds) outputs = self.bert( input_ids, video_tokens, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] prediction_scores = None if masked_lm_labels is not None: text_offset = input_video_embeds.size(1) + 1 # [CLS] # recover caps format: [CLS] [SEP] text [SEP] text_sequence_output = torch.cat( [sequence_output[:, :1], sequence_output[:, text_offset:]], dim=1 ) # only compute select tokens to training to speed up. hidden_size = text_sequence_output.size(-1) # masked_lm_labels = masked_lm_labels.reshape(-1) labels_mask = masked_lm_labels != -100 selected_text_output = text_sequence_output.masked_select( labels_mask.unsqueeze(-1) ).view(-1, hidden_size) prediction_scores = self.cls(selected_text_output) if not return_dict: output = ( prediction_scores, ) + outputs[2:] return output # for generation. text_offset = input_video_embeds.size(1) + 2 # [CLS] text_sequence_output = sequence_output[:, text_offset:] prediction_scores = self.cls(text_sequence_output) return CausalLMOutput( loss=None, logits=prediction_scores, ) def prepare_inputs_for_generation( self, input_ids, input_video_embeds, attention_mask=None, token_type_ids=None, **model_kwargs ): # must return a dictionary. seq_len = input_ids.size(1) + input_video_embeds.size(1) if attention_mask is not None: if len(attention_mask.size()) == 4: attention_mask = attention_mask[:, :, :seq_len, :seq_len] elif len(attention_mask.size()) == 3: attention_mask = attention_mask[:, :seq_len, :seq_len] else: attention_mask = attention_mask[:, :seq_len] if token_type_ids is not None: token_type_ids = token_type_ids[:, :seq_len] return { "input_ids": input_ids, "input_video_embeds": input_video_embeds, "attention_mask": attention_mask, "token_type_ids": token_type_ids, } @torch.no_grad() def generate( self, input_ids: Optional[torch.LongTensor] = None, decoder_input_ids: Optional[torch.LongTensor] = None, max_length: Optional[int] = None, min_length: Optional[int] = None, do_sample: Optional[bool] = None, early_stopping: Optional[bool] = None, num_beams: Optional[int] = None, temperature: Optional[float] = None, top_k: Optional[int] = None, top_p: Optional[float] = None, repetition_penalty: Optional[float] = None, bad_words_ids: Optional[Iterable[int]] = None, bos_token_id: Optional[int] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[int] = None, length_penalty: Optional[float] = None, no_repeat_ngram_size: Optional[int] = None, num_return_sequences: Optional[int] = None, attention_mask: Optional[torch.LongTensor] = None, decoder_start_token_id: Optional[int] = None, use_cache: Optional[bool] = None, **model_kwargs ) -> torch.LongTensor: r""" Generates sequences for models with a language modeling head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling. Adapted in part from `Facebook's XLM beam search code `__. Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the default values of those config. Most of these parameters are explained in more detail in `this blog post `__. Parameters: input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): The sequence used as a prompt for the generation. If :obj:`None` the method initializes it as an empty :obj:`torch.LongTensor` of shape :obj:`(1,)`. decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): initial input_ids for the decoder of encoder-decoder type models. If :obj:`None` then only decoder_start_token_id is passed as the first token to the decoder. max_length (:obj:`int`, `optional`, defaults to 20): The maximum length of the sequence to be generated. min_length (:obj:`int`, `optional`, defaults to 10): The minimum length of the sequence to be generated. do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether or not to use sampling ; use greedy decoding otherwise. early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not. num_beams (:obj:`int`, `optional`, defaults to 1): Number of beams for beam search. 1 means no beam search. temperature (:obj:`float`, `optional`, defaults tp 1.0): The value used to module the next token probabilities. top_k (:obj:`int`, `optional`, defaults to 50): The number of highest probability vocabulary tokens to keep for top-k-filtering. top_p (:obj:`float`, `optional`, defaults to 1.0): If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation. repetition_penalty (:obj:`float`, `optional`, defaults to 1.0): The parameter for repetition penalty. 1.0 means no penalty. See `this paper `__ for more details. pad_token_id (:obj:`int`, `optional`): The id of the `padding` token. bos_token_id (:obj:`int`, `optional`): The id of the `beginning-of-sequence` token. eos_token_id (:obj:`int`, `optional`): The id of the `end-of-sequence` token. length_penalty (:obj:`float`, `optional`, defaults to 1.0): Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer sequences. no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0): If set to int > 0, all ngrams of that size can only occur once. bad_words_ids(:obj:`List[int]`, `optional`): List of token ids that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`. num_return_sequences(:obj:`int`, `optional`, defaults to 1): The number of independently computed returned sequences for each element in the batch. attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for tokens that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same shape as :obj:`input_ids` that masks the pad token. `What are attention masks? <../glossary.html#attention-mask>`__ decoder_start_token_id (:obj:`int`, `optional`): If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token. use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not the model should use the past last key/values attentions (if applicable to the model) to speed up decoding. model_kwargs: Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. Return: :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all batches finished early due to the :obj:`eos_token_id`. Examples:: tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. outputs = model.generate(max_length=40) # do greedy decoding print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('openai-gpt') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('openai-gpt') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5) # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog' for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('distilgpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('distilgpt2') # Download model and configuration from S3 and cache. input_context = 'The dog' input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True) # generate 3 candidates using sampling for i in range(3): # 3 output sequences were generated print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('ctrl') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('ctrl') # Download model and configuration from S3 and cache. input_context = 'Legal My neighbor is' # "Legal" is one of the control codes for ctrl input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2) # generate sequences print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True))) tokenizer = AutoTokenizer.from_pretrained('gpt2') # Initialize tokenizer model = AutoModelWithLMHead.from_pretrained('gpt2') # Download model and configuration from S3 and cache. input_context = 'My cute dog' # "Legal" is one of the control codes for ctrl bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']] input_ids = tokenizer.encode(input_context, return_tensors='pt') # encode input context outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids) # generate sequences without allowing bad_words to be generated """ # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError( "You tried to generate sequences with a model that does not have a LM Head." "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )" ) max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping use_cache = use_cache if use_cache is not None else self.config.use_cache num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty no_repeat_ngram_size = ( no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size ) bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids num_return_sequences = ( num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences ) decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) if input_ids is not None: batch_size = input_ids.shape[0] # overriden by the input batch_size else: batch_size = 1 assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(use_cache, bool), "`use_cache` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." assert temperature > 0, "`temperature` should be strictly positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0) ), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0) ), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictly positive." assert ( isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0 ), "`no_repeat_ngram_size` should be a positive integer." assert ( isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictly positive integer." assert ( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" if input_ids is None: assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) input_ids = torch.full( (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device, ) else: assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids): attention_mask = input_ids.ne(pad_token_id).long() elif attention_mask is None: attention_mask = input_ids.new_ones(input_ids.shape) # set pad_token_id to eos_token_id if not set. Important that this is done after # attention_mask is created if pad_token_id is None and eos_token_id is not None: print( "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) ) pad_token_id = eos_token_id # vocab size if hasattr(self.config, "vocab_size"): vocab_size = self.config.vocab_size elif ( self.config.is_encoder_decoder and hasattr(self.config, "decoder") and hasattr(self.config.decoder, "vocab_size") ): vocab_size = self.config.decoder.vocab_size else: raise ValueError("either self.config.vocab_size or self.config.decoder.vocab_size needs to be defined") # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if self.config.is_encoder_decoder: if decoder_start_token_id is None: # see if BOS token can be used for decoder_start_token_id if bos_token_id is not None: decoder_start_token_id = bos_token_id elif ( hasattr(self.config, "decoder") and hasattr(self.config.decoder, "bos_token_id") and self.config.decoder.bos_token_id is not None ): decoder_start_token_id = self.config.decoder.bos_token_id else: raise ValueError( "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" ) assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) # get encoder and store encoder outputs encoder = self.get_encoder() encoder_outputs: ModelOutput = encoder(input_ids, attention_mask=attention_mask, return_dict=True) # Expand input ids if num_beams > 1 or num_return_sequences > 1 if num_return_sequences > 1 or num_beams > 1: # TODO: make this a call-back function. # input_ids=caps, # input_video_embeds=vfeats, # attention_mask=attention_mask, # token_type_ids=token_type_ids, input_video_embeds = model_kwargs.pop("input_video_embeds", None) token_type_ids = model_kwargs.pop("token_type_ids", None) input_ids_len = input_ids.shape[-1] input_ids = input_ids.unsqueeze(1).expand( batch_size, effective_batch_mult * num_beams, input_ids_len) input_video_embeds_len, input_video_embeds_hidden = input_video_embeds.size(1), input_video_embeds.size(2) input_video_embeds = input_video_embeds.unsqueeze(1).expand( batch_size, effective_batch_mult * num_beams, input_video_embeds_len, input_video_embeds_hidden) attention_mask_from_len, attention_mask_to_len = attention_mask.size(1), attention_mask.size(2) attention_mask = attention_mask.unsqueeze(1).expand( batch_size, effective_batch_mult * num_beams, attention_mask_from_len, attention_mask_to_len ) token_type_ids_len = token_type_ids.size(1) token_type_ids = token_type_ids.unsqueeze(1).expand( batch_size, effective_batch_mult * num_beams, token_type_ids_len ) # contiguous ... input_ids = input_ids.contiguous().view( effective_batch_size * num_beams, input_ids_len ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) input_video_embeds = input_video_embeds.contiguous().view( effective_batch_size * num_beams, input_video_embeds_len, input_video_embeds_hidden) attention_mask = attention_mask.contiguous().view( effective_batch_size * num_beams, attention_mask_from_len, attention_mask_to_len ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) token_type_ids = token_type_ids.contiguous().view( effective_batch_size * num_beams, token_type_ids_len ) model_kwargs["input_video_embeds"] = input_video_embeds model_kwargs["token_type_ids"] = token_type_ids if self.config.is_encoder_decoder: device = next(self.parameters()).device if decoder_input_ids is not None: # give initial decoder input ids input_ids = decoder_input_ids.repeat(effective_batch_size * num_beams, 1).to(device) else: # create empty decoder input_ids input_ids = torch.full( (effective_batch_size * num_beams, 1), decoder_start_token_id, dtype=torch.long, device=device, ) cur_len = input_ids.shape[-1] assert ( batch_size == encoder_outputs.last_hidden_state.shape[0] ), f"expected encoder_outputs.last_hidden_state to have 1st dimension bs={batch_size}, got {encoder_outputs.last_hidden_state.shape[0]} " # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) expanded_batch_idxs = ( torch.arange(batch_size) .view(-1, 1) .repeat(1, num_beams * effective_batch_mult) .view(-1) .to(input_ids.device) ) # expand encoder_outputs encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.index_select( 0, expanded_batch_idxs ) # save encoder_outputs in `model_kwargs` model_kwargs["encoder_outputs"] = encoder_outputs else: cur_len = input_ids.shape[-1] assert ( cur_len < max_length ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" if num_beams > 1: output = self._generate_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, early_stopping=early_stopping, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, pad_token_id=pad_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, num_return_sequences=num_return_sequences, length_penalty=length_penalty, num_beams=num_beams, vocab_size=vocab_size, attention_mask=attention_mask, use_cache=use_cache, model_kwargs=model_kwargs, ) else: output = self._generate_no_beam_search( input_ids, cur_len=cur_len, max_length=max_length, min_length=min_length, do_sample=do_sample, temperature=temperature, top_k=top_k, top_p=top_p, repetition_penalty=repetition_penalty, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, pad_token_id=pad_token_id, eos_token_id=eos_token_id, batch_size=effective_batch_size, attention_mask=attention_mask, use_cache=use_cache, model_kwargs=model_kwargs, ) return output def _generate_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, early_stopping, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, pad_token_id, eos_token_id, batch_size, num_return_sequences, length_penalty, num_beams, vocab_size, attention_mask, use_cache, model_kwargs, ): """Generate sequences for each example with beam search.""" # generated hypotheses generated_hyps = [ BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping) for _ in range(batch_size) ] # scores for each sentence in the beam beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times if do_sample is False: beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) # cache compute states past = None # done sentences done = [False for _ in range(batch_size)] while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs ) outputs = self(**model_inputs, return_dict=True) # (batch_size * num_beams, cur_len, vocab_size) next_token_logits = outputs.logits[:, -1, :] # (batch_size * num_beams, vocab_size) # if model has past, then set the past variable to speed up decoding if "past_key_values" in outputs: past = outputs.past_key_values elif "mems" in outputs: past = outputs.mems if self.config.is_encoder_decoder and do_sample is False: # TODO (PVP) still a bit hacky here - there might be a better solution next_token_logits = self.adjust_logits_during_generation( next_token_logits, cur_len=cur_len, max_length=max_length ) scores = F.log_softmax(next_token_logits, dim=-1) # (batch_size * num_beams, vocab_size) scores = self.postprocess_next_token_scores( scores=scores, input_ids=input_ids, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, cur_len=cur_len, min_length=min_length, max_length=max_length, eos_token_id=eos_token_id, repetition_penalty=repetition_penalty, batch_size=batch_size, num_beams=num_beams, ) assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format( scores.shape, (batch_size * num_beams, vocab_size) ) if do_sample: _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) # Temperature if temperature != 1.0: _scores = _scores / temperature # Top-p/top-k filtering _scores = top_k_top_p_filtering( _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2 ) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together to sample from all beam_idxs _scores = _scores.contiguous().view( batch_size, num_beams * vocab_size ) # (batch_size, num_beams * vocab_size) # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search) probs = F.softmax(_scores, dim=-1) next_tokens = torch.multinomial(probs, num_samples=2 * num_beams) # (batch_size, num_beams * 2) # Compute next scores next_scores = torch.gather(_scores, -1, next_tokens) # (batch_size, num_beams * 2) # sort the sampled vector to make sure that the first num_beams samples are the best next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1) next_tokens = torch.gather(next_tokens, -1, next_scores_indices) # (batch_size, num_beams * 2) else: next_scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) # re-organize to group the beam together (we are keeping top hypothesis accross beams) next_scores = next_scores.view( batch_size, num_beams * vocab_size ) # (batch_size, num_beams * vocab_size) next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams) # next batch beam content next_batch_beam = [] # for each sentence for batch_idx in range(batch_size): # if we are done with this sentence, add a pad token if done[batch_idx]: assert ( len(generated_hyps[batch_idx]) >= num_beams ), "Batch can only be done if at least {} beams have been generated".format(num_beams) assert ( eos_token_id is not None and pad_token_id is not None ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined" next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue # next sentence beam content, this will get added to next_batch_beam next_sent_beam = [] # next tokens for this sentence for beam_token_rank, (beam_token_id, beam_token_score) in enumerate( zip(next_tokens[batch_idx], next_scores[batch_idx]) ): # get beam and token IDs beam_id = beam_token_id // vocab_size token_id = beam_token_id % vocab_size effective_beam_id = batch_idx * num_beams + beam_id # add to generated hypotheses if end of sentence if (eos_token_id is not None) and (token_id.item() == eos_token_id): # if beam_token does not belong to top num_beams tokens, it should not be added is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams if is_beam_token_worse_than_top_num_beams: continue generated_hyps[batch_idx].add( input_ids[effective_beam_id].clone(), beam_token_score.item(), ) else: # add next predicted token since it is not eos_token next_sent_beam.append((beam_token_score, token_id, effective_beam_id)) # once the beam for next step is full, don't add more tokens to it. if len(next_sent_beam) == num_beams: break # Check if we are done so that we can save a pad step if all(done) done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done( next_scores[batch_idx].max().item(), cur_len ) # update next beam content assert len(next_sent_beam) == num_beams, "Beam should always be full" next_batch_beam.extend(next_sent_beam) assert len(next_batch_beam) == num_beams * (batch_idx + 1), "We should have added num_beams each step" # stop when we are done with each sentence if all(done): break # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) beam_tokens = input_ids.new([x[1] for x in next_batch_beam]) beam_idx = input_ids.new([x[2] for x in next_batch_beam]) # re-order batch and update current length input_ids = input_ids[beam_idx, :] input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1) cur_len = cur_len + 1 # re-order internal states if past is not None: past = self._reorder_cache(past, beam_idx) # extend attention_mask for new generated input if only decoder # (huxu): move out since we trim attention_mask by ourselves. # if self.config.is_encoder_decoder is False: # attention_mask = torch.cat( # [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 # ) # finalize all open beam hypotheses and add to generated hypotheses for batch_idx in range(batch_size): if done[batch_idx]: continue # test that beam scores match previously calculated scores if not eos and batch_idx not done if eos_token_id is not None and all( (token_id % vocab_size).item() != eos_token_id for token_id in next_tokens[batch_idx] ): assert torch.all( next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx] ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format( next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx], ) # need to add best num_beams hypotheses to generated hyps for beam_id in range(num_beams): effective_beam_id = batch_idx * num_beams + beam_id final_score = beam_scores[effective_beam_id].item() final_tokens = input_ids[effective_beam_id] generated_hyps[batch_idx].add(final_tokens, final_score) # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch output_batch_size = batch_size if do_sample else batch_size * num_return_sequences output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences # select the best hypotheses sent_lengths = input_ids.new(output_batch_size) best = [] # retrieve best hypotheses for i, hypotheses in enumerate(generated_hyps): sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0]) for j in range(output_num_return_sequences_per_batch): effective_batch_idx = output_num_return_sequences_per_batch * i + j best_hyp = sorted_hyps.pop()[1] sent_lengths[effective_batch_idx] = len(best_hyp) best.append(best_hyp) # prepare for adding eos sent_max_len = min(sent_lengths.max().item() + 1, max_length) decoded = input_ids.new(output_batch_size, sent_max_len) # shorter batches are padded if needed if sent_lengths.min().item() != sent_lengths.max().item(): assert pad_token_id is not None, "`pad_token_id` has to be defined" decoded.fill_(pad_token_id) # fill with hypotheses and eos_token_id if the latter fits in for i, hypo in enumerate(best): decoded[i, : sent_lengths[i]] = hypo if sent_lengths[i] < max_length: decoded[i, sent_lengths[i]] = eos_token_id return decoded def _generate_no_beam_search( self, input_ids, cur_len, max_length, min_length, do_sample, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, pad_token_id, eos_token_id, batch_size, attention_mask, use_cache, model_kwargs, ): """Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ # length of generated sentences / unfinished sentences unfinished_sents = input_ids.new(batch_size).fill_(1) sent_lengths = input_ids.new(batch_size).fill_(max_length) past = None while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_kwargs ) outputs = self(**model_inputs, return_dict=True) next_token_logits = outputs.logits[:, -1, :] scores = self.postprocess_next_token_scores( scores=next_token_logits, input_ids=input_ids, no_repeat_ngram_size=no_repeat_ngram_size, bad_words_ids=bad_words_ids, cur_len=cur_len, min_length=min_length, max_length=max_length, eos_token_id=eos_token_id, repetition_penalty=repetition_penalty, batch_size=batch_size, num_beams=1, ) # if model has past, then set the past variable to speed up decoding if "past_key_values" in outputs: past = outputs.past_key_values elif "mems" in outputs: past = outputs.mems if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: scores = scores / temperature # Top-p/top-k filtering next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p) # Sample probs = F.softmax(next_token_logscores, dim=-1) next_token = torch.multinomial(probs, num_samples=1).squeeze(1) else: # Greedy decoding next_token = torch.argmax(next_token_logits, dim=-1) # print(next_token_logits[0,next_token[0]], next_token_logits[0,eos_token_id]) # update generations and finished sentences if eos_token_id is not None: # pad finished sentences if eos_token_id exist tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) else: tokens_to_add = next_token # add token and increase length by one input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) cur_len = cur_len + 1 if eos_token_id is not None: eos_in_sents = tokens_to_add == eos_token_id # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool() sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len) # unfinished_sents is set to zero if eos in sentence unfinished_sents.mul_((~eos_in_sents).long()) # stop when there is a in each sentence, or if we exceed the maximul length if unfinished_sents.max() == 0: break # extend attention_mask for new generated input if only decoder # if self.config.is_encoder_decoder is False: # attention_mask = torch.cat( # [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 # ) return input_ids ================================================ FILE: examples/MMPT/mmpt/models/transformermodel.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Copyright (c) Facebook, Inc. All Rights Reserved import torch from torch import nn try: from transformers.modeling_bert import ( BertPreTrainedModel, BertModel, BertEncoder, BertPredictionHeadTransform, ) except ImportError: pass from ..modules import VideoTokenMLP, MMBertEmbeddings # --------------- fine-tuning models --------------- class MMBertForJoint(BertPreTrainedModel): """A BertModel with isolated attention mask to separate modality.""" def __init__(self, config): super().__init__(config) self.videomlp = VideoTokenMLP(config) self.bert = MMBertModel(config) self.init_weights() def forward( self, input_ids=None, input_video_embeds=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, next_sentence_label=None, output_attentions=None, output_hidden_states=None, return_dict=None, separate_forward_split=None, ): return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) video_tokens = self.videomlp(input_video_embeds) outputs = self.bert( input_ids, video_tokens, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, separate_forward_split=separate_forward_split, ) return outputs class MMBertForTokenClassification(BertPreTrainedModel): """A BertModel similar to MMJointUni, with extra wrapper layer to be fine-tuned from other pretrained MMFusion model.""" def __init__(self, config): super().__init__(config) self.videomlp = VideoTokenMLP(config) self.bert = MMBertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) # TODO(huxu): 779 is the number of classes for COIN: move to config? self.classifier = nn.Linear(config.hidden_size, 779) self.init_weights() def forward( self, input_ids=None, input_video_embeds=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, next_sentence_label=None, output_attentions=None, output_hidden_states=None, return_dict=None, separate_forward_split=None, ): return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) video_tokens = self.videomlp(input_video_embeds) outputs = self.bert( input_ids, video_tokens, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, separate_forward_split=separate_forward_split, ) return (self.classifier(outputs[0]),) # ------------ pre-training models ---------------- class MMBertForEncoder(BertPreTrainedModel): """A BertModel for Contrastive Learning.""" def __init__(self, config): super().__init__(config) self.videomlp = VideoTokenMLP(config) self.bert = MMBertModel(config) self.init_weights() def forward( self, input_ids=None, input_video_embeds=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) if input_video_embeds is not None: video_tokens = self.videomlp(input_video_embeds) else: video_tokens = None outputs = self.bert( input_ids, video_tokens, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) return outputs class MMBertForMFMMLM(BertPreTrainedModel): """A BertModel with shared prediction head on MFM-MLM.""" def __init__(self, config): super().__init__(config) self.videomlp = VideoTokenMLP(config) self.bert = MMBertModel(config) self.cls = MFMMLMHead(config) self.hidden_size = config.hidden_size self.init_weights() def get_output_embeddings(self): return self.cls.predictions.decoder def forward( self, input_ids=None, input_video_embeds=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, masked_frame_labels=None, target_video_hidden_states=None, non_masked_frame_mask=None, masked_lm_labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) if input_video_embeds is not None: video_tokens = self.videomlp(input_video_embeds) else: video_tokens = None if target_video_hidden_states is not None: target_video_hidden_states = self.videomlp( target_video_hidden_states) non_masked_frame_hidden_states = video_tokens.masked_select( non_masked_frame_mask.unsqueeze(-1) ).view(-1, self.hidden_size) outputs = self.bert( input_ids, video_tokens, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = outputs[0] mfm_scores, prediction_scores = None, None if masked_frame_labels is not None and masked_lm_labels is not None: # split the sequence. text_offset = masked_frame_labels.size(1) + 1 # [CLS] video_sequence_output = sequence_output[ :, 1:text_offset ] # remove [SEP] as not in video_label. text_sequence_output = torch.cat( [sequence_output[:, :1], sequence_output[:, text_offset:]], dim=1 ) hidden_size = video_sequence_output.size(-1) selected_video_output = video_sequence_output.masked_select( masked_frame_labels.unsqueeze(-1) ).view(-1, hidden_size) # only compute select tokens to training to speed up. hidden_size = text_sequence_output.size(-1) # masked_lm_labels = masked_lm_labels.reshape(-1) labels_mask = masked_lm_labels != -100 selected_text_output = text_sequence_output.masked_select( labels_mask.unsqueeze(-1) ).view(-1, hidden_size) mfm_scores, prediction_scores = self.cls( selected_video_output, target_video_hidden_states, non_masked_frame_hidden_states, selected_text_output, ) output = ( mfm_scores, prediction_scores, ) + outputs return output class BertMFMMLMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. self.decoder = nn.Linear( config.hidden_size, config.vocab_size, bias=False) self.bias = nn.Parameter(torch.zeros(config.vocab_size)) # Need a link between the two variables so that the bias is correctly # resized with `resize_token_embeddings` self.decoder.bias = self.bias def forward( self, video_hidden_states=None, target_video_hidden_states=None, non_masked_frame_hidden_states=None, text_hidden_states=None, ): video_logits, text_logits = None, None if video_hidden_states is not None: video_hidden_states = self.transform(video_hidden_states) non_masked_frame_logits = torch.mm( video_hidden_states, non_masked_frame_hidden_states.transpose(1, 0) ) masked_frame_logits = torch.bmm( video_hidden_states.unsqueeze(1), target_video_hidden_states.unsqueeze(-1), ).squeeze(-1) video_logits = torch.cat( [masked_frame_logits, non_masked_frame_logits], dim=1 ) if text_hidden_states is not None: text_hidden_states = self.transform(text_hidden_states) text_logits = self.decoder(text_hidden_states) return video_logits, text_logits class MFMMLMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertMFMMLMPredictionHead(config) def forward( self, video_hidden_states=None, target_video_hidden_states=None, non_masked_frame_hidden_states=None, text_hidden_states=None, ): video_logits, text_logits = self.predictions( video_hidden_states, target_video_hidden_states, non_masked_frame_hidden_states, text_hidden_states, ) return video_logits, text_logits class MMBertForMTM(MMBertForMFMMLM): def __init__(self, config): BertPreTrainedModel.__init__(self, config) self.videomlp = VideoTokenMLP(config) self.bert = MMBertModel(config) self.cls = MTMHead(config) self.hidden_size = config.hidden_size self.init_weights() class BertMTMPredictionHead(nn.Module): def __init__(self, config): super().__init__() self.transform = BertPredictionHeadTransform(config) self.decoder = nn.Linear( config.hidden_size, config.vocab_size, bias=False) def forward( self, video_hidden_states=None, target_video_hidden_states=None, non_masked_frame_hidden_states=None, text_hidden_states=None, ): non_masked_frame_hidden_states = non_masked_frame_hidden_states.transpose(1, 0) video_logits, text_logits = None, None if video_hidden_states is not None: video_hidden_states = self.transform(video_hidden_states) masked_frame_logits = torch.bmm( video_hidden_states.unsqueeze(1), target_video_hidden_states.unsqueeze(-1), ).squeeze(-1) non_masked_frame_logits = torch.mm( video_hidden_states, non_masked_frame_hidden_states ) video_on_vocab_logits = self.decoder(video_hidden_states) video_logits = torch.cat([ masked_frame_logits, non_masked_frame_logits, video_on_vocab_logits], dim=1) if text_hidden_states is not None: text_hidden_states = self.transform(text_hidden_states) # text first so label does not need to be shifted. text_on_vocab_logits = self.decoder(text_hidden_states) text_on_video_logits = torch.mm( text_hidden_states, non_masked_frame_hidden_states ) text_logits = torch.cat([ text_on_vocab_logits, text_on_video_logits ], dim=1) return video_logits, text_logits class MTMHead(nn.Module): def __init__(self, config): super().__init__() self.predictions = BertMTMPredictionHead(config) def forward( self, video_hidden_states=None, target_video_hidden_states=None, non_masked_frame_hidden_states=None, text_hidden_states=None, ): video_logits, text_logits = self.predictions( video_hidden_states, target_video_hidden_states, non_masked_frame_hidden_states, text_hidden_states, ) return video_logits, text_logits class MMBertModel(BertModel): """MMBertModel has MMBertEmbedding to support video tokens.""" def __init__(self, config, add_pooling_layer=True): super().__init__(config) # overwrite embedding self.embeddings = MMBertEmbeddings(config) self.encoder = MultiLayerAttentionMaskBertEncoder(config) self.init_weights() def forward( self, input_ids=None, input_video_embeds=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, separate_forward_split=None, ): output_attentions = ( output_attentions if output_attentions is not None else self.config.output_attentions ) output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = ( return_dict if return_dict is not None else self.config.use_return_dict ) if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids " "and inputs_embeds at the same time" ) elif input_ids is not None: if input_video_embeds is not None: input_shape = ( input_ids.size(0), input_ids.size(1) + input_video_embeds.size(1), ) else: input_shape = ( input_ids.size(0), input_ids.size(1), ) elif inputs_embeds is not None: if input_video_embeds is not None: input_shape = ( inputs_embeds.size(0), inputs_embeds.size(1) + input_video_embeds.size(1), ) else: input_shape = ( input_ids.size(0), input_ids.size(1), ) else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None \ else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros( input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions # [batch_size, from_seq_length, to_seq_length] # ourselves in which case # we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = \ self.get_extended_attention_mask( attention_mask, input_shape, device) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to # [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: ( encoder_batch_size, encoder_sequence_length, _, ) = encoder_hidden_states.size() encoder_hidden_shape = ( encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones( encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask ) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or # [num_hidden_layers x num_heads] # and head_mask is converted to shape # [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask( head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids, input_video_embeds, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, ) if separate_forward_split is not None: split_embedding_output = \ embedding_output[:, :separate_forward_split] split_extended_attention_mask = extended_attention_mask[ :, :, :, :separate_forward_split, :separate_forward_split ] split_encoder_outputs = self.encoder( split_embedding_output, attention_mask=split_extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) assert ( len(split_encoder_outputs) <= 2 ), "we do not support merge on attention for now." encoder_outputs = [] encoder_outputs.append([split_encoder_outputs[0]]) if len(split_encoder_outputs) == 2: encoder_outputs.append([]) for _all_hidden_states in split_encoder_outputs[1]: encoder_outputs[-1].append([_all_hidden_states]) split_embedding_output = \ embedding_output[:, separate_forward_split:] split_extended_attention_mask = extended_attention_mask[ :, :, :, separate_forward_split:, separate_forward_split: ] split_encoder_outputs = self.encoder( split_embedding_output, attention_mask=split_extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) assert ( len(split_encoder_outputs) <= 2 ), "we do not support merge on attention for now." encoder_outputs[0].append(split_encoder_outputs[0]) encoder_outputs[0] = torch.cat(encoder_outputs[0], dim=1) if len(split_encoder_outputs) == 2: for layer_idx, _all_hidden_states in enumerate( split_encoder_outputs[1] ): encoder_outputs[1][layer_idx].append(_all_hidden_states) encoder_outputs[1][layer_idx] = torch.cat( encoder_outputs[1][layer_idx], dim=1 ) encoder_outputs = tuple(encoder_outputs) else: encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = ( self.pooler(sequence_output) if self.pooler is not None else None ) return (sequence_output, pooled_output) + encoder_outputs[1:] def get_extended_attention_mask(self, attention_mask, input_shape, device): """This is borrowed from `modeling_utils.py` with the support of multi-layer attention masks. The second dim is expected to be number of layers. See `MMAttentionMaskProcessor`. Makes broadcastable attention and causal masks so that future and masked tokens are ignored. Arguments: attention_mask (:obj:`torch.Tensor`): Mask with ones indicating tokens to attend to, zeros for tokens to ignore. input_shape (:obj:`Tuple[int]`): The shape of the input to the model. device: (:obj:`torch.device`): The device of the input to the model. Returns: :obj:`torch.Tensor` The extended attention mask, \ with a the same dtype as :obj:`attention_mask.dtype`. """ # We can provide a self-attention mask of dimensions # [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable # to all heads. if attention_mask.dim() == 4: extended_attention_mask = attention_mask[:, :, None, :, :] extended_attention_mask = extended_attention_mask.to( dtype=self.dtype ) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) \ * -10000.0 return extended_attention_mask else: return super().get_extended_attention_mask( attention_mask, input_shape, device ) class MultiLayerAttentionMaskBertEncoder(BertEncoder): """extend BertEncoder with the capability of multiple layers of attention mask.""" def forward( self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=False, ): all_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None for i, layer_module in enumerate(self.layer): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_head_mask = head_mask[i] if head_mask is not None else None layer_attention_mask = ( attention_mask[:, i, :, :, :] if attention_mask.dim() == 5 else attention_mask ) if getattr(self.config, "gradient_checkpointing", False): def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer_module), hidden_states, layer_attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, ) else: layer_outputs = layer_module( hidden_states, layer_attention_mask, layer_head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions, ) hidden_states = layer_outputs[0] if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) return tuple( v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None ) ================================================ FILE: examples/MMPT/mmpt/modules/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .mm import * try: from .expmm import * except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/modules/mm.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Copyright (c) Facebook, Inc. All Rights Reserved import torch from torch import nn try: from transformers.modeling_bert import ( BertEmbeddings, ACT2FN, ) except ImportError: pass class VideoTokenMLP(nn.Module): def __init__(self, config): super().__init__() input_dim = config.input_dim if hasattr(config, "input_dim") else 512 self.linear1 = nn.Linear(input_dim, config.hidden_size) self.LayerNorm = nn.LayerNorm(config.hidden_size) self.activation = ACT2FN[config.hidden_act] self.linear2 = nn.Linear(config.hidden_size, config.hidden_size) def forward(self, hidden_states): hidden_states = self.linear1(hidden_states) hidden_states = self.activation(hidden_states) hidden_states = self.LayerNorm(hidden_states) hidden_states = self.linear2(hidden_states) return hidden_states class MMBertEmbeddings(BertEmbeddings): def __init__(self, config): super().__init__(config) self.max_video_len = config.max_video_len if hasattr(config, "use_seg_emb") and config.use_seg_emb: """the original VLM paper uses seg_embeddings for temporal space. although not used it changed the randomness of initialization. we keep it for reproducibility. """ self.seg_embeddings = nn.Embedding(256, config.hidden_size) def forward( self, input_ids, input_video_embeds, token_type_ids=None, position_ids=None, inputs_embeds=None, ): input_tensor = input_ids if input_ids is not None else inputs_embeds if input_video_embeds is not None: input_shape = ( input_tensor.size(0), input_tensor.size(1) + input_video_embeds.size(1), ) else: input_shape = (input_tensor.size(0), input_tensor.size(1)) if position_ids is None: """ Auto skip position embeddings for text only case. use cases: (1) action localization and segmentation: feed in len-1 dummy video token needs text part to skip input_video_embeds.size(1) for the right position_ids for video [SEP] and rest text tokens. (2) MMFusionShare for two forward passings: in `forward_text`: input_video_embeds is None. need to skip video [SEP] token. # video_len + 1: [CLS] + video_embed # self.max_video_len + 1: [SEP] for video. # self.max_video_len + 2: [SEP] for video. # self.max_video_len + input_ids.size(1): rest for text. """ if input_video_embeds is not None: video_len = input_video_embeds.size(1) starting_offset = self.max_video_len + 1 # video [SEP] ending_offset = self.max_video_len + input_ids.size(1) else: video_len = 0 starting_offset = self.max_video_len + 2 # first text token. ending_offset = self.max_video_len + input_ids.size(1) + 1 position_ids = torch.cat([ self.position_ids[:, :video_len + 1], self.position_ids[:, starting_offset:ending_offset] ], dim=1) if token_type_ids is None: token_type_ids = torch.zeros( input_shape, dtype=torch.long, device=self.position_ids.device ) """ the format of input_ids is [CLS] [SEP] caption [SEP] padding. the goal is to build [CLS] video tokens [SEP] caption [SEP] . """ if inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) if input_video_embeds is not None: inputs_mm_embeds = torch.cat([ inputs_embeds[:, :1], input_video_embeds, inputs_embeds[:, 1:] ], dim=1) else: # text only for `MMFusionShare`. inputs_mm_embeds = inputs_embeds position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) embeddings = inputs_mm_embeds + position_embeddings embeddings += token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) return embeddings class AlignHead(nn.Module): """this will load pre-trained weights for NSP, which is desirable.""" def __init__(self, config): super().__init__() self.seq_relationship = nn.Linear(config.hidden_size, 2) def forward(self, dropout_pooled_output): logits = self.seq_relationship(dropout_pooled_output) return logits ================================================ FILE: examples/MMPT/mmpt/modules/retri.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import numpy as np import pickle import time try: import faiss except ImportError: pass from collections import defaultdict from ..utils import get_local_rank, print_on_rank0 class VectorRetriever(object): """ How2 Video Retriver. Reference usage of FAISS: https://github.com/fairinternal/fairseq-py/blob/paraphrase_pretraining/fairseq/data/multilingual_faiss_dataset.py """ def __init__(self, hidden_size, cent, db_type, examples_per_cent_to_train): if db_type == "flatl2": quantizer = faiss.IndexFlatL2(hidden_size) # the other index self.db = faiss.IndexIVFFlat( quantizer, hidden_size, cent, faiss.METRIC_L2) elif db_type == "pq": self.db = faiss.index_factory( hidden_size, f"IVF{cent}_HNSW32,PQ32" ) else: raise ValueError("unknown type of db", db_type) self.train_thres = cent * examples_per_cent_to_train self.train_cache = [] self.train_len = 0 self.videoid_to_vectoridx = {} self.vectoridx_to_videoid = None self.make_direct_maps_done = False def make_direct_maps(self): faiss.downcast_index(self.db).make_direct_map() def __len__(self): return self.db.ntotal def save(self, out_dir): faiss.write_index( self.db, os.path.join(out_dir, "faiss_idx") ) with open( os.path.join( out_dir, "videoid_to_vectoridx.pkl"), "wb") as fw: pickle.dump( self.videoid_to_vectoridx, fw, protocol=pickle.HIGHEST_PROTOCOL ) def load(self, out_dir): fn = os.path.join(out_dir, "faiss_idx") self.db = faiss.read_index(fn) with open( os.path.join(out_dir, "videoid_to_vectoridx.pkl"), "rb") as fr: self.videoid_to_vectoridx = pickle.load(fr) def add(self, hidden_states, video_ids, last=False): assert len(hidden_states) == len(video_ids), "{}, {}".format( str(len(hidden_states)), str(len(video_ids))) assert len(hidden_states.shape) == 2 assert hidden_states.dtype == np.float32 valid_idx = [] for idx, video_id in enumerate(video_ids): if video_id not in self.videoid_to_vectoridx: valid_idx.append(idx) self.videoid_to_vectoridx[video_id] = \ len(self.videoid_to_vectoridx) hidden_states = hidden_states[valid_idx] if not self.db.is_trained: self.train_cache.append(hidden_states) self.train_len += hidden_states.shape[0] if self.train_len < self.train_thres: return self.finalize_training() else: self.db.add(hidden_states) def finalize_training(self): hidden_states = np.concatenate(self.train_cache, axis=0) del self.train_cache local_rank = get_local_rank() if local_rank == 0: start = time.time() print("training db on", self.train_thres, "/", self.train_len) self.db.train(hidden_states[:self.train_thres]) if local_rank == 0: print("training db for", time.time() - start) self.db.add(hidden_states) def search( self, query_hidden_states, orig_dist, ): if len(self.videoid_to_vectoridx) != self.db.ntotal: raise ValueError( "cannot search: size mismatch in-between index and db", len(self.videoid_to_vectoridx), self.db.ntotal ) if self.vectoridx_to_videoid is None: self.vectoridx_to_videoid = { self.videoid_to_vectoridx[videoid]: videoid for videoid in self.videoid_to_vectoridx } assert len(self.vectoridx_to_videoid) \ == len(self.videoid_to_vectoridx) # MultilingualFaissDataset uses the following; not sure the purpose. # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) queried_dist, index = self.db.search(query_hidden_states, 1) queried_dist, index = queried_dist[:, 0], index[:, 0] outputs = np.array( [self.vectoridx_to_videoid[_index] if _index != -1 else (-1, -1, -1) for _index in index], dtype=np.int32) outputs[queried_dist <= orig_dist] = -1 return outputs def search_by_video_ids( self, video_ids, retri_factor ): if len(self.videoid_to_vectoridx) != self.db.ntotal: raise ValueError( len(self.videoid_to_vectoridx), self.db.ntotal ) if not self.make_direct_maps_done: self.make_direct_maps() if self.vectoridx_to_videoid is None: self.vectoridx_to_videoid = { self.videoid_to_vectoridx[videoid]: videoid for videoid in self.videoid_to_vectoridx } assert len(self.vectoridx_to_videoid) \ == len(self.videoid_to_vectoridx) query_hidden_states = [] vector_ids = [] for video_id in video_ids: vector_id = self.videoid_to_vectoridx[video_id] vector_ids.append(vector_id) query_hidden_state = self.db.reconstruct(vector_id) query_hidden_states.append(query_hidden_state) query_hidden_states = np.stack(query_hidden_states) # MultilingualFaissDataset uses the following; not sure the reason. # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) _, index = self.db.search(query_hidden_states, retri_factor) outputs = [] for sample_idx, sample in enumerate(index): # the first video_id is always the video itself. cands = [video_ids[sample_idx]] for vector_idx in sample: if vector_idx >= 0 \ and vector_ids[sample_idx] != vector_idx: cands.append( self.vectoridx_to_videoid[vector_idx] ) outputs.append(cands) return outputs class VectorRetrieverDM(VectorRetriever): """ with direct map. How2 Video Retriver. Reference usage of FAISS: https://github.com/fairinternal/fairseq-py/blob/paraphrase_pretraining/fairseq/data/multilingual_faiss_dataset.py """ def __init__( self, hidden_size, cent, db_type, examples_per_cent_to_train ): super().__init__( hidden_size, cent, db_type, examples_per_cent_to_train) self.make_direct_maps_done = False def make_direct_maps(self): faiss.downcast_index(self.db).make_direct_map() self.make_direct_maps_done = True def search( self, query_hidden_states, orig_dist, ): if len(self.videoid_to_vectoridx) != self.db.ntotal: raise ValueError( len(self.videoid_to_vectoridx), self.db.ntotal ) if not self.make_direct_maps_done: self.make_direct_maps() if self.vectoridx_to_videoid is None: self.vectoridx_to_videoid = { self.videoid_to_vectoridx[videoid]: videoid for videoid in self.videoid_to_vectoridx } assert len(self.vectoridx_to_videoid) \ == len(self.videoid_to_vectoridx) # MultilingualFaissDataset uses the following; not sure the reason. # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) queried_dist, index = self.db.search(query_hidden_states, 1) outputs = [] for sample_idx, sample in enumerate(index): # and queried_dist[sample_idx] < thres \ if sample >= 0 \ and queried_dist[sample_idx] < orig_dist[sample_idx]: outputs.append(self.vectoridx_to_videoid[sample]) else: outputs.append(None) return outputs def search_by_video_ids( self, video_ids, retri_factor=8 ): if len(self.videoid_to_vectoridx) != self.db.ntotal: raise ValueError( len(self.videoid_to_vectoridx), self.db.ntotal ) if not self.make_direct_maps_done: self.make_direct_maps() if self.vectoridx_to_videoid is None: self.vectoridx_to_videoid = { self.videoid_to_vectoridx[videoid]: videoid for videoid in self.videoid_to_vectoridx } assert len(self.vectoridx_to_videoid) \ == len(self.videoid_to_vectoridx) query_hidden_states = [] vector_ids = [] for video_id in video_ids: vector_id = self.videoid_to_vectoridx[video_id] vector_ids.append(vector_id) query_hidden_state = self.db.reconstruct(vector_id) query_hidden_states.append(query_hidden_state) query_hidden_states = np.stack(query_hidden_states) # MultilingualFaissDataset uses the following; not sure the reason. # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) _, index = self.db.search(query_hidden_states, retri_factor) outputs = [] for sample_idx, sample in enumerate(index): # the first video_id is always the video itself. cands = [video_ids[sample_idx]] for vector_idx in sample: if vector_idx >= 0 \ and vector_ids[sample_idx] != vector_idx: cands.append( self.vectoridx_to_videoid[vector_idx] ) outputs.append(cands) return outputs class MMVectorRetriever(VectorRetrieverDM): """ multimodal vector retriver: text retrieve video or video retrieve text. """ def __init__(self, hidden_size, cent, db_type, examples_per_cent_to_train): super().__init__( hidden_size, cent, db_type, examples_per_cent_to_train) video_db = self.db super().__init__( hidden_size, cent, db_type, examples_per_cent_to_train) text_db = self.db self.db = {"video": video_db, "text": text_db} self.video_to_videoid = defaultdict(list) def __len__(self): assert self.db["video"].ntotal == self.db["text"].ntotal return self.db["video"].ntotal def make_direct_maps(self): faiss.downcast_index(self.db["video"]).make_direct_map() faiss.downcast_index(self.db["text"]).make_direct_map() def save(self, out_dir): faiss.write_index( self.db["video"], os.path.join(out_dir, "video_faiss_idx") ) faiss.write_index( self.db["text"], os.path.join(out_dir, "text_faiss_idx") ) with open( os.path.join( out_dir, "videoid_to_vectoridx.pkl"), "wb") as fw: pickle.dump( self.videoid_to_vectoridx, fw, protocol=pickle.HIGHEST_PROTOCOL ) def load(self, out_dir): fn = os.path.join(out_dir, "video_faiss_idx") video_db = faiss.read_index(fn) fn = os.path.join(out_dir, "text_faiss_idx") text_db = faiss.read_index(fn) self.db = {"video": video_db, "text": text_db} with open( os.path.join(out_dir, "videoid_to_vectoridx.pkl"), "rb") as fr: self.videoid_to_vectoridx = pickle.load(fr) self.video_to_videoid = defaultdict(list) def add(self, hidden_states, video_ids): """hidden_states is a pair `(video, text)`""" assert len(hidden_states) == len(video_ids), "{}, {}".format( str(len(hidden_states)), str(len(video_ids))) assert len(hidden_states.shape) == 3 assert len(self.video_to_videoid) == 0 valid_idx = [] for idx, video_id in enumerate(video_ids): if video_id not in self.videoid_to_vectoridx: valid_idx.append(idx) self.videoid_to_vectoridx[video_id] = \ len(self.videoid_to_vectoridx) batch_size = hidden_states.shape[0] hidden_states = hidden_states[valid_idx] hidden_states = np.transpose(hidden_states, (1, 0, 2)).copy() if not self.db["video"].is_trained: self.train_cache.append(hidden_states) train_len = batch_size * len(self.train_cache) if train_len < self.train_thres: return hidden_states = np.concatenate(self.train_cache, axis=1) del self.train_cache self.db["video"].train(hidden_states[0, :self.train_thres]) self.db["text"].train(hidden_states[1, :self.train_thres]) self.db["video"].add(hidden_states[0]) self.db["text"].add(hidden_states[1]) def get_clips_by_video_id(self, video_id): if not self.video_to_videoid: for video_id, video_clip, text_clip in self.videoid_to_vectoridx: self.video_to_videoid[video_id].append( (video_id, video_clip, text_clip)) return self.video_to_videoid[video_id] def search( self, video_ids, target_modality, retri_factor=8 ): if len(self.videoid_to_vectoridx) != len(self): raise ValueError( len(self.videoid_to_vectoridx), len(self) ) if not self.make_direct_maps_done: self.make_direct_maps() if self.vectoridx_to_videoid is None: self.vectoridx_to_videoid = { self.videoid_to_vectoridx[videoid]: videoid for videoid in self.videoid_to_vectoridx } assert len(self.vectoridx_to_videoid) \ == len(self.videoid_to_vectoridx) src_modality = "text" if target_modality == "video" else "video" query_hidden_states = [] vector_ids = [] for video_id in video_ids: vector_id = self.videoid_to_vectoridx[video_id] vector_ids.append(vector_id) query_hidden_state = self.db[src_modality].reconstruct(vector_id) query_hidden_states.append(query_hidden_state) query_hidden_states = np.stack(query_hidden_states) # MultilingualFaissDataset uses the following; not sure the reason. # faiss.ParameterSpace().set_index_parameter(self.db, "nprobe", 10) _, index = self.db[target_modality].search( query_hidden_states, retri_factor) outputs = [] for sample_idx, sample in enumerate(index): cands = [] for vector_idx in sample: if vector_idx >= 0: cands.append( self.vectoridx_to_videoid[vector_idx] ) outputs.append(cands) return outputs ================================================ FILE: examples/MMPT/mmpt/modules/vectorpool.py ================================================ # Copyright (c) Facebook, Inc. All Rights Reserved import torch import os import numpy as np import pickle from . import retri from ..utils import get_local_rank class VectorPool(object): """ Base class of retrieval space. """ def __init__(self, config): from transformers import AutoConfig self.hidden_size = AutoConfig.from_pretrained( config.dataset.bert_name).hidden_size self.retriever_cls = getattr(retri, config.retriever_cls) def __call__(self, sample, **kwargs): raise NotImplementedError def build_retriver( self, retriever_cls=None, hidden_size=None, centroids=512, db_type="flatl2", examples_per_cent_to_train=48 ): """merge results from multiple gpus and return a retriver..""" self.retriver = retriever_cls( hidden_size, centroids, db_type, examples_per_cent_to_train) return self.retriver def __repr__(self): if hasattr(self, "retriver"): retriver_name = str(len(self.retriver)) else: retriver_name = "no retriver field yet" return self.__class__.__name__ \ + "(" + retriver_name + ")" class VideoVectorPool(VectorPool): """ average clips of a video as video representation. """ def __init__(self, config): super().__init__(config) self.build_retriver(self.retriever_cls, self.hidden_size) def __call__(self, sample, subsampling, **kwargs): hidden_states = ( sample["pooled_video"] + sample["pooled_text"]) / 2. hidden_states = hidden_states.view( -1, subsampling, hidden_states.size(-1)) hidden_states = torch.mean(hidden_states, dim=1) hidden_states = hidden_states.cpu().detach().numpy() video_ids = [] for offset_idx, video_id in enumerate(sample["video_id"]): if isinstance(video_id, tuple) and len(video_id) == 3: # a sharded video_id. video_id = video_id[0] video_ids.append(video_id) assert len(video_ids) == len(hidden_states) self.retriver.add( hidden_states.astype("float32"), video_ids ) class DistributedVectorPool(VectorPool): """ support sync of multiple gpus/nodes. """ def __init__(self, config): super().__init__(config) self.out_dir = os.path.join( config.fairseq.checkpoint.save_dir, "retri") os.makedirs(self.out_dir, exist_ok=True) self.hidden_states = [] self.video_ids = [] def build_retriver( self, retriever_cls=None, hidden_size=None, centroids=4096, db_type="flatl2", examples_per_cent_to_train=48 ): if retriever_cls is None: retriever_cls = self.retriever_cls if hidden_size is None: hidden_size = self.hidden_size """merge results from multiple gpus and return a retriver..""" if torch.distributed.is_initialized(): self.save() # sync saving. torch.distributed.barrier() world_size = torch.distributed.get_world_size() else: world_size = 1 self.retriver = retriever_cls( hidden_size, centroids, db_type, examples_per_cent_to_train) # each gpu process has its own retriever. for local_rank in range(world_size): if get_local_rank() == 0: print("load local_rank", local_rank) hidden_states, video_ids = self.load(local_rank) hidden_states = hidden_states.astype("float32") self.retriver.add(hidden_states, video_ids) return self.retriver def load(self, local_rank): hidden_states = np.load( os.path.join( self.out_dir, "hidden_state" + str(local_rank) + ".npy" ) ) with open( os.path.join( self.out_dir, "video_id" + str(local_rank) + ".pkl"), "rb") as fr: video_ids = pickle.load(fr) return hidden_states, video_ids def save(self): hidden_states = np.vstack(self.hidden_states) assert len(hidden_states) == len(self.video_ids), "{}, {}".format( len(hidden_states), len(self.video_ids) ) local_rank = torch.distributed.get_rank() \ if torch.distributed.is_initialized() else 0 np.save( os.path.join( self.out_dir, "hidden_state" + str(local_rank) + ".npy"), hidden_states) with open( os.path.join( self.out_dir, "video_id" + str(local_rank) + ".pkl"), "wb") as fw: pickle.dump( self.video_ids, fw, protocol=pickle.HIGHEST_PROTOCOL ) class DistributedVideoVectorPool(DistributedVectorPool): """ average clips of a video as video representation. """ def __call__(self, sample, subsampling, **kwargs): hidden_states = ( sample["pooled_video"] + sample["pooled_text"]) / 2. hidden_states = hidden_states.view( -1, subsampling, hidden_states.size(-1)) hidden_states = torch.mean(hidden_states, dim=1) hidden_states = hidden_states.cpu().detach().numpy() video_ids = [] for offset_idx, video_id in enumerate(sample["video_id"]): if isinstance(video_id, tuple) and len(video_id) == 3: # a sharded video_id. video_id = video_id[0] video_ids.append(video_id) assert len(video_ids) == len(hidden_states) self.hidden_states.append(hidden_states) self.video_ids.extend(video_ids) # ------------ the following are deprecated -------------- class TextClipVectorPool(VectorPool): def __init__(self, config): from transformers import AutoConfig hidden_size = AutoConfig.from_pretrained( config.dataset.bert_name).hidden_size retriever_cls = getattr(retri, config.retriever_cls) self.build_retriver(retriever_cls, hidden_size) def __call__(self, sample, **kwargs): clip_meta = sample["clip_meta"].cpu() assert torch.all(torch.le(clip_meta[:, 4], clip_meta[:, 5])) text_meta = [tuple(item.tolist()) for item in clip_meta[:, 3:]] if hasattr(self, "retriver"): # build_retriver is called. self.retriver.add( sample["pooled_text"].cpu().numpy().astype("float32"), text_meta ) else: raise NotImplementedError class MMClipVectorPool(VectorPool): """ Multimodal Clip-level vector pool. """ def __init__(self, out_dir): """use hidden_states to store `(video, text)`.""" """use video_ids to store `(video_id, start, end)`.""" super().__init__(out_dir) def __call__(self, sample, **kwargs): pooled_video = sample["pooled_video"].cpu().unsqueeze(1).numpy() pooled_text = sample["pooled_text"].cpu().unsqueeze(1).numpy() self.hidden_states.append( np.concatenate([pooled_video, pooled_text], axis=1) ) video_starts = sample["video_start"].cpu() video_ends = sample["video_end"].cpu() assert torch.all(torch.le(video_starts, video_ends)) text_starts = sample["text_start"].cpu() text_ends = sample["text_end"].cpu() assert torch.all(torch.le(text_starts, text_ends)) subsample_size = sample["pooled_video"].size(0) // len(sample["video_id"]) video_ids = [video_id for video_id in sample["video_id"] for _ in range(subsample_size) ] for video_id, video_start, video_end, text_start, text_end in zip( video_ids, video_starts, video_ends, text_starts, text_ends): self.video_ids.append(( video_id, (int(video_start), int(video_end)), (int(text_start), int(text_end)) )) ================================================ FILE: examples/MMPT/mmpt/processors/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .processor import * from .how2processor import * from .how2retriprocessor import * from .dsprocessor import * try: from .rawvideoprocessor import * from .codecprocessor import * from .webvidprocessor import * from .expprocessor import * from .exphow2processor import * from .exphow2retriprocessor import * from .expcodecprocessor import * from .expfeatureencoder import * from .expdsprocessor import * except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/processors/dedupprocessor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import random import json import pickle from tqdm import tqdm import os import numpy as np class CaptionDedupProcessor(object): """remove overlapping of caption sentences(clip). Some statistics: caption: {'t_clip_len': 246.6448431320854, 'video_len': 281.09174795676245, 'clip_tps': 0.8841283727427481, 'video_tps': 0.7821156477732097, 'min_clip_len': 0.0, 'max_clip_len': 398.3, 'mean_clip_len': 3.196580003006861, 'num_clip': 77.15897706301081} raw_caption: {'t_clip_len': 238.95908778424115, 'video_len': 267.5914859862507, 'clip_tps': 2.4941363624267963, 'video_tps': 2.258989769647173, 'min_clip_len': 0.0, 'max_clip_len': 398.3, 'mean_clip_len': 3.0537954186814265, 'num_clip': 78.24986779481756} """ def __init__(self, pkl_file): with open(pkl_file, "rb") as fd: self.data = pickle.load(fd) self.stat = { "t_clip_len": [], "video_len": [], "clip_tps": [], "video_tps": [], "clip_len": [], } def __call__(self): for idx, video_id in enumerate(tqdm(self.data)): caption = json.loads(self.data[video_id]) caption = self._dedup(caption) if idx < 4096: # for the first 4096 examples, compute the statistics. self.save_stat(video_id, caption) self.data[video_id] = json.dumps(caption) self.print_stat() def single(self, video_id): caption = json.loads(self.data[video_id]) for clip_idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): print(start, end, text) print("@" * 100) caption = self._dedup(caption) for clip_idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): print(start, end, text) print("#" * 100) self.save_stat(video_id, caption) self.print_stat() def finalize(self, tgt_fn): with open(tgt_fn, "wb") as fw: pickle.dump(self.data, fw, pickle.HIGHEST_PROTOCOL) def save_stat(self, video_id, caption): video_fn = os.path.join( "data/feat/feat_how2_s3d", video_id + ".npy" ) if os.path.isfile(video_fn): with open(video_fn, "rb", 1) as fr: # 24 is the buffer size. buffered version = np.lib.format.read_magic(fr) shape, fortran, dtype = np.lib.format._read_array_header(fr, version) video_len = shape[0] t_clip_len = 0.0 t_tokens = 0 for idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): clip_len = ( (end - max(caption["end"][idx - 1], start)) if idx > 0 else end - start ) t_clip_len += clip_len t_tokens += len(text.split(" ")) self.stat["clip_len"].append(clip_len) self.stat["t_clip_len"].append(t_clip_len) self.stat["video_len"].append(video_len) self.stat["clip_tps"].append(t_tokens / t_clip_len) self.stat["video_tps"].append(t_tokens / video_len) def print_stat(self): result = { "t_clip_len": np.mean(self.stat["t_clip_len"]), "video_len": np.mean(self.stat["video_len"]), "clip_tps": np.mean(self.stat["clip_tps"]), "video_tps": np.mean(self.stat["video_tps"]), "min_clip_len": min(self.stat["clip_len"]), "max_clip_len": max(self.stat["clip_len"]), "mean_clip_len": np.mean(self.stat["clip_len"]), "num_clip": len(self.stat["clip_len"]) / len(self.stat["video_tps"]), } print(result) def _dedup(self, caption): def random_merge(end_idx, start, end, text, starts, ends, texts): if random.random() > 0.5: # print(clip_idx, "[PARTIAL INTO PREV]", end_idx) # overlapped part goes to the end of previous. ends[-1] = max(ends[-1], start) # ? rest_text = text[end_idx:].strip() if rest_text: starts.append(max(ends[-1], start)) ends.append(max(end, starts[-1])) texts.append(rest_text) else: # goes to the beginning of the current. # strip the previous. left_text = texts[-1][:-end_idx].strip() if left_text: # print(clip_idx, "[PREV PARTIAL INTO CUR]", end_idx) ends[-1] = min(ends[-1], start) texts[-1] = left_text else: # print(clip_idx, "[PREV LEFT NOTHING ALL INTO CUR]", end_idx) starts.pop(-1) ends.pop(-1) texts.pop(-1) starts.append(start) ends.append(end) texts.append(text) starts, ends, texts = [], [], [] for clip_idx, (start, end, text) in enumerate( zip(caption["start"], caption["end"], caption["text"]) ): if not isinstance(text, str): continue text = text.replace("\n", " ").strip() if len(text) == 0: continue starts.append(start) ends.append(end) texts.append(text) break for clip_idx, (start, end, text) in enumerate( zip( caption["start"][clip_idx + 1:], caption["end"][clip_idx + 1:], caption["text"][clip_idx + 1:], ) ): if not isinstance(text, str): continue text = text.replace("\n", " ").strip() if len(text) == 0: continue # print(clip_idx, texts[-5:]) # print(clip_idx, start, end, text) if texts[-1].endswith(text): # subset of prev caption -> merge # print(clip_idx, "[MERGE INTO PREV]") ends[-1] = max(ends[-1], end) elif text.startswith(texts[-1]): # superset of prev caption -> merge # print(clip_idx, "[PREV MERGE INTO CUR]") texts[-1] = text starts[-1] = min(starts[-1], start) ends[-1] = max(ends[-1], end) else: # overlapping or non-overlapping. for end_idx in range(1, len(text) + 1): if texts[-1].endswith(text[:end_idx]): random_merge(end_idx, start, end, text, starts, ends, texts) break else: starts.append(start) ends.append(end) texts.append(text) assert (ends[-1] + 0.001) >= starts[-1] and len( texts[-1] ) > 0, "{} {} {} <- {} {} {}, {} {} {}".format( str(starts[-1]), str(ends[-1]), texts[-1], caption["start"][clip_idx - 1], caption["end"][clip_idx - 1], caption["text"][clip_idx - 1], str(start), str(end), text, ) return {"start": starts, "end": ends, "text": texts} if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="dedup how2 caption") parser.add_argument('--how2dir', default="data/how2") args = parser.parse_args() raw_caption_json = os.path.join(args.how2dir, "raw_caption.json") raw_caption_pickle = os.path.join(args.how2dir, "raw_caption.pkl") raw_caption_dedup_pickle = os.path.join(args.how2dir, "raw_caption_dedup.pkl") def convert_to_pickle(src_fn, tgt_fn): with open(src_fn) as fd: captions = json.load(fd) for video_id in captions: captions[video_id] = json.dumps(captions[video_id]) with open(tgt_fn, "wb") as fw: pickle.dump(captions, fw, pickle.HIGHEST_PROTOCOL) if not os.path.isfile(raw_caption_pickle): convert_to_pickle(raw_caption_json, raw_caption_pickle) deduper = CaptionDedupProcessor(raw_caption_pickle) deduper() deduper.finalize(raw_caption_dedup_pickle) """ # demo deduper = CaptionDedupProcessor("data/how2/raw_caption.pkl") deduper.single("HfIeQ9pzL5U") """ ================================================ FILE: examples/MMPT/mmpt/processors/dsprocessor.py ================================================ # Copyright (c) Facebook, Inc. All Rights Reserved """ Processors for all downstream (ds) tasks. """ import json import os import pickle import random import math import numpy as np import torch from collections import defaultdict from .processor import ( MetaProcessor, VideoProcessor, TextProcessor, Aligner, MMAttentionMask2DProcessor, ) from .how2processor import TextGenerationProcessor # ------------- A General Aligner for all downstream tasks----------------- class DSAligner(Aligner): """ Downstream (DS) aligner shared by all datasets. """ def __call__(self, video_id, video_feature, text_feature, wps=0.7): # random sample a starting sec for video. video_start = 0 video_end = min(len(video_feature), self.max_video_len) # the whole sequence is a single clip. video_clips = {"start": [video_start], "end": [video_end]} text_feature = { "cap": [text_feature], "start": [video_start], "end": [len(text_feature) / wps], } text_clip_indexs = [0] vfeats, vmasks = self._build_video_seq( video_feature, video_clips ) caps, cmasks = self._build_text_seq( text_feature, text_clip_indexs ) return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, "vmasks": vmasks, "video_id": video_id, } class NLGTextProcessor(TextProcessor): """ Also return the original text as ref. """ def __call__(self, text_id): return super().__call__(text_id), text_id class DSNLGAligner(DSAligner): """extend with the capability of 2d mask for generation.""" def __init__(self, config): super().__init__(config) self.attnmasker = MMAttentionMask2DProcessor() from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( self.bert_name, use_fast=self.use_fast, bos_token="[CLS]", eos_token="[SEP]" ) self.tokenizer = tokenizer self.bos_token_id = tokenizer.bos_token_id self.eos_token_id = tokenizer.eos_token_id self.textgen = TextGenerationProcessor(tokenizer) def __call__(self, video_id, video_feature, text_feature): output = super().__call__(video_id, video_feature, text_feature[0]) if self.split == "test": # output.update({"ref": text_feature[1]}) output.update({"ref": self.tokenizer.decode( output["caps"], skip_special_tokens=True)}) text_label = output["caps"] cmasks = torch.BoolTensor([1] * text_label.size(0)) caps = torch.LongTensor([ self.cls_token_id, self.sep_token_id, self.bos_token_id]) else: caps, text_label = self.textgen(output["caps"]) cmasks = output["cmasks"] attention_mask = self.attnmasker( output["vmasks"], cmasks, "textgen") output.update({ "caps": caps, "cmasks": cmasks, "text_label": text_label, "attention_mask": attention_mask, }) return output # -------------------- MSRVTT ------------------------ class MSRVTTMetaProcessor(MetaProcessor): """MSRVTT dataset. reference: `howto100m/msrvtt_dataloader.py` """ def __init__(self, config): super().__init__(config) import pandas as pd data = pd.read_csv(self._get_split_path(config)) # TODO: add a text1ka flag. if config.split == "train" \ and config.full_test_path is not None \ and config.jsfusion_path is not None: # add testing videos from full_test_path not used by jfusion. additional_data = pd.read_csv(config.full_test_path) jsfusion_data = pd.read_csv(config.jsfusion_path) for video_id in additional_data["video_id"]: if video_id not in jsfusion_data["video_id"].values: data = data.append( {"video_id": video_id}, ignore_index=True) if config.dup is not None and config.split == "train": data = data.append([data] * (config.dup - 1), ignore_index=True) self.data = data def __len__(self): return len(self.data) def __getitem__(self, idx): """slightly modify with if condition to combine train/test.""" vid, sentence = None, None vid = self.data["video_id"].values[idx] if "sentence" in self.data: # for testing. sentence = self.data["sentence"].values[idx] else: # for training. sentence = vid return vid, sentence class MSRVTTTextProcessor(TextProcessor): """MSRVTT dataset. reference: `msrvtt_dataloader.py` `MSRVTT_TrainDataLoader`. TODO (huxu): add max_words. """ def __init__(self, config): super().__init__(config) self.sentences = None if config.json_path is not None and config.split == "train": with open(config.json_path) as fd: self.data = json.load(fd) self.sentences = defaultdict(list) for s in self.data["sentences"]: self.sentences[s["video_id"]].append(s["caption"]) def __call__(self, text_id): if self.sentences is not None: rind = random.randint(0, len(self.sentences[text_id]) - 1) sentence = self.sentences[text_id][rind] else: sentence = text_id caption = self.tokenizer(sentence, add_special_tokens=False) return caption["input_ids"] class MSRVTTNLGTextProcessor(MSRVTTTextProcessor): """TODO: change dsaligner and merge to avoid any NLG text processor.""" def __call__(self, text_id): if self.sentences is not None: rind = random.randint(0, len(self.sentences[text_id]) - 1) sentence = self.sentences[text_id][rind] else: sentence = text_id caption = self.tokenizer(sentence, add_special_tokens=False) return caption["input_ids"], sentence class MSRVTTQAMetaProcessor(MetaProcessor): """MSRVTT-QA: retrieval-based multi-choice QA from JSFusion dataset. For simplicity, we use the train retrieval model. reference: `https://github.com/yj-yu/lsmdc` """ def __init__(self, config): super().__init__(config) import pandas as pd csv_data = pd.read_csv(self._get_split_path(config), sep="\t") data = [] for video_id, a1, a2, a3, a4, a5, answer in zip( csv_data["vid_key"].values, csv_data["a1"].values, csv_data["a2"].values, csv_data["a3"].values, csv_data["a4"].values, csv_data["a5"].values, csv_data["answer"].values): video_id = video_id.replace("msr", "video") data.append((video_id, (answer, [a1, a2, a3, a4, a5]))) self.data = data def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx] class MSRVTTQATextProcessor(TextProcessor): """MSRVTT-QA dataset. text_ans is of format `(answer, [a1, a2, a3, a4, a5])`. """ def __call__(self, text_ans): for ans_idx, ans in enumerate(text_ans[1]): if isinstance(ans, str): text_ans[1][ans_idx] = self.tokenizer(ans, add_special_tokens=False)["input_ids"] return text_ans class MSRVTTQAAligner(DSAligner): """MSRVTT dataset. similar to sample in how2. we call __call__ multiple times. """ def __call__(self, video_id, video_feature, text_feature, wps=0.7): caps = [] cmasks = [] answer = text_feature[0] for ans_idx, _text_feature in enumerate(text_feature[1]): output = super().__call__( video_id, video_feature, _text_feature, wps) caps.append(output["caps"]) cmasks.append(output["cmasks"]) output.update({ "caps": torch.stack(caps), "cmasks": torch.stack(cmasks), "answers": torch.LongTensor([answer]), }) return output # -------------------- Youcook ----------------------- class YoucookMetaProcessor(MetaProcessor): """Youcook dataset. reference: `howto100m/youcook_dataloader.py` note that the data can be different as the (1) some videos already in Howto100m are removed. (2) stop words are removed from caption TODO (huxu): make a flag to load the original caption. (see youcookii_annotations_trainval.json). The max_video_len can be 264 and text can be 64 tokens. In reality we may not need that long. see projects/task/youcook.yaml """ def __init__(self, config): super().__init__(config) vfeat_dir = config.vfeat_dir print(self._get_split_path(config)) with open(self._get_split_path(config), "rb") as fd: data = pickle.load(fd) all_valid_video_ids = set( [os.path.splitext(fn)[0] for fn in os.listdir(vfeat_dir)] ) recs = [] video_ids = set() valid_video_ids = set() for rec in data: # filter videos not available. udl_idx = rec["id"].rindex("_") video_id = rec["id"][:udl_idx] video_ids.add(video_id) if video_id in all_valid_video_ids: valid_video_ids.add(video_id) recs.append(rec) print("total video_ids in .pkl", len(video_ids)) print("valid video_ids in .pkl", len(valid_video_ids)) print("please verify {train,val}_list.txt") data = recs self.data = data with open(config.trainval_annotation) as fd: self.youcook_annotation = json.load(fd)["database"] if config.use_annotation_text is True: print("using text in annotation.") self.use_annotation_caption = True else: self.use_annotation_caption = False def __getitem__(self, idx): def _get_video_and_caption(rec): vid = rec["id"] udl_idx = vid.rindex("_") video_id, clip_id = vid[:udl_idx], int(vid[udl_idx + 1:]) clip = self.youcook_annotation[video_id]["annotations"][clip_id] start, end = clip["segment"] if self.use_annotation_caption: caption = clip["sentence"] else: caption = rec["caption"] return (video_id, start, end), caption rec = self.data[idx] video_info, text_info = _get_video_and_caption(rec) return video_info, text_info class YoucookVideoProcessor(VideoProcessor): """video_fn is a tuple of (video_id, start, end) now.""" def __call__(self, video_fn): video_id, start, end = video_fn feat = np.load(os.path.join(self.vfeat_dir, video_id + ".npy")) return feat[start:end] class YoucookNLGMetaProcessor(MetaProcessor): """NLG uses the original split: `train_list.txt` and `val_list.txt` """ def __init__(self, config): super().__init__(config) vfeat_dir = config.vfeat_dir print(self._get_split_path(config)) with open(self._get_split_path(config)) as fd: video_ids = [ line.strip().split("/")[1] for line in fd.readlines()] print("total video_ids in train/val_list.txt", len(video_ids)) all_valid_video_ids = set( [os.path.splitext(fn)[0] for fn in os.listdir(vfeat_dir)] ) video_ids = [ video_id for video_id in video_ids if video_id in all_valid_video_ids] print("valid video_ids in train/val_list.txt", len(video_ids)) with open(config.trainval_annotation) as fd: self.youcook_annotation = json.load(fd)["database"] data = [] for video_id in video_ids: for clip in self.youcook_annotation[video_id]["annotations"]: start, end = clip["segment"] caption = clip["sentence"] data.append(((video_id, start, end), caption)) self.data = data def __getitem__(self, idx): return self.data[idx] # --------------------- CrossTask ------------------------- class CrossTaskMetaProcessor(MetaProcessor): def __init__(self, config): super().__init__(config) np.random.seed(0) # deterministic random split. task_vids = self._get_vids( config.train_csv_path, config.vfeat_dir, config.annotation_path) val_vids = self._get_vids( config.val_csv_path, config.vfeat_dir, config.annotation_path) # filter out those task and vids appear in val_vids. task_vids = { task: [ vid for vid in vids if task not in val_vids or vid not in val_vids[task]] for task, vids in task_vids.items()} primary_info = self._read_task_info(config.primary_path) test_tasks = set(primary_info['steps'].keys()) # if args.use_related: related_info = self._read_task_info(config.related_path) task_steps = {**primary_info['steps'], **related_info['steps']} n_steps = {**primary_info['n_steps'], **related_info['n_steps']} # else: # task_steps = primary_info['steps'] # n_steps = primary_info['n_steps'] all_tasks = set(n_steps.keys()) # filter and keep task in primary or related. task_vids = { task: vids for task, vids in task_vids.items() if task in all_tasks} # vocab-by-step matrix (A) and vocab (M) # (huxu): we do not use BoW. # A, M = self._get_A(task_steps, share="words") train_vids, test_vids = self._random_split( task_vids, test_tasks, config.n_train) print("train_num_videos", sum(len(vids) for vids in train_vids.values())) print("test_num_videos", sum(len(vids) for vids in test_vids.values())) # added by huxu to automatically determine the split. split_map = { "train": train_vids, "valid": test_vids, "test": test_vids } task_vids = split_map[config.split] self.vids = [] for task, vids in task_vids.items(): self.vids.extend([(task, vid) for vid in vids]) self.task_steps = task_steps self.n_steps = n_steps def __getitem__(self, idx): task, vid = self.vids[idx] n_steps = self.n_steps[task] steps = self.task_steps[task] assert len(steps) == n_steps return (task, vid, steps, n_steps), (task, vid, steps, n_steps) def __len__(self): return len(self.vids) def _random_split(self, task_vids, test_tasks, n_train): train_vids = {} test_vids = {} for task, vids in task_vids.items(): if task in test_tasks and len(vids) > n_train: train_vids[task] = np.random.choice( vids, n_train, replace=False).tolist() test_vids[task] = [ vid for vid in vids if vid not in train_vids[task]] else: train_vids[task] = vids return train_vids, test_vids def _get_vids(self, path, vfeat_dir, annotation_path): """refactored from https://github.com/DmZhukov/CrossTask/blob/master/data.py changes: add `vfeat_dir` to check if the video is available. add `annotation_path` to check if the video is available. """ task_vids = {} with open(path, 'r') as f: for line in f: task, vid, url = line.strip().split(',') # double check the video is available. if not os.path.exists( os.path.join(vfeat_dir, vid + ".npy")): continue # double check the annotation is available. if not os.path.exists(os.path.join( annotation_path, task + "_" + vid + ".csv")): continue if task not in task_vids: task_vids[task] = [] task_vids[task].append(vid) return task_vids def _read_task_info(self, path): titles = {} urls = {} n_steps = {} steps = {} with open(path, 'r') as f: idx = f.readline() while idx != '': idx = idx.strip() titles[idx] = f.readline().strip() urls[idx] = f.readline().strip() n_steps[idx] = int(f.readline().strip()) steps[idx] = f.readline().strip().split(',') next(f) idx = f.readline() return { 'title': titles, 'url': urls, 'n_steps': n_steps, 'steps': steps } def _get_A(self, task_steps, share="words"): raise ValueError("running get_A is not allowed for BERT.") """Step-to-component matrices.""" if share == 'words': # share words task_step_comps = { task: [step.split(' ') for step in steps] for task, steps in task_steps.items()} elif share == 'task_words': # share words within same task task_step_comps = { task: [[task+'_'+tok for tok in step.split(' ')] for step in steps] for task, steps in task_steps.items()} elif share == 'steps': # share whole step descriptions task_step_comps = { task: [[step] for step in steps] for task, steps in task_steps.items()} else: # no sharing task_step_comps = { task: [[task+'_'+step] for step in steps] for task, steps in task_steps.items()} # BERT tokenizer here? vocab = [] for task, steps in task_step_comps.items(): for step in steps: vocab.extend(step) vocab = {comp: m for m, comp in enumerate(set(vocab))} M = len(vocab) A = {} for task, steps in task_step_comps.items(): K = len(steps) a = torch.zeros(M, K) for k, step in enumerate(steps): a[[vocab[comp] for comp in step], k] = 1 a /= a.sum(dim=0) A[task] = a return A, M class CrossTaskVideoProcessor(VideoProcessor): def __call__(self, video_fn): task, vid, steps, n_steps = video_fn video_fn = os.path.join(self.vfeat_dir, vid + ".npy") feat = np.load(video_fn) return feat class CrossTaskTextProcessor(TextProcessor): def __call__(self, text_id): task, vid, steps, n_steps = text_id step_ids = [] for step_str in steps: step_ids.append( self.tokenizer(step_str, add_special_tokens=False)["input_ids"] ) return step_ids class CrossTaskAligner(Aligner): """ TODO: it's not clear yet the formulation of the task; finish this later. """ def __init__(self, config): super().__init__(config) self.annotation_path = config.annotation_path self.sliding_window = config.sliding_window self.sliding_window_size = config.sliding_window_size def __call__(self, video_id, video_feature, text_feature): task, vid, steps, n_steps = video_id annot_path = os.path.join( self.annotation_path, task + '_' + vid + '.csv') video_len = len(video_feature) labels = torch.from_numpy(self._read_assignment( video_len, n_steps, annot_path)).float() vfeats, vmasks, targets = [], [], [] # sliding window on video features and targets. for window_start in range(0, video_len, self.sliding_window): video_start = 0 video_end = min(video_len - window_start, self.sliding_window_size) video_clip = {"start": [video_start], "end": [video_end]} vfeat, vmask = self._build_video_seq( video_feature[window_start: window_start + video_end], video_clip ) target = labels[window_start: window_start + video_end] assert len(vfeat) >= len(target), "{},{}".format(len(vfeat), len(target)) # TODO: randomly drop all zero targets for training ? # if self.split == "train" and target.sum() == 0: # continue vfeats.append(vfeat) vmasks.append(vmask) targets.append(target) if (video_len - window_start) <= self.sliding_window_size: break vfeats = torch.stack(vfeats) vmasks = torch.stack(vmasks) targets = torch.cat(targets, dim=0) caps, cmasks = [], [] for step in text_feature: step_text_feature = {"start": [0], "end": [1], "cap": [step]} step_text_clip_index = [0] cap, cmask = self._build_text_seq( step_text_feature, step_text_clip_index ) caps.append(cap) cmasks.append(cmask) caps = torch.stack(caps) cmasks = torch.stack(cmasks) return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, # X for original code. "vmasks": vmasks, "targets": targets, "video_id": vid, "task": task, "video_len": video_len # for later checking. } def _read_assignment(self, T, K, path): """ refactored from https://github.com/DmZhukov/CrossTask/blob/master/data.py Howto interpret contraints on loss that is going to be minimized: lambd is a big number; self.lambd * C is a big number for all valid position (csv stores invalids) def forward(self, O, Y, C): return (Y*(self.lambd * C - self.lsm(O))).mean(dim=0).sum() This will load the csv file and fill-in the step col from start to end rows. """ Y = np.zeros([T, K], dtype=np.uint8) with open(path, 'r') as f: for line in f: step, start, end = line.strip().split(',') start = int(math.floor(float(start))) end = int(math.ceil(float(end))) step = int(step) - 1 Y[start:end, step] = 1 return Y # --------------------- COIN ------------------------- class MetaTextBinarizer(Aligner): def __call__(self, text_feature): text_feature = { "cap": [text_feature], "start": [0.], "end": [100.], } text_clip_indexs = [0] caps, cmasks = self._build_text_seq( text_feature, text_clip_indexs ) return {"caps": caps, "cmasks": cmasks} class COINActionSegmentationMetaProcessor(MetaProcessor): split_map = { "train": "training", "valid": "testing", "test": "testing", } def __init__(self, config): super().__init__(config) with open(self._get_split_path(config)) as fr: database = json.load(fr)["database"] id2label = {} data = [] # filter the data by split. for video_id, rec in database.items(): # always use testing to determine label_set if rec["subset"] == "testing": for segment in rec["annotation"]: id2label[int(segment["id"])] = segment["label"] # text_labels is used for ZS setting self.text_labels = ["none"] * len(id2label) for label_id in id2label: self.text_labels[label_id-1] = id2label[label_id] id2label[0] = "O" print("num of labels", len(id2label)) for video_id, rec in database.items(): if not os.path.isfile(os.path.join(config.vfeat_dir, video_id + ".npy")): continue if rec["subset"] == COINActionSegmentationMetaProcessor.split_map[self.split]: starts, ends, labels = [], [], [] for segment in rec["annotation"]: start, end = segment["segment"] label = int(segment["id"]) starts.append(start) ends.append(end) labels.append(label) data.append( (video_id, {"start": starts, "end": ends, "label": labels})) self.data = data def meta_text_labels(self, config): from transformers import default_data_collator from ..utils import get_local_rank text_processor = TextProcessor(config) binarizer = MetaTextBinarizer(config) # TODO: add prompts to .yaml. text_labels = [label for label in self.text_labels] if get_local_rank() == 0: print(text_labels) outputs = [] for text_label in text_labels: text_feature = text_processor(text_label) outputs.append(binarizer(text_feature)) return default_data_collator(outputs) def __getitem__(self, idx): return self.data[idx] class COINActionSegmentationTextProcessor(TextProcessor): def __call__(self, text_label): return text_label class COINActionSegmentationAligner(Aligner): def __init__(self, config): super().__init__(config) self.sliding_window = config.sliding_window self.sliding_window_size = config.sliding_window_size def __call__(self, video_id, video_feature, text_feature): starts, ends, label_ids = text_feature["start"], text_feature["end"], text_feature["label"] # sliding window. video_len = len(video_feature) vfeats, vmasks, targets = [], [], [] # sliding window on video features and targets. for window_start in range(0, video_len, self.sliding_window): video_start = 0 video_end = min(video_len - window_start, self.sliding_window_size) video_clip = {"start": [video_start], "end": [video_end]} vfeat, vmask = self._build_video_seq( video_feature[window_start: window_start + video_end], video_clip ) # covers video length only. target = torch.full_like(vmask, -100, dtype=torch.long) target[vmask] = 0 for start, end, label_id in zip(starts, ends, label_ids): if (window_start < end) and (start < (window_start + video_end)): start_offset = max(0, math.floor(start) - window_start) end_offset = min(video_end, math.ceil(end) - window_start) target[start_offset:end_offset] = label_id vfeats.append(vfeat) vmasks.append(vmask) targets.append(target) if (video_len - window_start) <= self.sliding_window_size: break vfeats = torch.stack(vfeats) vmasks = torch.stack(vmasks) targets = torch.stack(targets) video_targets = torch.full((video_len,), 0) for start, end, label_id in zip(starts, ends, label_ids): start_offset = max(0, math.floor(start)) end_offset = min(video_len, math.ceil(end)) video_targets[start_offset:end_offset] = label_id caps = torch.LongTensor( [[self.cls_token_id, self.sep_token_id, self.pad_token_id, self.sep_token_id]], ).repeat(vfeats.size(0), 1) cmasks = torch.BoolTensor( [[0, 1, 0, 1]] # pad are valid for attention. ).repeat(vfeats.size(0), 1) return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, # X for original code. "vmasks": vmasks, "targets": targets, "video_id": video_id, "video_len": video_len, # for later checking. "video_targets": video_targets } class DiDeMoMetaProcessor(MetaProcessor): """reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py """ def __init__(self, config): super().__init__(config) assert "test" in self._get_split_path(config), "DiDeMo only supports zero-shot testing for now." with open(self._get_split_path(config)) as data_file: json_data = json.load(data_file) data = [] for record in json_data: data.append((record["video"], record["description"])) self.data = data def __len__(self): return len(self.data) def __getitem__(self, idx): return self.data[idx] class DiDeMoTextProcessor(TextProcessor): """reference: https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/eval.py https://github.com/LisaAnne/LocalizingMoments/blob/master/utils/data_processing.py """ def __call__(self, text): return self.tokenizer(text, add_special_tokens=False)["input_ids"] class DiDeMoAligner(DSAligner): """ check video length. """ def __call__(self, video_id, video_feature, text_feature): # print(video_feature.shape[0]) return super().__call__(video_id, video_feature, text_feature) ================================================ FILE: examples/MMPT/mmpt/processors/how2processor.py ================================================ # coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Copyright (c) Facebook, Inc. All Rights Reserved import torch import math import pickle import random import os import numpy as np from collections import deque from typing import Optional, Tuple, List from .processor import ( Processor, MetaProcessor, TextProcessor, Aligner, MMAttentionMask2DProcessor ) from ..utils import ShardedTensor class How2MetaProcessor(MetaProcessor): def __init__(self, config): super().__init__(config) path = self._get_split_path(config) with open(path) as fd: self.data = [line.strip() for line in fd] def __getitem__(self, idx): video_id = self.data[idx] return video_id, video_id class ShardedHow2MetaProcessor(How2MetaProcessor): def __init__(self, config): super().__init__(config) self.split = str(config.split) self.vfeat_dir = config.vfeat_dir self._init_shard() def _init_shard(self): if self.split == "train": meta_fn = os.path.join(self.vfeat_dir, "train" + "_meta.pkl") with open(meta_fn, "rb") as fr: meta = pickle.load(fr) elif self.split == "valid": meta_fn = os.path.join(self.vfeat_dir, "val" + "_meta.pkl") with open(meta_fn, "rb") as fr: meta = pickle.load(fr) elif self.split == "test": print("use how2 val as test.") meta_fn = os.path.join(self.vfeat_dir, "val" + "_meta.pkl") with open(meta_fn, "rb") as fr: meta = pickle.load(fr) else: raise ValueError("unsupported for MetaProcessor:", self.split) video_id_to_shard = {} for shard_id in meta: for video_idx, video_id in enumerate(meta[shard_id]): video_id_to_shard[video_id] = (shard_id, video_idx) self.video_id_to_shard = video_id_to_shard def __getitem__(self, idx): video_id, video_id = super().__getitem__(idx) shard_id, shard_idx = self.video_id_to_shard[video_id] meta = (video_id, idx, shard_id, shard_idx) return meta, meta class ShardedVideoProcessor(Processor): """ mmaped shards of numpy video features. """ def __init__(self, config): self.split = str(config.split) self.vfeat_dir = config.vfeat_dir def __call__(self, video_id): _, _, shard_id, video_idx = video_id if self.split == "train": shard = ShardedTensor.load( os.path.join(self.vfeat_dir, "train" + "_" + str(shard_id)), "r" ) elif self.split == "valid": shard = ShardedTensor.load( os.path.join(self.vfeat_dir, "val" + "_" + str(shard_id)), "r" ) elif self.split == "test": shard = ShardedTensor.load( os.path.join(self.vfeat_dir, "val" + "_" + str(shard_id)), "r" ) else: raise ValueError("unknown split", self.split) feat = shard[video_idx] return feat class ShardedTextProcessor(Processor): def __init__(self, config): self.tfeat_dir = str(config.tfeat_dir) self.split = str(config.split) def __call__(self, video_id): _, _, shard_id, shard_idx = video_id if self.split == "train": target_path = self.tfeat_dir + "train" + "_" + str(shard_id) elif self.split == "valid": target_path = self.tfeat_dir + "val" + "_" + str(shard_id) elif self.split == "test": target_path = self.tfeat_dir + "val" + "_" + str(shard_id) else: raise ValueError("unknown split", self.split) startend = ShardedTensor.load( target_path + ".startends", "r")[shard_idx] cap_ids = ShardedTensor.load( target_path + ".caps_ids", "r")[shard_idx] cap = [] for clip_idx in range(len(cap_ids)): clip = cap_ids[clip_idx] cap.append(clip[clip != -1].tolist()) start, end = startend[:, 0].tolist(), startend[:, 1].tolist() return {"start": start, "end": end, "cap": cap} class FixedLenAligner(Aligner): """ In the model we assume text is on the left (closer to BERT formulation) and video is on the right. We fix the total length of text + video. max_video_len is in number of secs. max_text_len is in number of tokens. special tokens formats: we use the format [CLS] [SEP] text tokens [SEP] [PAD] ... [CLS] will be splitted out into: [CLS] video tokens [SEP] text tokens [SEP] [PAD] ... token_type_ids will be generated by the model (for now). 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence | so each sequence owns a [SEP] token for no-ops. """ def __init__(self, config): super().__init__(config) self.text_clip_sampler = TextClipSamplingProcessor( self.max_len - self.max_video_len - 3 ) """ decide subsampling: `config.subsampling` will change batch_size in trainer. `config.clip_per_video` (used by RetriTask) doesn't change batch_size in trainer. """ subsampling = config.subsampling \ if config.subsampling is not None else None if config.clip_per_video is not None: subsampling = config.clip_per_video self.subsampling = subsampling def _get_text_maxlen(self): # use max text len return self.text_clip_sampler.max_text_len def __call__(self, video_id, video_feature, text_feature): from transformers import default_data_collator video_idx = video_id[1] if self.subsampling is not None and self.subsampling >= 1: batch = [] for _ in range(self.subsampling): centerclip_idx = random.randint( 0, len(text_feature["start"]) - 1) batch.append( self.sampling( video_idx, video_feature, text_feature, centerclip_idx, self._get_text_maxlen() )) batch = self.batch_post_processing(batch, video_feature) batch = default_data_collator(batch) else: raise ValueError( "dataset.subsampling must be >= 1 for efficient video loading.") batch = self.sampling(video_idx, video_feature, text_feature) batch = self.batch_post_processing(batch, video_feature) batch["video_id"] = video_id if isinstance(video_id, str) \ else video_id[0] # e2e: make sure frame ids is into tensor. assert torch.is_tensor(batch["vfeats"]) return batch def sampling( self, video_idx, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): text_clip_indexs = self.text_clip_sampler( text_feature, centerclip_idx, sampled_max_text_len ) if isinstance(video_feature, np.ndarray): video_len = len(video_feature) else: video_len = math.ceil(text_feature["end"][-1]) video_end = min( math.ceil(text_feature["end"][text_clip_indexs[-1]]), video_len ) video_start = max( min( math.floor(text_feature["start"][text_clip_indexs[0]]), video_end), 0 ) video_clips = {"start": [video_start], "end": [video_end]} # tensorize. vfeats, vmasks = self._build_video_seq( video_feature, video_clips ) caps, cmasks = self._build_text_seq( text_feature, text_clip_indexs ) text_start = text_clip_indexs[0] text_end = text_clip_indexs[-1] + 1 return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, "vmasks": vmasks, "video_start": video_start, "video_end": video_end, "text_start": text_start, "text_end": text_end, } class VariedLenAligner(FixedLenAligner): def __init__(self, config): super().__init__(config) self.sampled_min_len = config.sampled_min_len self.sampled_max_len = config.sampled_max_len def _get_text_maxlen(self): return random.randint(self.sampled_min_len, self.sampled_max_len) class StartClipAligner(VariedLenAligner): def sampling( self, video_idx, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): return super().sampling( video_idx, video_feature, text_feature, 0) class OverlappedAligner(VariedLenAligner): """video clip and text clip has overlappings but may not be the same start/end.""" def __init__(self, config): super().__init__(config) self.sampled_video_min_len = config.sampled_video_min_len self.sampled_video_max_len = config.sampled_video_max_len self.video_clip_sampler = VideoClipSamplingProcessor() def _get_video_maxlen(self): return random.randint( self.sampled_video_min_len, self.sampled_video_max_len) def sampling( self, video_idx, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): text_clip_indexs = self.text_clip_sampler( text_feature, centerclip_idx, sampled_max_text_len ) if isinstance(video_feature, np.ndarray): video_len = len(video_feature) else: video_len = math.ceil(text_feature["end"][-1]) low = math.floor(text_feature["start"][text_clip_indexs[0]]) high = math.ceil(text_feature["end"][text_clip_indexs[-1]]) if low < high: center = random.randint(low, high) else: center = int((low + high) // 2) center = max(0, min(video_feature.shape[0] - 1, center)) assert 0 <= center < video_feature.shape[0] video_clips = self.video_clip_sampler( video_len, self._get_video_maxlen(), center ) video_start = video_clips["start"][0] video_end = video_clips["end"][0] # tensorize. vfeats, vmasks = self._build_video_seq( video_feature, video_clips ) caps, cmasks = self._build_text_seq( text_feature, text_clip_indexs ) text_start = text_clip_indexs[0] text_end = text_clip_indexs[-1] + 1 return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, "vmasks": vmasks, "video_start": video_start, "video_end": video_end, "text_start": text_start, "text_end": text_end, } class MFMMLMAligner(FixedLenAligner): """ `FixedLenAligner` with Masked Language Model and Masked Frame Model. """ def __init__(self, config): super().__init__(config) keep_prob = config.keep_prob if config.keep_prob is not None else 1.0 self.text_clip_sampler = TextClipSamplingProcessor( self.max_len - self.max_video_len - 3, keep_prob ) self.sampled_min_len = config.sampled_min_len self.sampled_max_len = config.sampled_max_len self.masked_token_sampler = TextMaskingProcessor(config) self.mm_type = config.mm_type \ if config.mm_type is not None else "full" self.attnmasker = MMAttentionMask2DProcessor() \ if self.mm_type == "textgen" else None self.masked_frame_sampler = FrameMaskingProcessor(config) self.lazy_vfeat_mask = ( False if config.lazy_vfeat_mask is None else config.lazy_vfeat_mask ) self.mm_prob = config.mm_prob if config.mm_prob is not None else 0. def __call__(self, video_id, video_feature, text_feature): from transformers import default_data_collator if self.subsampling is not None and self.subsampling > 1: batch = [] for _ in range(self.subsampling): centerclip_idx = random.randint( 0, len(text_feature["start"]) - 1) sampled_max_text_len = random.randint( self.sampled_min_len, self.sampled_max_len ) batch.append( self.sampling( video_id, video_feature, text_feature, centerclip_idx, sampled_max_text_len, ) ) batch = self.batch_post_processing(batch, video_feature) batch = default_data_collator(batch) else: batch = self.sampling(video_id, video_feature, text_feature) batch = self.batch_post_processing(batch, video_feature) batch["video_id"] = video_id if isinstance(video_id, str) \ else video_id[0] return batch def sampling( self, video_id, video_feature, text_feature, centerclip_idx=None, sampled_max_text_len=None, ): output = FixedLenAligner.sampling(self, video_id, video_feature, text_feature, centerclip_idx, sampled_max_text_len) masking_text, masking_video = None, None if random.random() < self.mm_prob: if random.random() > 0.5: masking_text, masking_video = self.mm_type, "no" else: masking_text, masking_video = "no", "full" video_feats = output["vfeats"] if not self.lazy_vfeat_mask else None video_label = self.masked_frame_sampler( output["vmasks"], masking_video, vfeats=video_feats) caps, text_label = self.masked_token_sampler( output["caps"], masking_text) output.update({ "caps": caps, "video_label": video_label, "text_label": text_label, }) if self.attnmasker is not None: attention_mask = self.attnmasker( output["vmasks"], output["cmasks"], masking_text) output.update({ "attention_mask": attention_mask }) return output class FrameMaskingProcessor(Processor): def __init__(self, config): self.mfm_probability = 0.15 if config.mfm_probability is not None: self.mfm_probability = config.mfm_probability def __call__(self, vmasks, modality_masking=None, vfeats=None): """ We perform lazy masking to save data transfer time. It only generates video_labels by default and MFM model will do actualy masking. Return: `video_label` is a binary mask. """ video_label = vmasks.clone() if modality_masking is not None: if modality_masking == "full": probability_matrix = torch.full(video_label.shape, 1.) elif modality_masking == "no": probability_matrix = torch.full(video_label.shape, 0.) elif modality_masking == "inverse": probability_matrix = torch.full( video_label.shape, 1. - self.mfm_probability) else: raise ValueError("unknown modality masking.", modality_masking) else: probability_matrix = torch.full( video_label.shape, self.mfm_probability) masked_indices = torch.bernoulli(probability_matrix).bool() # We only compute loss on masked tokens video_label[~masked_indices] = 0 if vfeats is not None: vfeats[video_label, :] = 0.0 return video_label class TextGenerationProcessor(Processor): def __init__(self, tokenizer): self.bos_token_id = tokenizer.bos_token_id self.pad_token_id = tokenizer.pad_token_id def __call__(self, inputs): labels = inputs.clone() # [CLS] [SEP] for video labels[:2] = -100 # keep [SEP] for text. pad_mask = labels == self.pad_token_id labels[pad_mask] = -100 inputs[2:] = torch.cat([ torch.LongTensor([self.bos_token_id]), inputs[2:-1]]) inputs[pad_mask] = self.pad_token_id assert len(inputs) == len(labels) return inputs, labels class TextMaskingProcessor(Processor): def __init__(self, config): """this function is borrowed from `transformers/data/data_collator.DataCollatorForLanguageModeling`""" self.mlm_probability = 0.15 if config.mlm_probability is not None: self.mlm_probability = config.mlm_probability self.bert_name = config.bert_name # [CLS] is used as bos_token and [SEP] is used as eos_token. # https://huggingface.co/transformers/master/model_doc/bertgeneration.html from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.bert_name, bos_token="[CLS]", eos_token="[SEP]") self.textgen = TextGenerationProcessor(self.tokenizer) def __call__( self, inputs: torch.Tensor, modality_masking=None, special_tokens_mask: Optional[torch.Tensor] = None ) -> Tuple[torch.Tensor, torch.Tensor]: """ expand modality_masking into None: traditional bert masking. "no": no masking. "full": all [MASK] token for generation. "gen": autoregressive generation. """ """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for MLM training # (with probability `self.mlm_probability`) if modality_masking is not None: if modality_masking == "full": probability_matrix = torch.full(labels.shape, 1.) elif modality_masking == "no": probability_matrix = torch.full(labels.shape, 0.) elif modality_masking.startswith("textgen"): # [CLS] [SEP] ... inputs, labels = self.textgen(inputs) if "mask" not in modality_masking: return inputs, labels inputs = self.mask_input(inputs, special_tokens_mask) return inputs, labels elif modality_masking == "mask": inputs = self.mask_input(inputs, special_tokens_mask) labels = torch.full(inputs.shape, -100) return inputs, labels elif modality_masking == "inverse": probability_matrix = torch.full(labels.shape, 1. - self.mlm_probability) else: raise ValueError("unknown modality masking.", modality_masking) else: probability_matrix = torch.full(labels.shape, self.mlm_probability) if special_tokens_mask is None: special_tokens_mask = self.get_special_tokens_mask( labels.tolist(), already_has_special_tokens=True ) special_tokens_mask = torch.tensor( special_tokens_mask, dtype=torch.bool) else: special_tokens_mask = special_tokens_mask.bool() probability_matrix.masked_fill_(special_tokens_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, # we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = ( torch.bernoulli( torch.full(labels.shape, 0.8)).bool() & masked_indices ) inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( self.tokenizer.mask_token ) # 10% of the time, we replace masked input tokens with random word indices_random = ( torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced ) random_words = torch.randint( len(self.tokenizer), labels.shape, dtype=torch.long ) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input # tokens unchanged return inputs, labels def mask_input(self, inputs, special_tokens_mask=None): # the following is new with masked autoregressive. probability_matrix = torch.full( inputs.shape, self.mlm_probability) if special_tokens_mask is None: special_tokens_mask = self.get_special_tokens_mask( inputs.tolist(), already_has_special_tokens=True ) special_tokens_mask = torch.tensor( special_tokens_mask, dtype=torch.bool) else: special_tokens_mask = special_tokens_mask.bool() probability_matrix.masked_fill_(special_tokens_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() indices_replaced = ( torch.bernoulli( torch.full(inputs.shape, 0.8)).bool() & masked_indices ) inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids( self.tokenizer.mask_token ) # 10% of the time, we replace masked input tokens with random word indices_random = ( torch.bernoulli(torch.full(inputs.shape, 0.5)).bool() & masked_indices & ~indices_replaced ) random_words = torch.randint( len(self.tokenizer), inputs.shape, dtype=torch.long ) inputs[indices_random] = random_words[indices_random] return inputs def get_special_tokens_mask( self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False ) -> List[int]: """ Note: the version from transformers do not consider pad as special tokens. """ if already_has_special_tokens: if token_ids_1 is not None: raise ValueError( "You should not supply a second sequence if" "the provided sequence of " "ids is already formated with special tokens " "for the model." ) return list(map(lambda x: 1 if x in [ self.tokenizer.sep_token_id, self.tokenizer.cls_token_id, self.tokenizer.pad_token_id] else 0, token_ids_0)) if token_ids_1 is not None: return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] return [1] + ([0] * len(token_ids_0)) + [1] class TextClipSamplingProcessor(Processor): def __init__(self, max_text_len, keep_prob=1.0): self.max_text_len = max_text_len self.max_video_len = 256 # always hold. self.keep_prob = keep_prob def __call__( self, text_feature, centerclip_idx=None, sampled_max_text_len=None, sampled_max_video_len=None, ): # Let's use all caps for now and see if 256 can cover all of them. if sampled_max_text_len is not None: max_text_len = sampled_max_text_len else: max_text_len = self.max_text_len if sampled_max_video_len is not None: max_video_len = sampled_max_video_len else: max_video_len = self.max_video_len t_num_clips = len(text_feature["start"]) if centerclip_idx is None: centerclip_idx = random.randint(0, t_num_clips - 1) start_idx, end_idx = centerclip_idx, centerclip_idx + 1 text_clip_indexs = deque() text_clip_indexs.append(start_idx) text_len = len(text_feature["cap"][start_idx]) video_len = max( 0, text_feature["end"][start_idx] - text_feature["start"][start_idx], ) while ( (start_idx > 0 or end_idx < t_num_clips) and text_len < max_text_len and video_len < max_video_len ): if random.random() > 0.5 and end_idx < t_num_clips: # skip the next one? if random.random() > self.keep_prob and (end_idx + 1) < t_num_clips: end_idx = end_idx + 1 text_clip_indexs.append(end_idx) text_len += len(text_feature["cap"][end_idx]) end_idx += 1 elif start_idx > 0: if random.random() > self.keep_prob and (start_idx - 1) > 0: start_idx = start_idx - 1 start_idx -= 1 text_clip_indexs.insert(0, start_idx) text_len += len(text_feature["cap"][start_idx]) else: if end_idx < t_num_clips: if random.random() > self.keep_prob and (end_idx + 1) < t_num_clips: end_idx = end_idx + 1 text_clip_indexs.append(end_idx) text_len += len(text_feature["cap"][end_idx]) end_idx += 1 else: return text_clip_indexs video_len = max( 0, text_feature["end"][text_clip_indexs[-1]] - text_feature["start"][text_clip_indexs[0]], ) return text_clip_indexs class VideoClipSamplingProcessor(Processor): def __call__(self, video_len, max_video_len, center): """ `video_len`: length of the video. `max_video_len`: maximum video tokens allowd in a sequence. `center`: initial starting index. """ assert center >= 0 and center < video_len t_clip_len = 0 start, end = center, center while (start > 0 or end < video_len) and t_clip_len < max_video_len: # decide the direction to grow. if start <= 0: end += 1 elif end >= video_len: start -= 1 elif random.random() > 0.5: end += 1 else: start -= 1 t_clip_len += 1 return {"start": [start], "end": [end]} class How2MILNCEAligner(FixedLenAligner): """reference: `antoine77340/MIL-NCE_HowTo100M/video_loader.py`""" def __init__(self, config): super().__init__(config) self.num_candidates = 4 self.min_time = 5.0 self.num_sec = 3.2 # self.num_sec = self.num_frames / float(self.fps) num_frames=16 / fps = 5 # self.num_frames = 16 def sampling( self, video_id, video_feature, text_feature, centerclip_idx=None, # will be ignored. sampled_max_text_len=None # will be ignored. ): text, start, end = self._get_text(text_feature) video = self._get_video(video_feature, start, end) vfeats = torch.zeros((self.max_video_len, video_feature.shape[1])) vmasks = torch.zeros((self.max_video_len,), dtype=torch.bool) vfeats[: video.shape[0]] = torch.from_numpy(np.array(video)) vmasks[: video.shape[0]] = 1 caps, cmasks = [], [] for words in text: cap, cmask = self._build_text_seq(text_feature, words) caps.append(cap) cmasks.append(cmask) caps = torch.stack(caps) cmasks = torch.stack(cmasks) # video of shape: (video_len) # text of shape (num_candidates, max_text_len) return { "caps": caps, "cmasks": cmasks, "vfeats": vfeats, "vmasks": vmasks, # "video_id": video_id, } def _get_video(self, video_feature, start, end): start_seek = random.randint(start, int(max(start, end - self.num_sec))) # duration = self.num_sec + 0.1 return video_feature[start_seek : int(start_seek + self.num_sec)] def _get_text(self, cap): ind = random.randint(0, len(cap["start"]) - 1) if self.num_candidates == 1: words = [ind] else: words = [] cap_start = self._find_nearest_candidates(cap, ind) for i in range(self.num_candidates): words.append([max(0, min(len(cap["cap"]) - 1, cap_start + i))]) start, end = cap["start"][ind], cap["end"][ind] # TODO: May need to be improved for edge cases. # expand the min time. if end - start < self.min_time: diff = self.min_time - end + start start = max(0, start - diff / 2) end = start + self.min_time return words, int(start), int(end) def _find_nearest_candidates(self, caption, ind): """find the range of the clips.""" start, end = ind, ind #diff = caption["end"][end] - caption["start"][start] n_candidate = 1 while n_candidate < self.num_candidates: # the first clip if start == 0: return 0 # we add () in the following condition to fix the bug. elif end == (len(caption["start"]) - 1): return start - (self.num_candidates - n_candidate) elif (caption["end"][end] - caption["start"][start - 1]) < ( caption["end"][end + 1] - caption["start"][start] ): start -= 1 else: end += 1 n_candidate += 1 return start class PKLJSONStrTextProcessor(TextProcessor): """`caption.json` from howto100m are preprocessed as a dict `[video_id, json_str]`. Json parsing tokenization are conducted on-the-fly and cached into dict. """ def __init__(self, config, max_clip_text_len=96): print("[Warning] PKLJSONStrTextProcessor is slow for num_workers > 0.") self.caption_pkl_path = str(config.caption_pkl_path) with open(self.caption_pkl_path, "rb") as fd: self.data = pickle.load(fd) self.max_clip_text_len = max_clip_text_len from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( str(config.bert_name), use_fast=config.use_fast ) def __call__(self, video_id): caption = self.data[video_id] if isinstance(caption, str): import json caption = json.loads(caption) cap = [] for clip_idx, text_clip in enumerate(caption["text"]): clip_ids = [] if isinstance(text_clip, str): clip_ids = self.tokenizer( text_clip[: self.max_clip_text_len], add_special_tokens=False )["input_ids"] cap.append(clip_ids) caption["cap"] = cap caption.pop("text") # save space. self.data[video_id] = caption return caption ================================================ FILE: examples/MMPT/mmpt/processors/how2retriprocessor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .how2processor import ( ShardedHow2MetaProcessor, ShardedVideoProcessor, ShardedTextProcessor, VariedLenAligner, OverlappedAligner ) class ShardedHow2VideoRetriMetaProcessor(ShardedHow2MetaProcessor): def __init__(self, config): super().__init__(config) self.num_video_per_batch = config.num_video_per_batch self.cands = [ self.data[batch_offset:batch_offset + self.num_video_per_batch] for batch_offset in range(0, (len(self.data) // (8 * self.num_video_per_batch)) * 8 * self.num_video_per_batch, self.num_video_per_batch)] def __len__(self): return len(self.cands) def set_candidates(self, cands): # no changes on num of batches. print(len(self.cands), "->", len(cands)) # assert len(self.cands) == len(cands) self.cands = cands def __getitem__(self, idx): video_ids = self.cands[idx] assert isinstance(video_ids, list) sharded_video_idxs = [] for video_id in video_ids: shard_id, video_idx = self.video_id_to_shard[video_id] sharded_video_idxs.append((video_id, -1, shard_id, video_idx)) return sharded_video_idxs, sharded_video_idxs class ShardedVideoRetriVideoProcessor(ShardedVideoProcessor): """In retrival case the video_id is a list of tuples: `(shard_id, video_idx)` .""" def __call__(self, sharded_video_idxs): assert isinstance(sharded_video_idxs, list) cand_feats = [] for shared_video_idx in sharded_video_idxs: feat = super().__call__(shared_video_idx) cand_feats.append(feat) return cand_feats class ShardedVideoRetriTextProcessor(ShardedTextProcessor): """In retrival case the video_id is a list of tuples: `(shard_id, video_idx)` .""" def __call__(self, sharded_video_idxs): assert isinstance(sharded_video_idxs, list) cand_caps = [] for shared_video_idx in sharded_video_idxs: caps = super().__call__(shared_video_idx) cand_caps.append(caps) return cand_caps class VideoRetriAligner(VariedLenAligner): # Retritask will trim dim-0. def __call__(self, sharded_video_idxs, video_features, text_features): from transformers import default_data_collator batch, video_ids = [], [] for video_id, video_feature, text_feature in \ zip(sharded_video_idxs, video_features, text_features): sub_batch = super().__call__(video_id, video_feature, text_feature) batch.append(sub_batch) if isinstance(video_id, tuple): video_id = video_id[0] video_ids.append(video_id) batch = default_data_collator(batch) batch["video_id"] = video_ids return batch class VideoRetriOverlappedAligner(OverlappedAligner): # Retritask will trim dim-0. def __call__(self, sharded_video_idxs, video_features, text_features): from transformers import default_data_collator batch, video_ids = [], [] for video_id, video_feature, text_feature in \ zip(sharded_video_idxs, video_features, text_features): sub_batch = super().__call__(video_id, video_feature, text_feature) batch.append(sub_batch) if isinstance(video_id, tuple): video_id = video_id[0] video_ids.append(video_id) batch = default_data_collator(batch) batch["video_id"] = video_ids return batch ================================================ FILE: examples/MMPT/mmpt/processors/models/s3dg.py ================================================ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """Contains a PyTorch definition for Gated Separable 3D network (S3D-G) with a text module for computing joint text-video embedding from raw text and video input. The following code will enable you to load the HowTo100M pretrained S3D Text-Video model from: A. Miech, J.-B. Alayrac, L. Smaira, I. Laptev, J. Sivic and A. Zisserman, End-to-End Learning of Visual Representations from Uncurated Instructional Videos. https://arxiv.org/abs/1912.06430. S3D-G was proposed by: S. Xie, C. Sun, J. Huang, Z. Tu and K. Murphy, Rethinking Spatiotemporal Feature Learning For Video Understanding. https://arxiv.org/abs/1712.04851. Tensorflow code: https://github.com/tensorflow/models/blob/master/research/slim/nets/s3dg.py The S3D architecture was slightly modified with a space to depth trick for TPU optimization. """ import torch as th import torch.nn.functional as F import torch.nn as nn import os import numpy as np import re class InceptionBlock(nn.Module): def __init__( self, input_dim, num_outputs_0_0a, num_outputs_1_0a, num_outputs_1_0b, num_outputs_2_0a, num_outputs_2_0b, num_outputs_3_0b, gating=True, ): super(InceptionBlock, self).__init__() self.conv_b0 = STConv3D(input_dim, num_outputs_0_0a, [1, 1, 1]) self.conv_b1_a = STConv3D(input_dim, num_outputs_1_0a, [1, 1, 1]) self.conv_b1_b = STConv3D( num_outputs_1_0a, num_outputs_1_0b, [3, 3, 3], padding=1, separable=True ) self.conv_b2_a = STConv3D(input_dim, num_outputs_2_0a, [1, 1, 1]) self.conv_b2_b = STConv3D( num_outputs_2_0a, num_outputs_2_0b, [3, 3, 3], padding=1, separable=True ) self.maxpool_b3 = th.nn.MaxPool3d((3, 3, 3), stride=1, padding=1) self.conv_b3_b = STConv3D(input_dim, num_outputs_3_0b, [1, 1, 1]) self.gating = gating self.output_dim = ( num_outputs_0_0a + num_outputs_1_0b + num_outputs_2_0b + num_outputs_3_0b ) if gating: self.gating_b0 = SelfGating(num_outputs_0_0a) self.gating_b1 = SelfGating(num_outputs_1_0b) self.gating_b2 = SelfGating(num_outputs_2_0b) self.gating_b3 = SelfGating(num_outputs_3_0b) def forward(self, input): """Inception block """ b0 = self.conv_b0(input) b1 = self.conv_b1_a(input) b1 = self.conv_b1_b(b1) b2 = self.conv_b2_a(input) b2 = self.conv_b2_b(b2) b3 = self.maxpool_b3(input) b3 = self.conv_b3_b(b3) if self.gating: b0 = self.gating_b0(b0) b1 = self.gating_b1(b1) b2 = self.gating_b2(b2) b3 = self.gating_b3(b3) return th.cat((b0, b1, b2, b3), dim=1) class SelfGating(nn.Module): def __init__(self, input_dim): super(SelfGating, self).__init__() self.fc = nn.Linear(input_dim, input_dim) def forward(self, input_tensor): """Feature gating as used in S3D-G. """ spatiotemporal_average = th.mean(input_tensor, dim=[2, 3, 4]) weights = self.fc(spatiotemporal_average) weights = th.sigmoid(weights) return weights[:, :, None, None, None] * input_tensor class STConv3D(nn.Module): def __init__( self, input_dim, output_dim, kernel_size, stride=1, padding=0, separable=False ): super(STConv3D, self).__init__() self.separable = separable self.relu = nn.ReLU(inplace=True) assert len(kernel_size) == 3 if separable and kernel_size[0] != 1: spatial_kernel_size = [1, kernel_size[1], kernel_size[2]] temporal_kernel_size = [kernel_size[0], 1, 1] if isinstance(stride, list) and len(stride) == 3: spatial_stride = [1, stride[1], stride[2]] temporal_stride = [stride[0], 1, 1] else: spatial_stride = [1, stride, stride] temporal_stride = [stride, 1, 1] if isinstance(padding, list) and len(padding) == 3: spatial_padding = [0, padding[1], padding[2]] temporal_padding = [padding[0], 0, 0] else: spatial_padding = [0, padding, padding] temporal_padding = [padding, 0, 0] if separable: self.conv1 = nn.Conv3d( input_dim, output_dim, kernel_size=spatial_kernel_size, stride=spatial_stride, padding=spatial_padding, bias=False, ) self.bn1 = nn.BatchNorm3d(output_dim) self.conv2 = nn.Conv3d( output_dim, output_dim, kernel_size=temporal_kernel_size, stride=temporal_stride, padding=temporal_padding, bias=False, ) self.bn2 = nn.BatchNorm3d(output_dim) else: self.conv1 = nn.Conv3d( input_dim, output_dim, kernel_size=kernel_size, stride=stride, padding=padding, bias=False, ) self.bn1 = nn.BatchNorm3d(output_dim) def forward(self, input): out = self.relu(self.bn1(self.conv1(input))) if self.separable: out = self.relu(self.bn2(self.conv2(out))) return out class MaxPool3dTFPadding(th.nn.Module): def __init__(self, kernel_size, stride=None, padding="SAME"): super(MaxPool3dTFPadding, self).__init__() if padding == "SAME": padding_shape = self._get_padding_shape(kernel_size, stride) self.padding_shape = padding_shape self.pad = th.nn.ConstantPad3d(padding_shape, 0) self.pool = th.nn.MaxPool3d(kernel_size, stride, ceil_mode=True) def _get_padding_shape(self, filter_shape, stride): def _pad_top_bottom(filter_dim, stride_val): pad_along = max(filter_dim - stride_val, 0) pad_top = pad_along // 2 pad_bottom = pad_along - pad_top return pad_top, pad_bottom padding_shape = [] for filter_dim, stride_val in zip(filter_shape, stride): pad_top, pad_bottom = _pad_top_bottom(filter_dim, stride_val) padding_shape.append(pad_top) padding_shape.append(pad_bottom) depth_top = padding_shape.pop(0) depth_bottom = padding_shape.pop(0) padding_shape.append(depth_top) padding_shape.append(depth_bottom) return tuple(padding_shape) def forward(self, inp): inp = self.pad(inp) out = self.pool(inp) return out class Sentence_Embedding(nn.Module): def __init__( self, embd_dim, num_embeddings=66250, word_embedding_dim=300, token_to_word_path="dict.npy", max_words=16, output_dim=2048, ): super(Sentence_Embedding, self).__init__() self.word_embd = nn.Embedding(num_embeddings, word_embedding_dim) self.fc1 = nn.Linear(word_embedding_dim, output_dim) self.fc2 = nn.Linear(output_dim, embd_dim) self.word_to_token = {} self.max_words = max_words token_to_word = np.load(token_to_word_path) for i, t in enumerate(token_to_word): self.word_to_token[t] = i + 1 def _zero_pad_tensor_token(self, tensor, size): if len(tensor) >= size: return tensor[:size] else: zero = th.zeros(size - len(tensor)).long() return th.cat((tensor, zero), dim=0) def _split_text(self, sentence): w = re.findall(r"[\w']+", str(sentence)) return w def _words_to_token(self, words): words = [ self.word_to_token[word] for word in words if word in self.word_to_token ] if words: we = self._zero_pad_tensor_token(th.LongTensor(words), self.max_words) return we else: return th.zeros(self.max_words).long() def _words_to_ids(self, x): split_x = [self._words_to_token(self._split_text(sent.lower())) for sent in x] return th.stack(split_x, dim=0) def forward(self, x): x = self._words_to_ids(x) x = self.word_embd(x) x = F.relu(self.fc1(x)) x = th.max(x, dim=1)[0] x = self.fc2(x) return {'text_embedding': x} class S3D(nn.Module): def __init__(self, dict_path, num_classes=512, gating=True, space_to_depth=True): super(S3D, self).__init__() self.num_classes = num_classes self.gating = gating self.space_to_depth = space_to_depth if space_to_depth: self.conv1 = STConv3D( 24, 64, [2, 4, 4], stride=1, padding=(1, 2, 2), separable=False ) else: self.conv1 = STConv3D( 3, 64, [3, 7, 7], stride=2, padding=(1, 3, 3), separable=False ) self.conv_2b = STConv3D(64, 64, [1, 1, 1], separable=False) self.conv_2c = STConv3D(64, 192, [3, 3, 3], padding=1, separable=True) self.gating = SelfGating(192) self.maxpool_2a = MaxPool3dTFPadding( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding="SAME" ) self.maxpool_3a = MaxPool3dTFPadding( kernel_size=(1, 3, 3), stride=(1, 2, 2), padding="SAME" ) self.mixed_3b = InceptionBlock(192, 64, 96, 128, 16, 32, 32) self.mixed_3c = InceptionBlock( self.mixed_3b.output_dim, 128, 128, 192, 32, 96, 64 ) self.maxpool_4a = MaxPool3dTFPadding( kernel_size=(3, 3, 3), stride=(2, 2, 2), padding="SAME" ) self.mixed_4b = InceptionBlock( self.mixed_3c.output_dim, 192, 96, 208, 16, 48, 64 ) self.mixed_4c = InceptionBlock( self.mixed_4b.output_dim, 160, 112, 224, 24, 64, 64 ) self.mixed_4d = InceptionBlock( self.mixed_4c.output_dim, 128, 128, 256, 24, 64, 64 ) self.mixed_4e = InceptionBlock( self.mixed_4d.output_dim, 112, 144, 288, 32, 64, 64 ) self.mixed_4f = InceptionBlock( self.mixed_4e.output_dim, 256, 160, 320, 32, 128, 128 ) self.maxpool_5a = self.maxPool3d_5a_2x2 = MaxPool3dTFPadding( kernel_size=(2, 2, 2), stride=(2, 2, 2), padding="SAME" ) self.mixed_5b = InceptionBlock( self.mixed_4f.output_dim, 256, 160, 320, 32, 128, 128 ) self.mixed_5c = InceptionBlock( self.mixed_5b.output_dim, 384, 192, 384, 48, 128, 128 ) self.fc = nn.Linear(self.mixed_5c.output_dim, num_classes) self.text_module = Sentence_Embedding(num_classes, token_to_word_path=dict_path) def _space_to_depth(self, input): """3D space to depth trick for TPU optimization. """ B, C, T, H, W = input.shape input = input.view(B, C, T // 2, 2, H // 2, 2, W // 2, 2) input = input.permute(0, 3, 5, 7, 1, 2, 4, 6) input = input.contiguous().view(B, 8 * C, T // 2, H // 2, W // 2) return input def forward(self, inputs): """Defines the S3DG base architecture.""" if self.space_to_depth: inputs = self._space_to_depth(inputs) net = self.conv1(inputs) if self.space_to_depth: # we need to replicate 'SAME' tensorflow padding net = net[:, :, 1:, 1:, 1:] net = self.maxpool_2a(net) net = self.conv_2b(net) net = self.conv_2c(net) if self.gating: net = self.gating(net) net = self.maxpool_3a(net) net = self.mixed_3b(net) net = self.mixed_3c(net) net = self.maxpool_4a(net) net = self.mixed_4b(net) net = self.mixed_4c(net) net = self.mixed_4d(net) net = self.mixed_4e(net) net = self.mixed_4f(net) net = self.maxpool_5a(net) net = self.mixed_5b(net) net = self.mixed_5c(net) net = th.mean(net, dim=[2, 3, 4]) return {'video_embedding': self.fc(net), 'mixed_5c': net} ================================================ FILE: examples/MMPT/mmpt/processors/processor.py ================================================ # Copyright (c) Facebook, Inc. All Rights Reserved import numpy as np import os import torch class Processor(object): """ A generic processor for video (codec, feature etc.) and text. """ def __call__(self, **kwargs): raise NotImplementedError class MetaProcessor(Processor): """ A meta processor is expected to load the metadata of a dataset: (e.g., video_ids, or captions). You must implement the `__getitem__` (meta datasets are rather diverse.). """ def __init__(self, config): self.split = config.split def __len__(self): return len(self.data) def __getitem__(self, idx): raise NotImplementedError def _get_split_path(self, config): splits = { "train": config.train_path, "valid": config.val_path, "test": config.test_path, } if config.split is not None: return splits[config.split] return config.train_path class TextProcessor(Processor): """ A generic Text processor: rename this as `withTokenizer`. tokenize a string of text on-the-fly. Warning: mostly used for end tasks. (on-the-fly tokenization is slow for how2.) TODO(huxu): move this class as a subclass. """ def __init__(self, config): self.bert_name = str(config.bert_name) self.use_fast = config.use_fast from transformers import AutoTokenizer self.tokenizer = AutoTokenizer.from_pretrained( self.bert_name, use_fast=self.use_fast ) def __call__(self, text_id): caption = self.tokenizer(text_id, add_special_tokens=False) return caption["input_ids"] class VideoProcessor(Processor): """ A generic video processor: load a numpy video tokens by default. """ def __init__(self, config): self.vfeat_dir = config.vfeat_dir def __call__(self, video_fn): if isinstance(video_fn, tuple): video_fn = video_fn[0] assert isinstance(video_fn, str) video_fn = os.path.join(self.vfeat_dir, video_fn + ".npy") feat = np.load(video_fn) return feat class Aligner(object): """ An alignprocessor align video and text and output a dict of tensors (for a model). """ def __init__(self, config): """__init__ needs to be light weight for more workers/threads.""" self.split = config.split self.max_video_len = config.max_video_len self.max_len = config.max_len from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained( str(config.bert_name), use_fast=config.use_fast ) self.cls_token_id = tokenizer.cls_token_id self.sep_token_id = tokenizer.sep_token_id self.pad_token_id = tokenizer.pad_token_id self.mask_token_id = tokenizer.mask_token_id def __call__(self, video_id, video_feature, text_feature): raise NotImplementedError def _build_video_seq(self, video_feature, video_clips=None): """ `video_feature`: available video tokens. `video_clips`: video clip sequence to build. """ if not isinstance(video_feature, np.ndarray): raise ValueError( "unsupported type of video_feature", type(video_feature) ) if video_clips is None: # this is borrowed from DSAligner video_start = 0 video_end = min(len(video_feature), self.max_video_len) # the whole sequence is a single clip. video_clips = {"start": [video_start], "end": [video_end]} vfeats = np.zeros( (self.max_video_len, video_feature.shape[1]), dtype=np.float32 ) vmasks = torch.zeros((self.max_video_len,), dtype=torch.bool) video_len = 0 for start, end in zip(video_clips["start"], video_clips["end"]): clip_len = min(self.max_video_len - video_len, (end - start)) if clip_len > 0: vfeats[video_len: video_len + clip_len] = video_feature[ start: start + clip_len ] vmasks[video_len: video_len + clip_len] = 1 video_len += clip_len vfeats = torch.from_numpy(vfeats) return vfeats, vmasks def _build_text_seq(self, text_feature, text_clip_indexs=None): """ `text_feature`: all available clips. `text_clip_indexes`: clip sequence to build. """ if text_clip_indexs is None: text_clip_indexs = [0] full_caps = [] if isinstance(text_feature, dict): for clip_idx in text_clip_indexs: full_caps.extend(text_feature["cap"][clip_idx]) else: full_caps = text_feature max_text_len = self.max_len - self.max_video_len - 3 full_caps = full_caps[:max_text_len] full_caps = ( [self.cls_token_id, self.sep_token_id] + full_caps + [self.sep_token_id] ) text_pad_len = self.max_len - len(full_caps) - self.max_video_len padded_full_caps = full_caps + [self.pad_token_id] * text_pad_len caps = torch.LongTensor(padded_full_caps) cmasks = torch.zeros((len(padded_full_caps),), dtype=torch.bool) cmasks[: len(full_caps)] = 1 return caps, cmasks def batch_post_processing(self, batch, video_feature): return batch class MMAttentionMask2DProcessor(Processor): """text generation requires 2d mask that is harder to generate by GPU at this stage.""" def __call__(self, vmask, cmask, mtype): if mtype == "textgen": return self._build_textgeneration_mask(vmask, cmask) elif mtype == "videogen": return self._build_videogeneration_mask(vmask, cmask) else: return self._build_mm_mask(vmask, cmask) def _build_mm_mask(self, vmask, cmask): mask_1d = torch.cat([cmask[:1], vmask, cmask[1:]], dim=0) return mask_1d[None, :].repeat(mask_1d.size(0), 1) def _build_videogeneration_mask(self, vmask, cmask): # cls_mask is only about text otherwise it will leak generation. cls_text_mask = torch.cat([ # [CLS] torch.ones( (1,), dtype=torch.bool, device=cmask.device), # video tokens and [SEP] for video. torch.zeros( (vmask.size(0) + 1,), dtype=torch.bool, device=cmask.device), cmask[2:] ], dim=0) # concat horizontially. video_len = int(vmask.sum()) video_masks = torch.cat([ # [CLS] torch.ones( (video_len, 1), dtype=torch.bool, device=cmask.device ), torch.tril( torch.ones( (video_len, video_len), dtype=torch.bool, device=cmask.device)), # video_padding torch.zeros( (video_len, vmask.size(0) - video_len), dtype=torch.bool, device=cmask.device ), # [SEP] for video (unused). torch.zeros( (video_len, 1), dtype=torch.bool, device=cmask.device ), cmask[2:].unsqueeze(0).repeat(video_len, 1) ], dim=1) text_masks = cls_text_mask[None, :].repeat( cmask.size(0) - 2, 1) video_padding_masks = cls_text_mask[None, :].repeat( vmask.size(0) - video_len, 1) return torch.cat([ cls_text_mask[None, :], video_masks, video_padding_masks, torch.cat([cmask[:1], vmask, cmask[1:]], dim=0)[None,:], text_masks ], dim=0) def _build_textgeneration_mask(self, vmask, cmask): # cls_mask is only about video otherwise it will leak generation. cls_video_mask = torch.cat([ # [CLS] torch.ones( (1,), dtype=torch.bool, device=cmask.device), vmask, # [SEP] torch.ones((1,), dtype=torch.bool, device=cmask.device), torch.zeros( (cmask.size(0)-2,), dtype=torch.bool, device=cmask.device) ], dim=0) # concat horizontially. text_len = int(cmask[2:].sum()) text_masks = torch.cat([ # [CLS] torch.ones( (text_len, 1), dtype=torch.bool, device=cmask.device ), vmask.unsqueeze(0).repeat(text_len, 1), # [SEP] for video. torch.ones( (text_len, 1), dtype=torch.bool, device=cmask.device ), torch.tril( torch.ones( (text_len, text_len), dtype=torch.bool, device=cmask.device)), # padding. torch.zeros( (text_len, cmask.size(0) - text_len - 2), dtype=torch.bool, device=cmask.device ) ], dim=1) cls_video_masks = cls_video_mask[None, :].repeat( vmask.size(0) + 2, 1) text_padding_masks = cls_video_mask[None, :].repeat( cmask.size(0) - text_len - 2, 1) return torch.cat([ cls_video_masks, text_masks, text_padding_masks], dim=0) ================================================ FILE: examples/MMPT/mmpt/tasks/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .task import * from .vlmtask import * from .retritask import * try: from .fairseqmmtask import * except ImportError: pass try: from .milncetask import * except ImportError: pass try: from .expretritask import * except ImportError: pass ================================================ FILE: examples/MMPT/mmpt/tasks/fairseqmmtask.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ make a general fairseq task for MM pretraining. """ import random from fairseq.tasks import LegacyFairseqTask, register_task from .task import Task from .retritask import RetriTask from ..datasets import FairseqMMDataset from .. import utils @register_task("mmtask") class FairseqMMTask(LegacyFairseqTask): @staticmethod def add_args(parser): # Add some command-line arguments for specifying where the data is # located and the maximum supported input length. parser.add_argument( "taskconfig", metavar="FILE", help=("taskconfig to load all configurations" "outside fairseq parser."), ) @classmethod def setup_task(cls, args, **kwargs): return FairseqMMTask(args) def __init__(self, args): super().__init__(args) config = utils.load_config(args) self.mmtask = Task.config_task(config) self.mmtask.build_dataset() self.mmtask.build_model() self.mmtask.build_loss() def load_dataset(self, split, **kwargs): split_map = { "train": self.mmtask.train_data, "valid": self.mmtask.val_data, "test": self.mmtask.test_data, } if split not in split_map: raise ValueError("unknown split type.") if split_map[split] is not None: self.datasets[split] = FairseqMMDataset(split_map[split]) def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, skip_remainder_batch=False, grouped_shuffling=False, update_epoch_batch_itr=False, ): random.seed(epoch) if dataset.mmdataset.split == "train" and isinstance(self.mmtask, RetriTask): if epoch >= self.mmtask.config.retri_epoch: if not hasattr(self.mmtask, "retri_dataloader"): self.mmtask.build_dataloader() self.mmtask.retrive_candidates(epoch) return super().get_batch_iterator( dataset, max_tokens, max_sentences, max_positions, ignore_invalid_inputs, required_batch_size_multiple, seed, num_shards, shard_id, num_workers, epoch, data_buffer_size, disable_iterator_cache, grouped_shuffling, update_epoch_batch_itr, ) @property def source_dictionary(self): return None @property def target_dictionary(self): return None ================================================ FILE: examples/MMPT/mmpt/tasks/milncetask.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from .task import Task class MILNCETask(Task): def reshape_subsample(self, sample): if ( hasattr(self.config.dataset, "subsampling") and self.config.dataset.subsampling is not None and self.config.dataset.subsampling > 1 ): for key in sample: if torch.is_tensor(sample[key]): tensor = self.flat_subsample(sample[key]) if key in ["caps", "cmasks"]: size = tensor.size() batch_size = size[0] * size[1] expanded_size = (batch_size,) + size[2:] tensor = tensor.view(expanded_size) sample[key] = tensor return sample ================================================ FILE: examples/MMPT/mmpt/tasks/retritask.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import torch import pickle import random from tqdm import tqdm from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler from ..processors import ( ShardedHow2MetaProcessor, ShardedVideoProcessor, ShardedTextProcessor, VariedLenAligner, ) from ..datasets import MMDataset from .task import Task from ..modules import vectorpool from ..evaluators.predictor import Predictor from ..utils import set_seed, get_local_rank, get_world_size class RetriTask(Task): """abstract class for task with retrival.""" def reshape_subsample(self, sample): for key in sample: if torch.is_tensor(sample[key]): sample[key] = self.flat_subsample(sample[key]) return sample def flat_subsample(self, tensor): if tensor.size(0) == 1: tensor = tensor.squeeze(0) return tensor def build_dataloader(self): """called by `get_batch_iterator` in fairseqmmtask. """ # TODO: hard-code dataloader for retri for now and configurable in .yaml. # reuse the `train.lst`. self.config.dataset.split = "train" meta_processor = ShardedHow2MetaProcessor(self.config.dataset) video_processor = ShardedVideoProcessor(self.config.dataset) text_processor = ShardedTextProcessor(self.config.dataset) aligner = VariedLenAligner(self.config.dataset) aligner.subsampling = self.config.dataset.clip_per_video self.retri_data = MMDataset( meta_processor, video_processor, text_processor, aligner ) retri_sampler = DistributedSampler(self.retri_data) infer_scale = 16 batch_size = self.config.dataset.num_video_per_batch \ * infer_scale self.retri_dataloader = DataLoader( self.retri_data, collate_fn=self.retri_data.collater, batch_size=batch_size, shuffle=False, sampler=retri_sampler, num_workers=self.config.fairseq.dataset.num_workers ) return self.retri_dataloader def retrive_candidates(self, epoch, dataloader=None): if get_local_rank() == 0: print("running retrieval model.") out_dir = os.path.join( self.config.fairseq.checkpoint.save_dir, "retri") os.makedirs(out_dir, exist_ok=True) if not os.path.isfile( os.path.join( out_dir, "batched_e" + str(epoch) + "_videos0.pkl") ): if dataloader is None: dataloader = self.retri_dataloader self.model.eval() self.model.is_train = False assert self.retri_data.meta_processor.data == \ self.train_data.meta_processor.data # video_ids not mutated. self._retri_predict(epoch, dataloader) self.model.train() self.model.is_train = True torch.distributed.barrier() output = self._retri_sync(epoch, out_dir) torch.distributed.barrier() self.train_data.meta_processor.set_candidates(output) return output class VideoRetriTask(RetriTask): """RetriTask on video level.""" def reshape_subsample(self, sample): if ( hasattr(self.config.dataset, "clip_per_video") and self.config.dataset.clip_per_video is not None and self.config.dataset.clip_per_video > 1 ): for key in sample: if torch.is_tensor(sample[key]): sample[key] = self.flat_subsample(sample[key]) return sample def flat_subsample(self, tensor): if tensor.size(0) == 1: tensor = tensor.squeeze(0) return Task.flat_subsample(self, tensor) def _retri_predict(self, epoch, dataloader): set_seed(epoch) # save for retrival. predictor = VideoPredictor(self.config) predictor.predict_loop( self.model, dataloader) set_seed(epoch) # get the same text clips. # retrival. retri_predictor = VideoRetriPredictor( self.config) retri_predictor.predict_loop( self.model, predictor.vecpool.retriver, epoch) del predictor del retri_predictor def _retri_sync(self, epoch, out_dir): # gpu do the same merge. batched_videos = [] for local_rank in range(get_world_size()): fn = os.path.join( out_dir, "batched_e" + str(epoch) + "_videos" + str(local_rank) + ".pkl") with open(fn, "rb") as fr: batched_videos.extend(pickle.load(fr)) print( "[INFO] batched_videos", len(batched_videos), len(batched_videos[0])) return batched_videos class VideoPredictor(Predictor): def __init__(self, config): vectorpool_cls = getattr(vectorpool, config.vectorpool_cls) self.vecpool = vectorpool_cls(config) def predict_loop( self, model, dataloader, early_stop=-1, ): with torch.no_grad(): if get_local_rank() == 0: dataloader = tqdm(dataloader) for batch_idx, batch in enumerate(dataloader): if batch_idx == early_stop: break self(batch, model) return self.finalize() def __call__(self, sample, model, **kwargs): param = next(model.parameters()) dtype = param.dtype device = param.device subsample = sample["vfeats"].size(1) sample = self.to_ctx(sample, device, dtype) for key in sample: if torch.is_tensor(sample[key]): size = sample[key].size() if len(size) >= 2: batch_size = size[0] * size[1] expanded_size = ( (batch_size,) + size[2:] if len(size) > 2 else (batch_size,) ) sample[key] = sample[key].view(expanded_size) outputs = model(**sample) sample.update(outputs) self.vecpool(sample, subsample) def finalize(self): print("[INFO]", self.vecpool) if not self.vecpool.retriver.db.is_trained: self.vecpool.retriver.finalize_training() return self.vecpool.retriver class VideoRetriPredictor(Predictor): """ Online Retrieval Predictor for Clips (used by RetriTask). TODO: merge this with VisPredictor? """ def __init__(self, config): self.pred_dir = os.path.join( config.fairseq.checkpoint.save_dir, "retri") self.num_cands = config.num_cands self.num_video_per_batch = config.dataset.num_video_per_batch def predict_loop( self, model, retriver, epoch, early_stop=-1 ): # a fake loop that only try to recover video vector # from video_id. batched_videos = [] # obtain available video_ids. video_ids = list(retriver.videoid_to_vectoridx.keys()) dataloader = random.sample( video_ids, len(video_ids) // self.num_video_per_batch ) if get_local_rank() == 0: dataloader = tqdm(dataloader) for batch_idx, batch in enumerate(dataloader): # batch is one video id. if batch_idx == early_stop: break video_ids = retriver.search_by_video_ids( [batch], self.num_cands)[0] if len(video_ids) > self.num_video_per_batch: # we moved the center to make cluster robust. video_ids = random.sample(video_ids, self.num_video_per_batch) batched_videos.append(video_ids) return self.finalize(batched_videos, epoch) def finalize(self, batched_videos, epoch): fn = os.path.join( self.pred_dir, "batched_e" + str(epoch) + "_videos" + str(get_local_rank()) + ".pkl") with open(fn, "wb") as fw: pickle.dump(batched_videos, fw, pickle.HIGHEST_PROTOCOL) return batched_videos ================================================ FILE: examples/MMPT/mmpt/tasks/task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from .. import tasks from .. import models from .. import losses from ..datasets import MMDataset from .. import processors class Task(object): """ A task refers to one generic training task (e.g., training one model). """ @classmethod def config_task(cls, config): """ determine whether to load a hard-coded task or config from a generic one. via if a task string is available in config. """ if config.task is not None: # TODO (huxu): expand the search scope. task_cls = getattr(tasks, config.task) return task_cls(config) else: return Task(config) def __init__(self, config): self.config = config self.train_data = None self.val_data = None self.test_data = None self.model = None self.loss_fn = None self.eval_fn = None def build_dataset(self): """TODO (huxu): move processor breakdown to MMDataset.""" """fill-in `self.train_data`, `self.val_data` and `self.test_data`.""" meta_processor_cls = getattr( processors, self.config.dataset.meta_processor) video_processor_cls = getattr( processors, self.config.dataset.video_processor) text_processor_cls = getattr( processors, self.config.dataset.text_processor) aligner_cls = getattr( processors, self.config.dataset.aligner) if self.config.dataset.train_path is not None: self.config.dataset.split = "train" # may be used by meta processor. # meta_processor controls different dataset. meta_processor = meta_processor_cls(self.config.dataset) video_processor = video_processor_cls(self.config.dataset) text_processor = text_processor_cls(self.config.dataset) aligner = aligner_cls(self.config.dataset) self.train_data = MMDataset( meta_processor, video_processor, text_processor, aligner ) print("train_len", len(self.train_data)) output = self.train_data[0] self.train_data.print_example(output) if self.config.dataset.val_path is not None: self.config.dataset.split = "valid" # may be used by meta processor. meta_processor = meta_processor_cls(self.config.dataset) video_processor = video_processor_cls(self.config.dataset) text_processor = text_processor_cls(self.config.dataset) aligner = aligner_cls(self.config.dataset) self.val_data = MMDataset( meta_processor, video_processor, text_processor, aligner ) print("val_len", len(self.val_data)) output = self.val_data[0] self.val_data.print_example(output) if self.config.dataset.split == "test": # the following is run via lauching fairseq-validate. meta_processor = meta_processor_cls(self.config.dataset) video_processor = video_processor_cls(self.config.dataset) text_processor = text_processor_cls(self.config.dataset) self.test_data = MMDataset( meta_processor, video_processor, text_processor, aligner ) print("test_len", len(self.test_data)) output = self.test_data[0] self.test_data.print_example(output) def build_model(self, checkpoint=None): if self.model is None: model_cls = getattr(models, self.config.model.model_cls) self.model = model_cls(self.config) if checkpoint is not None: self.load_checkpoint(checkpoint) return self.model def load_checkpoint(self, checkpoint): if self.model is None: raise ValueError("model is not initialized.") state_dict = torch.load(checkpoint) state_dict = self._trim_state_dict(state_dict) self.model.load_state_dict(state_dict, strict=False) # if it's a fp16 model, turn it back. if next(self.model.parameters()).dtype == torch.float16: self.model = self.model.float() return self.model def _trim_state_dict(self, state_dict): from collections import OrderedDict if "state_dict" in state_dict: state_dict = state_dict["state_dict"] if "model" in state_dict: # fairseq checkpoint format. state_dict = state_dict["model"] ret_state_dict = OrderedDict() for ( key, value, ) in state_dict.items(): # remove fairseq wrapper since this is a task. if key.startswith("mmmodel"): key = key[len("mmmodel."):] ret_state_dict[key] = value return ret_state_dict def build_loss(self): if self.loss_fn is None and self.config.loss is not None: loss_cls = getattr(losses, self.config.loss.loss_cls) self.loss_fn = loss_cls() return self.loss_fn def flat_subsample(self, tensor): size = tensor.size() if len(size) >= 2: batch_size = size[0] * size[1] expanded_size = ( (batch_size,) + size[2:] if len(size) > 2 else (batch_size,) ) tensor = tensor.view(expanded_size) return tensor def reshape_subsample(self, sample): if ( hasattr(self.config.dataset, "subsampling") and self.config.dataset.subsampling is not None and self.config.dataset.subsampling > 1 ): for key in sample: if torch.is_tensor(sample[key]): sample[key] = self.flat_subsample(sample[key]) return sample def __call__(self, model, sample): loss = None loss_scalar = float("inf") sample = self.reshape_subsample(sample) outputs = self.model(**sample) sample.update(outputs) if self.loss_fn is not None: loss = self.loss_fn(**sample) loss_scalar = loss.item() batch_size = sample["caps"].size(0) sample_size = 1 return { "loss": loss, "loss_scalar": loss_scalar, "max_len": self.config.dataset.max_len, "batch_size": batch_size, "sample_size": sample_size, } def build_dataloader(self): """only used for trainer that lacks building loaders.""" raise NotImplementedError ================================================ FILE: examples/MMPT/mmpt/tasks/vlmtask.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from .task import Task class VLMTask(Task): """A VLM task for reproducibility. the collator split subsamples into two sub-batches. This has should have no logic changes. but changed the randomness in frame masking. """ def flat_subsample(self, tensor): size = tensor.size() if len(size) >= 2: batch_size = size[0] * (size[1] // 2) expanded_size = ( (batch_size, 2) + size[2:] if len(size) > 2 else (batch_size, 2) ) tensor = tensor.view(expanded_size) tensor = torch.cat([tensor[:, 0], tensor[:, 1]], dim=0) return tensor ================================================ FILE: examples/MMPT/mmpt/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import random import numpy as np import torch from .shardedtensor import * from .load_config import * def set_seed(seed=43211): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if torch.backends.cudnn.enabled: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True def get_world_size(): if torch.distributed.is_initialized(): world_size = torch.distributed.get_world_size() else: world_size = 1 return world_size def get_local_rank(): return torch.distributed.get_rank() \ if torch.distributed.is_initialized() else 0 def print_on_rank0(func): local_rank = get_local_rank() if local_rank == 0: print("[INFO]", func) class RetriMeter(object): """ Statistics on whether retrieval yields a better pair. """ def __init__(self, freq=1024): self.freq = freq self.total = 0 self.replace = 0 self.updates = 0 def __call__(self, data): if isinstance(data, np.ndarray): self.replace += data.shape[0] - int((data[:, 0] == -1).sum()) self.total += data.shape[0] elif torch.is_tensor(data): self.replace += int(data.sum()) self.total += data.size(0) else: raise ValueError("unsupported RetriMeter data type.", type(data)) self.updates += 1 if get_local_rank() == 0 and self.updates % self.freq == 0: print("[INFO]", self) def __repr__(self): return "RetriMeter (" + str(self.replace / self.total) \ + "/" + str(self.replace) + "/" + str(self.total) + ")" ================================================ FILE: examples/MMPT/mmpt/utils/load_config.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import omegaconf from omegaconf import OmegaConf def load_config(args=None, config_file=None, overwrite_fairseq=False): """TODO (huxu): move fairseq overwrite to another function.""" if args is not None: config_file = args.taskconfig config = recursive_config(config_file) if config.dataset.subsampling is not None: batch_size = config.fairseq.dataset.batch_size // config.dataset.subsampling print( "adjusting batch_size to {} due to subsampling {}.".format( batch_size, config.dataset.subsampling ) ) config.fairseq.dataset.batch_size = batch_size is_test = config.dataset.split is not None and config.dataset.split == "test" if not is_test: if ( config.fairseq.checkpoint is None or config.fairseq.checkpoint.save_dir is None ): raise ValueError("fairseq save_dir or save_path must be specified.") save_dir = config.fairseq.checkpoint.save_dir os.makedirs(save_dir, exist_ok=True) if config.fairseq.common.tensorboard_logdir is not None: tb_run_dir = suffix_rundir( save_dir, config.fairseq.common.tensorboard_logdir ) config.fairseq.common.tensorboard_logdir = tb_run_dir print( "update tensorboard_logdir as", config.fairseq.common.tensorboard_logdir ) os.makedirs(save_dir, exist_ok=True) OmegaConf.save(config=config, f=os.path.join(save_dir, "config.yaml")) if overwrite_fairseq and config.fairseq is not None and args is not None: # flatten fields. for group in config.fairseq: for field in config.fairseq[group]: print("overwrite args." + field, "as", config.fairseq[group][field]) setattr(args, field, config.fairseq[group][field]) return config def recursive_config(config_path): """allows for stacking of configs in any depth.""" config = OmegaConf.load(config_path) if config.includes is not None: includes = config.includes config.pop("includes") base_config = recursive_config(includes) config = OmegaConf.merge(base_config, config) return config def suffix_rundir(save_dir, run_dir): max_id = -1 for search_dir in os.listdir(save_dir): if search_dir.startswith(run_dir): splits = search_dir.split("_") cur_id = int(splits[1]) if len(splits) > 1 else 0 max_id = max(max_id, cur_id) return os.path.join(save_dir, run_dir + "_" + str(max_id + 1)) def overwrite_dir(config, replace, basedir): for key in config: if isinstance(config[key], str) and config[key].startswith(basedir): config[key] = config[key].replace(basedir, replace) if isinstance(config[key], omegaconf.dictconfig.DictConfig): overwrite_dir(config[key], replace, basedir) ================================================ FILE: examples/MMPT/mmpt/utils/shardedtensor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import pickle import numpy as np class ShardedTensor(object): def __init__(self, data, starts): self.data = data self.starts = starts assert self.starts[0] == 0 assert self.starts[-1] == len(self.data) assert (self.starts[1:] >= self.starts[:-1]).all() assert (self.starts > -1).all() @staticmethod def from_list(xs): starts = np.full((len(xs) + 1,), -1, dtype=np.long) data = np.concatenate(xs, axis=0) starts[0] = 0 for i, x in enumerate(xs): starts[i + 1] = starts[i] + x.shape[0] assert (starts > -1).all() return ShardedTensor(data, starts) def __getitem__(self, i): return self.data[self.starts[i] : self.starts[i + 1]] def __len__(self): return len(self.starts) - 1 def lengths(self): return self.starts[1:] - self.starts[:-1] def save(self, path): np.save(path + "_starts", self.starts) np.save(path + "_data", self.data) @staticmethod def load(path, mmap_mode=None): starts = np.load(path + "_starts.npy", mmap_mode) data = np.load(path + "_data.npy", mmap_mode) return ShardedTensor(data, starts) ================================================ FILE: examples/MMPT/mmpt_cli/localjob.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os from mmpt.utils import recursive_config class BaseJob(object): def __init__(self, yaml_file, dryrun=False): self.yaml_file = yaml_file self.config = recursive_config(yaml_file) self.dryrun = dryrun def submit(self, **kwargs): raise NotImplementedError def _normalize_cmd(self, cmd_list): cmd_list = list(cmd_list) yaml_index = cmd_list.index("[yaml]") cmd_list[yaml_index] = self.yaml_file return cmd_list class LocalJob(BaseJob): CMD_CONFIG = { "local_single": [ "fairseq-train", "[yaml]", "--user-dir", "mmpt", "--task", "mmtask", "--arch", "mmarch", "--criterion", "mmloss", ], "local_small": [ "fairseq-train", "[yaml]", "--user-dir", "mmpt", "--task", "mmtask", "--arch", "mmarch", "--criterion", "mmloss", "--distributed-world-size", "2" ], "local_big": [ "fairseq-train", "[yaml]", "--user-dir", "mmpt", "--task", "mmtask", "--arch", "mmarch", "--criterion", "mmloss", "--distributed-world-size", "8" ], "local_predict": ["python", "mmpt_cli/predict.py", "[yaml]"], } def __init__(self, yaml_file, job_type=None, dryrun=False): super().__init__(yaml_file, dryrun) if job_type is None: self.job_type = "local_single" if self.config.task_type is not None: self.job_type = self.config.task_type else: self.job_type = job_type if self.job_type in ["local_single", "local_small"]: if self.config.fairseq.dataset.batch_size > 32: print("decreasing batch_size to 32 for local testing?") def submit(self): cmd_list = self._normalize_cmd(LocalJob.CMD_CONFIG[self.job_type]) if "predict" not in self.job_type: # append fairseq args. from mmpt.utils import load_config config = load_config(config_file=self.yaml_file) for field in config.fairseq: for key in config.fairseq[field]: if key in ["fp16", "reset_optimizer", "reset_dataloader", "reset_meters"]: # a list of binary flag. param = ["--" + key.replace("_", "-")] else: if key == "lr": value = str(config.fairseq[field][key][0]) elif key == "adam_betas": value = "'"+str(config.fairseq[field][key])+"'" else: value = str(config.fairseq[field][key]) param = [ "--" + key.replace("_", "-"), value ] cmd_list.extend(param) print("launching", " ".join(cmd_list)) if not self.dryrun: os.system(" ".join(cmd_list)) return JobStatus("12345678") class JobStatus(object): def __init__(self, job_id): self.job_id = job_id def __repr__(self): return self.job_id def __str__(self): return self.job_id def done(self): return False def running(self): return False def result(self): if self.done(): return "{} is done.".format(self.job_id) else: return "{} is running.".format(self.job_id) def stderr(self): return self.result() def stdout(self): return self.result() ================================================ FILE: examples/MMPT/mmpt_cli/predict.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import glob import argparse import pprint import omegaconf from omegaconf import OmegaConf from torch.utils.data import DataLoader from mmpt.utils import load_config, set_seed from mmpt.evaluators import Evaluator from mmpt.evaluators import predictor as predictor_path from mmpt.tasks import Task from mmpt import processors from mmpt.datasets import MMDataset def get_dataloader(config): meta_processor_cls = getattr(processors, config.dataset.meta_processor) video_processor_cls = getattr(processors, config.dataset.video_processor) text_processor_cls = getattr(processors, config.dataset.text_processor) aligner_cls = getattr(processors, config.dataset.aligner) meta_processor = meta_processor_cls(config.dataset) video_processor = video_processor_cls(config.dataset) text_processor = text_processor_cls(config.dataset) aligner = aligner_cls(config.dataset) test_data = MMDataset( meta_processor, video_processor, text_processor, aligner, ) print("test_len", len(test_data)) output = test_data[0] test_data.print_example(output) test_dataloader = DataLoader( test_data, batch_size=config.fairseq.dataset.batch_size, shuffle=False, num_workers=6, collate_fn=test_data.collater, ) return test_dataloader def main(args): config = load_config(args) if isinstance(config, omegaconf.dictconfig.DictConfig): print(OmegaConf.to_yaml(config)) else: pp = pprint.PrettyPrinter(indent=4) pp.print(config) mmtask = Task.config_task(config) mmtask.build_model() test_dataloader = get_dataloader(config) checkpoint_search_path = os.path.dirname(config.eval.save_path) results = [] prefix = os.path.basename(args.taskconfig) if prefix.startswith("test"): # loop all checkpoint for datasets without validation set. if "best" not in config.fairseq.common_eval.path: print("eval each epoch.") for checkpoint in glob.glob(checkpoint_search_path + "/checkpoint*"): model = mmtask.load_checkpoint(checkpoint) ckpt = os.path.basename(checkpoint) evaluator = Evaluator(config) output = evaluator.evaluate( model, test_dataloader, ckpt + "_merged") results.append((checkpoint, output)) # use the one specified by the config lastly. model = mmtask.load_checkpoint(config.fairseq.common_eval.path) evaluator = Evaluator(config) output = evaluator.evaluate(model, test_dataloader) results.append((config.fairseq.common_eval.path, output)) best_result = None best_metric = 0. for checkpoint, result in results: print(checkpoint) evaluator.metric.print_computed_metrics(result) best_score = evaluator.metric.best_metric(result) if best_score > best_metric: best_result = (checkpoint, result) best_metric = best_score print("best results:") print(best_result[0]) evaluator.metric.print_computed_metrics(best_result[1]) elif prefix.startswith("vis"): model = mmtask.load_checkpoint(config.fairseq.common_eval.path) predictor_cls = getattr(predictor_path, config.predictor) predictor = predictor_cls(config) predictor.predict_loop(model, test_dataloader, mmtask, None) else: raise ValueError("unknown prefix of the config file", args.taskconfig) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("taskconfig", type=str) args = parser.parse_args() main(args) ================================================ FILE: examples/MMPT/pretraining.md ================================================ # Pretraining (If you are new to the ideas of `mmpt.processors`, see [README](README.md) first.) We mostly use [howto100M](https://github.com/antoine77340/howto100m) dataset for pretraining (other datasets are coming). So you are less likely to write a new `MetaProcessor`, `VideoProcessor` or `TextProcessor` but only working on a new `Aligner`, a new model and loss. ### Data Sharding Pretraining on Howto100M is heavy on IO since we have millions of videos or captions on the hard disk that cannot be fit into the memory. It is desirable to have an optimized preprocessing step before the actual dataloading. We support data sharding to pack multiple videos into a shards of training data for both videos and captions. (see [dataset](DATASET.md) for preprocessing). These shards will be mapped into memory to reduce the frequency of IO access on millions of files. See (processors starting with `Sharded*`). This will be the default config for a how2 dataset `projects/task/how2.yaml`. Great thanks to Dmytro Okhonko for sharing the code from MARGE project. ### Training Pretraining on Howto100m is expected on one or multiple nodes, where each node has 8 GPUS with 32 GB mem. launching a pretraing on MFM+MLM can be done, via: ```python locallaunch.py projects/mfmmlm/how2.yaml``` ### Pre-training with a Retrieval Model (VideoCLIP) This projects now support alternatively run a retrieval model and pre-training. We implement a basic retrieval model that is built on the hidden states of a video and faiss. You may need to install faiss via `conda install faiss-cpu -c pytorch`. Right now, the hidden states of a video is computed as the average of 8 clips of their pooled visual/text hidden states. See `mmpt/tasks/retritask.py` for more details. The `.yaml` config for running pre-training with a retrieval model can be found at `projects/retri/videoretri.yaml`. ================================================ FILE: examples/MMPT/projects/mfmmlm.yaml ================================================ project_dir: mfmmlm run_task: - how2.yaml - [vtt.yaml, vttcap.yaml, vttqa.yaml, youcook.yaml, youcookcap.yaml, crosstask.yaml, coin.yaml] base_dir: task task_group: pretrain: task_list: - how2.yaml dataset: subsampling: 32 sampled_min_len: 10 sampled_max_len: 64 max_video_len: 32 max_len: 96 aligner: MFMMLMAligner lazy_vfeat_mask: True mfm_probability: 0.15 mlm_probability: 0.15 mm_prob: 0.5 model: model_cls: MMFusionMFMMLM mm_encoder_cls: MMFusionForMFMMLM loss: loss_cls: MFMMLM fairseq: common: fp16: true dataset: batch_size: 256 optimization: max_epoch: 15 finetune: task_list: - vtt.yaml - vttqa.yaml - youcook.yaml - youcookcap.yaml - crosstask.yaml - coin.yaml dataset: max_video_len: 32 max_len: 96 fairseq: common: fp16: true # do not write any model or loss here (they are expected to be fixed in mmfusion). test: task_list: - test_vtt.yaml - test_vttqa.yaml - test_youcook.yaml - test_youcookcap.yaml - test_crosstask.yaml - test_crosstask_zs.yaml - test_coin.yaml dataset: max_video_len: 32 max_len: 96 ================================================ FILE: examples/MMPT/projects/mtm/mmfusionmtm.yaml ================================================ includes: projects/mfmmlm.yaml project_dir: mtm/mmfusionmtm task_group: pretrain: task: VLMTask # reproducible dataset: aligner: MFMMLMAligner model: use_seg_emb: True # reproducible model_cls: MMFusionMTM mm_encoder_cls: MMBertForMFMMLM loss: loss_cls: MTM finetune: model: use_seg_emb: True # reproducible test: model: use_seg_emb: True # reproducible ================================================ FILE: examples/MMPT/projects/mtm/vlm/coin.yaml ================================================ dataset: video_processor: VideoProcessor bert_name: bert-base-uncased meta_processor: COINActionSegmentationMetaProcessor train_path: data/coin/COIN.json val_path: data/coin/COIN.json vfeat_dir: data/feat/feat_coin_s3d text_processor: COINActionSegmentationTextProcessor aligner: COINActionSegmentationAligner num_iso_layer: 12 sliding_window: 8 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 1 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 8 checkpoint: restore_file: runs/mtm/vlm/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/mtm/vlm/coin task_type: sweep_big model: model_cls: MMFusionActionSegmentation mm_encoder_cls: MMBertForTokenClassification use_seg_emb: true loss: loss_cls: CrossEntropy ================================================ FILE: examples/MMPT/projects/mtm/vlm/crosstask.yaml ================================================ dataset: video_processor: CrossTaskVideoProcessor bert_name: bert-base-uncased meta_processor: CrossTaskMetaProcessor train_path: data/crosstask/crosstask_release/videos.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 text_processor: CrossTaskTextProcessor aligner: CrossTaskAligner num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 1 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 5 checkpoint: restore_file: runs/mtm/vlm/checkpoint11.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/mtm/vlm/crosstask task_type: sweep_small model: model_cls: MMFusionActionLocalization mm_encoder_cls: MMBertForJoint use_seg_emb: true loss: loss_cls: BCE ================================================ FILE: examples/MMPT/projects/mtm/vlm/how2.yaml ================================================ dataset: video_processor: ShardedVideoProcessor bert_name: bert-base-uncased meta_processor: ShardedHow2MetaProcessor train_path: data/how2/how2_s3d_train.lst val_path: data/how2/how2_s3d_val.lst vfeat_dir: data/feat/feat_how2_s3d_shard_small text_processor: ShardedTextProcessor tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased. aligner: MFMMLMAligner subsampling: 32 sampled_min_len: 8 sampled_max_len: 64 max_video_len: 32 max_len: 96 lazy_vfeat_mask: true mfm_probability: 0.15 mlm_probability: 0.15 mm_prob: 0.5 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 256 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 1000 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 15 checkpoint: save_dir: runs/mtm/vlm save_interval_updates: 1024 keep_interval_updates: 2 keep_last_epochs: 30 task_type: sweep_big slurm_config: big eval: save_path: runs/mtm/vlm model: model_cls: MMFusionMTM mm_encoder_cls: MMBertForMFMMLM use_seg_emb: true loss: loss_cls: MTM task: VLMTask ================================================ FILE: examples/MMPT/projects/mtm/vlm/test_coin.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: COINActionSegmentationAligner bert_name: bert-base-uncased test_path: data/coin/COIN.json meta_processor: COINActionSegmentationMetaProcessor vfeat_dir: data/feat/feat_coin_s3d text_processor: COINActionSegmentationTextProcessor num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 1 valid_subset: test num_workers: 2 common_eval: path: runs/mtm/vlm/coin/checkpoint_best.pt model: model_cls: MMFusionActionSegmentation mm_encoder_cls: MMBertForTokenClassification use_seg_emb: true eval: save_path: runs/mtm/vlm/coin/eval metric: COINActionSegmentationMetric predictor: COINPredictor ================================================ FILE: examples/MMPT/projects/mtm/vlm/test_crosstask.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: CrossTaskVideoProcessor aligner: CrossTaskAligner bert_name: bert-base-uncased meta_processor: CrossTaskMetaProcessor test_path: data/crosstask/crosstask_release/videos_val.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 text_processor: CrossTaskTextProcessor num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 1 valid_subset: test num_workers: 2 common_eval: path: runs/mtm/vlm/crosstask/checkpoint_best.pt model: model_cls: MMFusionActionLocalization mm_encoder_cls: MMBertForJoint use_seg_emb: true eval: save_path: runs/mtm/vlm/crosstask/eval metric: CrossTaskMetric predictor: CrossTaskPredictor ================================================ FILE: examples/MMPT/projects/mtm/vlm/test_crosstask_zs.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: CrossTaskVideoProcessor aligner: CrossTaskAligner bert_name: bert-base-uncased meta_processor: CrossTaskMetaProcessor test_path: data/crosstask/crosstask_release/videos_val.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 text_processor: CrossTaskTextProcessor num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 1 valid_subset: test num_workers: 2 common_eval: path: runs/mtm/vlm/checkpoint_best.pt model: model_cls: MMFusionActionLocalization mm_encoder_cls: MMBertForJoint use_seg_emb: true eval: save_path: runs/mtm/vlm/crosstask_zs/eval metric: CrossTaskMetric predictor: CrossTaskPredictor ================================================ FILE: examples/MMPT/projects/mtm/vlm/test_vtt.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: DSAligner bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/mtm/vlm/vtt/checkpoint_last.pt model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint use_seg_emb: true eval: save_path: runs/mtm/vlm/vtt/eval metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/mtm/vlm/test_vttqa.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: MSRVTTQAAligner bert_name: bert-base-uncased meta_processor: MSRVTTQAMetaProcessor test_path: data/msrvtt-qa/MSR_MC_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTQATextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/mtm/vlm/vttqa/checkpoint_last.pt model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint use_seg_emb: true eval: save_path: runs/mtm/vlm/vttqa/eval metric: QAMetric predictor: QAPredictor ================================================ FILE: examples/MMPT/projects/mtm/vlm/test_youcook.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: YoucookVideoProcessor aligner: DSAligner bert_name: bert-base-uncased meta_processor: YoucookMetaProcessor test_path: data/youcook/youcook_val.pkl trainval_annotation: data/youcook/youcookii_annotations_trainval.json use_annotation_text: true vfeat_dir: data/feat/feat_youcook_s3d text_processor: TextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/mtm/vlm/youcook/checkpoint_last.pt model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint use_seg_emb: true eval: save_path: runs/mtm/vlm/youcook/eval metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/mtm/vlm/test_youcookcap.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: YoucookVideoProcessor aligner: DSNLGAligner bert_name: bert-base-uncased meta_processor: YoucookNLGMetaProcessor test_path: data/youcook/val_list.txt trainval_annotation: data/youcook/youcookii_annotations_trainval.json vfeat_dir: data/feat/feat_youcook_s3d text_processor: NLGTextProcessor max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/mtm/vlm/youcookcap/checkpoint_best.pt model: model_cls: MMFusionNLG mm_encoder_cls: MMBertForNLG max_decode_length: 24 use_seg_emb: true eval: save_path: runs/mtm/vlm/youcookcap/eval metric: NLGMetric predictor: NLGPredictor gen_param: num_beams: 5 ================================================ FILE: examples/MMPT/projects/mtm/vlm/vtt.yaml ================================================ dataset: video_processor: VideoProcessor bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor train_path: data/msrvtt/MSRVTT_train.csv jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv full_test_path: data/msrvtt/MSRVTT_FULL_test.csv dup: 20 val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor json_path: data/msrvtt/MSRVTT_data.json aligner: DSAligner num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 256 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 10 checkpoint: restore_file: runs/mtm/vlm/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/mtm/vlm/vtt task_type: sweep_small model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint use_seg_emb: true loss: loss_cls: T2VContraLoss ================================================ FILE: examples/MMPT/projects/mtm/vlm/vttqa.yaml ================================================ dataset: video_processor: VideoProcessor bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor train_path: data/msrvtt/MSRVTT_train.csv dup: 20 val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor json_path: data/msrvtt/MSRVTT_data.json aligner: DSAligner num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 128 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 5 checkpoint: restore_file: runs/mtm/vlm/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/mtm/vlm/vttqa task_type: sweep_small model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint use_seg_emb: true loss: loss_cls: V2TContraLoss ================================================ FILE: examples/MMPT/projects/mtm/vlm/youcook.yaml ================================================ dataset: video_processor: YoucookVideoProcessor bert_name: bert-base-uncased meta_processor: YoucookMetaProcessor train_path: data/youcook/youcook_train.pkl val_path: data/youcook/youcook_val.pkl trainval_annotation: data/youcook/youcookii_annotations_trainval.json use_annotation_text: true vfeat_dir: data/feat/feat_youcook_s3d text_processor: TextProcessor aligner: DSAligner num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 128 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 10 checkpoint: restore_file: runs/mtm/vlm/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/mtm/vlm/youcook task_type: sweep_small model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint use_seg_emb: true loss: loss_cls: T2VContraLoss ================================================ FILE: examples/MMPT/projects/mtm/vlm/youcookcap.yaml ================================================ dataset: video_processor: YoucookVideoProcessor bert_name: bert-base-uncased meta_processor: YoucookNLGMetaProcessor train_path: data/youcook/train_list.txt val_path: data/youcook/val_list.txt trainval_annotation: data/youcook/youcookii_annotations_trainval.json vfeat_dir: data/feat/feat_youcook_s3d text_processor: NLGTextProcessor aligner: DSNLGAligner max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 128 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 10 checkpoint: restore_file: runs/mtm/vlm/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/mtm/vlm/youcookcap task_type: sweep_small model: model_cls: MMFusionNLG mm_encoder_cls: MMBertForNLG use_seg_emb: true loss: loss_cls: NLGLoss ================================================ FILE: examples/MMPT/projects/mtm/vlm.yaml ================================================ includes: projects/mtm/mmfusionmtm.yaml project_dir: mtm/vlm task_group: pretrain: dataset: sampled_min_len: 8 loss: loss_cls: MTM ================================================ FILE: examples/MMPT/projects/retri/videoclip/coin_videoclip.yaml ================================================ dataset: video_processor: VideoProcessor bert_name: bert-base-uncased meta_processor: COINActionSegmentationMetaProcessor train_path: data/coin/COIN.json val_path: data/coin/COIN.json vfeat_dir: data/feat/feat_coin_s3d text_processor: COINActionSegmentationTextProcessor aligner: COINActionSegmentationAligner num_iso_layer: 12 sliding_window: 8 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 1 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 8 checkpoint: restore_file: runs/retri/videoclip/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/retri/videoclip/coin task_type: sweep_big model: model_cls: MMFusionSeparateActionSegmentation mm_encoder_cls: null video_encoder_cls: MMBertForTokenClassification text_encoder_cls: BertModel num_hidden_video_layers: 6 loss: loss_cls: CrossEntropy ================================================ FILE: examples/MMPT/projects/retri/videoclip/crosstask_videoclip.yaml ================================================ dataset: video_processor: CrossTaskVideoProcessor bert_name: bert-base-uncased meta_processor: CrossTaskMetaProcessor train_path: data/crosstask/crosstask_release/videos.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 text_processor: CrossTaskTextProcessor aligner: CrossTaskAligner num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 1 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 5 checkpoint: restore_file: runs/retri/videoclip/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/retri/videoclip/crosstask task_type: sweep_small model: model_cls: MMFusionSeparateActionLocalization mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 loss: loss_cls: BCE ================================================ FILE: examples/MMPT/projects/retri/videoclip/how2.yaml ================================================ dataset: video_processor: ShardedVideoRetriVideoProcessor bert_name: bert-base-uncased meta_processor: ShardedHow2VideoRetriMetaProcessor train_path: data/how2/how2_s3d_train.lst val_path: data/how2/how2_s3d_val.lst vfeat_dir: data/feat/feat_how2_s3d_shard_small text_processor: ShardedVideoRetriTextProcessor tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased. aligner: VideoRetriOverlappedAligner subsampling: 1 sampled_min_len: 8 sampled_max_len: 64 max_video_len: 32 max_len: 96 lazy_vfeat_mask: true mfm_probability: 0.15 mlm_probability: 0.15 mm_prob: 0.5 sampled_video_min_len: 3 sampled_video_max_len: 32 num_video_per_batch: 32 clip_per_video: 16 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 1 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 1000 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 25 checkpoint: save_dir: runs/retri/videoclip save_interval_updates: 1024 keep_interval_updates: 2 keep_last_epochs: 30 task_type: sweep_big slurm_config: big eval: save_path: runs/retri/videoclip model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 loss: loss_cls: MMContraLoss task: VideoRetriTask retri_epoch: 1 vectorpool_cls: VideoVectorPool retriever_cls: VectorRetriever num_cands: 64 ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_coin_videoclip.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: COINActionSegmentationAligner bert_name: bert-base-uncased test_path: data/coin/COIN.json meta_processor: COINActionSegmentationMetaProcessor vfeat_dir: data/feat/feat_coin_s3d text_processor: COINActionSegmentationTextProcessor num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 1 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/coin/checkpoint_best.pt model: model_cls: MMFusionSeparateActionSegmentation mm_encoder_cls: null video_encoder_cls: MMBertForTokenClassification text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/coin/eval metric: COINActionSegmentationMetric predictor: COINPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_coin_zs.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: COINActionSegmentationAligner bert_name: bert-base-uncased test_path: data/coin/COIN.json meta_processor: COINActionSegmentationMetaProcessor vfeat_dir: data/feat/feat_coin_s3d text_processor: COINActionSegmentationTextProcessor num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 1 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/checkpoint_best.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/coin_zs/eval metric: COINActionSegmentationMetric predictor: COINZSPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_crosstask_videoclip.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: CrossTaskVideoProcessor aligner: CrossTaskAligner bert_name: bert-base-uncased meta_processor: CrossTaskMetaProcessor test_path: data/crosstask/crosstask_release/videos_val.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 text_processor: CrossTaskTextProcessor num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 1 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/crosstask/checkpoint_best.pt model: model_cls: MMFusionSeparateActionLocalization mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/crosstask/eval metric: CrossTaskMetric predictor: CrossTaskPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_crosstask_zs_videoclip.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: CrossTaskVideoProcessor aligner: CrossTaskAligner bert_name: bert-base-uncased meta_processor: CrossTaskMetaProcessor test_path: data/crosstask/crosstask_release/videos_val.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 text_processor: CrossTaskTextProcessor num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 1 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/checkpoint_best.pt model: model_cls: MMFusionSeparateActionLocalization mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/crosstask_zs/eval metric: CrossTaskMetric predictor: CrossTaskPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_didemo_zs.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: DiDeMoAligner bert_name: bert-base-uncased meta_processor: DiDeMoMetaProcessor test_path: data/didemo/test_data.json vfeat_dir: data/feat/feat_didemo_s3d text_processor: DiDeMoTextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/checkpoint_best.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/didemo_zs/eval metric: DiDeMoMetric predictor: DiDeMoPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_vtt_videoclip.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: DSAligner bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/vtt/checkpoint_last.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/vtt/eval metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_vtt_zs.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: DSAligner bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/checkpoint_best.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/vtt_zs/eval metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_vttqa_videoclip.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: MSRVTTQAAligner bert_name: bert-base-uncased meta_processor: MSRVTTQAMetaProcessor test_path: data/msrvtt-qa/MSR_MC_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTQATextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/vttqa/checkpoint_last.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/vttqa/eval metric: QAMetric predictor: QAPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_vttqa_zs.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: MSRVTTQAAligner bert_name: bert-base-uncased meta_processor: MSRVTTQAMetaProcessor test_path: data/msrvtt-qa/MSR_MC_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTQATextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/checkpoint_best.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/vttqa_zs/eval metric: QAMetric predictor: QAPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_youcook_videoclip.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: YoucookVideoProcessor aligner: DSAligner bert_name: bert-base-uncased meta_processor: YoucookMetaProcessor test_path: data/youcook/youcook_val.pkl trainval_annotation: data/youcook/youcookii_annotations_trainval.json use_annotation_text: true vfeat_dir: data/feat/feat_youcook_s3d text_processor: TextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/youcook/checkpoint_last.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/youcook/eval metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/test_youcook_zs.yaml ================================================ slurm_config: big task_type: local_predict dataset: split: test video_processor: YoucookVideoProcessor aligner: DSAligner bert_name: bert-base-uncased meta_processor: YoucookMetaProcessor test_path: data/youcook/youcook_val.pkl trainval_annotation: data/youcook/youcookii_annotations_trainval.json use_annotation_text: true vfeat_dir: data/feat/feat_youcook_s3d text_processor: TextProcessor num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 common_eval: path: runs/retri/videoclip/checkpoint_best.pt model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/retri/videoclip/youcook_zs/eval metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/retri/videoclip/vtt_videoclip.yaml ================================================ dataset: video_processor: VideoProcessor bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor train_path: data/msrvtt/MSRVTT_train.csv jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv full_test_path: data/msrvtt/MSRVTT_FULL_test.csv dup: 20 val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor json_path: data/msrvtt/MSRVTT_data.json aligner: DSAligner num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 224 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 10 checkpoint: restore_file: runs/retri/videoclip/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/retri/videoclip/vtt task_type: sweep_small model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 loss: loss_cls: T2VContraLoss ================================================ FILE: examples/MMPT/projects/retri/videoclip/vttqa_videoclip.yaml ================================================ dataset: video_processor: VideoProcessor bert_name: bert-base-uncased meta_processor: MSRVTTMetaProcessor train_path: data/msrvtt/MSRVTT_train.csv dup: 20 val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor json_path: data/msrvtt/MSRVTT_data.json aligner: DSAligner num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 128 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 5 checkpoint: restore_file: runs/retri/videoclip/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/retri/videoclip/vttqa task_type: sweep_small model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 loss: loss_cls: V2TContraLoss ================================================ FILE: examples/MMPT/projects/retri/videoclip/youcook_videoclip.yaml ================================================ dataset: video_processor: YoucookVideoProcessor bert_name: bert-base-uncased meta_processor: YoucookMetaProcessor train_path: data/youcook/youcook_train.pkl val_path: data/youcook/youcook_val.pkl trainval_annotation: data/youcook/youcookii_annotations_trainval.json use_annotation_text: true vfeat_dir: data/feat/feat_youcook_s3d text_processor: TextProcessor aligner: DSAligner num_iso_layer: 12 max_video_len: 32 max_len: 96 fairseq: common: tensorboard_logdir: run log_interval: 1000 fp16: true dataset: num_workers: 4 batch_size: 128 optimization: lr: - 5.0e-05 clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 warmup_updates: 122 weight_decay: 0.0 ddp_backend: no_c10d max_epoch: 10 checkpoint: restore_file: runs/retri/videoclip/checkpoint_best.pt reset_optimizer: true reset_dataloader: true reset_meters: true save_dir: runs/retri/videoclip/youcook task_type: sweep_small model: model_cls: MMFusionSeparate mm_encoder_cls: null video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 loss: loss_cls: T2VContraLoss ================================================ FILE: examples/MMPT/projects/retri/videoclip.yaml ================================================ includes: projects/retri/videoretri.yaml project_dir: retri/videoclip task_group: pretrain: model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/retri/videoretri.yaml ================================================ includes: projects/mfmmlm.yaml project_dir: retri/videoretri run_task: - how2.yaml task_group: pretrain: task: VideoRetriTask retri_epoch: 1 vectorpool_cls: VideoVectorPool retriever_cls: VectorRetriever num_cands: 64 dataset: train_path: data/how2/how2_s3d_train.lst meta_processor: ShardedHow2VideoRetriMetaProcessor video_processor: ShardedVideoRetriVideoProcessor text_processor: ShardedVideoRetriTextProcessor aligner: VideoRetriOverlappedAligner sampled_video_min_len: 3 sampled_video_max_len: 32 sampled_min_len: 8 sampled_max_len: 64 num_video_per_batch: 32 # do not use subsampling as it changes fairseq batch_size. subsampling: 1 # disable subsampling clip_per_video: 16 fairseq: dataset: batch_size: 1 optimization: max_epoch: 25 model: model_cls: MMFusionShare mm_encoder_cls: MMBertForEncoder loss: loss_cls: MMContraLoss finetune: task_list: [vtt_videoclip.yaml, youcook_videoclip.yaml, vttqa_videoclip.yaml, crosstask_videoclip.yaml, coin_videoclip.yaml] test: task_list: - test_youcook_zs.yaml - test_vtt_zs.yaml - test_vttqa_zs.yaml - test_crosstask_zs_videoclip.yaml - test_coin_zs.yaml - test_didemo_zs.yaml - test_youcook_videoclip.yaml - test_vtt_videoclip.yaml - test_vttqa_videoclip.yaml - test_crosstask_videoclip.yaml - test_coin_videoclip.yaml ================================================ FILE: examples/MMPT/projects/task/coin.yaml ================================================ includes: projects/task/ft.yaml task_type: sweep_big dataset: meta_processor: COINActionSegmentationMetaProcessor train_path: data/coin/COIN.json val_path: data/coin/COIN.json vfeat_dir: data/feat/feat_coin_s3d video_processor: VideoProcessor text_processor: COINActionSegmentationTextProcessor aligner: COINActionSegmentationAligner num_iso_layer: 12 sliding_window: 8 sliding_window_size: 32 model: model_cls: MMFusionActionSegmentation mm_encoder_cls: MMBertForTokenClassification loss: loss_cls: CrossEntropy fairseq: dataset: batch_size: 1 optimization: max_epoch: 8 checkpoint: save_dir: runs/task/coin ================================================ FILE: examples/MMPT/projects/task/coin_videoclip.yaml ================================================ includes: projects/task/coin.yaml model: model_cls: MMFusionSeparateActionSegmentation mm_encoder_cls: video_encoder_cls: MMBertForTokenClassification text_encoder_cls: BertModel # dummy, not used. num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/task/crosstask.yaml ================================================ includes: projects/task/ft.yaml dataset: meta_processor: CrossTaskMetaProcessor train_path: data/crosstask/crosstask_release/videos.csv # dummy train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv # dummy val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 video_processor: CrossTaskVideoProcessor text_processor: CrossTaskTextProcessor aligner: CrossTaskAligner num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 model: model_cls: MMFusionActionLocalization mm_encoder_cls: MMBertForJoint loss: loss_cls: BCE fairseq: dataset: batch_size: 1 optimization: max_epoch: 5 checkpoint: save_dir: runs/task/crosstask restore_file: runs/task/checkpoint11.pt # for VLM ================================================ FILE: examples/MMPT/projects/task/crosstask_videoclip.yaml ================================================ includes: projects/task/crosstask.yaml model: model_cls: MMFusionSeparateActionLocalization mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel # dummy, not used. num_hidden_video_layers: 6 fairseq: checkpoint: restore_file: runs/task/checkpoint_best.pt # overwrite the default of VLM. ================================================ FILE: examples/MMPT/projects/task/default.yaml ================================================ # this yaml cannot be run alone. you must use `how2.yaml`, `vtt.yaml` etc for training. dataset: video_processor: VideoProcessor bert_name: bert-base-uncased fairseq: common: tensorboard_logdir: run log_interval: 1000 dataset: num_workers: 4 optimization: lr: [ 0.00005 ] clip_norm: 2.0 optimizer: adam adam_betas: (0.9, 0.98) lr_scheduler: polynomial_decay total_num_update: 1000000 # backward compatible on fairseq 1.0.0a0+af0389f for reproducibility. warmup_updates: 1000 weight_decay: 0.0 ddp_backend: no_c10d ================================================ FILE: examples/MMPT/projects/task/ft.yaml ================================================ includes: projects/task/default.yaml # all derived config will be run by fairseq-train. task_type: sweep_small fairseq: optimization: warmup_updates: 122 # copied from roberta glue: https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.glue.md checkpoint: # save_interval_updates: 512 # borrowed from Roberta script. restore_file: runs/task/checkpoint_best.pt reset_optimizer: True reset_dataloader: True reset_meters: True ================================================ FILE: examples/MMPT/projects/task/how2.yaml ================================================ includes: projects/task/default.yaml task_type: sweep_big slurm_config: big dataset: meta_processor: ShardedHow2MetaProcessor train_path: data/how2/how2_s3d_train.lst val_path: data/how2/how2_s3d_val.lst video_processor: ShardedVideoProcessor vfeat_dir: data/feat/feat_how2_s3d_shard_small text_processor: ShardedTextProcessor tfeat_dir: data/feat/feat_how2_s3d_shard_small/raw_caption_dedup.bert-base-uncased. aligner: FixedLenAligner # disable direct running of this yaml eval: save_path: runs/task fairseq: checkpoint: save_dir: runs/task save_interval_updates: 1024 keep_interval_updates: 2 keep_last_epochs: 30 ================================================ FILE: examples/MMPT/projects/task/test.yaml ================================================ # this yaml cannot be run alone: implement a test_${dataset}.yaml slurm_config: big task_type: local_predict dataset: split: test video_processor: VideoProcessor aligner: DSAligner bert_name: bert-base-uncased fairseq: dataset: batch_size: 256 valid_subset: test num_workers: 2 ================================================ FILE: examples/MMPT/projects/task/test_coin.yaml ================================================ includes: projects/task/test.yaml dataset: split: test test_path: data/coin/COIN.json meta_processor: COINActionSegmentationMetaProcessor vfeat_dir: data/feat/feat_coin_s3d video_processor: VideoProcessor text_processor: COINActionSegmentationTextProcessor aligner: COINActionSegmentationAligner num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 model: model_cls: MMFusionActionSegmentation mm_encoder_cls: MMBertForTokenClassification eval: save_path: runs/task/coin/eval fairseq: dataset: batch_size: 1 common_eval: path: runs/task/coin/checkpoint_best.pt metric: COINActionSegmentationMetric predictor: COINPredictor ================================================ FILE: examples/MMPT/projects/task/test_coin_videoclip.yaml ================================================ includes: projects/task/test_coin.yaml model: model_cls: MMFusionSeparateActionSegmentation mm_encoder_cls: video_encoder_cls: MMBertForTokenClassification text_encoder_cls: BertModel # dummy, not used. num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/task/test_coin_zs.yaml ================================================ includes: projects/task/test_coin.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/task/coin_zs/eval fairseq: common_eval: path: runs/task/checkpoint_best.pt predictor: COINZSPredictor ================================================ FILE: examples/MMPT/projects/task/test_crosstask.yaml ================================================ includes: projects/task/test.yaml dataset: split: test meta_processor: CrossTaskMetaProcessor test_path: data/crosstask/crosstask_release/videos_val.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv # dummy val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 video_processor: CrossTaskVideoProcessor text_processor: CrossTaskTextProcessor aligner: CrossTaskAligner num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 model: model_cls: MMFusionActionLocalization mm_encoder_cls: MMBertForJoint eval: save_path: runs/task/crosstask/eval fairseq: # read code and find what is the checkpoint arg. dataset: batch_size: 1 common_eval: path: runs/task/crosstask/checkpoint_best.pt metric: CrossTaskMetric predictor: CrossTaskPredictor ================================================ FILE: examples/MMPT/projects/task/test_crosstask_videoclip.yaml ================================================ includes: projects/task/test_crosstask.yaml model: model_cls: MMFusionSeparateActionLocalization mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel # dummy, not used. num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/task/test_crosstask_zs.yaml ================================================ includes: projects/task/test.yaml dataset: split: test meta_processor: CrossTaskMetaProcessor test_path: data/crosstask/crosstask_release/videos_val.csv train_csv_path: data/crosstask/crosstask_release/videos.csv val_path: data/crosstask/crosstask_release/videos_val.csv # dummy val_csv_path: data/crosstask/crosstask_release/videos_val.csv primary_path: data/crosstask/crosstask_release/tasks_primary.txt related_path: data/crosstask/crosstask_release/tasks_related.txt vfeat_dir: data/feat/feat_crosstask_s3d annotation_path: data/crosstask/crosstask_release/annotations n_train: 30 video_processor: CrossTaskVideoProcessor text_processor: CrossTaskTextProcessor aligner: CrossTaskAligner num_iso_layer: 12 sliding_window: 16 sliding_window_size: 32 model: model_cls: MMFusionActionLocalization mm_encoder_cls: MMBertForJoint eval: save_path: runs/task/crosstask_zs/eval fairseq: # read code and find what is the checkpoint arg. dataset: batch_size: 1 common_eval: path: runs/task/checkpoint_best.pt # load the best from how2 on ACL submission: runs/task/checkpoint11.pt metric: CrossTaskMetric predictor: CrossTaskPredictor ================================================ FILE: examples/MMPT/projects/task/test_crosstask_zs_videoclip.yaml ================================================ includes: projects/task/test_crosstask_zs.yaml model: model_cls: MMFusionSeparateActionLocalization mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel # dummy, not used. num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/task/test_didemo_zs.yaml ================================================ includes: projects/task/test.yaml dataset: meta_processor: DiDeMoMetaProcessor test_path: data/didemo/test_data.json video_processor: VideoProcessor vfeat_dir: data/feat/feat_didemo_s3d text_processor: DiDeMoTextProcessor aligner: DiDeMoAligner num_iso_layer: 12 model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/task/didemo_zs/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/checkpoint_best.pt metric: DiDeMoMetric predictor: DiDeMoPredictor ================================================ FILE: examples/MMPT/projects/task/test_vtt.yaml ================================================ includes: projects/task/test.yaml dataset: meta_processor: MSRVTTMetaProcessor test_path: data/msrvtt/MSRVTT_JSFUSION_test.csv video_processor: VideoProcessor vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor num_iso_layer: 12 model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint eval: save_path: runs/task/vtt/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/vtt/checkpoint_last.pt metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/task/test_vtt_videoclip.yaml ================================================ includes: projects/task/test_vtt.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/task/test_vtt_zs.yaml ================================================ includes: projects/task/test_vtt.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/task/vtt_zs/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/checkpoint_best.pt ================================================ FILE: examples/MMPT/projects/task/test_vttqa.yaml ================================================ includes: projects/task/test.yaml dataset: meta_processor: MSRVTTQAMetaProcessor test_path: data/msrvtt-qa/MSR_MC_test.csv video_processor: VideoProcessor vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTQATextProcessor aligner: MSRVTTQAAligner num_iso_layer: 12 model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint eval: save_path: runs/task/vttqa/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/vttqa/checkpoint_last.pt metric: QAMetric predictor: QAPredictor ================================================ FILE: examples/MMPT/projects/task/test_vttqa_videoclip.yaml ================================================ includes: projects/task/test_vttqa.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/task/test_vttqa_zs.yaml ================================================ includes: projects/task/test_vttqa.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/task/vttqa_zs/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/checkpoint_best.pt ================================================ FILE: examples/MMPT/projects/task/test_youcook.yaml ================================================ includes: projects/task/test.yaml dataset: meta_processor: YoucookMetaProcessor test_path: data/youcook/youcook_val.pkl trainval_annotation: data/youcook/youcookii_annotations_trainval.json use_annotation_text: True video_processor: YoucookVideoProcessor vfeat_dir: data/feat/feat_youcook_s3d # /checkpoint/huxu/feat/youcook_vmz # /checkpoint/prarora/berniehuang/feat_youcook_vmz text_processor: TextProcessor aligner: DSAligner num_iso_layer: 12 model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint eval: save_path: runs/task/youcook/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/youcook/checkpoint_last.pt metric: RetrievalMetric predictor: RetrievalPredictor ================================================ FILE: examples/MMPT/projects/task/test_youcook_videoclip.yaml ================================================ includes: projects/task/test_youcook.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 ================================================ FILE: examples/MMPT/projects/task/test_youcook_zs.yaml ================================================ includes: projects/task/test_youcook.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 eval: save_path: runs/task/youcook_zs/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/checkpoint_best.pt ================================================ FILE: examples/MMPT/projects/task/test_youcookcap.yaml ================================================ includes: projects/task/test.yaml dataset: meta_processor: YoucookNLGMetaProcessor test_path: data/youcook/val_list.txt trainval_annotation: data/youcook/youcookii_annotations_trainval.json video_processor: YoucookVideoProcessor vfeat_dir: data/feat/feat_youcook_s3d text_processor: NLGTextProcessor aligner: DSNLGAligner model: model_cls: MMFusionNLG mm_encoder_cls: MMBertForNLG max_decode_length: 24 eval: save_path: runs/task/youcookcap/eval fairseq: # read code and find what is the checkpoint arg. common_eval: path: runs/task/youcookcap/checkpoint_best.pt metric: NLGMetric predictor: NLGPredictor gen_param: num_beams: 5 ================================================ FILE: examples/MMPT/projects/task/vtt.yaml ================================================ includes: projects/task/ft.yaml dataset: meta_processor: MSRVTTMetaProcessor train_path: data/msrvtt/MSRVTT_train.csv jsfusion_path: data/msrvtt/MSRVTT_JSFUSION_test.csv full_test_path: data/msrvtt/MSRVTT_FULL_test.csv dup: 20 val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor json_path: data/msrvtt/MSRVTT_data.json aligner: DSAligner num_iso_layer: 12 model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint loss: loss_cls: T2VContraLoss fairseq: dataset: batch_size: 256 optimization: max_epoch: 10 checkpoint: save_dir: runs/task/vtt ================================================ FILE: examples/MMPT/projects/task/vtt_videoclip.yaml ================================================ includes: projects/task/vtt.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 fairseq: dataset: batch_size: 224 # model_cls: MMFusionShare # mm_encoder_cls: MMBertForEncoder ================================================ FILE: examples/MMPT/projects/task/vttqa.yaml ================================================ includes: projects/task/ft.yaml dataset: meta_processor: MSRVTTMetaProcessor train_path: data/msrvtt/MSRVTT_train.csv dup: 20 val_path: data/msrvtt/MSRVTT_JSFUSION_test.csv vfeat_dir: data/feat/feat_vtt_s3d text_processor: MSRVTTTextProcessor json_path: data/msrvtt/MSRVTT_data.json aligner: DSAligner num_iso_layer: 12 model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint loss: loss_cls: V2TContraLoss fairseq: dataset: batch_size: 128 optimization: max_epoch: 5 checkpoint: save_dir: runs/task/vttqa ================================================ FILE: examples/MMPT/projects/task/vttqa_videoclip.yaml ================================================ includes: projects/task/vttqa.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 # model_cls: MMFusionShare # mm_encoder_cls: MMBertForEncoder ================================================ FILE: examples/MMPT/projects/task/youcook.yaml ================================================ includes: projects/task/ft.yaml dataset: meta_processor: YoucookMetaProcessor train_path: data/youcook/youcook_train.pkl val_path: data/youcook/youcook_val.pkl trainval_annotation: data/youcook/youcookii_annotations_trainval.json use_annotation_text: True video_processor: YoucookVideoProcessor vfeat_dir: data/feat/feat_youcook_s3d # /checkpoint/huxu/feat/youcook_vmz # /checkpoint/prarora/berniehuang/feat_youcook_vmz text_processor: TextProcessor aligner: DSAligner num_iso_layer: 12 model: model_cls: MMFusionJoint mm_encoder_cls: MMBertForJoint loss: loss_cls: T2VContraLoss fairseq: dataset: batch_size: 128 optimization: max_epoch: 10 checkpoint: save_dir: runs/task/youcook ================================================ FILE: examples/MMPT/projects/task/youcook_videoclip.yaml ================================================ includes: projects/task/youcook.yaml model: model_cls: MMFusionSeparate mm_encoder_cls: video_encoder_cls: MMBertForEncoder text_encoder_cls: BertModel num_hidden_video_layers: 6 # model_cls: MMFusionShare # mm_encoder_cls: MMBertForEncoder ================================================ FILE: examples/MMPT/projects/task/youcookcap.yaml ================================================ # finetuning for youcook captioning. includes: projects/task/ft.yaml dataset: meta_processor: YoucookNLGMetaProcessor train_path: data/youcook/train_list.txt val_path: data/youcook/val_list.txt trainval_annotation: data/youcook/youcookii_annotations_trainval.json video_processor: YoucookVideoProcessor vfeat_dir: data/feat/feat_youcook_s3d text_processor: NLGTextProcessor aligner: DSNLGAligner model: model_cls: MMFusionNLG mm_encoder_cls: MMBertForNLG loss: loss_cls: NLGLoss fairseq: dataset: batch_size: 128 optimization: max_epoch: 10 checkpoint: save_dir: runs/task/youcookcap ================================================ FILE: examples/MMPT/scripts/text_token_extractor/configs/bert-base-uncased.yaml ================================================ dataset: bert_name: bert-base-uncased caption_pkl_path: data/how2/raw_caption_dedup.pkl use_fast: true target_dir: data/feat/feat_how2_s3d_shard_small ================================================ FILE: examples/MMPT/scripts/text_token_extractor/pretokenization.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import pickle import os import argparse import numpy as np from torch.utils.data import Dataset, DataLoader from mmpt.processors import PKLJSONStrTextProcessor from mmpt.utils import ShardedTensor, recursive_config class TokenizerDataset(Dataset): def __init__(self, config): self.text_processor = PKLJSONStrTextProcessor(config) self.video_ids = list(self.text_processor.data.keys()) def __getitem__(self, idx): video_id = self.video_ids[idx] return video_id, self.text_processor(video_id) def __len__(self): return len(self.video_ids) def numpify(shard_idx, video_ids, captions, target_dir, split, prefix, max_cap_len=32): startends = [] caps_ids = [] for video_id in video_ids: caption = captions[video_id] startend = [] cap_ids = [] for start, end, cap in zip( caption["start"], caption["end"], caption["cap"]): startend.append(np.array([start, end]).astype("float32")) cap_id = np.full((max_cap_len,), -1, dtype=np.int32) cap = cap[:max_cap_len] cap_id[:len(cap)] = cap cap_ids.append(cap_id) startends.append(np.stack(startend)) caps_ids.append(np.stack(cap_ids)) startends = ShardedTensor.from_list(startends) target_path = os.path.join( target_dir, prefix + split + "_" + str(shard_idx) ) print("save to", target_path) startends.save(target_path + ".startends") caps_ids = ShardedTensor.from_list(caps_ids) caps_ids.save(target_path + ".caps_ids") def sharding(config, out_file): with open(out_file, "rb") as fr: captions = pickle.load(fr) target_dir = config.target_dir prefix = os.path.basename( os.path.splitext(config.caption_pkl_path)[0] ) + "." + config.bert_name + "." for split in ["train", "val"]: target_path = os.path.join(target_dir, split + "_meta") with open(target_path + ".pkl", "rb") as fr: meta = pickle.load(fr) print("load meta", target_path, len(meta)) for shard_id in meta: numpify( shard_id, meta[shard_id], captions, target_dir, split, prefix ) def tokenize(config, out_file): def collator(samples): return samples dataset = TokenizerDataset(config) data = {} for idx, batch in enumerate( DataLoader(dataset, collate_fn=collator, num_workers=16)): for video_id, caption in batch: data[video_id] = caption if idx % 5000 == 0: print(idx) with open(out_file, "wb") as fw: pickle.dump(data, fw, pickle.HIGHEST_PROTOCOL) def main(args): config = recursive_config(args.config).dataset out_file = os.path.splitext(config.caption_pkl_path)[0] \ + "." + config.bert_name + ".pkl" if not os.path.isfile(out_file): tokenize(config, out_file) sharding(config, out_file) if __name__ == "__main__": parser = argparse.ArgumentParser( description="pretokenize (raw_)caption.json into pkl.") parser.add_argument('config', type=str) args = parser.parse_args() main(args) ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/extract.py ================================================ # Copyright Howto100M authors. # Copyright (c) Facebook, Inc. All Rights Reserved import torch as th import torch.nn.functional as F import math import numpy as np import argparse from torch.utils.data import DataLoader from model import get_model from preprocessing import Preprocessing from random_sequence_shuffler import RandomSequenceSampler from tqdm import tqdm from pathbuilder import PathBuilder from videoreader import VideoLoader parser = argparse.ArgumentParser(description='Easy video feature extractor') parser.add_argument('--vdir', type=str) parser.add_argument('--fdir', type=str) parser.add_argument('--hflip', type=int, default=0) parser.add_argument('--batch_size', type=int, default=64, help='batch size') parser.add_argument('--type', type=str, default='2d', help='CNN type') parser.add_argument('--half_precision', type=int, default=0, help='output half precision float') parser.add_argument('--num_decoding_thread', type=int, default=4, help='Num parallel thread for video decoding') parser.add_argument('--l2_normalize', type=int, default=1, help='l2 normalize feature') parser.add_argument('--resnext101_model_path', type=str, default='model/resnext101.pth', help='Resnext model path') parser.add_argument('--vmz_model_path', type=str, default='model/r2plus1d_34_clip8_ig65m_from_scratch-9bae36ae.pth', help='vmz model path') args = parser.parse_args() # TODO: refactor all args into config. (current code is from different people.) CONFIGS = { "2d": { "fps": 1, "size": 224, "centercrop": False, "shards": 0, }, "3d": { "fps": 24, "size": 112, "centercrop": True, "shards": 0, }, "s3d": { "fps": 30, "size": 224, "centercrop": True, "shards": 0, }, "vmz": { "fps": 24, "size": 112, "centercrop": True, "shards": 0, }, "vae": { "fps": 2, "size": 256, "centercrop": True, "shards": 100, } } config = CONFIGS[args.type] video_dirs = args.vdir feature_dir = args.fdir video_dict = PathBuilder.build(video_dirs, feature_dir, ".npy", config["shards"]) dataset = VideoLoader( video_dict=video_dict, framerate=config["fps"], size=config["size"], centercrop=config["centercrop"], hflip=args.hflip ) n_dataset = len(dataset) sampler = RandomSequenceSampler(n_dataset, 10) loader = DataLoader( dataset, batch_size=1, shuffle=False, num_workers=args.num_decoding_thread, sampler=sampler if n_dataset > 10 else None, ) preprocess = Preprocessing(args.type) model = get_model(args) with th.no_grad(): for k, data in tqdm(enumerate(loader), total=loader.__len__(), ascii=True): input_file = data['input'][0] output_file = data['output'][0] if len(data['video'].shape) > 3: video = data['video'].squeeze() if len(video.shape) == 4: video = preprocess(video) n_chunk = len(video) if args.type == 'vmz': n_chunk = math.ceil(n_chunk/float(3)) features = th.cuda.FloatTensor(n_chunk, 512).fill_(0) elif args.type == 's3d': features = th.cuda.FloatTensor(n_chunk, 512).fill_(0) elif args.type == "vae": features = th.cuda.LongTensor(n_chunk, 1024).fill_(0) else: features = th.cuda.FloatTensor(n_chunk, 2048).fill_(0) n_iter = int(math.ceil(n_chunk / float(args.batch_size))) for i in range(n_iter): factor = 1 if args.type == 'vmz': factor = 3 min_ind = factor * i * args.batch_size max_ind = factor * (i + 1) * args.batch_size video_batch = video[min_ind:max_ind:factor].cuda() if args.type == '2d': batch_features = model(video_batch) # (51, 487), (51, 512) elif args.type == 's3d': batch_features = model(video_batch) batch_features = batch_features['video_embedding'] elif args.type == "vae": # image_code. batch_features = model(video_batch) else: batch_pred, batch_features = model(video_batch) # (51, 487), (51, 512) if args.l2_normalize: batch_features = F.normalize(batch_features, dim=1) features[i*args.batch_size:(i+1)*args.batch_size] = batch_features features = features.cpu().numpy() if args.half_precision: if args.type == "vae": features = features.astype(np.int16) else: features = features.astype('float16') else: if args.type == "vae": features = features.astype(np.int32) else: features = features.astype('float32') np.save(output_file, features) else: print('Video {} error.'.format(input_file)) ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/how2/s3d.sh ================================================ #!/bin/bash python scripts/video_feature_extractor/extract.py \ --vdir \ --fdir data/feat/feat_how2_s3d \ --type=s3d --num_decoding_thread=4 \ --batch_size 32 --half_precision 1 ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/model.py ================================================ # Copyright (c) Howto100M authors and Facebook, Inc. All Rights Reserved import torch as th from torch import nn class GlobalAvgPool(nn.Module): def __init__(self): super(GlobalAvgPool, self).__init__() def forward(self, x): return th.mean(x, dim=[-2, -1]) def get_model(args): assert args.type in ['2d', '3d', 'vmz', 's3d', 'vae'] if args.type == '2d': print('Loading 2D-ResNet-152 ...') import torchvision.models as models model = models.resnet152(pretrained=True) model = nn.Sequential(*list(model.children())[:-2], GlobalAvgPool()) model = model.cuda() elif args.type == 'vmz': print('Loading VMZ ...') from vmz34 import r2plus1d_34 model = r2plus1d_34(pretrained_path=args.vmz_model_path, pretrained_num_classes=487) model = model.cuda() elif args.type == 's3d': # we use one copy of s3d instead of dup another one for feature extraction. from mmpt.processors.models.s3dg import S3D model = S3D('pretrained_models/s3d_dict.npy', 512) model.load_state_dict(th.load('pretrained_models/s3d_howto100m.pth')) model = model.cuda() elif args.type == '3d': print('Loading 3D-ResneXt-101 ...') from videocnn.models import resnext model = resnext.resnet101( num_classes=400, shortcut_type='B', cardinality=32, sample_size=112, sample_duration=16, last_fc=False) model = model.cuda() model_data = th.load(args.resnext101_model_path) model.load_state_dict(model_data) elif args.type == 'vae': from openaivae import OpenAIParallelDiscreteVAE model = OpenAIParallelDiscreteVAE() model = model.cuda() else: raise ValueError("model not supported yet.") model.eval() print('loaded') return model ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/pathbuilder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import urllib.parse import json import pandas as pd from tqdm import tqdm # TODO: extending to other datasets. supported_formats = {} class PathBuilder(object): @classmethod def build(cls, video_dirs, feature_dir, ext, shards=0, split=None): meta_fn = os.path.join(feature_dir, "meta_plan.json") os.makedirs(feature_dir, exist_ok=True) if os.path.isfile(meta_fn): with open(meta_fn) as fr: meta = json.load(fr) return meta print("searching videos...") video_id_to_path = {} for video_dir in video_dirs.split(","): # TODO: add supports of recursive listdir. if video_dir in supported_formats: supported_formats[video_dir].load(video_dir, video_id_to_path) else: for idx, fn in enumerate(tqdm(os.listdir(video_dir))): video_fn = os.path.join(video_dir, fn) if os.path.isfile(video_fn): video_id = os.path.splitext(fn)[0] video_id_to_path[video_id] = video_fn elif os.path.isdir(video_fn): # shards of folders. shard_dir = video_fn for idx, fn in enumerate(os.listdir(shard_dir)): video_fn = os.path.join(shard_dir, fn) if os.path.isfile(video_fn): video_id = os.path.splitext(fn)[0] video_id_to_path[video_id] = video_fn video_path, feature_path = [], [] valid_ext = set() for idx, video_id in enumerate(video_id_to_path): video_path.append(video_id_to_path[video_id]) if ext is None: # use original file ext for format compatibility. video_id_to_path[video_id] path = urllib.parse.urlparse(video_id_to_path[video_id]).path ext = os.path.splitext(path)[1] if ext not in valid_ext: valid_ext.add(ext) print("adding", ext) if shards: shard_id = str(idx % shards) feature_fn = os.path.join( feature_dir, shard_id, video_id + ext) else: feature_fn = os.path.join( feature_dir, video_id + ext) feature_path.append(feature_fn) print("targeting", len(feature_path), "videos") meta = { "video_path": video_path, "feature_path": feature_path} with open(meta_fn, "w") as fw: json.dump(meta, fw) if split is not None: splits = split.split("/") assert len(splits) == 2 cur, total = int(splits[0]), int(splits[1]) assert cur < total import math chunk = math.ceil(len(meta["video_path"]) / total) start = cur * chunk end = (cur + 1) * chunk meta = { "video_path": meta["video_path"][start:end], "feature_path": meta["feature_path"][start:end] } return meta ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/preprocessing.py ================================================ # Copyright Howto100m authors. # Copyright (c) Facebook, Inc. All Rights Reserved import torch as th class Normalize(object): def __init__(self, mean, std): self.mean = th.FloatTensor(mean).view(1, 3, 1, 1) self.std = th.FloatTensor(std).view(1, 3, 1, 1) def __call__(self, tensor): tensor = (tensor - self.mean) / (self.std + 1e-8) return tensor class Preprocessing(object): def __init__(self, type): self.type = type if type == '2d': self.norm = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) elif type == '3d': self.norm = Normalize(mean=[110.6, 103.2, 96.3], std=[1.0, 1.0, 1.0]) elif type == 'vmz': self.norm = Normalize(mean=[110.201, 100.64, 95.997], std=[58.1489, 56.4701, 55.3324]) def _zero_pad(self, tensor, size): n = size - len(tensor) % size if n == size: return tensor else: z = th.zeros(n, tensor.shape[1], tensor.shape[2], tensor.shape[3]) return th.cat((tensor, z), 0) def __call__(self, tensor): if self.type == '2d': tensor = tensor / 255.0 tensor = self.norm(tensor) elif self.type == 'vmz': #tensor = self._zero_pad(tensor, 8) tensor = self._zero_pad(tensor, 10) tensor = self.norm(tensor) #tensor = tensor.view(-1, 8, 3, 112, 112) tensor = tensor.view(-1, 10, 3, 112, 112) tensor = tensor.transpose(1, 2) elif self.type == '3d': tensor = self._zero_pad(tensor, 16) tensor = self.norm(tensor) tensor = tensor.view(-1, 16, 3, 112, 112) tensor = tensor.transpose(1, 2) elif self.type == 's3d': tensor = tensor / 255.0 tensor = self._zero_pad(tensor, 30) tensor = tensor.view(-1, 30, 3, 224, 224) # N x 30 x 3 x H x W tensor = tensor.transpose(1, 2) # N x 3 x 30 x H x W # for vae do nothing return tensor ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/random_sequence_shuffler.py ================================================ # Copyright (c) Facebook, Inc. All Rights Reserved import numpy as np from torch.utils.data.sampler import Sampler class RandomSequenceSampler(Sampler): def __init__(self, n_sample, seq_len): self.n_sample = n_sample self.seq_len = seq_len def _pad_ind(self, ind): zeros = np.zeros(self.seq_len - self.n_sample % self.seq_len) ind = np.concatenate((ind, zeros)) return ind def __iter__(self): idx = np.arange(self.n_sample) if self.n_sample % self.seq_len != 0: idx = self._pad_ind(idx) idx = np.reshape(idx, (-1, self.seq_len)) np.random.shuffle(idx) idx = np.reshape(idx, (-1)) return iter(idx.astype(int)) def __len__(self): return self.n_sample + (self.seq_len - self.n_sample % self.seq_len) ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/shard_feature.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import os import pickle from mmpt.utils import ShardedTensor class Shard(object): def __init__( self, vfeat_dir, tfeat_dir, target_dir, file_paths, shard_size=4096 ): self.vfeat_dir = vfeat_dir self.tfeat_dir = tfeat_dir self.target_dir = target_dir self.video_ids = {} for split, file_path in zip(["train", "val"], file_paths): with open(file_path) as fr: self.video_ids[split] = [ line.strip() for line in fr.readlines()] self.shard_size = shard_size def __call__(self, split="train"): for split in ["train", "val"]: meta = {} for shard_idx, shard_offset in enumerate( range(0, len(self.video_ids[split]), self.shard_size) ): print(shard_idx) meta_shard = [] video_shard = [] for video_id in self.video_ids[split][shard_offset:shard_offset+self.shard_size]: meta_shard.append(video_id) npy_file = os.path.join(self.vfeat_dir, video_id + ".npy") video_shard.append(np.load(npy_file)) meta[shard_idx] = meta_shard video_shard = ShardedTensor.from_list(video_shard) target_path = os.path.join( self.target_dir, split + "_" + str(shard_idx)) video_shard.save(target_path) target_path = os.path.join(self.target_dir, split + "_meta") with open(target_path + ".pkl", "wb") as fw: pickle.dump(meta, fw, pickle.HIGHEST_PROTOCOL) if __name__ == "__main__": shard = Shard( "data/feat/feat_how2_s3d", "data/how2/raw_caption_dedup.bert-base-uncased", "data/feat/feat_how2_s3d_shard_small", ["data/how2/how2_s3d_train.lst", "data/how2/how2_s3d_val.lst"] ) shard() ================================================ FILE: examples/MMPT/scripts/video_feature_extractor/videoreader.py ================================================ # Copyright Howto100M authors. # Copyright (c) Facebook, Inc. All Rights Reserved import torch as th import pandas as pd import os import numpy as np import ffmpeg import random from torch.utils.data import Dataset class VideoLoader(Dataset): """modified from how2's video_feature_extractor.""" def __init__( self, csv=None, video_dict=None, framerate=1, size=112, centercrop=False, hflip=False, **kwargs ): if csv is None and video_dict is None: raise ValueError("csv and video_dict cannot be both None.") if csv is not None: self.csv = pd.read_csv(csv) if video_dict is not None: self.csv = pd.DataFrame.from_dict(video_dict) self.centercrop = centercrop self.size = size self.framerate = framerate self.hflip = hflip def __len__(self): return len(self.csv) def _get_video_dim(self, video_path): probe = ffmpeg.probe(video_path) video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None) width = int(video_stream['width']) height = int(video_stream['height']) return height, width def _get_video_info(self, video_path): probe = ffmpeg.probe(video_path) video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None) return video_stream def _get_output_dim(self, h, w): if isinstance(self.size, tuple) and len(self.size) == 2: return self.size elif h >= w: return int(h * self.size / w), self.size else: return self.size, int(w * self.size / h) def __getitem__(self, idx): video_path = self.csv['video_path'].values[idx] output_file = self.csv['feature_path'].values[idx] return self._decode(output_file, video_path) def _decode(self, output_file, video_path): if not(os.path.isfile(output_file)) and os.path.isfile(video_path): try: h, w = self._get_video_dim(video_path) except Exception: print('ffprobe failed at: {}'.format(video_path)) return {'video': th.zeros(1), 'input': video_path, 'output': output_file} try: os.makedirs(os.path.dirname(output_file), exist_ok=True) height, width = self._get_output_dim(h, w) cmd = ( ffmpeg .input(video_path) .filter('fps', fps=self.framerate) .filter('scale', width, height) ) if self.hflip: cmd = cmd.filter('hflip') if self.centercrop: x = int((width - self.size) / 2.0) y = int((height - self.size) / 2.0) cmd = cmd.crop(x, y, self.size, self.size) video = self._run(cmd, output_file) except Exception: video = th.zeros(1) else: video = th.zeros(1) return {'video': video, 'input': video_path, 'output': output_file} def _run(self, cmd, output_file): out, _ = ( cmd.output('pipe:', format='rawvideo', pix_fmt='rgb24') .run(capture_stdout=True, quiet=True) ) if self.centercrop and isinstance(self.size, int): height, width = self.size, self.size video = np.frombuffer(out, np.uint8).reshape([-1, height, width, 3]) video = th.from_numpy(video.astype('float32')) return video.permute(0, 3, 1, 2) class VideoVerifier(VideoLoader): def __getitem__(self, idx): video_path = self.csv['video_path'].values[idx] try: return self._get_video_info(video_path) except Exception: # print('ffprobe failed at: {}'.format(video_path)) return None class VideoCompressor(VideoLoader): def __init__( self, csv=None, video_dict=None, framerate=1, size=112, centercrop=False, hflip=False, crf=32, **kwargs ): super().__init__( csv, video_dict, framerate, size, centercrop, hflip ) self.crf = crf def _run(self, cmd, output_file): out, _ = ( cmd.output(filename=output_file, crf=self.crf) .run(quiet=True) ) video = None return video class VideoDownloader(VideoCompressor): """download""" def __getitem__(self, idx): video_path = self.csv['video_path'].values[idx] output_file = self.csv['feature_path'].values[idx] if not(os.path.isfile(output_file)): os.makedirs(os.path.dirname(output_file), exist_ok=True) cmd = "wget -O" + output_file + " " + video_path # import subprocess # subprocess.check_output( # cmd, # stderr=subprocess.STDOUT, shell=True) os.system(cmd) return {'video': None, 'input': video_path, 'output': output_file} class AvKeyframeVideoCompressor(VideoLoader): """extract keyframes from a video and save it as jpg. TODO: consider to merge with `CodecProcessor`. """ def __init__( self, csv=None, video_dict=None, framerate=1, size=112, centercrop=False, max_num_frames=5, **kwargs ): super().__init__(csv, video_dict, framerate, size, centercrop) self.max_num_frames = max_num_frames def _get_video_dim(self, video_fn): """decord cannot probe the size of a video, we use pyav instead.""" import av with av.open(video_fn) as container: height = container.streams.video[0].codec_context.height width = container.streams.video[0].codec_context.width return height, width def _get_output_dim(self, height, width): """ keep the shorter side be `self.size`, strech the other. """ if height >= width: return int(height * self.size / width), self.size else: return self.size, int(width * self.size / height) def __getitem__(self, idx): import av video_path = self.csv['video_path'].values[idx] output_file = self.csv['feature_path'].values[idx] if not(os.path.isdir(output_file)) and os.path.isfile(video_path): try: h, w = self._get_video_dim(video_path) except Exception: print('probe failed at: {}'.format(video_path)) return {'video': th.zeros(1), 'input': video_path, 'output': output_file} try: height, width = self._get_output_dim(h, w) # new for av. with av.open(video_path) as container: container.streams.video[0].thread_type = "AUTO" container.streams.video[0].codec_context.height = height container.streams.video[0].codec_context.width = width if self.framerate == 0: # keyframe. container.streams.video[0].codec_context.skip_frame = 'NONKEY' frames = [] for frame in container.decode(video=0): frames.append(frame) frames = random.sample(frames, self.max_num_frames) os.makedirs(output_file, exist_ok=True) for frame in frames: frame.to_image().save( os.path.join( output_file, "%04d.jpg" % frame.index)) except Exception: print('extract failed at: {}'.format(video_path)) return {'video': th.zeros(1), 'input': video_path, 'output': output_file} video = th.zeros(1) return {'video': video, 'input': video_path, 'output': output_file} ================================================ FILE: examples/MMPT/setup.py ================================================ import setuptools with open("README.md", "r") as fh: long_description = fh.read() setuptools.setup( name="mmpt", version="0.0.1", author="Hu Xu, Po-yao Huang", author_email="huxu@fb.com", description="A package for multimodal pretraining.", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/pytorch/fairseq/examples/MMPT", packages=setuptools.find_packages(), install_requires=[ ], classifiers=[ "Programming Language :: Python :: 3", "License :: CC-BY-NC", "Operating System :: OS Independent", ], python_requires='>=3.6', ) ================================================ FILE: examples/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. try: from fairseq.version import __version__ # noqa except ImportError: pass ================================================ FILE: examples/adaptive_span/README.md ================================================ # Adaptive Span Adaptive Span is a novel self-attention mechanism that can learn its optimal attention span. This allows us to extend significantly the maximum context size used in Transformer, while maintaining control over their memory footprint and computational time. It uses the Truncated BPTT technique for training, as in [transformerXL](https://github.com/pytorch/fairseq/blob/main/examples/truncated_bptt/README.md). Adaptive Span was introduced by paper: [Adaptive Attention Span in Transformers](https://arxiv.org/abs/1905.07799), which achieved state-of-the-art language modeling results at the time of publication. We manage to reproduce their result in fairseq and keep most of the [original implementation](https://github.com/facebookresearch/adaptive-span) untouched. You can refer to the their sweep file as well if any combination of hyperparameter is not clear. ##### 0. Setup First you need to process the Enwik8 dataset, we use the pre-tokenized dataset from [adaptive span paper](https://github.com/facebookresearch/adaptive-span/blob/master/get_data.sh). You can download the dataset, and then run: ```bash fairseq-preprocess --only-source --trainpref ~/data/enwik8/train.txt \ --validpref ~/data/enwik8/valid.txt --testpref ~/data/enwik8/test.txt \ --destdir ~/data/enwik8/data-bin/ --joined-dictionary --workers 20 ``` ##### 1. Train a Adaptive Span model on Enwik8 We will train a 12-layer Adaptive Span model following the [hyperparameters used in the original paper](https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8.sh). The following command assumes 4 GPUs, so that the total batch size is 64 sequences (4 x 16). Training should take 2-3 days on 4 V100 GPUs: ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \ --user-dir examples/adaptive_span \ --data ~/data/enwik8/data-bin/ \ --fp16 --fp16-no-flatten-grads --max-update 600000 \ --task truncated_bptt_lm --tokens-per-sample 512 --arch adaptive_span \ --n-layer 12 --d-model 512 --n-head 8 --d-inner 2048 --dropout 0.3 \ --attn-span 8192 --optimizer adagrad_with_grad_clip --adagrad-clip 0.03 \ --validate-interval-updates 1000 \ --lr-scheduler fixed --warmup-updates 32000 --batch-size-valid 32 \ --lr 0.07 --criterion adaptive_span_loss --batch-size 16 --update-freq 1 \ --seed 2 --log-format json --log-interval 25 --aux-loss-scaler 5e-07 ``` This should land around 1.05 on validation, 1.03 on test. You can lower the --aux-loss-scaler for better performance (longer span). It gives ~0.03 bpc improvement to the transformerXL baseline here. If training on a single GPU, set `--update-freq=4` to accumulate 4x gradients and simulate training on 4 GPUs. You can also reproduce the transformerXL result on enwik8 using this code base. It should land around 1.06 on test,matching the [original paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_enwik8_base.sh). You can try by ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \ --user-dir examples/truncated_bptt \ ~/data/enwik8/data-bin/ \ --task truncated_bptt_lm --fp16 --max-update 400000 \ --tokens-per-sample 512 --arch transformer_xl --n-layer 12 \ --d-model 512 --n-head 8 --d-head 64 --d-inner 2048 --dropout 0.1 \ --dropatt 0.0 --mem-len 512 --optimizer adam --clip-norm 0.25 \ --lr-scheduler cosine --warmup-updates 0 \ --lr 0.0 --lr 0.00025 --batch-size 15 \ --update-freq 1 --seed 2 --log-format json --log-interval 25 \ --fp16 ``` ##### 2. Evaluate For Adaptive Span: ```bash fairseq-eval-lm ~/data/enwik8/data-bin/ --path model/checkpoint_best.pt \ --user-dir examples/adaptive_span \ --task truncated_bptt_lm --batch-size 8 --tokens-per-sample 512 --gen-subset test ``` For Transformer-XL evaluation: ```bash fairseq-eval-lm ~/data/enwik8/data-bin/ --path model/checkpoint_best.pt \ --user-dir examples/truncated_bptt/ --task truncated_bptt_lm --batch-size 8 \ --tokens-per-sample 80 \ --model-overrides '{"mem_len":2100,"clamp_len":820,"same_length":True}' \ --gen-subset valid ``` *Note:* During training the model saw 512 tokens of context (``--tokens-per-sample=512``), with batch size 8. These settings match the evaluation settings from [the original paper](https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8.sh). ================================================ FILE: examples/adaptive_span/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os # automatically import any Python files in the current directory cur_dir = os.path.dirname(__file__) for file in os.listdir(cur_dir): path = os.path.join(cur_dir, file) if ( not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)) ): mod_name = file[: file.find(".py")] if file.endswith(".py") else file module = importlib.import_module(__name__ + "." + mod_name) ================================================ FILE: examples/adaptive_span/adagrad_with_grad_clip.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from torch.optim import Adagrad from fairseq.optim import LegacyFairseqOptimizer, register_optimizer @register_optimizer("adagrad_with_grad_clip") class FairseqAdagradWithGradClip(LegacyFairseqOptimizer): def __init__(self, args, params): super().__init__(args) self._optimizer = AdagradWithGradClip(params, **self.optimizer_config) @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" # fmt: off parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') parser.add_argument('--adagrad-clip', default=0.0, type=float, metavar='D', help='internal grad clip') # fmt: on @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.args.lr[0], "weight_decay": self.args.weight_decay, "grad_clip": self.args.adagrad_clip, } @property def supports_flat_params(self): return False def _clip_grad(clr, grad, group_grad_clip): if group_grad_clip > 0: norm = grad.norm(2).item() if norm > group_grad_clip: clr *= group_grad_clip / (norm + 1e-10) return clr class AdagradWithGradClip(Adagrad): """Adagrad algorithm with custom gradient clipping""" def __init__( self, params, lr=1e-2, lr_decay=0, weight_decay=0, initial_accumulator_value=0, grad_clip=0, ): Adagrad.__init__( self, params, lr=lr, lr_decay=lr_decay, weight_decay=weight_decay, initial_accumulator_value=initial_accumulator_value, ) self.defaults["grad_clip"] = grad_clip self.param_groups[0].setdefault("grad_clip", grad_clip) def step(self, closure=None): loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data state = self.state[p] state["step"] += 1 if group["weight_decay"] != 0: if p.grad.data.is_sparse: raise RuntimeError( "weight_decay option is " "not compatible with sparse " "gradients" ) grad = grad.add(group["weight_decay"], p.data) clr = group["lr"] / (1 + (state["step"] - 1) * group["lr_decay"]) # clip clr = _clip_grad(clr=clr, grad=grad, group_grad_clip=group["grad_clip"]) if grad.is_sparse: # the update is non-linear so indices must be unique grad = grad.coalesce() grad_indices = grad._indices() grad_values = grad._values() size = grad.size() def make_sparse(values): constructor = grad.new if grad_indices.dim() == 0 or values.dim() == 0: return constructor().resize_as_(grad) return constructor(grad_indices, values, size) state["sum"].add_(make_sparse(grad_values.pow(2))) std = state["sum"]._sparse_mask(grad) std_values = std._values().sqrt_().add_(1e-10) p.data.add_(-clr, make_sparse(grad_values / std_values)) else: state["sum"].addcmul_(1, grad, grad) std = state["sum"].sqrt().add_(1e-10) p.data.addcdiv_(-clr, grad, std) return loss ================================================ FILE: examples/adaptive_span/adaptive_span_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn as nn import torch.nn.functional as F class AdaptiveMask(nn.Module): """Soft masking function for adaptive size. It masks out the last K values of an input. The masking value goes from 1 to 0 gradually, so K can be learned with back-propagation. Args: max_size: maximum size (i.e. input dimension) ramp_size: size of the ramp going from 0 to 1 init_val: initial size proportion not to be masked out shape: learn multiple sizes independent of each other """ def __init__(self, max_size, ramp_size, init_val=0, shape=(1,)): nn.Module.__init__(self) self._max_size = max_size self._ramp_size = ramp_size self.current_val = nn.Parameter(torch.zeros(*shape) + init_val) mask_template = torch.linspace(1 - max_size, 0, steps=max_size) self.register_buffer("mask_template", mask_template) def forward(self, x): mask = self.mask_template.float() + self.current_val.float() * self._max_size mask = mask / self._ramp_size + 1 mask = mask.clamp(0, 1) if x.size(-1) < self._max_size: # the input could have been trimmed beforehand to save computation mask = mask.narrow(-1, self._max_size - x.size(-1), x.size(-1)) x = (x * mask).type_as(x) return x def get_current_max_size(self, include_ramp=True): current_size = math.ceil(self.current_val.max().item() * self._max_size) if include_ramp: current_size += self._ramp_size current_size = max(0, min(self._max_size, current_size)) return current_size def get_current_avg_size(self, include_ramp=True): current_size = math.ceil( self.current_val.float().mean().item() * self._max_size ) if include_ramp: current_size += self._ramp_size current_size = max(0, min(self._max_size, current_size)) return current_size def clamp_param(self): """this need to be called after each update""" self.current_val.data.clamp_(0, 1) class AdaptiveSpan(nn.Module): """Adaptive attention span for Transformerself. This module learns an attention span length from data for each self-attention head. Args: attn_span: maximum attention span adapt_span_loss: loss coefficient for the span length adapt_span_ramp: length of the masking ramp adapt_span_init: initial size ratio adapt_span_cache: adapt cache size to reduce memory usage """ def __init__( self, attn_span, adapt_span_ramp, adapt_span_init, n_head, adapt_span_layer, **kargs ): nn.Module.__init__(self) self._max_span = attn_span self._n_head = n_head self._adapt_span_layer = adapt_span_layer if self._adapt_span_layer: self._mask = AdaptiveMask( max_size=self._max_span, ramp_size=adapt_span_ramp, init_val=adapt_span_init, ) else: self._mask = AdaptiveMask( max_size=self._max_span, ramp_size=adapt_span_ramp, init_val=adapt_span_init, shape=(n_head, 1, 1), ) def forward(self, attn, normalize=True): """mask attention with the right span""" # batch and head dimensions are merged together, so separate them first self.clamp_param() if self._adapt_span_layer: attn = self._mask(attn) else: B = attn.size(0) # batch size M = attn.size(1) # block size attn = attn.reshape(B // self._n_head, self._n_head, M, -1) attn = self._mask(attn) attn = attn.view(B, M, -1) return attn def get_trim_len(self): """how much of memory can be trimmed to reduce computation""" L = self._max_span trim_len = min(L - 1, L - self._mask.get_current_max_size()) # too fine granularity might be bad for the memory management trim_len = math.floor(trim_len / 64) * 64 return trim_len def trim_memory(self, query, key, value, key_pe): """trim out unnecessary memory beforehand to reduce computation""" trim_len = self.get_trim_len() cache_size = key.size(1) - query.size(1) trim_len_cache = trim_len - (self._max_span - cache_size) if trim_len_cache > 0: key = key[:, trim_len_cache:, :] value = value[:, trim_len_cache:, :] elif trim_len_cache < 0: # cache is too short! this happens when validation resumes # after a lot of updates. key = F.pad(key, [0, 0, -trim_len_cache, 0]) value = F.pad(value, [0, 0, -trim_len_cache, 0]) if trim_len > 0: if key_pe is not None: key_pe = key_pe[:, :, trim_len:] return key, value, key_pe def get_cache_size(self): """determine how long the cache should be""" trim_len = self.get_trim_len() # give a buffer of 64 steps since a span might increase # in future updates return min(self._max_span, self._max_span - trim_len + 64) def get_loss(self): """a loss term for regularizing the span length""" return self._max_span * self._mask.current_val.float().mean() def get_current_max_span(self): return self._mask.get_current_max_size() def get_current_avg_span(self): return self._mask.get_current_avg_size() def clamp_param(self): self._mask.clamp_param() ================================================ FILE: examples/adaptive_span/adaptive_span_loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import register_criterion from fairseq.criterions.cross_entropy import CrossEntropyCriterion from fairseq.dataclass import FairseqDataclass from omegaconf import II @dataclass class AdaptiveSpanCriterionConfig(FairseqDataclass): sentence_avg: bool = II("optimization.sentence_avg") @register_criterion("adaptive_span_loss", dataclass=AdaptiveSpanCriterionConfig) class AdaptiveSpanCriterion(CrossEntropyCriterion): def __init__(self, task, sentence_avg): super().__init__(task, sentence_avg) def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss here is summed, different from the adaptive span code 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) loss, aux_loss, avg_span, max_span = self.compute_loss( model, net_output, sample, reduce=reduce ) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) loss /= sample_size total_loss = loss + aux_loss sample_size = 1 logging_output = { "loss": loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, "total_loss": total_loss.data, "avg_span": avg_span * sample_size, "max_span": max_span * sample_size, } return total_loss, sample_size, logging_output def compute_loss(self, model, net_output, sample, reduce=True): loss, _ = super().compute_loss(model, net_output, sample, reduce) aux_loss = model.get_aux_loss() avg_span = model.get_current_avg_span() max_span = model.get_current_max_span() return loss, aux_loss, avg_span, max_span @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) total_loss_sum = sum(log.get("total_loss", 0) for log in logging_outputs) avg_span_sum = sum(log.get("avg_span", 0) for log in logging_outputs) max_span_sum = sum(log.get("max_span", 0) for log in logging_outputs) # we divide by log(2) to convert the loss from base e to base 2 metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar("avg_span", avg_span_sum / sample_size, sample_size, round=3) metrics.log_scalar("max_span", max_span_sum / sample_size, sample_size, round=3) # total loss contains the L1 norm on adaptive-span metrics.log_scalar( "total_loss", total_loss_sum / sample_size / math.log(2), sample_size, round=3, ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) else: metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: examples/adaptive_span/adaptive_span_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn as nn import torch.nn.functional as F from fairseq.modules.layer_norm import LayerNorm from .adaptive_span_attention import AdaptiveSpan # Size notations: # B = batch_size, H = d_model, M = block_size, L = attn_span def _skew(X, pad_value): """shift every row 1 step to right""" # X = B x M x L B, M, L = X.size() X = F.pad(X, (0, M + 1), value=pad_value) # B x M x (L+M+1) X = X.view(B, -1) # B x ML+MM+M X = X[:, :-M] # B x ML+MM X = X.view(B, M, M + L) # B x M x L+M return X def _unskew(X): """reverse _skew operation""" # X = B x M x L+M B, M, L = X.size() L -= M X = X.view(B, -1) # B x ML+MM X = F.pad(X, (0, M)) # B x ML+MM+M X = X.view(B, M, M + L + 1) # B x M x L+M+1 X = X[:, :, :L] # B x M x L return X class SeqAttention(nn.Module): """Sequential self-attention layer. Each token will attend to its previous fixed number of steps. Note that attention doesn't include the current step itself. """ def __init__(self, d_model, n_head, attn_span, dropout, adapt_span_layer, **kargs): nn.Module.__init__(self) self.dropout = nn.Dropout(dropout) self.d_model = d_model # size of a single head self.attn_span = attn_span self.adaptive_span = AdaptiveSpan( attn_span=attn_span, n_head=n_head, adapt_span_layer=adapt_span_layer, **kargs ) def forward(self, query, key, value, key_pe): # query size = B x M x H # key, value sizes = B x (M+L) x H key, value, key_pe = self.adaptive_span.trim_memory(query, key, value, key_pe) # compute attention from context # B x M (dest) x (M+L) (src) attn_cont = torch.matmul(query, key.transpose(-1, -2)) attn_cont = _unskew(attn_cont) # B x M x L # compute the effect of position embedding attn_pos = torch.matmul(query, key_pe) # B x M x L_pos attn = attn_cont + attn_pos attn = attn / math.sqrt(self.d_model) # B x M X L_pos attn = F.softmax(attn.float(), dim=-1).type_as(attn) # trim attention lengths according to the learned span attn = self.adaptive_span(attn) attn = self.dropout(attn) # B x M X L_pos attn_cont = _skew(attn, 0) # B x M X (L+M) out = torch.matmul(attn_cont, value) # B x M x H return out def get_cache_size(self): return self.adaptive_span.get_cache_size() class MultiHeadSeqAttention(nn.Module): def __init__(self, d_model, n_head, **kargs): nn.Module.__init__(self) assert d_model % n_head == 0 self.n_head = n_head self.head_dim = d_model // n_head self.attn = SeqAttention(d_model=self.head_dim, n_head=n_head, **kargs) self.proj_query = nn.Linear(d_model, d_model, bias=False) nn.init.xavier_normal_(self.proj_query.weight) self.proj_out = nn.Linear(d_model, d_model, bias=False) nn.init.xavier_normal_(self.proj_out.weight) self.proj_val = nn.Linear(d_model, d_model, bias=False) nn.init.xavier_normal_(self.proj_val.weight) self.proj_key = nn.Linear(d_model, d_model, bias=False) nn.init.xavier_normal_(self.proj_key.weight) def head_reshape(self, x): K = self.n_head D = self.head_dim x = x.view(x.size()[:-1] + (K, D)) # B x (M+L) x K x D x = x.transpose(1, 2).contiguous() # B x K x (M+L) x D x = x.view(-1, x.size(-2), x.size(-1)) # B_K x (M+L) x D return x def forward(self, query, key, value, key_pe): B = query.size(0) K = self.n_head D = self.head_dim M = query.size(1) query = self.proj_query(query) query = self.head_reshape(query) value = self.proj_val(value) value = self.head_reshape(value) key = self.proj_key(key) key = self.head_reshape(key) out = self.attn(query, key, value, key_pe) # B_K x M x D out = out.view(B, K, M, D) # B x K x M x D out = out.transpose(1, 2).contiguous() # B x M x K x D out = out.view(B, M, -1) # B x M x K_D out = self.proj_out(out) return out class FeedForwardLayer(nn.Module): def __init__(self, d_model, d_inner, dropout, **kargs): nn.Module.__init__(self) self.fc1 = nn.Linear(d_model, d_inner) self.fc2 = nn.Linear(d_inner, d_model) nn.init.xavier_uniform_(self.fc1.weight) nn.init.xavier_uniform_(self.fc2.weight) self.dropout = nn.Dropout(dropout) def forward(self, h): h1 = F.relu(self.fc1(h)) h1 = self.dropout(h1) h2 = self.fc2(h1) return h2 class TransformerSeqLayer(nn.Module): def __init__(self, d_model, **kargs): nn.Module.__init__(self) self.attn = MultiHeadSeqAttention(d_model=d_model, **kargs) self.norm1 = LayerNorm(d_model) self.ff = FeedForwardLayer(d_model=d_model, **kargs) self.norm2 = LayerNorm(d_model) def forward(self, h, h_cache, key_pe): # h = B x M x H # h_cache = B x L x H h_all = torch.cat([h_cache, h], dim=1) # B x (M+L) x H attn_out = self.attn(h, h_all, h_all, key_pe) h = self.norm1(h + attn_out) # B x M x H if self.ff is not None: ff_out = self.ff(h) out = self.norm2(h + ff_out) # B x M x H else: out = h return out def get_cache_size(self): return self.attn.attn.get_cache_size() class TransformerSeq(nn.Module): def __init__( self, vocab_size, d_model, n_head, n_layer, attn_span, emb_dropout, aux_loss_scaler, adapt_span_layer, **kargs ): nn.Module.__init__(self) # token embeddings self.in_emb = nn.Embedding(vocab_size, d_model) nn.init.normal_(self.in_emb.weight, mean=0, std=d_model ** -0.5) self.out_emb = nn.Linear(d_model, vocab_size) self.aux_loss_scaler = aux_loss_scaler if emb_dropout > 0: self.emb_dropout = nn.Dropout(emb_dropout) else: self.emb_dropout = None # position embeddings self.key_pe = nn.Parameter(torch.randn(1, d_model // n_head, attn_span)) self.layers = nn.ModuleList() self.layers.extend( TransformerSeqLayer( d_model=d_model, n_head=n_head, attn_span=attn_span, adapt_span_layer=adapt_span_layer, **kargs ) for _ in range(n_layer) ) def forward(self, x, h_cache, target=None): # x size = B x M block_size = x.size(1) h = self.in_emb(x) # B x M x H if self.emb_dropout is not None: h = self.emb_dropout(h) h_cache_next = [] for l, layer in enumerate(self.layers): cache_size = layer.attn.attn.get_cache_size() if cache_size > block_size: h_cache_next_l = torch.cat( [h_cache[l][:, -cache_size + block_size :, :], h], dim=1 ).detach() else: h_cache_next_l = h[:, -cache_size:, :].detach() h_cache_next.append(h_cache_next_l) h = layer(h, h_cache[l], self.key_pe) # B x M x H if self.emb_dropout is not None: h = self.emb_dropout(h) out = F.log_softmax(self.out_emb(h).float(), dim=-1).type_as(h) dummy_loss = None return out, h_cache_next, dummy_loss def get_aux_loss(self): loss = 0.0 for layer in self.layers: loss += layer.attn.attn.adaptive_span.get_loss() return self.aux_loss_scaler * loss def get_current_max_span(self): max_span = 0.0 for layer in self.layers: max_span = max( max_span, layer.attn.attn.adaptive_span.get_current_max_span() ) return max_span def get_current_avg_span(self): avg_span = 0.0 for layer in self.layers: avg_span += layer.attn.attn.adaptive_span.get_current_avg_span() return avg_span / len(self.layers) ================================================ FILE: examples/adaptive_span/adaptive_span_model_wrapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass from typing import Dict, List, Optional import torch from fairseq.dataclass import FairseqDataclass from fairseq.models import ( FairseqIncrementalDecoder, FairseqLanguageModel, register_model, ) from .adaptive_span_model import TransformerSeq as AdaptiveSpanTransformerModel logger = logging.getLogger(__name__) @dataclass class AdaptiveSpanSmallConfig(FairseqDataclass): # defaults come from https://github.com/facebookresearch/adaptive-span/blob/master/experiments/enwik8_small.sh vocab_size: int = 50 d_model: int = 256 n_head: int = 4 d_inner: int = 1024 n_layer: int = 8 attn_span: int = 1024 dropout: float = 0.0 emb_dropout: float = 0.0 adapt_span_ramp: int = 32 adapt_span_init: float = 0.0 aux_loss_scaler: float = 0.000002 adapt_span_layer: bool = False @register_model("adaptive_span", dataclass=AdaptiveSpanSmallConfig) class AdaptiveSpanTransformer(FairseqLanguageModel): @classmethod def build_model(cls, cfg: AdaptiveSpanSmallConfig, task): return cls(AdaptiveSpanDecoder(cfg, task)) def get_aux_loss(self): return self.decoder.get_aux_loss() def get_current_max_span(self): return self.decoder.get_current_max_span() def get_current_avg_span(self): return self.decoder.get_current_avg_span() class AdaptiveSpanDecoder(FairseqIncrementalDecoder): def __init__(self, cfg, task): super().__init__(task.target_dictionary) self.config = cfg config = AdaptiveSpanSmallConfig( vocab_size=len(task.target_dictionary), d_model=cfg.d_model, n_head=cfg.n_head, d_inner=cfg.d_inner, n_layer=cfg.n_layer, attn_span=cfg.attn_span, dropout=cfg.dropout, emb_dropout=cfg.emb_dropout, adapt_span_ramp=cfg.adapt_span_ramp, adapt_span_init=cfg.adapt_span_init, aux_loss_scaler=cfg.aux_loss_scaler, adapt_span_layer=cfg.adapt_span_layer, ) logger.info(config) self.model = AdaptiveSpanTransformerModel(**config.__dict__) self._mems = None def forward( self, src_tokens, incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, encoder_out=None, ): bsz = src_tokens.size(0) if incremental_state is not None: # used during inference mems = self.get_incremental_state("mems") src_tokens = src_tokens[:, -1:] # only keep the most recent token else: mems = self._mems if mems is None: # first time init mems = self.init_hid_cache(bsz) output = self.model(x=src_tokens, h_cache=mems,) if incremental_state is not None: self.set_incremental_state(incremental_state, "mems", output[1]) else: self._mems = output[1] return (output[0],) def max_positions(self): return self.config.attn_span def init_hid_cache(self, batch_sz): hid = [] for layer in self.model.layers: param = next(self.model.parameters()) h = torch.zeros( batch_sz, layer.get_cache_size(), self.config.d_model, dtype=param.dtype, device=param.device, ) hid.append(h) return hid def get_aux_loss(self): return self.model.get_aux_loss() def get_current_max_span(self): return self.model.get_current_max_span() def get_current_avg_span(self): return self.model.get_current_avg_span() def reorder_incremental_state( self, incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]], new_order: torch.Tensor, ): """Reorder incremental state. This will be called when the order of the input has changed from the previous time step. A typical use case is beam search, where the input order changes between time steps based on the selection of beams. """ raise NotImplementedError("This is required for generation/beam search") # mems = self.get_incremental_state(incremental_state, "mems") # if mems is not None: # new_mems = [mems_i.index_select(1, new_order) for mems_i in mems] # self.set_incremental_state(incremental_state, "mems", new_mems) ================================================ FILE: examples/attention_head_selection/README.md ================================================ # Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling (Gong et al., 2021) [https://arxiv.org/pdf/2106.10840.pdf](https://arxiv.org/pdf/2106.10840.pdf) ## Introduction We present attention head selection strategies in multilingual and multi-domain sequence modeling including text translation, speech recognition and speech translation tasks. Below is an example of training multilingual/multi-domain speech recognition models. ## Data Preparation Prepare mTEDx data as in [mTEDx example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/mtedx_example.md) and CoVoST data as in [CoVoST example](https://github.com/fairinternal/fairseq-py/blob/0d9c5851e6fac40f9e366b3633ccd615c2901788/examples/speech_to_text/docs/covost_example.md). Similarly prepare EuroParl data. ## Training a multilingual ASR model with attention head selection ```bash data_dir= train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx" valid_subset="valid_ar_ar_tedx,valid_de_de_tedx,valid_el_el_tedx,valid_es_es_tedx,valid_fr_fr_tedx,valid_it_it_tedx,valid_pt_pt_tedx,valid_ru_ru_tedx" strateg= fairseq-train ${data_dir} \ --user-dir examples/attention_head_selection/src \ --train-subset "${train_subset}" \ --valid-subset "${valid_subset}" \ --config-yaml 'config_asr.yaml' \ --arch 'head_selection_s2t_transformer_s' \ --task 'speech_to_text_head_selection' \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \ --lr 5e-4 \ --clip-norm 10.0 \ --seed 1 \ --max-epoch 400 \ --max-tokens 32000 \ --ignore-prefix-size 1 \ --dropout 0.3 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --skip-invalid-size-inputs-valid-test \ --encoder-attn-head-select \ --total-encoder-attention-heads 8 \ --decoder-self-attn-head-select \ --total-decoder-attention-heads 8 \ --attn-head-select-strategy ${strategy} \ --task-type lang \ ``` ## Training a multi-domain ASR model with attention head selection ```bash data_dir= train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep" valid_subset="dev_es_es_tedx,dev_fr_fr_tedx,dev_pt_pt_tedx,dev_it_it_tedx,dev_ru_ru_tedx,dev_el_el_tedx,dev_ar_ar_tedx,dev_de_de_tedx,dev_ar_ar_cv,dev_de_de_cv,dev_es_es_cv,dev_fr_fr_cv,dev_it_it_cv,dev_pt_pt_cv,dev_ru_ru_cv,dev_de_de_ep,dev_es_es_ep,dev_fr_fr_ep,dev_it_it_ep,dev_pt_pt_ep" strateg= fairseq-train ${data_dir} \ --user-dir examples/attention_head_selection/src \ --train-subset "${train_subset}" \ --valid-subset "${valid_subset}" \ --config-yaml 'config_asr.yaml' \ --arch head_selection_s2t_transformer_s \ --task speech_to_text_head_selection \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --lr-scheduler 'inverse_sqrt' --stop-min-lr -1.0 --warmup-updates 10000 \ --lr 5e-4 \ --clip-norm 10.0 \ --seed 1 \ --max-epoch 400 \ --max-tokens 32000 \ --ignore-prefix-size 1 \ --dropout 0.3 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --skip-invalid-size-inputs-valid-test \ --encoder-attn-head-select \ --total-encoder-attention-heads 8 \ --decoder-self-attn-head-select \ --total-decoder-attention-heads 8 \ --attn-head-select-strategy ${strategy} \ --task-type domain ``` ## Inference in multilingual setting ```bash MODEL_DIR= data_dir= gen_subset= train_subset="train_ar_ar_tedx,train_de_de_tedx,train_el_el_tedx,train_es_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_pt_tedx,train_ru_ru_tedx" last_n=10 CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt" CHECKPOINT="_avg" RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}" if [ ! -d $RESULTS ]; then mkdir -p $RESULTS fi; python scripts/average_checkpoints.py \ --inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \ --output "${MODEL_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${data_dir} \ --user-dir examples/attention_head_selection/src \ --arch 'head_selection_s2t_transformer_s' \ --task 'speech_to_text_head_selection' \ --train-subset ${train_subset} \ --gen-subset ${gen_subset} \ --path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \ --config-yaml 'config_asr.yaml' \ --prefix-size 1 \ --max-tokens 40000 --beam 5 \ --skip-invalid-size-inputs-valid-test \ --results-path ${RESULTS} \ --scoring wer --wer-tokenizer 13a \ --wer-lowercase --wer-remove-punct --remove-bpe ``` ## Inference in multi-domain setting ```bash MODEL_DIR= data_dir= gen_subset= train_subset="train_es_es_tedx,train_fr_fr_tedx,train_pt_pt_tedx,train_it_it_tedx,train_ru_ru_tedx,train_el_el_tedx,train_ar_ar_tedx,train_de_de_tedx,train_ar_ar_cv,train_de_de_cv,train_es_es_cv,train_fr_fr_cv,train_it_it_cv,train_pt_pt_cv,train_ru_ru_cv,train_de_de_ep,train_es_es_ep,train_fr_fr_ep,train_it_it_ep,train_pt_pt_ep" last_n=10 CHECKPOINT_FILENAME="avg_last_${last_n}_checkpoint.pt" CHECKPOINT="_avg" RESULTS="${MODEL_DIR}/ckpt${CHECKPOINT}" if [ ! -d $RESULTS ]; then mkdir -p $RESULTS fi; python scripts/average_checkpoints.py \ --inputs ${MODEL_DIR} --num-epoch-checkpoints ${last_n} \ --output "${MODEL_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${data_dir} \ --user-dir examples/attention_head_selection/src \ --arch 'head_selection_s2t_transformer_s' \ --task 'speech_to_text_head_selection' \ --train-subset ${train_subset} \ --gen-subset ${gen_subset} \ --path "${MODEL_DIR}/${CHECKPOINT_FILENAME}" \ --config-yaml 'config_asr.yaml' \ --prefix-size 1 \ --max-tokens 40000 --beam 5 \ --skip-invalid-size-inputs-valid-test \ --results-path ${RESULTS} \ --scoring wer --wer-tokenizer 13a \ --wer-lowercase --wer-remove-punct --remove-bpe ``` ## Citation ```bibtex @article{gong2021pay, title={Pay Better Attention to Attention: Head Selection in Multilingual and Multi-Domain Sequence Modeling}, author={Gong, Hongyu and Tang, Yun and Pino, Juan and Li, Xian}, journal={arXiv preprint arXiv:2106.10840}, year={2021} } ''' ================================================ FILE: examples/attention_head_selection/src/__init__.py ================================================ ================================================ FILE: examples/attention_head_selection/src/data/__init__.py ================================================ ================================================ FILE: examples/attention_head_selection/src/data/speech_to_text_dataset_with_domain.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from pathlib import Path from typing import Dict, List, Optional from dataclasses import dataclass import torch from fairseq.data import ( ConcatDataset, Dictionary, FairseqDataset, ResamplingDataset ) from fairseq.data.audio.data_cfg import S2TDataConfig from fairseq.data.audio.speech_to_text_dataset import ( SpeechToTextDatasetItem, SpeechToTextDataset, SpeechToTextDatasetCreator ) logger = logging.getLogger(__name__) @dataclass class SpeechToTextDatasetItemWithDomain(SpeechToTextDatasetItem): src_lang_id: Optional[torch.Tensor] = None tgt_lang_id: Optional[torch.Tensor] = None domain_id: Optional[torch.Tensor] = None class SpeechToTextDatasetWithDomain(SpeechToTextDataset): def __init__( self, split: str, is_train_split: bool, cfg: S2TDataConfig, audio_paths: List[str], n_frames: List[int], src_texts: Optional[List[str]] = None, tgt_texts: Optional[List[str]] = None, speakers: Optional[List[str]] = None, src_langs: Optional[List[str]] = None, tgt_langs: Optional[List[str]] = None, ids: Optional[List[str]] = None, tgt_dict: Optional[Dictionary] = None, pre_tokenizer=None, bpe_tokenizer=None, n_frames_per_step=1, speaker_to_id=None, src_lang_ids: Optional[List[int]] = None, tgt_lang_ids: Optional[List[int]] = None, domain_ids: Optional[List[int]] = None ): super().__init__( split, is_train_split, cfg, audio_paths, n_frames, src_texts, tgt_texts, speakers, src_langs, tgt_langs, ids, tgt_dict, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id ) assert src_lang_ids is None or len(src_lang_ids) == self.n_samples assert tgt_lang_ids is None or len(tgt_lang_ids) == self.n_samples assert domain_ids is None or len(domain_ids) == self.n_samples self.src_lang_ids = src_lang_ids self.tgt_lang_ids = tgt_lang_ids self.domain_ids = domain_ids def __getitem__(self, index: int) -> SpeechToTextDatasetItemWithDomain: item = super().__getitem__(index) src_lang_id = self.src_lang_ids[index] tgt_lang_id = self.tgt_lang_ids[index] domain_id = self.domain_ids[index] return SpeechToTextDatasetItemWithDomain( index=item.index, source=item.source, target=item.target, speaker_id=item.speaker_id, src_lang_id=src_lang_id, tgt_lang_id=tgt_lang_id, domain_id=domain_id ) def collater( self, samples: List[SpeechToTextDatasetItem], return_order: bool = False ) -> Dict: if len(samples) == 0: return {} out = super().collater(samples, return_order=True) order = out["order"] src_lang_ids = torch.tensor([x.src_lang_id for x in samples], dtype=torch.long).index_select(0, order) tgt_lang_ids = torch.tensor([x.tgt_lang_id for x in samples], dtype=torch.long).index_select(0, order) domain_ids = torch.tensor([x.domain_id for x in samples], dtype=torch.long).index_select(0, order) out["src_lang_ids"] = src_lang_ids out["tgt_lang_ids"] = tgt_lang_ids out["domain_ids"] = domain_ids if not return_order: del out["order"] return out class SpeechToTextDatasetCreatorWithDomain(SpeechToTextDatasetCreator): KEY_SRC_LANG_ID, KEY_TGT_LANG_ID = "src_lang_id", "tgt_lang_id" KEY_DOMAIN_ID = "domain_id" # default values DEFAULT_SRC_LANG_ID, DEFAULT_TGT_LANG_ID, DEFAULT_DOMAIN_ID = 0, 0, 0 @classmethod def _from_list( cls, split_name: str, is_train_split, samples: List[Dict], cfg: S2TDataConfig, tgt_dict, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id ) -> SpeechToTextDatasetWithDomain: audio_root = Path(cfg.audio_root) ids = [s[cls.KEY_ID] for s in samples] audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] src_lang_ids = [s.get(cls.KEY_SRC_LANG_ID, cls.DEFAULT_SRC_LANG_ID) for s in samples] tgt_lang_ids = [s.get(cls.KEY_TGT_LANG_ID, cls.DEFAULT_TGT_LANG_ID) for s in samples] domain_ids = [s.get(cls.KEY_DOMAIN_ID, cls.DEFAULT_DOMAIN_ID) for s in samples] return SpeechToTextDatasetWithDomain( split_name, is_train_split, cfg, audio_paths, n_frames, src_texts=src_texts, tgt_texts=tgt_texts, speakers=speakers, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, tgt_dict=tgt_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, n_frames_per_step=n_frames_per_step, speaker_to_id=speaker_to_id, src_lang_ids=src_lang_ids, tgt_lang_ids=tgt_lang_ids, domain_ids=domain_ids ) @classmethod def _load_samples_from_tsv( cls, root: str, split: str, src_lang_map, tgt_lang_map, domain_map ): # metadata from split _, src_lang, tgt_lang, domain = split.split("_") src_lang_id = src_lang_map[src_lang] tgt_lang_id = tgt_lang_map[tgt_lang] domain_id = domain_map[domain] samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split) for s in samples: s.update({ cls.KEY_SRC_LANG_ID: src_lang_id, cls.KEY_TGT_LANG_ID: tgt_lang_id, cls.KEY_DOMAIN_ID: domain_id }) return samples @classmethod def _from_tsv( cls, root: str, cfg: S2TDataConfig, split: str, tgt_dict, is_train_split: bool, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, src_lang_map: Dict[str, int], tgt_lang_map: Dict[str, int], domain_map: Dict[str, int] ) -> SpeechToTextDatasetItemWithDomain: samples = cls._load_samples_from_tsv( root, split, src_lang_map, tgt_lang_map, domain_map ) return cls._from_list( split, is_train_split, samples, cfg, tgt_dict, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id ) @classmethod def from_tsv( cls, root: str, cfg: S2TDataConfig, splits: str, tgt_dict, pre_tokenizer, bpe_tokenizer, is_train_split: bool, epoch: int, seed: int, src_lang_map: Dict[str, int], tgt_lang_map: Dict[str, int], domain_map: Dict[str, int], n_frames_per_step: int = 1, speaker_to_id=None ) -> SpeechToTextDatasetWithDomain: datasets = [ cls._from_tsv( root, cfg, split, tgt_dict, is_train_split, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, src_lang_map, tgt_lang_map, domain_map ) for split in splits.split(",") ] if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0: # temperature-based sampling size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha) datasets = [ ResamplingDataset( d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0) ) for r, d in zip(size_ratios, datasets) ] return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] ================================================ FILE: examples/attention_head_selection/src/loss/__init__.py ================================================ ================================================ FILE: examples/attention_head_selection/src/loss/attention_head_selection.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch from torch.nn.modules.loss import _Loss class HeadSelectionLoss(_Loss): def __init__(self, args): super().__init__() self.args = args self.kl_weight = getattr(args, "kl_weight", 0.0) def forward(self, head_samples, sample_sizes, prior=0.5, eps=1e-7): """ head_scores: (num_tasks, num_layers, num_heads) sample_sizes: (num_tasks, ) """ kl_loss = (head_samples * (torch.log(head_samples + eps) - math.log(prior))).sum(-1).sum(-1) kl_loss /= (torch.numel(head_samples) / head_samples.size(0)) kl_loss = self.kl_weight * torch.matmul(kl_loss, sample_sizes) return kl_loss ================================================ FILE: examples/attention_head_selection/src/models/__init__.py ================================================ ================================================ FILE: examples/attention_head_selection/src/models/head_selection_s2t_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import Dict, List, Optional from pathlib import Path import torch.nn as nn from torch import Tensor from fairseq import checkpoint_utils from fairseq.models import register_model, register_model_architecture from fairseq.utils import safe_hasattr from fairseq.models.speech_to_text.s2t_transformer import ( S2TTransformerModel, S2TTransformerEncoder, TransformerDecoderScriptable ) from fairseq.models.speech_to_text.s2t_transformer import base_architecture as s2t_base_architecture from ..modules.attn_head_selector import AttnHeadSelector from ..modules.head_selection_transformer_layer import HeadSelectionTransformerEncoderLayer from .head_selection_transformer import HeadSelectionTransformerDecoder logger = logging.getLogger(__name__) @register_model("head_selection_s2t_transformer") class HeadSelectionS2TTransformerModel(S2TTransformerModel): """ Head selection implemented in S2TTransformer """ def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): S2TTransformerModel.add_args(parser) # encoder head selection parser.add_argument( "--encoder-attn-head-select", action="store_true", default=False, help="encoder head selection" ) parser.add_argument( "--total-encoder-attention-heads", type=int, help="total number of encoder attention heads" ) # decoder self attention selection parser.add_argument( "--decoder-self-attn-head-select", action="store_true", default=False, help="decoder self-attention head selection" ) # decoder-encoder attention selection parser.add_argument( "--dec-enc-attn-head-select", action="store_true", default=False, help="decoder-encoder attention head selection" ) parser.add_argument( "--total-decoder-attention-heads", type=int, help="total number of decoder attention heads" ) # selection strategy parser.add_argument( "--attn-head-select-strategy", type=str, help="attention head selection strategy, subset or group" ) @classmethod def build_encoder(cls, args): if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select: encoder = HeadSelectionS2TTransformerEncoder(args) else: encoder = S2TTransformerEncoder(args) pretraining_path = getattr(args, "load_pretrained_encoder_from", None) if pretraining_path is not None: if not Path(pretraining_path).exists(): logger.warning( f"skipped pretraining because {pretraining_path} does not exist" ) else: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=pretraining_path ) logger.info(f"loaded pretrained encoder from: {pretraining_path}") return encoder @classmethod def build_decoder(cls, args, task, embed_tokens): if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select): return HeadSelectionTransformerDecoderScriptable(args, task.target_dictionary, embed_tokens) else: return TransformerDecoderScriptable(args, task.target_dictionary, embed_tokens) class HeadSelectionS2TTransformerEncoder(S2TTransformerEncoder): def __init__(self, args): super().__init__(args) self.attn_head_selector = AttnHeadSelector( args.encoder_tasks, args.encoder_layers, args.total_encoder_attention_heads, args.encoder_attention_heads, args.attn_head_select_strategy, ) self.task_ids = None self.transformer_layers = nn.ModuleList([ HeadSelectionTransformerEncoderLayer(args, layer_idx, attn_head_selector=self.attn_head_selector) for layer_idx in range(args.encoder_layers) ]) def set_task_ids(self, task_ids): self.task_ids = task_ids def _forward(self, src_tokens, src_lengths, return_all_hiddens=False): self.attn_head_selector.head_select(self.task_ids) return super()._forward(src_tokens, src_lengths, return_all_hiddens) class HeadSelectionTransformerDecoderScriptable(HeadSelectionTransformerDecoder): def extract_features( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): # call scriptable method from parent class x, _ = self.extract_features_scriptable( prev_output_tokens, encoder_out, incremental_state, full_context_alignment, alignment_layer, alignment_heads, ) return x, None @register_model_architecture(model_name="head_selection_s2t_transformer", arch_name="head_selection_s2t_transformer") def base_architecture(args): s2t_base_architecture(args) args.encoder_attn_head_select = getattr(args, "encoder_attn_head_select", False) args.decoder_self_attn_head_select = getattr(args, "decoder_self_attn_head_select", False) args.dec_enc_attn_head_select = getattr(args, "dec_enc_attn_head_select", False) args.total_encoder_attention_heads = getattr(args, "total_encoder_attention_heads", 8) args.total_decoder_attention_heads = getattr(args, "total_decoder_attention_heads", 8) args.attn_head_select_strategy = getattr(args, "attn_head_select_strategy", "group") @register_model_architecture("head_selection_s2t_transformer", "head_selection_s2t_transformer_s") def head_selection_s2t_transformer_s(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) base_architecture(args) ================================================ FILE: examples/attention_head_selection/src/models/head_selection_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Any, List, Dict, Optional import torch import torch.nn as nn from torch import Tensor from fairseq.utils import safe_hasattr from fairseq.models.transformer import ( TransformerModel, TransformerEncoder, TransformerDecoder ) from ..modules.attn_head_selector import AttnHeadSelector from ..modules.head_selection_transformer_layer import ( HeadSelectionTransformerEncoderLayer, HeadSelectionTransformerDecoderLayer ) class HeadSelectionTransformerModel(TransformerModel): def __init__(self, args, encoder, decoder): super().__init__(args, encoder, decoder) @staticmethod def add_args(parser): TransformerModel.add_args(parser) # encoder head selection parser.add_argument( "--encoder-attn-head-select", action="store_true", default=False, help="encoder head selection" ) parser.add_argument( "--total-encoder-attention-heads", type=int, help="total number of encoder attention heads" ) # decoder self attention parser.add_argument( "--decoder-self-attn-head-select", action="store_true", default=False, help="decoder self-attention head selection" ) # decoder-encoder attention parser.add_argument( "--dec-enc-attn-head-select", action="store_true", default=False, help="decoder-encoder attention head selection" ) parser.add_argument( "--total-decoder-attention-heads", type=int, help="total number of decoder attention heads" ) # selection strategy parser.add_argument( "--attn-head-select-strategy", type=str, help="attention head selection strategy, subset or group" ) @classmethod def build_encoder(cls, args, src_dict, embed_tokens): if safe_hasattr(args, "encoder_attn_head_select") and args.encoder_attn_head_select: return HeadSelectionTransformerEncoder( args, src_dict, embed_tokens ) else: return TransformerEncoder(args, src_dict, embed_tokens) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): if (safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select) or (safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select): return HeadSelectionTransformerDecoder( args, tgt_dict, embed_tokens ) else: return TransformerDecoder(args, tgt_dict, embed_tokens) class HeadSelectionTransformerEncoder(TransformerEncoder): def __init__(self, args, dictionary, embed_tokens): self.num_tasks = args.encoder_tasks self.num_layers = args.encoder_layers self.total_num_heads = args.total_encoder_attention_heads self.num_heads = args.encoder_attention_heads self.select_strategy = args.attn_head_select_strategy super().__init__(args, dictionary, embed_tokens) self.attn_head_selector = AttnHeadSelector( self.num_tasks, self.num_layers, self.total_num_heads, self.num_heads, self.select_strategy ) self.task_ids = None self.layers = nn.ModuleList( [self.build_encoder_layer(args, i) for i in range(args.encoder_layers)] ) def set_task_ids(self, task_ids): self.task_ids = task_ids def build_encoder_layer(self, args, layer_idx=None): return HeadSelectionTransformerEncoderLayer( args, layer_idx, attn_head_selector=self.attn_head_selector ) def forward( self, src_tokens, src_lengths: Optional[torch.Tensor] = None, return_all_hiddens: bool = False, token_embeddings: Optional[torch.Tensor] = None, ): self.attn_head_selector.head_select(self.task_ids) return super().forward(src_tokens, src_lengths, return_all_hiddens, token_embeddings) class HeadSelectionTransformerDecoder(TransformerDecoder): def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): self.num_tasks = args.decoder_tasks self.num_layers = args.decoder_layers self.total_num_heads = args.total_decoder_attention_heads self.num_heads = args.decoder_attention_heads self.select_strategy = args.attn_head_select_strategy super().__init__( args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn, output_projection=output_projection ) self.self_attn_head_selector = None self.enc_attn_head_selector = None if safe_hasattr(args, "decoder_self_attn_head_select") and args.decoder_self_attn_head_select: self.self_attn_head_selector = AttnHeadSelector( self.num_tasks, self.num_layers, self.total_num_heads, self.num_heads, self.select_strategy ) if safe_hasattr(args, "dec_enc_attn_head_select") and args.dec_enc_attn_head_select: self.enc_attn_head_selector = AttnHeadSelector( self.num_tasks, self.num_layers, self.total_num_heads, self.num_heads, self.select_strategy ) self.task_ids = None self.layers = nn.ModuleList( [ self.build_head_selection_decoder_layer(args, no_encoder_attn, idx) for idx in range(args.decoder_layers) ] ) def set_task_ids(self, task_ids): self.task_ids = task_ids def build_head_selection_decoder_layer(self, args, no_encoder_attn=False, layer_idx=None): return HeadSelectionTransformerDecoderLayer( args, layer_idx, self.self_attn_head_selector, self.enc_attn_head_selector, no_encoder_attn=no_encoder_attn ) def forward( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, ): if self.self_attn_head_selector is not None: self.self_attn_head_selector.head_select(self.task_ids) if self.enc_attn_head_selector is not None: self.enc_attn_head_selector.head_select(self.task_ids) return super().forward( prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, features_only=features_only, full_context_alignment=full_context_alignment, alignment_layer=alignment_layer, alignment_heads=alignment_heads, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens ) ================================================ FILE: examples/attention_head_selection/src/modules/__init__.py ================================================ ================================================ FILE: examples/attention_head_selection/src/modules/attn_head_selector.py ================================================ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import math class AttnHeadSelector(nn.Module): """ Latent variable modeling of attention head selection """ def __init__( self, num_tasks, num_layers, total_num_heads, num_heads, select_strategy="group", head_select_temp=5.0 ): super(AttnHeadSelector, self).__init__() self.num_tasks = num_tasks self.num_layers = num_layers self.total_num_heads = total_num_heads self.num_heads = num_heads self.select_strategy = select_strategy self.temp = head_select_temp self.head_logits = torch.nn.Parameter( torch.Tensor(self.num_tasks, self.num_layers, total_num_heads), requires_grad=True ) nn.init.uniform_( self.head_logits, a=math.log(0.01), b=math.log(1.0) ) def gumbel_sample(self, logits, tau=1.0): gumbels1 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log() gumbels2 = -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format).exponential_().log() gumbels1 = (logits + gumbels1 - gumbels2) / tau y_soft = gumbels1.sigmoid() return y_soft def subset_select(self, y_soft, topk, dim=-1): top_values, top_inds = torch.topk(y_soft, k=topk, dim=dim) top_ret = 1.0 - top_values.detach() + top_values return top_inds.detach(), top_ret def group_selet(self, y_soft, topk, dim=-1): # top_values: (num_tasks, num_layers, topk) top_values, top_inds = torch.max( y_soft.view(self.num_tasks, self.num_layers, -1, topk), dim=2 ) top_inds = top_inds * topk + torch.arange(topk, device=top_inds.device).unsqueeze(0).unsqueeze(1) top_ret = 1.0 - top_values.detach() + top_values return top_inds.detach(), top_ret def head_select(self, task_ids=None): # gumbel_sample self.head_samples = self.gumbel_sample(self.head_logits, tau=self.temp) # head select if self.select_strategy == "subset": self.subset_heads, self.subset_weights = self.subset_select( self.head_samples, topk=self.num_heads, ) elif self.select_strategy == "group": self.subset_heads, self.subset_weights = self.group_selet( self.head_samples, topk=self.num_heads, ) else: raise ValueError("{} is not supported".format(self.select_strategy)) self.batch_subset = self.subset_heads[task_ids, :, :] self.batch_weights = self.subset_weights[task_ids, :, :] def forward(self, layer_idx): assert layer_idx is not None batch_subset = self.batch_subset[:, layer_idx, :] batch_weights = self.batch_weights[:, layer_idx, :] return batch_subset, batch_weights ================================================ FILE: examples/attention_head_selection/src/modules/head_selection_transformer_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.utils import safe_getattr from fairseq.modules import TransformerEncoderLayer, TransformerDecoderLayer from ..modules.multihead_attention_selection import MultiheadAttentionSelection class HeadSelectionTransformerEncoderLayer(TransformerEncoderLayer): def __init__(self, args, layer_idx, attn_head_selector=None): super().__init__(args) self.layer_idx = layer_idx self.self_attn = self.build_self_attention_selection( self.embed_dim, args, attn_head_selector ) def build_self_attention_selection(self, embed_dim, args, attn_head_selector=None): return MultiheadAttentionSelection( embed_dim, args.total_encoder_attention_heads, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, layer_idx=self.layer_idx, attn_head_selector=attn_head_selector ) class HeadSelectionTransformerDecoderLayer(TransformerDecoderLayer): def __init__( self, args, layer_idx, self_attn_head_selector=None, enc_attn_head_selector=None, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False, ): self.layer_idx = layer_idx super().__init__(args, no_encoder_attn, add_bias_kv, add_zero_attn) if self_attn_head_selector is not None: self.self_attn = self.build_self_attention_selection( self.embed_dim, args, self_attn_head_selector=self_attn_head_selector, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn ) if enc_attn_head_selector is not None: self.encoder_attn = self.build_encoder_attention_selection( self.embed_dim, args, enc_attn_head_selector=enc_attn_head_selector ) def build_self_attention_selection( self, embed_dim, args, self_attn_head_selector=None, add_bias_kv=False, add_zero_attn=False ): return MultiheadAttentionSelection( embed_dim, args.total_decoder_attention_heads, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not safe_getattr(args, "cross_self_attention"), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, layer_idx=self.layer_idx, attn_head_selector=self_attn_head_selector, ) def build_encoder_attention_selection(self, embed_dim, args, enc_attn_head_selector=None): return MultiheadAttentionSelection( embed_dim, args.total_decoder_attention_heads, args.decoder_attention_heads, kdim=args.encoder_embed_dim, vdim=args.encoder_embed_dim, dropout=args.attention_dropout, encoder_decoder_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, layer_idx=self.layer_idx, attn_head_selector=enc_attn_head_selector, ) ================================================ FILE: examples/attention_head_selection/src/modules/multihead_attention_selection.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, Optional, Tuple import torch from fairseq import utils from fairseq.modules.quant_noise import quant_noise from torch import Tensor, nn from torch.nn import Parameter from fairseq.modules.multihead_attention import MultiheadAttention from ..modules.multihead_functional import multi_head_attention_forward class MultiheadAttentionSelection(MultiheadAttention): def __init__( self, embed_dim, total_num_heads, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, layer_idx=0, attn_head_selector=None ): super().__init__( embed_dim, num_heads, kdim=kdim, vdim=vdim, dropout=dropout, bias=bias, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=self_attention, encoder_decoder_attention=encoder_decoder_attention, q_noise=q_noise, qn_block_size=qn_block_size, ) self.layer_idx = layer_idx self.attn_head_selector = attn_head_selector self.total_num_heads = total_num_heads self.total_embed_dim = self.head_dim * total_num_heads self.k_proj = quant_noise( nn.Linear(self.kdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size ) self.v_proj = quant_noise( nn.Linear(self.vdim, self.total_embed_dim, bias=bias), q_noise, qn_block_size ) self.q_proj = quant_noise( nn.Linear(embed_dim, self.total_embed_dim, bias=bias), q_noise, qn_block_size ) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, self.total_embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, self.total_embed_dim)) else: self.bias_k = self.bias_v = None self.reset_parameters() def forward( self, query, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, need_weights: bool = True, static_kv: bool = False, attn_mask: Optional[Tensor] = None, before_softmax: bool = False, need_head_weights: bool = False, # subset_heads: Optional[Tensor] = None, # subset_weights: Optional[Tensor] = None ) -> Tuple[Tensor, Optional[Tensor]]: if need_head_weights: need_weights = True is_tpu = query.device.type == "xla" subset_heads, subset_weights = self.attn_head_selector(self.layer_idx) tgt_len, bsz, embed_dim = query.size() src_len = tgt_len assert list(query.size()) == [tgt_len, bsz, self.embed_dim] if key is not None: src_len, key_bsz, _ = key.size() if not torch.jit.is_scripting(): assert key_bsz == bsz assert value is not None assert src_len, bsz == value.shape[:2] if ( not self.onnx_trace and not is_tpu # don't use PyTorch version on TPUs and incremental_state is None and not static_kv # A workaround for quantization to work. Otherwise JIT compilation # treats bias in linear module as method. and not torch.jit.is_scripting() ): assert key is not None and value is not None return multi_head_attention_forward( query, key, value, self.embed_dim, self.total_num_heads, self.num_heads, torch.empty([0]), torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), self.bias_k, self.bias_v, self.add_zero_attn, self.dropout_module.p, self.out_proj.weight, self.out_proj.bias, self.training or self.dropout_module.apply_during_inference, key_padding_mask, need_weights, attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, v_proj_weight=self.v_proj.weight, subset_heads=subset_heads, subset_weights=subset_weights ) if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if saved_state is not None and "prev_key" in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert self.encoder_decoder_attention and not self.self_attention key = value = None else: saved_state = None if self.self_attention: q = self.q_proj(query) k = self.k_proj(query) v = self.v_proj(query) elif self.encoder_decoder_attention: # encoder-decoder attention q = self.q_proj(query) if key is None: assert value is None k = v = None else: k = self.k_proj(key) v = self.v_proj(key) else: assert key is not None and value is not None q = self.q_proj(query) k = self.k_proj(key) v = self.v_proj(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1), ], dim=1, ) q = ( q.contiguous() .view(tgt_len, bsz * self.total_num_heads, self.head_dim) .transpose(0, 1) ) if k is not None: k = ( k.contiguous() .view(-1, bsz * self.total_num_heads, self.head_dim) .transpose(0, 1) ) if v is not None: v = ( v.contiguous() .view(-1, bsz * self.total_num_heads, self.head_dim) .transpose(0, 1) ) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None prev_key = _prev_key.view(bsz * self.total_num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) src_len = k.size(1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None prev_value = _prev_value.view(bsz * self.total_num_heads, -1, self.head_dim) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) prev_key_padding_mask: Optional[Tensor] = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=bsz, src_len=k.size(1), static_kv=static_kv, ) saved_state["prev_key"] = k.view(bsz, self.total_num_heads, -1, self.head_dim) saved_state["prev_value"] = v.view(bsz, self.total_num_heads, -1, self.head_dim) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None incremental_state = self._set_input_buffer(incremental_state, saved_state) assert k is not None assert k.size(1) == src_len # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: assert v is not None src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as( key_padding_mask ), ], dim=1, ) attn_weights = torch.bmm(q, k.transpose(1, 2)) attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) assert list(attn_weights.size()) == [bsz * self.total_num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.total_num_heads, tgt_len, src_len) if not is_tpu: attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf"), ) else: attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) if before_softmax: return attn_weights, v attn_weights_float = utils.softmax( attn_weights, dim=-1, onnx_trace=self.onnx_trace ) attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = self.dropout_module(attn_weights) assert v is not None # evaluation if subset_heads is not None and subset_heads.numel() == 1: subset_heads = subset_heads.repeat(bsz) subset_weights = subset_weights.repeat(bsz) if subset_heads is None: attn = torch.bmm(attn_probs, v) else: # training with head selection mixed_attn = torch.bmm(attn_probs, v).contiguous().view(bsz, self.total_num_heads, tgt_len, self.head_dim) attn = torch.stack( [mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1 ) attn = attn * subset_weights.unsqueeze(2).unsqueeze(3) attn = attn.contiguous().view(bsz * self.num_heads, tgt_len, self.head_dim) assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if self.onnx_trace and attn.size(1) == 1: # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) attn_weights: Optional[Tensor] = None if need_weights: if subset_heads is None: attn_weights = attn_weights_float.view( bsz, self.num_heads, tgt_len, src_len ).transpose(1, 0) else: mixed_attn_weights = attn_weights_float.view( bsz, self.total_num_heads, tgt_len, src_len ) attn_weights = torch.stack( [mixed_attn_weights[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1 ).transpose(1, 0) if not need_head_weights: # average attention weights over heads attn_weights = attn_weights.mean(dim=0) return attn, attn_weights ================================================ FILE: examples/attention_head_selection/src/modules/multihead_functional.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Optional, Tuple import torch from torch import Tensor from torch.nn.functional import ( linear, softmax, dropout, pad, has_torch_function, handle_torch_function, _in_projection_packed, ) import math import warnings def _scaled_dot_product_attention( q: Tensor, k: Tensor, v: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: float = 0.0, bsz: int = 1, subset_heads: Optional[Tensor] = None, subset_weights: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor]: B, Nt, E = q.shape q = q / math.sqrt(E) # B: bsz * total_num_heads # (B, Nt, E) x (B, E, Ns) -> (B, Nt, Ns) attn = torch.bmm(q, k.transpose(-2, -1)) if attn_mask is not None: attn += attn_mask attn = softmax(attn, dim=-1) if dropout_p > 0.0: attn = dropout(attn, p=dropout_p) if subset_heads is None: # (B, Nt, Ns) x (B, Ns, E) -> (B, Nt, E) output = torch.bmm(attn, v) else: mixed_output = torch.bmm(attn, v).contiguous().view(bsz, -1, Nt, E) output = torch.stack( [mixed_output[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1 ) output = output * subset_weights.unsqueeze(2).unsqueeze(3) output = output.contiguous().view(-1, Nt, E) if subset_heads is not None: _, Nt, Ns = attn.size() mixed_attn = attn.view(bsz, -1, Nt, Ns) attn = torch.stack( [mixed_attn[torch.arange(bsz), subset_heads[:, col], :, :] for col in range(subset_heads.size(1))], dim=1 ) return output, attn def _in_projection( q: Tensor, k: Tensor, v: Tensor, w_q: Tensor, w_k: Tensor, w_v: Tensor, b_q: Optional[Tensor] = None, b_k: Optional[Tensor] = None, b_v: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor, Tensor]: return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v) def multi_head_attention_forward( query: Tensor, key: Tensor, value: Tensor, embed_dim_to_check: int, total_num_heads: int, num_heads: int, in_proj_weight: Tensor, in_proj_bias: Optional[Tensor], bias_k: Optional[Tensor], bias_v: Optional[Tensor], add_zero_attn: bool, dropout_p: float, out_proj_weight: Tensor, out_proj_bias: Optional[Tensor], training: bool = True, key_padding_mask: Optional[Tensor] = None, need_weights: bool = True, attn_mask: Optional[Tensor] = None, use_separate_proj_weight: bool = False, q_proj_weight: Optional[Tensor] = None, k_proj_weight: Optional[Tensor] = None, v_proj_weight: Optional[Tensor] = None, static_k: Optional[Tensor] = None, static_v: Optional[Tensor] = None, subset_heads: Optional[Tensor] = None, subset_weights: Optional[Tensor] = None, ): tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias) if has_torch_function(tens_ops): return handle_torch_function( multi_head_attention_forward, tens_ops, query, key, value, embed_dim_to_check, total_num_heads, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout_p, out_proj_weight, out_proj_bias, training=training, key_padding_mask=key_padding_mask, need_weights=need_weights, attn_mask=attn_mask, use_separate_proj_weight=use_separate_proj_weight, q_proj_weight=q_proj_weight, k_proj_weight=k_proj_weight, v_proj_weight=v_proj_weight, static_k=static_k, static_v=static_v, subset_heads=subset_heads, subset_weights=subset_weights ) # set up shape vars tgt_len, bsz, embed_dim = query.shape src_len, _, _ = key.shape assert embed_dim == embed_dim_to_check, \ f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}" if isinstance(embed_dim, torch.Tensor): # embed_dim can be a tensor when JIT tracing head_dim = embed_dim.div(num_heads, rounding_mode='trunc') else: head_dim = embed_dim // num_heads assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}" if use_separate_proj_weight: # allow MHA to have different embedding dimensions when separate projection weights are used assert key.shape[:2] == value.shape[:2], \ f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}" else: assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}" # # compute in-projection # if not use_separate_proj_weight: q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias) else: assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None" assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None" assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None" if in_proj_bias is None: b_q = b_k = b_v = None else: b_q, b_k, b_v = in_proj_bias.chunk(3) q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v) # prep attention mask if attn_mask is not None: if attn_mask.dtype == torch.uint8: warnings.warn("Byte tensor for attn_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.") attn_mask = attn_mask.to(torch.bool) else: assert attn_mask.is_floating_point() or attn_mask.dtype == torch.bool, \ f"Only float, byte, and bool types are supported for attn_mask, not {attn_mask.dtype}" # ensure attn_mask's dim is 3 if attn_mask.dim() == 2: correct_2d_size = (tgt_len, src_len) if attn_mask.shape != correct_2d_size: raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.") attn_mask = attn_mask.unsqueeze(0) elif attn_mask.dim() == 3: correct_3d_size = (bsz * total_num_heads, tgt_len, src_len) if attn_mask.shape != correct_3d_size: raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.") else: raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported") # prep key padding mask if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8: warnings.warn("Byte tensor for key_padding_mask in nn.MultiheadAttention is deprecated. Use bool tensor instead.") key_padding_mask = key_padding_mask.to(torch.bool) # add bias along batch dimension (currently second) if bias_k is not None and bias_v is not None: assert static_k is None, "bias cannot be added to static key." assert static_v is None, "bias cannot be added to static value." k = torch.cat([k, bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1)) else: assert bias_k is None assert bias_v is None # # reshape q, k, v for multihead attention and make em batch first # q = q.contiguous().view(tgt_len, bsz * total_num_heads, head_dim).transpose(0, 1) if static_k is None: k = k.contiguous().view(k.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed assert static_k.size(0) == bsz * total_num_heads, \ f"expecting static_k.size(0) of {bsz * total_num_heads}, but got {static_k.size(0)}" assert static_k.size(2) == head_dim, \ f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}" k = static_k if static_v is None: v = v.contiguous().view(v.shape[0], bsz * total_num_heads, head_dim).transpose(0, 1) else: # TODO finish disentangling control flow so we don't do in-projections when statics are passed assert static_v.size(0) == bsz * total_num_heads, \ f"expecting static_v.size(0) of {bsz * total_num_heads}, but got {static_v.size(0)}" assert static_v.size(2) == head_dim, \ f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}" v = static_v # add zero attention along batch dimension (now first) if add_zero_attn: zero_attn_shape = (bsz * total_num_heads, 1, head_dim) k = torch.cat([k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=1) v = torch.cat([v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=1) if attn_mask is not None: attn_mask = pad(attn_mask, (0, 1)) if key_padding_mask is not None: key_padding_mask = pad(key_padding_mask, (0, 1)) # update source sequence length after adjustments src_len = k.size(1) # merge key padding and attention masks if key_padding_mask is not None: assert key_padding_mask.shape == (bsz, src_len), \ f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}" key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len). \ expand(-1, total_num_heads, -1, -1).reshape(bsz * total_num_heads, 1, src_len) if attn_mask is None: attn_mask = key_padding_mask elif attn_mask.dtype == torch.bool: attn_mask = attn_mask.logical_or(key_padding_mask) else: attn_mask = attn_mask.masked_fill(key_padding_mask, float("-inf")) # convert mask to float if attn_mask is not None and attn_mask.dtype == torch.bool: new_attn_mask = torch.zeros_like(attn_mask, dtype=torch.float) new_attn_mask.masked_fill_(attn_mask, float("-inf")) attn_mask = new_attn_mask # adjust dropout probability if not training: dropout_p = 0.0 # # (deep breath) calculate attention and out projection # attn_output, attn_output_weights = _scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, bsz, subset_heads, subset_weights) attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn_output = linear(attn_output, out_proj_weight, out_proj_bias) if need_weights: # average attention weights over heads attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len) return attn_output, attn_output_weights.sum(dim=1) / num_heads else: return attn_output, None ================================================ FILE: examples/attention_head_selection/src/speech_to_text_head_selection.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.optim.amp_optimizer import AMPOptimizer from fairseq.tasks import register_task from fairseq.tasks.speech_to_text import SpeechToTextTask from .data.speech_to_text_dataset_with_domain import SpeechToTextDatasetCreatorWithDomain from .loss.attention_head_selection import HeadSelectionLoss @register_task("speech_to_text_head_selection") class SpeechToTextHeadSelectionTask(SpeechToTextTask): @classmethod def add_args(cls, parser): SpeechToTextTask.add_args(parser) parser.add_argument( "--task-type", type=str, default="lang", help="task type for head selection, lang or domain" ) parser.add_argument( "--kl-weight", type=float, default=0.0, help="the weight of KL loss" ) def __init__(self, args, tgt_dict): super().__init__(args, tgt_dict) self.task_type = args.task_type assert self.task_type in ["lang", "domain"], "invalid task_type: {}, should be either lang or domain".format(self.task_type) self.map_task_to_id(args.train_subset) self.encoder_head_prior = float(args.decoder_attention_heads) / args.total_decoder_attention_heads self.decoder_head_prior = float(args.encoder_attention_heads) / args.total_encoder_attention_heads self.kl_loss = HeadSelectionLoss(args) def map_task_to_id(self, train_subset): src_lang_set, tgt_lang_set, domain_set = set(), set(), set() for split in train_subset.split(","): seq = split.split("_") assert len(seq) == 4, "subset {} should be in the format of train_src_tgt_domain".format(split) _, src_lang, tgt_lang, domain = seq src_lang_set.add(src_lang) tgt_lang_set.add(tgt_lang) domain_set.add(domain) src_langs = sorted(src_lang_set) tgt_langs = sorted(tgt_lang_set) domains = sorted(domain_set) self.src_lang_map = {src_lang: i for (i, src_lang) in enumerate(src_langs)} self.tgt_lang_map = {tgt_lang: i for (i, tgt_lang) in enumerate(tgt_langs)} self.domain_map = {domain: i for (i, domain) in enumerate(domains)} if self.task_type == "lang": self.encoder_tasks = len(self.src_lang_map) self.decoder_tasks = len(self.tgt_lang_map) elif self.task_type == "domain": self.encoder_tasks = len(self.domain_map) self.decoder_tasks = len(self.domain_map) def load_dataset(self, split, epoch=1, combine=False, **kwargs): is_train_split = split.startswith("train") pre_tokenizer = self.build_tokenizer(self.args) bpe_tokenizer = self.build_bpe(self.args) self.datasets[split] = SpeechToTextDatasetCreatorWithDomain.from_tsv( self.args.data, self.data_cfg, split, self.tgt_dict, pre_tokenizer, bpe_tokenizer, is_train_split=is_train_split, epoch=epoch, seed=self.args.seed, src_lang_map=self.src_lang_map, tgt_lang_map=self.tgt_lang_map, domain_map=self.domain_map, speaker_to_id=self.speaker_to_id ) def build_model(self, args): args.encoder_tasks = self.encoder_tasks args.decoder_tasks = self.decoder_tasks return super(SpeechToTextHeadSelectionTask, self).build_model(args) def get_sample_sizes(self, sample, task_ids, num_tasks): """ task_ids: (bsz,) get sample sizes for each task """ bsz = task_ids.size(0) mat = torch.zeros((num_tasks, bsz), device=task_ids.device) mat[task_ids, torch.arange(bsz)] = 1.0 ntokens = torch.sum(sample['target'] != 1, dim=-1) sample_sizes = torch.matmul(mat, ntokens.float()) return sample_sizes def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): model.train() model.set_num_updates(update_num) # task ids if self.task_type == "lang": encoder_task_ids = sample["src_lang_ids"] decoder_task_ids = sample["tgt_lang_ids"] elif self.task_type == "domain": encoder_task_ids = sample["domain_ids"] decoder_task_ids = sample["domain_ids"] model.encoder.set_task_ids(encoder_task_ids) model.decoder.set_task_ids(decoder_task_ids) with torch.autograd.profiler.record_function("forward"): with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))): loss, sample_size, logging_output = criterion(model, sample) # KL loss if self.args.encoder_attn_head_select: sample_sizes = self.get_sample_sizes(sample, encoder_task_ids, self.encoder_tasks) loss += self.kl_loss( model.encoder.attn_head_selector.head_samples, sample_sizes, self.encoder_head_prior ) if self.args.decoder_self_attn_head_select: sample_sizes = self.get_sample_sizes(sample, decoder_task_ids, self.decoder_tasks) loss += self.kl_loss( model.decoder.self_attn_head_selector.head_samples, sample_sizes, self.decoder_head_prior ) if self.args.dec_enc_attn_head_select: sample_sizes = self.get_sample_sizes(sample, decoder_task_ids, self.decoder_tasks) loss += self.kl_loss( model.decoder.enc_attn_head_selector.head_sampes, sample_sizes, self.decoder_head_prior ) if ignore_grad: loss *= 0 with torch.autograd.profiler.record_function("backward"): optimizer.backward(loss) return loss, sample_size, logging_output def valid_step(self, sample, model, criterion): model.eval() # task ids if self.task_type == "lang": encoder_task_ids = sample["src_lang_ids"] decoder_task_ids = sample["tgt_lang_ids"] elif self.task_type == "domain": encoder_task_ids = sample["domain_ids"] decoder_task_ids = sample["domain_ids"] model.encoder.set_task_ids(encoder_task_ids) model.decoder.set_task_ids(decoder_task_ids) with torch.no_grad(): loss, sample_size, logging_output = criterion(model, sample) return loss, sample_size, logging_output def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): # task ids if self.task_type == "lang": encoder_task_ids = sample["src_lang_ids"][:1] decoder_task_ids = sample["tgt_lang_ids"][:1] elif self.task_type == "domain": encoder_task_ids = sample["domain_ids"][:1] decoder_task_ids = sample["domain_ids"][:1] for model in models: model.encoder.set_task_ids(encoder_task_ids) model.decoder.set_task_ids(decoder_task_ids) return generator.generate( models, sample, prefix_tokens=prefix_tokens, constraints=constraints ) ================================================ FILE: examples/audio_nlp/nlu/README.md ================================================ # End-to-end NLU End-to-end spoken language understanding (SLU) predicts intent directly from audio using a single model. It promises to improve the performance of assistant systems by leveraging acoustic information lost in the intermediate textual representation and preventing cascading errors from Automatic Speech Recognition (ASR). Further, having one unified model has efficiency advantages when deploying assistant systems on-device. This page releases the code for reproducing the results in [STOP: A dataset for Spoken Task Oriented Semantic Parsing](https://arxiv.org/abs/2207.10643) The dataset can be downloaded here: [download link](https://dl.fbaipublicfiles.com/stop/stop.tar.gz) The low-resource splits can be downloaded here: [download link](http://dl.fbaipublicfiles.com/stop/low_resource_splits.tar.gz) ## Pretrained models end-to-end NLU Models | Speech Pretraining | ASR Pretraining | Test EM Accuracy | Tesst EM-Tree Accuracy | Link | | ----------- | ----------- |----------|----------|----------| | None | None | 36.54 | 57.01 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-none-none.pt) | | Wav2Vec | None | 68.05 | 82.53 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-none.pt) | | HuBERT | None | 68.40 | 82.85 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-none.pt) | | Wav2Vec | STOP | 68.70 | 82.78 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-stop.pt) | | HuBERT | STOP | 69.23 | 82.87 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-stop.pt) | | Wav2Vec | Librispeech | 68.47 | 82.49 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-wav2vec-ls.pt) | | HuBERT | Librispeech | 68.70 | 82.78 | [link](https://dl.fbaipublicfiles.com/stop/end-to-end-nlu-hubert-ls.pt) | ## Pretrained models ASR Models | Speech Pre-training | ASR Dataset | STOP Eval WER | STOP Test WER | dev\_other WER | dev\_clean WER | test\_clean WER | test\_other WER | Link | | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | ----------- | | HuBERT | Librispeech | 8.47 | 2.99 | 3.25 | 8.06 | 25.68 | 26.19 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-ls.pt) | | Wav2Vec | Librispeech | 9.215 | 3.204 | 3.334 | 9.006 | 27.257 | 27.588 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-ls.pt) | | HuBERT | STOP | 46.31 | 31.30 | 31.52 | 47.16 | 4.29 | 4.26 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-stop.pt) | | Wav2Vec | STOP | 43.103 | 27.833 | 28.479 | 28.479 | 4.679 | 4.667 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-stop.pt) | | HuBERT | Librispeech + STOP | 9.015 | 3.211 | 3.372 | 8.635 | 5.133 | 5.056 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-hubert-ls-stop.pt) | | Wav2Vec | Librispeech + STOP | 9.549 | 3.537 | 3.625 | 9.514 | 5.59 | 5.562 | [link](https://dl.fbaipublicfiles.com/stop/ctc-asr-wav2vec-ls-stop.pt) | ## Creating the fairseq datasets from STOP First, create the audio file manifests and label files: ``` python examples/audio_nlp/nlu/generate_manifests.py --stop_root $STOP_DOWNLOAD_DIR/stop --output $FAIRSEQ_DATASET_OUTPUT/ ``` Run `./examples/audio_nlp/nlu/create_dict_stop.sh $FAIRSEQ_DATASET_OUTPUT` to generate the fairseq dictionaries. ## Training an End-to-end NLU Model Download a wav2vec or hubert model from [link](https://github.com/facebookresearch/fairseq/tree/main/examples/hubert) or [link](https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec) ``` python fairseq_cli/hydra-train --config-dir examples/audio_nlp/nlu/configs/ --config-name nlu_finetuning task.data=$FAIRSEQ_DATA_OUTPUT model.w2v_path=$PRETRAINED_MODEL_PATH ``` ================================================ FILE: examples/audio_nlp/nlu/configs/nlu_finetuning.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 10 tensorboard_logdir: tb checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: em_error save_interval: 10 task: _name: nlu_finetuning data: ??? labels: parse eval_wer_parse: true autoregressive: true dataset: num_workers: 6 max_tokens: 1600000 skip_invalid_size_inputs_valid_test: true valid_subset: eval,test train_subset: train validate_interval: 10 criterion: _name: label_smoothed_cross_entropy optimization: max_update: 320000 lr: [0.0001] sentence_avg: true update_freq: [1] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_seq2seq w2v_path: ??? autoregressive: true apply_mask: true mask_prob: 0.5 mask_channel_prob: 0.5 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 0 ================================================ FILE: examples/audio_nlp/nlu/create_dict_stop.sh ================================================ #!/bin/bash ### Script handling creation of data binaries ### for model training within fairseq fairseq_root="." data_root=$1 train_prefix="${data_root}/train" valid_prefix="${data_root}/eval" test_prefix="${data_root}/test" dest_dir="$data_root/" #echo "src dict: $src_dict" > "$dest_dir/src_dict.txt" #echo "trg dict: $tgt_dict" > "$dest_dir/tgt_dict.txt" #--tgtdict $tgt_dict \ PYTHONPATH=$fairseq_root \ python $fairseq_root/fairseq_cli/preprocess.py \ --source-lang "parse" \ --trainpref "$train_prefix" \ --validpref "$valid_prefix" \ --destdir "$dest_dir" \ --only-source \ --dict-only \ --workers 60; PYTHONPATH=$fairseq_root \ python $fairseq_root/fairseq_cli/preprocess.py \ --source-lang "ltr" \ --trainpref "$train_prefix" \ --validpref "$valid_prefix" \ --destdir "$dest_dir" \ --only-source \ --dict-only \ --workers 60; ================================================ FILE: examples/audio_nlp/nlu/generate_manifests.py ================================================ import argparse from pathlib import Path import soundfile def get_insl_frame(parse): out = [] def is_ont_token(tok): return tok[0] in ["[", "]"]; res = [] x = [] for tok in parse.split(): if is_ont_token(tok): res.extend('_'.join(x)) x = [] res.append(tok.upper()) else: x.append(tok.upper()) return " ".join(res) + ' | ' def sequencify_utterance(utterance): utterance = utterance.upper() utterance = utterance.replace(' ', '|') + '|' utterance = list(utterance) utterance = ' '.join(utterance) return utterance def generate_fairseq_manifests(manifest, output_path, audio_root=None): with open(manifest, 'r') as i: parses = [] utterances = [] filepaths = [] keys = None for (idx, line) in enumerate(i): if idx == 0: keys = line.strip().split('\t') else: data = { k: v for (k, v) in zip(keys, line.split('\t'))} parses.append(get_insl_frame(data['decoupled_normalized_seqlogical'])) utterances.append(sequencify_utterance(data['normalized_utterance'])) filepaths.append(data['file_id']) parses_fp = output_path.with_suffix('.parse') with open(str(parses_fp), 'w') as o: for p in parses: o.write(p + '\n') utterances_fp = output_path.with_suffix('.ltr') with open(str(utterances_fp), 'w') as o: for u in utterances: o.write(u + '\n') filepaths_fp = output_path.with_suffix('.tsv') with open(str(filepaths_fp), 'w') as o: o.write(str(audio_root) + '\n') for f in filepaths: fullpath = audio_root / f assert fullpath.exists(), f'{fullpath}' frames = soundfile.info(fullpath).frames o.write(f'{f}\t{frames}\n') def main(args): splits = ['train', 'eval', 'test'] root = Path(args.stop_root) output_root = Path(args.output) for split in splits: stop_manifest_path = root / 'manifests' / (split + '.tsv') output_path = output_root / (split) generate_fairseq_manifests(stop_manifest_path, output_path, root) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('--stop_root', type=str, help='path to stop root directory') parser.add_argument('--output', type=str, help='output directory') args = parser.parse_args() main(args) ================================================ FILE: examples/backtranslation/README.md ================================================ # Understanding Back-Translation at Scale (Edunov et al., 2018) This page includes pre-trained models from the paper [Understanding Back-Translation at Scale (Edunov et al., 2018)](https://arxiv.org/abs/1808.09381). ## Pre-trained models Model | Description | Dataset | Download ---|---|---|--- `transformer.wmt18.en-de` | Transformer
([Edunov et al., 2018](https://arxiv.org/abs/1808.09381))
WMT'18 winner | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz)
See NOTE in the archive ## Example usage (torch.hub) We require a few additional Python dependencies for preprocessing: ```bash pip install subword_nmt sacremoses ``` Then to generate translations from the full model ensemble: ```python import torch # List available models torch.hub.list('pytorch/fairseq') # [..., 'transformer.wmt18.en-de', ... ] # Load the WMT'18 En-De ensemble en2de_ensemble = torch.hub.load( 'pytorch/fairseq', 'transformer.wmt18.en-de', checkpoint_file='wmt18.model1.pt:wmt18.model2.pt:wmt18.model3.pt:wmt18.model4.pt:wmt18.model5.pt', tokenizer='moses', bpe='subword_nmt') # The ensemble contains 5 models len(en2de_ensemble.models) # 5 # Translate en2de_ensemble.translate('Hello world!') # 'Hallo Welt!' ``` ## Training your own model (WMT'18 English-German) The following instructions can be adapted to reproduce the models from the paper. #### Step 1. Prepare parallel data and optionally train a baseline (English-German) model First download and preprocess the data: ```bash # Download and prepare the data cd examples/backtranslation/ bash prepare-wmt18en2de.sh cd ../.. # Binarize the data TEXT=examples/backtranslation/wmt18_en_de fairseq-preprocess \ --joined-dictionary \ --source-lang en --target-lang de \ --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ --destdir data-bin/wmt18_en_de --thresholdtgt 0 --thresholdsrc 0 \ --workers 20 # Copy the BPE code into the data-bin directory for future use cp examples/backtranslation/wmt18_en_de/code data-bin/wmt18_en_de/code ``` (Optionally) Train a baseline model (English-German) using just the parallel data: ```bash CHECKPOINT_DIR=checkpoints_en_de_parallel fairseq-train --fp16 \ data-bin/wmt18_en_de \ --source-lang en --target-lang de \ --arch transformer_wmt_en_de_big --share-all-embeddings \ --dropout 0.3 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ --max-tokens 3584 --update-freq 16 \ --max-update 30000 \ --save-dir $CHECKPOINT_DIR # Note: the above command assumes 8 GPUs. Adjust `--update-freq` if you have a # different number of GPUs. ``` Average the last 10 checkpoints: ```bash python scripts/average_checkpoints.py \ --inputs $CHECKPOINT_DIR \ --num-epoch-checkpoints 10 \ --output $CHECKPOINT_DIR/checkpoint.avg10.pt ``` Evaluate BLEU: ```bash # tokenized BLEU on newstest2017: bash examples/backtranslation/tokenized_bleu.sh \ wmt17 \ en-de \ data-bin/wmt18_en_de \ data-bin/wmt18_en_de/code \ $CHECKPOINT_DIR/checkpoint.avg10.pt # BLEU4 = 29.57, 60.9/35.4/22.9/15.5 (BP=1.000, ratio=1.014, syslen=63049, reflen=62152) # compare to 29.46 in Table 1, which is also for tokenized BLEU # generally it's better to report (detokenized) sacrebleu though: bash examples/backtranslation/sacrebleu.sh \ wmt17 \ en-de \ data-bin/wmt18_en_de \ data-bin/wmt18_en_de/code \ $CHECKPOINT_DIR/checkpoint.avg10.pt # BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt17+tok.13a+version.1.4.3 = 29.0 60.6/34.7/22.4/14.9 (BP = 1.000 ratio = 1.013 hyp_len = 62099 ref_len = 61287) ``` #### Step 2. Back-translate monolingual German data Train a reverse model (German-English) to do the back-translation: ```bash CHECKPOINT_DIR=checkpoints_de_en_parallel fairseq-train --fp16 \ data-bin/wmt18_en_de \ --source-lang de --target-lang en \ --arch transformer_wmt_en_de_big --share-all-embeddings \ --dropout 0.3 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr 0.001 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ --max-tokens 3584 --update-freq 16 \ --max-update 30000 \ --save-dir $CHECKPOINT_DIR # Note: the above command assumes 8 GPUs. Adjust `--update-freq` if you have a # different number of GPUs. ``` Let's evaluate the back-translation (BT) model to make sure it is well trained: ```bash bash examples/backtranslation/sacrebleu.sh \ wmt17 \ de-en \ data-bin/wmt18_en_de \ data-bin/wmt18_en_de/code \ $CHECKPOINT_DIR/checkpoint_best.py # BLEU+case.mixed+lang.de-en+numrefs.1+smooth.exp+test.wmt17+tok.13a+version.1.4.3 = 34.9 66.9/41.8/28.5/19.9 (BP = 0.983 ratio = 0.984 hyp_len = 63342 ref_len = 64399) # compare to the best system from WMT'17 which scored 35.1: http://matrix.statmt.org/matrix/systems_list/1868 ``` Next prepare the monolingual data: ```bash # Download and prepare the monolingual data # By default the script samples 25M monolingual sentences, which after # deduplication should be just over 24M sentences. These are split into 25 # shards, each with 1M sentences (except for the last shard). cd examples/backtranslation/ bash prepare-de-monolingual.sh cd ../.. # Binarize each shard of the monolingual data TEXT=examples/backtranslation/wmt18_de_mono for SHARD in $(seq -f "%02g" 0 24); do \ fairseq-preprocess \ --only-source \ --source-lang de --target-lang en \ --joined-dictionary \ --srcdict data-bin/wmt18_en_de/dict.de.txt \ --testpref $TEXT/bpe.monolingual.dedup.${SHARD} \ --destdir data-bin/wmt18_de_mono/shard${SHARD} \ --workers 20; \ cp data-bin/wmt18_en_de/dict.en.txt data-bin/wmt18_de_mono/shard${SHARD}/; \ done ``` Now we're ready to perform back-translation over the monolingual data. The following command generates via sampling, but it's possible to use greedy decoding (`--beam 1`), beam search (`--beam 5`), top-k sampling (`--sampling --beam 1 --sampling-topk 10`), etc.: ```bash mkdir backtranslation_output for SHARD in $(seq -f "%02g" 0 24); do \ fairseq-generate --fp16 \ data-bin/wmt18_de_mono/shard${SHARD} \ --path $CHECKPOINT_DIR/checkpoint_best.pt \ --skip-invalid-size-inputs-valid-test \ --max-tokens 4096 \ --sampling --beam 1 \ > backtranslation_output/sampling.shard${SHARD}.out; \ done ``` After BT, use the `extract_bt_data.py` script to re-combine the shards, extract the back-translations and apply length ratio filters: ```bash python examples/backtranslation/extract_bt_data.py \ --minlen 1 --maxlen 250 --ratio 1.5 \ --output backtranslation_output/bt_data --srclang en --tgtlang de \ backtranslation_output/sampling.shard*.out # Ensure lengths are the same: # wc -l backtranslation_output/bt_data.{en,de} # 21795614 backtranslation_output/bt_data.en # 21795614 backtranslation_output/bt_data.de # 43591228 total ``` Binarize the filtered BT data and combine it with the parallel data: ```bash TEXT=backtranslation_output fairseq-preprocess \ --source-lang en --target-lang de \ --joined-dictionary \ --srcdict data-bin/wmt18_en_de/dict.en.txt \ --trainpref $TEXT/bt_data \ --destdir data-bin/wmt18_en_de_bt \ --workers 20 # We want to train on the combined data, so we'll symlink the parallel + BT data # in the wmt18_en_de_para_plus_bt directory. We link the parallel data as "train" # and the BT data as "train1", so that fairseq will combine them automatically # and so that we can use the `--upsample-primary` option to upsample the # parallel data (if desired). PARA_DATA=$(readlink -f data-bin/wmt18_en_de) BT_DATA=$(readlink -f data-bin/wmt18_en_de_bt) COMB_DATA=data-bin/wmt18_en_de_para_plus_bt mkdir -p $COMB_DATA for LANG in en de; do \ ln -s ${PARA_DATA}/dict.$LANG.txt ${COMB_DATA}/dict.$LANG.txt; \ for EXT in bin idx; do \ ln -s ${PARA_DATA}/train.en-de.$LANG.$EXT ${COMB_DATA}/train.en-de.$LANG.$EXT; \ ln -s ${BT_DATA}/train.en-de.$LANG.$EXT ${COMB_DATA}/train1.en-de.$LANG.$EXT; \ ln -s ${PARA_DATA}/valid.en-de.$LANG.$EXT ${COMB_DATA}/valid.en-de.$LANG.$EXT; \ ln -s ${PARA_DATA}/test.en-de.$LANG.$EXT ${COMB_DATA}/test.en-de.$LANG.$EXT; \ done; \ done ``` #### 3. Train an English-German model over the combined parallel + BT data Finally we can train a model over the parallel + BT data: ```bash CHECKPOINT_DIR=checkpoints_en_de_parallel_plus_bt fairseq-train --fp16 \ data-bin/wmt18_en_de_para_plus_bt \ --upsample-primary 16 \ --source-lang en --target-lang de \ --arch transformer_wmt_en_de_big --share-all-embeddings \ --dropout 0.3 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr 0.0007 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ --max-tokens 3584 --update-freq 16 \ --max-update 100000 \ --save-dir $CHECKPOINT_DIR # Note: the above command assumes 8 GPUs. Adjust `--update-freq` if you have a # different number of GPUs. ``` Average the last 10 checkpoints: ```bash python scripts/average_checkpoints.py \ --inputs $CHECKPOINT_DIR \ --num-epoch-checkpoints 10 \ --output $CHECKPOINT_DIR/checkpoint.avg10.pt ``` Evaluate BLEU: ```bash # tokenized BLEU on newstest2017: bash examples/backtranslation/tokenized_bleu.sh \ wmt17 \ en-de \ data-bin/wmt18_en_de \ data-bin/wmt18_en_de/code \ $CHECKPOINT_DIR/checkpoint.avg10.pt # BLEU4 = 32.35, 64.4/38.9/26.2/18.3 (BP=0.977, ratio=0.977, syslen=60729, reflen=62152) # compare to 32.35 in Table 1, which is also for tokenized BLEU # generally it's better to report (detokenized) sacrebleu: bash examples/backtranslation/sacrebleu.sh \ wmt17 \ en-de \ data-bin/wmt18_en_de \ data-bin/wmt18_en_de/code \ $CHECKPOINT_DIR/checkpoint.avg10.pt # BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt17+tok.13a+version.1.4.3 = 31.5 64.3/38.2/25.6/17.6 (BP = 0.971 ratio = 0.971 hyp_len = 59515 ref_len = 61287) ``` ## Citation ```bibtex @inproceedings{edunov2018backtranslation, title = {Understanding Back-Translation at Scale}, author = {Edunov, Sergey and Ott, Myle and Auli, Michael and Grangier, David}, booktitle = {Conference of the Association for Computational Linguistics (ACL)}, year = 2018, } ``` ================================================ FILE: examples/backtranslation/deduplicate_lines.py ================================================ #!/usr/bin/python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import fileinput import hashlib import sys from multiprocessing import Pool def get_hashes_and_lines(raw_line): hash = hashlib.md5(raw_line).hexdigest() return hash, raw_line def main(): parser = argparse.ArgumentParser() parser.add_argument("--workers", type=int, default=10) parser.add_argument("files", nargs="*", help="input files") args = parser.parse_args() seen = set() with fileinput.input(args.files, mode="rb") as h: pool = Pool(args.workers) results = pool.imap_unordered(get_hashes_and_lines, h, 1000) for i, (hash, raw_line) in enumerate(results): if hash not in seen: seen.add(hash) sys.stdout.buffer.write(raw_line) if i % 1000000 == 0: print(i, file=sys.stderr, end="", flush=True) elif i % 100000 == 0: print(".", file=sys.stderr, end="", flush=True) print(file=sys.stderr, flush=True) if __name__ == "__main__": main() ================================================ FILE: examples/backtranslation/extract_bt_data.py ================================================ #!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import fileinput from tqdm import tqdm def main(): parser = argparse.ArgumentParser( description=( "Extract back-translations from the stdout of fairseq-generate. " "If there are multiply hypotheses for a source, we only keep the first one. " ) ) parser.add_argument("--output", required=True, help="output prefix") parser.add_argument( "--srclang", required=True, help="source language (extracted from H-* lines)" ) parser.add_argument( "--tgtlang", required=True, help="target language (extracted from S-* lines)" ) parser.add_argument("--minlen", type=int, help="min length filter") parser.add_argument("--maxlen", type=int, help="max length filter") parser.add_argument("--ratio", type=float, help="ratio filter") parser.add_argument("files", nargs="*", help="input files") args = parser.parse_args() def validate(src, tgt): srclen = len(src.split(" ")) if src != "" else 0 tgtlen = len(tgt.split(" ")) if tgt != "" else 0 if ( (args.minlen is not None and (srclen < args.minlen or tgtlen < args.minlen)) or ( args.maxlen is not None and (srclen > args.maxlen or tgtlen > args.maxlen) ) or ( args.ratio is not None and (max(srclen, tgtlen) / float(min(srclen, tgtlen)) > args.ratio) ) ): return False return True def safe_index(toks, index, default): try: return toks[index] except IndexError: return default with open(args.output + "." + args.srclang, "w") as src_h, open( args.output + "." + args.tgtlang, "w" ) as tgt_h: for line in tqdm(fileinput.input(args.files)): if line.startswith("S-"): tgt = safe_index(line.rstrip().split("\t"), 1, "") elif line.startswith("H-"): if tgt is not None: src = safe_index(line.rstrip().split("\t"), 2, "") if validate(src, tgt): print(src, file=src_h) print(tgt, file=tgt_h) tgt = None if __name__ == "__main__": main() ================================================ FILE: examples/backtranslation/prepare-de-monolingual.sh ================================================ #!/bin/bash SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl BPEROOT=subword-nmt/subword_nmt BPE_CODE=wmt18_en_de/code SUBSAMPLE_SIZE=25000000 LANG=de OUTDIR=wmt18_${LANG}_mono orig=orig tmp=$OUTDIR/tmp mkdir -p $OUTDIR $tmp URLS=( "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2007.de.shuffled.gz" "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2008.de.shuffled.gz" "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2009.de.shuffled.gz" "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.de.shuffled.gz" "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2011.de.shuffled.gz" "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2012.de.shuffled.gz" "http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2013.de.shuffled.gz" "http://www.statmt.org/wmt15/training-monolingual-news-crawl-v2/news.2014.de.shuffled.v2.gz" "http://data.statmt.org/wmt16/translation-task/news.2015.de.shuffled.gz" "http://data.statmt.org/wmt17/translation-task/news.2016.de.shuffled.gz" "http://data.statmt.org/wmt18/translation-task/news.2017.de.shuffled.deduped.gz" ) FILES=( "news.2007.de.shuffled.gz" "news.2008.de.shuffled.gz" "news.2009.de.shuffled.gz" "news.2010.de.shuffled.gz" "news.2011.de.shuffled.gz" "news.2012.de.shuffled.gz" "news.2013.de.shuffled.gz" "news.2014.de.shuffled.v2.gz" "news.2015.de.shuffled.gz" "news.2016.de.shuffled.gz" "news.2017.de.shuffled.deduped.gz" ) cd $orig for ((i=0;i<${#URLS[@]};++i)); do file=${FILES[i]} if [ -f $file ]; then echo "$file already exists, skipping download" else url=${URLS[i]} wget "$url" fi done cd .. if [ -f $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then echo "found monolingual sample, skipping shuffle/sample/tokenize" else gzip -c -d -k $(for FILE in "${FILES[@]}"; do echo $orig/$FILE; done) \ | shuf -n $SUBSAMPLE_SIZE \ | perl $NORM_PUNC $LANG \ | perl $REM_NON_PRINT_CHAR \ | perl $TOKENIZER -threads 8 -a -l $LANG \ > $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} fi if [ -f $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} ]; then echo "found BPE monolingual sample, skipping BPE step" else python $BPEROOT/apply_bpe.py -c $BPE_CODE \ < $tmp/monolingual.${SUBSAMPLE_SIZE}.${LANG} \ > $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} fi if [ -f $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} ]; then echo "found deduplicated monolingual sample, skipping deduplication step" else python deduplicate_lines.py $tmp/bpe.monolingual.${SUBSAMPLE_SIZE}.${LANG} \ > $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} fi if [ -f $OUTDIR/bpe.monolingual.dedup.00.de ]; then echo "found sharded data, skipping sharding step" else split --lines 1000000 --numeric-suffixes \ --additional-suffix .${LANG} \ $tmp/bpe.monolingual.dedup.${SUBSAMPLE_SIZE}.${LANG} \ $OUTDIR/bpe.monolingual.dedup. fi ================================================ FILE: examples/backtranslation/prepare-wmt18en2de.sh ================================================ #!/bin/bash # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git echo 'Cloning Subword NMT repository (for BPE pre-processing)...' git clone https://github.com/rsennrich/subword-nmt.git SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl BPEROOT=subword-nmt/subword_nmt BPE_TOKENS=32000 URLS=( "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz" "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz" "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz" "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz" "http://data.statmt.org/wmt17/translation-task/dev.tgz" "http://statmt.org/wmt14/test-full.tgz" ) FILES=( "training-parallel-europarl-v7.tgz" "training-parallel-commoncrawl.tgz" "training-parallel-nc-v13.tgz" "rapid2016.tgz" "dev.tgz" "test-full.tgz" ) CORPORA=( "training/europarl-v7.de-en" "commoncrawl.de-en" "training-parallel-nc-v13/news-commentary-v13.de-en" "rapid2016.de-en" ) if [ ! -d "$SCRIPTS" ]; then echo "Please set SCRIPTS variable correctly to point to Moses scripts." exit 1 fi OUTDIR=wmt18_en_de src=en tgt=de lang=en-de prep=$OUTDIR tmp=$prep/tmp orig=orig mkdir -p $orig $tmp $prep cd $orig for ((i=0;i<${#URLS[@]};++i)); do file=${FILES[i]} if [ -f $file ]; then echo "$file already exists, skipping download" else url=${URLS[i]} wget "$url" if [ -f $file ]; then echo "$url successfully downloaded." else echo "$url not successfully downloaded." exit 1 fi if [ ${file: -4} == ".tgz" ]; then tar zxvf $file elif [ ${file: -4} == ".tar" ]; then tar xvf $file fi fi done cd .. echo "pre-processing train data..." for l in $src $tgt; do rm $tmp/train.tags.$lang.tok.$l for f in "${CORPORA[@]}"; do cat $orig/$f.$l | \ perl $NORM_PUNC $l | \ perl $REM_NON_PRINT_CHAR | \ perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l done done echo "pre-processing test data..." for l in $src $tgt; do if [ "$l" == "$src" ]; then t="src" else t="ref" fi grep '\s*//g' | \ sed -e 's/\s*<\/seg>\s*//g' | \ sed -e "s/\’/\'/g" | \ perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l echo "" done echo "splitting train and valid..." for l in $src $tgt; do awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l done TRAIN=$tmp/train.de-en BPE_CODE=$prep/code rm -f $TRAIN for l in $src $tgt; do cat $tmp/train.$l >> $TRAIN done echo "learn_bpe.py on ${TRAIN}..." python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE for L in $src $tgt; do for f in train.$L valid.$L test.$L; do echo "apply_bpe.py to ${f}..." python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f done done perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250 perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250 for L in $src $tgt; do cp $tmp/bpe.test.$L $prep/test.$L done ================================================ FILE: examples/backtranslation/sacrebleu.sh ================================================ #!/bin/bash if [ $# -ne 5 ]; then echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]" exit fi DATASET=$1 LANGPAIR=$2 DATABIN=$3 BPECODE=$4 MODEL=$5 SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1) TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2) BPEROOT=examples/backtranslation/subword-nmt/subword_nmt if [ ! -e $BPEROOT ]; then BPEROOT=subword-nmt/subword_nmt if [ ! -e $BPEROOT ]; then echo 'Cloning Subword NMT repository (for BPE pre-processing)...' git clone https://github.com/rsennrich/subword-nmt.git fi fi sacrebleu -t $DATASET -l $LANGPAIR --echo src \ | sacremoses tokenize -a -l $SRCLANG -q \ | python $BPEROOT/apply_bpe.py -c $BPECODE \ | fairseq-interactive $DATABIN --path $MODEL \ -s $SRCLANG -t $TGTLANG \ --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \ | grep ^H- | cut -f 3- \ | sacremoses detokenize -l $TGTLANG -q \ | sacrebleu -t $DATASET -l $LANGPAIR ================================================ FILE: examples/backtranslation/tokenized_bleu.sh ================================================ #!/bin/bash if [ $# -ne 5 ]; then echo "usage: $0 [dataset=wmt14/full] [langpair=en-de] [databin] [bpecode] [model]" exit fi DATASET=$1 LANGPAIR=$2 DATABIN=$3 BPECODE=$4 MODEL=$5 SRCLANG=$(echo $LANGPAIR | cut -d '-' -f 1) TGTLANG=$(echo $LANGPAIR | cut -d '-' -f 2) BPEROOT=examples/backtranslation/subword-nmt/subword_nmt if [ ! -e $BPEROOT ]; then BPEROOT=subword-nmt/subword_nmt if [ ! -e $BPEROOT ]; then echo 'Cloning Subword NMT repository (for BPE pre-processing)...' git clone https://github.com/rsennrich/subword-nmt.git fi fi TMP_REF=$(mktemp) sacrebleu -t $DATASET -l $LANGPAIR --echo ref -q \ | sacremoses normalize -l $TGTLANG -q \ | sacremoses tokenize -a -l $TGTLANG -q \ > $TMP_REF sacrebleu -t $DATASET -l $LANGPAIR --echo src -q \ | sacremoses normalize -l $SRCLANG -q \ | sacremoses tokenize -a -l $SRCLANG -q \ | python $BPEROOT/apply_bpe.py -c $BPECODE \ | fairseq-interactive $DATABIN --path $MODEL \ -s $SRCLANG -t $TGTLANG \ --beam 5 --remove-bpe --buffer-size 1024 --max-tokens 8000 \ | grep ^H- | cut -f 3- \ | fairseq-score --ref $TMP_REF rm -f $TMP_REF ================================================ FILE: examples/bart/README.glue.md ================================================ # Fine-tuning BART on GLUE tasks ### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands: ```bash wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py python download_glue_data.py --data_dir glue_data --tasks all ``` ### 2) Preprocess GLUE task data (same as RoBERTa): ```bash ./examples/roberta/preprocess_GLUE_tasks.sh glue_data ``` `glue_task_name` is one of the following: `{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}` Use `ALL` for preprocessing all the glue tasks. ### 3) Fine-tuning on GLUE task: Example fine-tuning cmd for `RTE` task ```bash TOTAL_NUM_UPDATES=2036 # 10 epochs through RTE for bsz 16 WARMUP_UPDATES=61 # 6 percent of the number of updates LR=1e-05 # Peak LR for polynomial LR scheduler. NUM_CLASSES=2 MAX_SENTENCES=16 # Batch size. BART_PATH=/path/to/bart/model.pt CUDA_VISIBLE_DEVICES=0,1 fairseq-train RTE-bin/ \ --restore-file $BART_PATH \ --batch-size $MAX_SENTENCES \ --max-tokens 4400 \ --task sentence_prediction \ --add-prev-output-tokens \ --layernorm-embedding \ --share-all-embeddings \ --share-decoder-input-output-embed \ --reset-optimizer --reset-dataloader --reset-meters \ --required-batch-size-multiple 1 \ --init-token 0 \ --arch bart_large \ --criterion sentence_prediction \ --num-classes $NUM_CLASSES \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 \ --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --max-epoch 10 \ --find-unused-parameters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; ``` For each of the GLUE task, you will need to use following cmd-line arguments: Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B ---|---|---|---|---|---|---|---|--- `--num-classes` | 3 | 2 | 2 | 2 | 2 | 2 | 2 | 1 `--lr` | 5e-6 | 1e-5 | 1e-5 | 1e-5 | 5e-6 | 2e-5 | 2e-5 | 2e-5 `bsz` | 128 | 32 | 32 | 32 | 128 | 64 | 64 | 32 `--total-num-update` | 30968 | 33112 | 113272 | 1018 | 5233 | 1148 | 1334 | 1799 `--warmup-updates` | 1858 | 1986 | 6796 | 61 | 314 | 68 | 80 | 107 For `STS-B` additionally add `--regression-target --best-checkpoint-metric loss` and remove `--maximize-best-checkpoint-metric`. **Note:** a) `--total-num-updates` is used by `--polynomial_decay` scheduler and is calculated for `--max-epoch=10` and `--batch-size=32/64/128` depending on the task. b) Above cmd-args and hyperparams are tested on Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. ### Inference on GLUE task After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet: ```python from fairseq.models.bart import BARTModel bart = BARTModel.from_pretrained( 'checkpoints/', checkpoint_file='checkpoint_best.pt', data_name_or_path='RTE-bin' ) label_fn = lambda label: bart.task.label_dictionary.string( [label + bart.task.label_dictionary.nspecial] ) ncorrect, nsamples = 0, 0 bart.cuda() bart.eval() with open('glue_data/RTE/dev.tsv') as fin: fin.readline() for index, line in enumerate(fin): tokens = line.strip().split('\t') sent1, sent2, target = tokens[1], tokens[2], tokens[3] tokens = bart.encode(sent1, sent2) prediction = bart.predict('sentence_classification_head', tokens).argmax().item() prediction_label = label_fn(prediction) ncorrect += int(prediction_label == target) nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) ``` ================================================ FILE: examples/bart/README.md ================================================ # BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension [https://arxiv.org/abs/1910.13461](https://arxiv.org/abs/1910.13461) ## Introduction BART is sequence-to-sequence model trained with denoising as pretraining objective. We show that this pretraining objective is more generic and show that we can match [RoBERTa](../roberta) results on SQuAD and GLUE and gain state-of-the-art results on summarization (XSum, CNN dataset), long form generative question answering (ELI5) and dialog response genration (ConvAI2). See the associated paper for more details. ## Pre-trained models Model | Description | # params | Download ---|---|---|--- `bart.base` | BART model with 6 encoder and decoder layers | 140M | [bart.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz) `bart.large` | BART model with 12 encoder and decoder layers | 400M | [bart.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz) `bart.large.mnli` | `bart.large` finetuned on `MNLI` | 400M | [bart.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz) `bart.large.cnn` | `bart.large` finetuned on `CNN-DM` | 400M | [bart.large.cnn.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz) `bart.large.xsum` | `bart.large` finetuned on `Xsum` | 400M | [bart.large.xsum.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz) ## Results **[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)** _(dev set, single model, single-task finetuning)_ Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B ---|---|---|---|---|---|---|---|--- `roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4 `bart.large` | 89.9 | 94.9 | 92.5 | 87.0 | 96.6 | 90.4 | 62.8 | 91.2 **[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)** _(dev set, no additional data used)_ Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1 ---|---|--- `roberta.large` | 88.9/94.6 | 86.5/89.4 `bart.large` | 88.8/94.6 | 86.1/89.2 **[CNN/Daily Mail](http://nlpprogress.com/english/summarization.html)** _(test set, no additional data used)_ Model | R1 | R2 | RL ---|---|---|--- `BERTSUMEXTABS` | 42.13 | 19.60 | 39.18 `bart.large` | 44.16 | 21.28 | 40.90 ## Example usage ##### Load BART from torch.hub (PyTorch >= 1.1): ```python import torch bart = torch.hub.load('pytorch/fairseq', 'bart.large') bart.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Load BART (for PyTorch 1.0 or custom models): ```python # Download bart.large model wget https://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz tar -xzvf bart.large.tar.gz # Load the model in fairseq from fairseq.models.bart import BARTModel bart = BARTModel.from_pretrained('/path/to/bart.large', checkpoint_file='model.pt') bart.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Apply Byte-Pair Encoding (BPE) to input text: ```python tokens = bart.encode('Hello world!') assert tokens.tolist() == [0, 31414, 232, 328, 2] bart.decode(tokens) # 'Hello world!' ``` ##### Extract features from BART: ```python # Extract the last layer's features last_layer_features = bart.extract_features(tokens) assert last_layer_features.size() == torch.Size([1, 5, 1024]) # Extract all layer's features from decoder (layer 0 is the embedding layer) all_layers = bart.extract_features(tokens, return_all_hiddens=True) assert len(all_layers) == 13 assert torch.all(all_layers[-1] == last_layer_features) ``` ##### Use BART for sentence-pair classification tasks: ```python # Download BART already finetuned for MNLI bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli') bart.eval() # disable dropout for evaluation # Encode a pair of sentences and make a prediction tokens = bart.encode('BART is a seq2seq model.', 'BART is not sequence to sequence.') bart.predict('mnli', tokens).argmax() # 0: contradiction # Encode another pair of sentences tokens = bart.encode('BART is denoising autoencoder.', 'BART is version of autoencoder.') bart.predict('mnli', tokens).argmax() # 2: entailment ``` ##### Register a new (randomly initialized) classification head: ```python bart.register_classification_head('new_task', num_classes=3) logprobs = bart.predict('new_task', tokens) ``` ##### Batched prediction: ```python import torch from fairseq.data.data_utils import collate_tokens bart = torch.hub.load('pytorch/fairseq', 'bart.large.mnli') bart.eval() batch_of_pairs = [ ['BART is a seq2seq model.', 'BART is not sequence to sequence.'], ['BART is denoising autoencoder.', 'BART is version of autoencoder.'], ] batch = collate_tokens( [bart.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1 ) logprobs = bart.predict('mnli', batch) print(logprobs.argmax(dim=1)) # tensor([0, 2]) ``` ##### Using the GPU: ```python bart.cuda() bart.predict('new_task', tokens) ``` #### Filling masks: BART can be used to fill multiple `` tokens in the input. ```python bart = torch.hub.load('pytorch/fairseq', 'bart.base') bart.eval() bart.fill_mask(['The cat on the .'], topk=3, beam=10) # [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))]] ``` Note that by default we enforce the output length to match the input length. This can be disabled by setting ``match_source_len=False``: ``` bart.fill_mask(['The cat on the .'], topk=3, beam=10, match_source_len=False) # [[('The cat was on the ground.', tensor(-0.6185)), ('The cat was asleep on the couch.', tensor(-0.6276)), ('The cat was on the floor.', tensor(-0.6800))]] ``` Example code to fill masks for a batch of sentences using GPU ``` bart.cuda() bart.fill_mask(['The cat on the .', 'The dog on the .'], topk=3, beam=10) # [[('The cat was on the ground.', tensor(-0.6183)), ('The cat was on the floor.', tensor(-0.6798)), ('The cat sleeps on the couch.', tensor(-0.6830))], [('The dog was on the ground.', tensor(-0.6190)), ('The dog lay on the ground.', tensor(-0.6711)), ('The dog was asleep on the couch', tensor(-0.6796))]] ``` #### Evaluating the `bart.large.mnli` model: Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set. ```python label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'} ncorrect, nsamples = 0, 0 bart.cuda() bart.eval() with open('glue_data/MNLI/dev_matched.tsv') as fin: fin.readline() for index, line in enumerate(fin): tokens = line.strip().split('\t') sent1, sent2, target = tokens[8], tokens[9], tokens[-1] tokens = bart.encode(sent1, sent2) prediction = bart.predict('mnli', tokens).argmax().item() prediction_label = label_map[prediction] ncorrect += int(prediction_label == target) nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) # Expected output: 0.9010 ``` #### Evaluating the `bart.large.cnn` model: - Follow instructions [here](https://github.com/abisee/cnn-dailymail) to download and process into data-files such that `test.source` and `test.target` has one line for each non-tokenized sample. - For simpler preprocessing, you can also `wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz`, although there is no guarantee of identical scores - `huggingface/transformers` has a simpler interface that supports [single-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_eval.py) and [multi-gpu](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/run_distributed_eval.py) beam search. In `huggingface/transformers`, the BART models' paths are `facebook/bart-large-cnn` and `facebook/bart-large-xsum`. In `fairseq`, summaries can be generated using: ```bash cp data-bin/cnn_dm/dict.source.txt checkpoints/ python examples/bart/summarize.py \ --model-dir pytorch/fairseq \ --model-file bart.large.cnn \ --src cnn_dm/test.source \ --out cnn_dm/test.hypo ``` For calculating rouge, install `files2rouge` from [here](https://github.com/pltrdy/files2rouge). ```bash export CLASSPATH=/path/to/stanford-corenlp-full-2016-10-31/stanford-corenlp-3.7.0.jar # Tokenize hypothesis and target files. cat test.hypo | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.tokenized cat test.target | java edu.stanford.nlp.process.PTBTokenizer -ioFileList -preserveLines > test.hypo.target files2rouge test.hypo.tokenized test.hypo.target # Expected output: (ROUGE-2 Average_F: 0.21238) ``` ## Finetuning - [Finetuning on GLUE](README.glue.md) - [Finetuning on CNN-DM](README.summarization.md) ## Citation ```bibtex @article{lewis2019bart, title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension}, author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov and Luke Zettlemoyer }, journal={arXiv preprint arXiv:1910.13461}, year = {2019}, } ``` ================================================ FILE: examples/bart/README.summarization.md ================================================ # Fine-tuning BART on CNN-Dailymail summarization task ### 1) Download the CNN and Daily Mail data and preprocess it into data files with non-tokenized cased samples. Follow the instructions [here](https://github.com/abisee/cnn-dailymail) to download the original CNN and Daily Mail datasets. To preprocess the data, refer to the pointers in [this issue](https://github.com/pytorch/fairseq/issues/1391) or check out the code [here](https://github.com/artmatsak/cnn-dailymail). Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to download the original Extreme Summarization datasets, or check out the code [here](https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset), Please keep the raw dataset and make sure no tokenization nor BPE on the dataset. ### 2) BPE preprocess: ```bash wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' TASK=cnn_dm for SPLIT in train val do for LANG in source target do python -m examples.roberta.multiprocessing_bpe_encoder \ --encoder-json encoder.json \ --vocab-bpe vocab.bpe \ --inputs "$TASK/$SPLIT.$LANG" \ --outputs "$TASK/$SPLIT.bpe.$LANG" \ --workers 60 \ --keep-empty; done done ``` ### 3) Binarize dataset: ```bash fairseq-preprocess \ --source-lang "source" \ --target-lang "target" \ --trainpref "${TASK}/train.bpe" \ --validpref "${TASK}/val.bpe" \ --destdir "${TASK}-bin/" \ --workers 60 \ --srcdict dict.txt \ --tgtdict dict.txt; ``` ### 4) Fine-tuning on CNN-DM summarization task: Example fine-tuning CNN-DM ```bash TOTAL_NUM_UPDATES=20000 WARMUP_UPDATES=500 LR=3e-05 MAX_TOKENS=2048 UPDATE_FREQ=4 BART_PATH=/path/to/bart/model.pt CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train cnn_dm-bin \ --restore-file $BART_PATH \ --max-tokens $MAX_TOKENS \ --task translation \ --source-lang source --target-lang target \ --truncate-source \ --layernorm-embedding \ --share-all-embeddings \ --share-decoder-input-output-embed \ --reset-optimizer --reset-dataloader --reset-meters \ --required-batch-size-multiple 1 \ --arch bart_large \ --criterion label_smoothed_cross_entropy \ --label-smoothing 0.1 \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \ --clip-norm 0.1 \ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ --fp16 --update-freq $UPDATE_FREQ \ --skip-invalid-size-inputs-valid-test \ --find-unused-parameters; ``` Above is expected to run on `1` node with `8 32gb-V100`. Expected training time is about `5 hours`. Training time can be reduced with distributed training on `4` nodes and `--update-freq 1`. Use TOTAL_NUM_UPDATES=15000 UPDATE_FREQ=2 for Xsum task ### Inference for CNN-DM test data using above trained checkpoint. After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using `eval_cnn.py`, for example ```bash cp data-bin/cnn_dm/dict.source.txt checkpoints/ python examples/bart/summarize.py \ --model-dir checkpoints \ --model-file checkpoint_best.pt \ --src cnn_dm/test.source \ --out cnn_dm/test.hypo ``` For XSUM, which uses beam=6, lenpen=1.0, max_len_b=60, min_len=10: ```bash cp data-bin/cnn_dm/dict.source.txt checkpoints/ python examples/bart/summarize.py \ --model-dir checkpoints \ --model-file checkpoint_best.pt \ --src cnn_dm/test.source \ --out cnn_dm/test.hypo \ --xsum-kwargs ``` ================================================ FILE: examples/bart/summarize.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.models.bart import BARTModel import argparse XSUM_KWARGS = dict(beam=6, lenpen=1.0, max_len_b=60, min_len=10, no_repeat_ngram_size=3) CNN_KWARGS = dict(beam=4, lenpen=2.0, max_len_b=140, min_len=55, no_repeat_ngram_size=3) @torch.no_grad() def generate(bart, infile, outfile="bart_hypo.txt", bsz=32, n_obs=None, **eval_kwargs): count = 1 # if n_obs is not None: bsz = min(bsz, n_obs) with open(infile) as source, open(outfile, "w") as fout: sline = source.readline().strip() slines = [sline] for sline in source: if n_obs is not None and count > n_obs: break if count % bsz == 0: hypotheses_batch = bart.sample(slines, **eval_kwargs) for hypothesis in hypotheses_batch: fout.write(hypothesis + "\n") fout.flush() slines = [] slines.append(sline.strip()) count += 1 if slines != []: hypotheses_batch = bart.sample(slines, **eval_kwargs) for hypothesis in hypotheses_batch: fout.write(hypothesis + "\n") fout.flush() def main(): """ Usage:: python examples/bart/summarize.py \ --model-dir $HOME/bart.large.cnn \ --model-file model.pt \ --src $HOME/data-bin/cnn_dm/test.source """ parser = argparse.ArgumentParser() parser.add_argument( "--model-dir", required=True, type=str, default="bart.large.cnn/", help="path containing model file and src_dict.txt", ) parser.add_argument( "--model-file", default="checkpoint_best.pt", help="where in model_dir are weights saved", ) parser.add_argument( "--src", default="test.source", help="text to summarize", type=str ) parser.add_argument( "--out", default="test.hypo", help="where to save summaries", type=str ) parser.add_argument("--bsz", default=32, help="where to save summaries", type=int) parser.add_argument( "--n", default=None, help="how many examples to summarize", type=int ) parser.add_argument( "--xsum-kwargs", action="store_true", default=False, help="if true use XSUM_KWARGS else CNN_KWARGS", ) args = parser.parse_args() eval_kwargs = XSUM_KWARGS if args.xsum_kwargs else CNN_KWARGS if args.model_dir == "pytorch/fairseq": bart = torch.hub.load("pytorch/fairseq", args.model_file) else: bart = BARTModel.from_pretrained( args.model_dir, checkpoint_file=args.model_file, data_name_or_path=args.model_dir, ) bart = bart.eval() if torch.cuda.is_available(): bart = bart.cuda().half() generate( bart, args.src, bsz=args.bsz, n_obs=args.n, outfile=args.out, **eval_kwargs ) if __name__ == "__main__": main() ================================================ FILE: examples/byte_level_bpe/README.md ================================================ # Neural Machine Translation with Byte-Level Subwords https://arxiv.org/abs/1909.03341 We provide an implementation of byte-level byte-pair encoding (BBPE), taking IWSLT 2017 Fr-En translation as example. ## Data Get data and generate fairseq binary dataset: ```bash bash ./get_data.sh ``` ## Model Training Train Transformer model with Bi-GRU embedding contextualization (implemented in `gru_transformer.py`): ```bash # VOCAB=bytes # VOCAB=chars VOCAB=bbpe2048 # VOCAB=bpe2048 # VOCAB=bbpe4096 # VOCAB=bpe4096 # VOCAB=bpe16384 ``` ```bash fairseq-train "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \ --arch gru_transformer --encoder-layers 2 --decoder-layers 2 --dropout 0.3 --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --log-format 'simple' --log-interval 100 --save-dir "checkpoints/${VOCAB}" \ --batch-size 100 --max-update 100000 --update-freq 2 ``` ## Generation `fairseq-generate` requires bytes (BBPE) decoder to convert byte-level representation back to characters: ```bash # BPE=--bpe bytes # BPE=--bpe characters BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe2048.model # BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe2048.model # BPE=--bpe byte_bpe --sentencepiece-model-path data/spm_bbpe4096.model # BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe4096.model # BPE=--bpe sentencepiece --sentencepiece-model data/spm_bpe16384.model ``` ```bash fairseq-generate "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \ --source-lang fr --gen-subset test --sacrebleu --path "checkpoints/${VOCAB}/checkpoint_last.pt" \ --tokenizer moses --moses-target-lang en ${BPE} ``` When using `fairseq-interactive`, bytes (BBPE) encoder/decoder is required to tokenize input data and detokenize model predictions: ```bash fairseq-interactive "data/bin_${VOCAB}" --task translation --user-dir examples/byte_level_bpe/gru_transformer \ --path "checkpoints/${VOCAB}/checkpoint_last.pt" --input data/test.fr --tokenizer moses --moses-source-lang fr \ --moses-target-lang en ${BPE} --buffer-size 1000 --max-tokens 10000 ``` ## Results | Vocabulary | Model | BLEU | |:-------------:|:-------------:|:-------------:| | Joint BPE 16k ([Kudo, 2018](https://arxiv.org/abs/1804.10959)) | 512d LSTM 2+2 | 33.81 | | Joint BPE 16k | Transformer base 2+2 (w/ GRU) | 36.64 (36.72) | | Joint BPE 4k | Transformer base 2+2 (w/ GRU) | 35.49 (36.10) | | Joint BBPE 4k | Transformer base 2+2 (w/ GRU) | 35.61 (35.82) | | Joint BPE 2k | Transformer base 2+2 (w/ GRU) | 34.87 (36.13) | | Joint BBPE 2k | Transformer base 2+2 (w/ GRU) | 34.98 (35.43) | | Characters | Transformer base 2+2 (w/ GRU) | 31.78 (33.30) | | Bytes | Transformer base 2+2 (w/ GRU) | 31.57 (33.62) | ## Citation ``` @misc{wang2019neural, title={Neural Machine Translation with Byte-Level Subwords}, author={Changhan Wang and Kyunghyun Cho and Jiatao Gu}, year={2019}, eprint={1909.03341}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ## Contact Changhan Wang ([changhan@fb.com](mailto:changhan@fb.com)), Kyunghyun Cho ([kyunghyuncho@fb.com](mailto:kyunghyuncho@fb.com)), Jiatao Gu ([jgu@fb.com](mailto:jgu@fb.com)) ================================================ FILE: examples/byte_level_bpe/get_bitext.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import os.path as op from collections import namedtuple from multiprocessing import cpu_count from typing import List, Optional import sentencepiece as sp from fairseq.data.encoders.byte_bpe import ByteBPE from fairseq.data.encoders.byte_utils import byte_encode from fairseq.data.encoders.bytes import Bytes from fairseq.data.encoders.characters import Characters from fairseq.data.encoders.moses_tokenizer import MosesTokenizer from fairseq.data.encoders.sentencepiece_bpe import SentencepieceBPE SPLITS = ["train", "valid", "test"] def _convert_xml(in_path: str, out_path: str): with open(in_path) as f, open(out_path, "w") as f_o: for s in f: ss = s.strip() if not ss.startswith("", "").split('">') assert len(ss) == 2 f_o.write(ss[1].strip() + "\n") def _convert_train(in_path: str, out_path: str): with open(in_path) as f, open(out_path, "w") as f_o: for s in f: ss = s.strip() if ss.startswith("<"): continue f_o.write(ss.strip() + "\n") def _get_bytes(in_path: str, out_path: str): with open(in_path) as f, open(out_path, "w") as f_o: for s in f: f_o.write(Bytes.encode(s.strip()) + "\n") def _get_chars(in_path: str, out_path: str): with open(in_path) as f, open(out_path, "w") as f_o: for s in f: f_o.write(Characters.encode(s.strip()) + "\n") def pretokenize(in_path: str, out_path: str, src: str, tgt: str): Args = namedtuple( "Args", [ "moses_source_lang", "moses_target_lang", "moses_no_dash_splits", "moses_no_escape", ], ) args = Args( moses_source_lang=src, moses_target_lang=tgt, moses_no_dash_splits=False, moses_no_escape=False, ) pretokenizer = MosesTokenizer(args) with open(in_path) as f, open(out_path, "w") as f_o: for s in f: f_o.write(pretokenizer.encode(s.strip()) + "\n") def _convert_to_bchar(in_path_prefix: str, src: str, tgt: str, out_path: str): with open(out_path, "w") as f_o: for lang in [src, tgt]: with open(f"{in_path_prefix}.{lang}") as f: for s in f: f_o.write(byte_encode(s.strip()) + "\n") def _get_bpe(in_path: str, model_prefix: str, vocab_size: int): arguments = [ f"--input={in_path}", f"--model_prefix={model_prefix}", f"--model_type=bpe", f"--vocab_size={vocab_size}", "--character_coverage=1.0", "--normalization_rule_name=identity", f"--num_threads={cpu_count()}", ] sp.SentencePieceTrainer.Train(" ".join(arguments)) def _apply_bbpe(model_path: str, in_path: str, out_path: str): Args = namedtuple("Args", ["sentencepiece_model_path"]) args = Args(sentencepiece_model_path=model_path) tokenizer = ByteBPE(args) with open(in_path) as f, open(out_path, "w") as f_o: for s in f: f_o.write(tokenizer.encode(s.strip()) + "\n") def _apply_bpe(model_path: str, in_path: str, out_path: str): Args = namedtuple("Args", ["sentencepiece_model"]) args = Args(sentencepiece_model=model_path) tokenizer = SentencepieceBPE(args) with open(in_path) as f, open(out_path, "w") as f_o: for s in f: f_o.write(tokenizer.encode(s.strip()) + "\n") def _concat_files(in_paths: List[str], out_path: str): with open(out_path, "w") as f_o: for p in in_paths: with open(p) as f: for r in f: f_o.write(r) def preprocess_iwslt17( root: str, src: str, tgt: str, bpe_size: Optional[int], need_chars: bool, bbpe_size: Optional[int], need_bytes: bool, ): # extract bitext in_root = op.join(root, f"{src}-{tgt}") for lang in [src, tgt]: _convert_train( op.join(in_root, f"train.tags.{src}-{tgt}.{lang}"), op.join(root, f"train.{lang}"), ) _convert_xml( op.join(in_root, f"IWSLT17.TED.dev2010.{src}-{tgt}.{lang}.xml"), op.join(root, f"valid.{lang}"), ) _convert_xml( op.join(in_root, f"IWSLT17.TED.tst2015.{src}-{tgt}.{lang}.xml"), op.join(root, f"test.{lang}"), ) # pre-tokenize for lang in [src, tgt]: for split in SPLITS: pretokenize( op.join(root, f"{split}.{lang}"), op.join(root, f"{split}.moses.{lang}"), src, tgt, ) # tokenize with BPE vocabulary if bpe_size is not None: # learn vocabulary concated_train_path = op.join(root, "train.all") _concat_files( [op.join(root, "train.moses.fr"), op.join(root, "train.moses.en")], concated_train_path, ) bpe_model_prefix = op.join(root, f"spm_bpe{bpe_size}") _get_bpe(concated_train_path, bpe_model_prefix, bpe_size) os.remove(concated_train_path) # apply for lang in [src, tgt]: for split in SPLITS: _apply_bpe( bpe_model_prefix + ".model", op.join(root, f"{split}.moses.{lang}"), op.join(root, f"{split}.moses.bpe{bpe_size}.{lang}"), ) # tokenize with bytes vocabulary if need_bytes: for lang in [src, tgt]: for split in SPLITS: _get_bytes( op.join(root, f"{split}.moses.{lang}"), op.join(root, f"{split}.moses.bytes.{lang}"), ) # tokenize with characters vocabulary if need_chars: for lang in [src, tgt]: for split in SPLITS: _get_chars( op.join(root, f"{split}.moses.{lang}"), op.join(root, f"{split}.moses.chars.{lang}"), ) # tokenize with byte-level BPE vocabulary if bbpe_size is not None: # learn vocabulary bchar_path = op.join(root, "train.bchar") _convert_to_bchar(op.join(root, "train.moses"), src, tgt, bchar_path) bbpe_model_prefix = op.join(root, f"spm_bbpe{bbpe_size}") _get_bpe(bchar_path, bbpe_model_prefix, bbpe_size) os.remove(bchar_path) # apply for lang in [src, tgt]: for split in SPLITS: _apply_bbpe( bbpe_model_prefix + ".model", op.join(root, f"{split}.moses.{lang}"), op.join(root, f"{split}.moses.bbpe{bbpe_size}.{lang}"), ) def main(): parser = argparse.ArgumentParser() parser.add_argument("--root", type=str, default="data") parser.add_argument( "--bpe-vocab", default=None, type=int, help="Generate tokenized bitext with BPE of size K." "Default to None (disabled).", ) parser.add_argument( "--bbpe-vocab", default=None, type=int, help="Generate tokenized bitext with BBPE of size K." "Default to None (disabled).", ) parser.add_argument( "--byte-vocab", action="store_true", help="Generate tokenized bitext with bytes vocabulary", ) parser.add_argument( "--char-vocab", action="store_true", help="Generate tokenized bitext with chars vocabulary", ) args = parser.parse_args() preprocess_iwslt17( args.root, "fr", "en", args.bpe_vocab, args.char_vocab, args.bbpe_vocab, args.byte_vocab, ) if __name__ == "__main__": main() ================================================ FILE: examples/byte_level_bpe/get_data.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. PY_BIN_ROOT= # PyPI dependency ${PY_BIN_ROOT}pip install sentencepiece sacremoses # Get data if [ ! -d "data" ]; then mkdir data fi if [ ! -f "data/fr-en.tgz" ]; then wget https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz -P data tar xvf data/fr-en.tgz -C data fi ${PY_BIN_ROOT}python get_bitext.py --bpe-vocab 16384 --byte-vocab --char-vocab for VOCAB_SIZE in 2048 4096; do ${PY_BIN_ROOT}python get_bitext.py --bpe-vocab ${VOCAB_SIZE} --bbpe-vocab ${VOCAB_SIZE} done rm -r data/fr-en data/fr-en.tgz # Generate binary dataset ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bpe16384 --joined-dictionary \ --workers "$(nproc)" --trainpref data/train.moses.bpe16384 --validpref data/valid.moses.bpe16384 \ --testpref data/test.moses.bpe16384 ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_bytes --joined-dictionary \ --workers "$(nproc)" --trainpref data/train.moses.bytes --validpref data/valid.moses.bytes \ --testpref data/test.moses.bytes ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir data/bin_chars --joined-dictionary \ --workers "$(nproc)" --trainpref data/train.moses.chars --validpref data/valid.moses.chars \ --testpref data/test.moses.chars for VOCAB_SIZE in 2048 4096; do for TYPE in bbpe bpe; do ${PY_BIN_ROOT}/fairseq-preprocess --source-lang fr --target-lang en --destdir "data/bin_${TYPE}${VOCAB_SIZE}" \ --joined-dictionary --workers "$(nproc)" --trainpref "data/train.moses.${TYPE}${VOCAB_SIZE}" \ --validpref "data/valid.moses.${TYPE}${VOCAB_SIZE}" --testpref "data/test.moses.${TYPE}${VOCAB_SIZE}" done done ================================================ FILE: examples/byte_level_bpe/gru_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn import torch.nn.functional as F from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer import TransformerEncoder, TransformerModel @register_model("gru_transformer") class GRUTransformerModel(TransformerModel): @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return GRUTransformerEncoder(args, src_dict, embed_tokens) class GRUTransformerEncoder(TransformerEncoder): def __init__(self, args, dictionary, embed_tokens): super().__init__(args, dictionary, embed_tokens) self.emb_ctx = nn.GRU( input_size=embed_tokens.embedding_dim, hidden_size=embed_tokens.embedding_dim // 2, num_layers=1, bidirectional=True, ) def forward_embedding(self, src_tokens): # embed tokens and positions x = embed = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x = embed + self.embed_positions(src_tokens) # contextualize embeddings x = x.transpose(0, 1) x = self.dropout_module(x) x, _ = self.emb_ctx.forward(x) x = x.transpose(0, 1) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) return x, embed @register_model_architecture("gru_transformer", "gru_transformer") def gru_transformer_base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.no_cross_attention = getattr(args, "no_cross_attention", False) args.cross_self_attention = getattr(args, "cross_self_attention", False) args.layer_wise_attention = getattr(args, "layer_wise_attention", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) @register_model_architecture("gru_transformer", "gru_transformer_big") def gru_transformer_big(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.dropout = getattr(args, "dropout", 0.3) gru_transformer_base_architecture(args) ================================================ FILE: examples/camembert/README.md ================================================ # CamemBERT: a Tasty French Language Model ## Introduction [CamemBERT](https://arxiv.org/abs/1911.03894) is a pretrained language model trained on 138GB of French text based on RoBERTa. Also available in [github.com/huggingface/transformers](https://github.com/huggingface/transformers/). ## Pre-trained models | Model | #params | Download | Arch. | Training data | |--------------------------------|---------|--------------------------------------------------------------------------------------------------------------------------|-------|-----------------------------------| | `camembert` / `camembert-base` | 110M | [camembert-base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz) | Base | OSCAR (138 GB of text) | | `camembert-large` | 335M | [camembert-large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz) | Large | CCNet (135 GB of text) | | `camembert-base-ccnet` | 110M | [camembert-base-ccnet.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz) | Base | CCNet (135 GB of text) | | `camembert-base-wikipedia-4gb` | 110M | [camembert-base-wikipedia-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz) | Base | Wikipedia (4 GB of text) | | `camembert-base-oscar-4gb` | 110M | [camembert-base-oscar-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz) | Base | Subsample of OSCAR (4 GB of text) | | `camembert-base-ccnet-4gb` | 110M | [camembert-base-ccnet-4gb.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz) | Base | Subsample of CCNet (4 GB of text) | ## Example usage ### fairseq ##### Load CamemBERT from torch.hub (PyTorch >= 1.1): ```python import torch camembert = torch.hub.load('pytorch/fairseq', 'camembert') camembert.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Load CamemBERT (for PyTorch 1.0 or custom models): ```python # Download camembert model wget https://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz tar -xzvf camembert.tar.gz # Load the model in fairseq from fairseq.models.roberta import CamembertModel camembert = CamembertModel.from_pretrained('/path/to/camembert') camembert.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Filling masks: ```python masked_line = 'Le camembert est :)' camembert.fill_mask(masked_line, topk=3) # [('Le camembert est délicieux :)', 0.4909118115901947, ' délicieux'), # ('Le camembert est excellent :)', 0.10556942224502563, ' excellent'), # ('Le camembert est succulent :)', 0.03453322499990463, ' succulent')] ``` ##### Extract features from Camembert: ```python # Extract the last layer's features line = "J'aime le camembert !" tokens = camembert.encode(line) last_layer_features = camembert.extract_features(tokens) assert last_layer_features.size() == torch.Size([1, 10, 768]) # Extract all layer's features (layer 0 is the embedding layer) all_layers = camembert.extract_features(tokens, return_all_hiddens=True) assert len(all_layers) == 13 assert torch.all(all_layers[-1] == last_layer_features) ``` ## Citation If you use our work, please cite: ```bibtex @inproceedings{martin2020camembert, title={CamemBERT: a Tasty French Language Model}, author={Martin, Louis and Muller, Benjamin and Su{\'a}rez, Pedro Javier Ortiz and Dupont, Yoann and Romary, Laurent and de la Clergerie, {\'E}ric Villemonte and Seddah, Djam{\'e} and Sagot, Beno{\^\i}t}, booktitle={Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics}, year={2020} } ``` ================================================ FILE: examples/constrained_decoding/README.md ================================================ # (Vectorized) Lexically constrained decoding with dynamic beam allocation This page provides instructions for how to use lexically constrained decoding in Fairseq. Fairseq implements the code described in the following papers: * [Fast Lexically Constrained Decoding With Dynamic Beam Allocation](https://www.aclweb.org/anthology/N18-1119/) (Post & Vilar, 2018) * [Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting](https://www.aclweb.org/anthology/N19-1090/) (Hu et al., 2019) ## Quick start Constrained search is enabled by adding the command-line argument `--constraints` to `fairseq-interactive`. Constraints are appended to each line of input, separated by tabs. Each constraint (one or more tokens) is a separate field. The following command, using [Fairseq's WMT19 German--English model](https://github.com/pytorch/fairseq/blob/main/examples/wmt19/README.md), translates the sentence *Die maschinelle Übersetzung ist schwer zu kontrollieren.* with the constraints "hard" and "to influence". echo -e "Die maschinelle Übersetzung ist schwer zu kontrollieren.\thard\ttoinfluence" \ | normalize.py | tok.py \ | fairseq-interactive /path/to/model \ --path /path/to/model/model1.pt \ --bpe fastbpe \ --bpe-codes /path/to/model/bpecodes \ --constraints \ -s de -t en \ --beam 10 (tok.py and normalize.py can be found in the same directory as this README; they are just shortcuts around Fairseq's WMT19 preprocessing). This will generate the following output: [snip] S-0 Die masch@@ in@@ elle Über@@ setzung ist schwer zu kontrollieren . W-0 1.844 seconds C-0 hard C-0 influence H-0 -1.5333266258239746 Mach@@ ine trans@@ lation is hard to influence . D-0 -1.5333266258239746 Machine translation is hard to influence . P-0 -0.5434 -0.1423 -0.1930 -0.1415 -0.2346 -1.8031 -0.1701 -11.7727 -0.1815 -0.1511 By default, constraints are generated in the order supplied, with any number (zero or more) of tokens generated between constraints. If you wish for the decoder to order the constraints, then use `--constraints unordered`. Note that you may want to use a larger beam. ## Implementation details The heart of the implementation is in `fairseq/search.py`, which adds a `LexicallyConstrainedBeamSearch` instance. This instance of beam search tracks the progress of each hypothesis in the beam through the set of constraints provided for each input sentence. It does this using one of two classes, both found in `fairseq/token_generation_contstraints.py`: * OrderedConstraintState: assumes the `C` input constraints will be generated in the provided order * UnorderedConstraintState: tries to apply `C` (phrasal) constraints in all `C!` orders ## Differences from Sockeye There are a number of [differences from Sockeye's implementation](https://awslabs.github.io/sockeye/inference.html#lexical-constraints). * Generating constraints in the order supplied (the default option here) is not available in Sockeye. * Due to an improved beam allocation method, there is no need to prune the beam. * Again due to better allocation, beam sizes as low as 10 or even 5 are often sufficient. * [The vector extensions described in Hu et al.](https://github.com/edwardjhu/sockeye/tree/trie_constraints) (NAACL 2019) were never merged into the main Sockeye branch. ## Citation The paper first describing lexical constraints for seq2seq decoding is: ```bibtex @inproceedings{hokamp-liu-2017-lexically, title = "Lexically Constrained Decoding for Sequence Generation Using Grid Beam Search", author = "Hokamp, Chris and Liu, Qun", booktitle = "Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", month = jul, year = "2017", address = "Vancouver, Canada", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/P17-1141", doi = "10.18653/v1/P17-1141", pages = "1535--1546", } ``` The fairseq implementation uses the extensions described in ```bibtex @inproceedings{post-vilar-2018-fast, title = "Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation", author = "Post, Matt and Vilar, David", booktitle = "Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)", month = jun, year = "2018", address = "New Orleans, Louisiana", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/N18-1119", doi = "10.18653/v1/N18-1119", pages = "1314--1324", } ``` and ```bibtex @inproceedings{hu-etal-2019-improved, title = "Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting", author = "Hu, J. Edward and Khayrallah, Huda and Culkin, Ryan and Xia, Patrick and Chen, Tongfei and Post, Matt and Van Durme, Benjamin", booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", month = jun, year = "2019", address = "Minneapolis, Minnesota", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/N19-1090", doi = "10.18653/v1/N19-1090", pages = "839--850", } ``` ================================================ FILE: examples/constrained_decoding/normalize.py ================================================ #!/usr/bin/env python3 # # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys from sacremoses.normalize import MosesPunctNormalizer def main(args): normalizer = MosesPunctNormalizer(lang=args.lang, penn=args.penn) for line in sys.stdin: print(normalizer.normalize(line.rstrip()), flush=True) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--lang", "-l", default="en") parser.add_argument("--penn", "-p", action="store_true") args = parser.parse_args() main(args) ================================================ FILE: examples/constrained_decoding/tok.py ================================================ #!/usr/bin/env python3 # # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys import sacremoses def main(args): """Tokenizes, preserving tabs""" mt = sacremoses.MosesTokenizer(lang=args.lang) def tok(s): return mt.tokenize(s, return_str=True) for line in sys.stdin: parts = list(map(tok, line.split("\t"))) print(*parts, sep="\t", flush=True) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--lang", "-l", default="en") parser.add_argument("--penn", "-p", action="store_true") parser.add_argument("--fields", "-f", help="fields to tokenize") args = parser.parse_args() main(args) ================================================ FILE: examples/conv_seq2seq/README.md ================================================ # Convolutional Sequence to Sequence Learning (Gehring et al., 2017) ## Pre-trained models Description | Dataset | Model | Test set(s) ---|---|---|--- Convolutional
([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2) | newstest2014:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.newstest2014.tar.bz2)
newstest2012/2013:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.ntst1213.tar.bz2) Convolutional
([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-German](http://statmt.org/wmt14/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2) | newstest2014:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-de.newstest2014.tar.bz2) Convolutional
([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT17 English-German](http://statmt.org/wmt17/translation-task.html#Download) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2) | newstest2014:
[download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.v2.en-de.newstest2014.tar.bz2) ## Example usage See the [translation README](../translation/README.md) for instructions on reproducing results for WMT'14 En-De and WMT'14 En-Fr using the `fconv_wmt_en_de` and `fconv_wmt_en_fr` model architectures. ## Citation ```bibtex @inproceedings{gehring2017convs2s, title = {Convolutional Sequence to Sequence Learning}, author = {Gehring, Jonas, and Auli, Michael and Grangier, David and Yarats, Denis and Dauphin, Yann N}, booktitle = {Proc. of ICML}, year = 2017, } ``` ================================================ FILE: examples/criss/README.md ================================================ # Cross-lingual Retrieval for Iterative Self-Supervised Training https://arxiv.org/pdf/2006.09526.pdf ## Introduction CRISS is a multilingual sequence-to-sequnce pretraining method where mining and training processes are applied iteratively, improving cross-lingual alignment and translation ability at the same time. ## Requirements: * faiss: https://github.com/facebookresearch/faiss * mosesdecoder: https://github.com/moses-smt/mosesdecoder * flores: https://github.com/facebookresearch/flores * LASER: https://github.com/facebookresearch/LASER ## Unsupervised Machine Translation ##### 1. Download and decompress CRISS checkpoints ``` cd examples/criss wget https://dl.fbaipublicfiles.com/criss/criss_3rd_checkpoints.tar.gz tar -xf criss_checkpoints.tar.gz ``` ##### 2. Download and preprocess Flores test dataset Make sure to run all scripts from examples/criss directory ``` bash download_and_preprocess_flores_test.sh ``` ##### 3. Run Evaluation on Sinhala-English ``` bash unsupervised_mt/eval.sh ``` ## Sentence Retrieval ##### 1. Download and preprocess Tatoeba dataset ``` bash download_and_preprocess_tatoeba.sh ``` ##### 2. Run Sentence Retrieval on Tatoeba Kazakh-English ``` bash sentence_retrieval/sentence_retrieval_tatoeba.sh ``` ## Mining ##### 1. Install faiss Follow instructions on https://github.com/facebookresearch/faiss/blob/master/INSTALL.md ##### 2. Mine pseudo-parallel data between Kazakh and English ``` bash mining/mine_example.sh ``` ## Citation ```bibtex @article{tran2020cross, title={Cross-lingual retrieval for iterative self-supervised training}, author={Tran, Chau and Tang, Yuqing and Li, Xian and Gu, Jiatao}, journal={arXiv preprint arXiv:2006.09526}, year={2020} } ``` ================================================ FILE: examples/criss/download_and_preprocess_flores_test.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. SPM_ENCODE=flores/scripts/spm_encode.py DATA=data_tmp SPM_MODEL=criss_checkpoints/sentence.bpe.model DICT=criss_checkpoints/dict.txt download_data() { CORPORA=$1 URL=$2 if [ -f $CORPORA ]; then echo "$CORPORA already exists, skipping download" else echo "Downloading $URL" wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA if [ -f $CORPORA ]; then echo "$URL successfully downloaded." else echo "$URL not successfully downloaded." rm -f $CORPORA fi fi } if [[ -f flores ]]; then echo "flores already cloned" else git clone https://github.com/facebookresearch/flores fi mkdir -p $DATA download_data $DATA/wikipedia_en_ne_si_test_sets.tgz "https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz" pushd $DATA pwd tar -vxf wikipedia_en_ne_si_test_sets.tgz popd for lang in ne_NP si_LK; do datadir=$DATA/${lang}-en_XX-flores rm -rf $datadir mkdir -p $datadir TEST_PREFIX=$DATA/wikipedia_en_ne_si_test_sets/wikipedia.test python $SPM_ENCODE \ --model ${SPM_MODEL} \ --output_format=piece \ --inputs ${TEST_PREFIX}.${lang:0:2}-en.${lang:0:2} ${TEST_PREFIX}.${lang:0:2}-en.en \ --outputs $datadir/test.bpe.${lang}-en_XX.${lang} $datadir/test.bpe.${lang}-en_XX.en_XX # binarize data fairseq-preprocess \ --source-lang ${lang} --target-lang en_XX \ --testpref $datadir/test.bpe.${lang}-en_XX \ --destdir $datadir \ --srcdict ${DICT} \ --joined-dictionary \ --workers 4 done ================================================ FILE: examples/criss/download_and_preprocess_tatoeba.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. SPM_ENCODE=flores/scripts/spm_encode.py DATA=data_tmp SPM_MODEL=criss_checkpoints/sentence.bpe.model DICT=criss_checkpoints/dict.txt if [[ -f flores ]]; then echo "flores already cloned" else git clone https://github.com/facebookresearch/flores fi if [[ -f LASER ]]; then echo "LASER already cloned" else git clone https://github.com/facebookresearch/LASER fi mkdir -p data_tmp declare -A lang_tatoeba_map=( ["ar_AR"]="ara" ["de_DE"]="deu" ["es_XX"]="spa" ["et_EE"]="est" ["fi_FI"]="fin" ["fr_XX"]="fra" ["hi_IN"]="hin" ["it_IT"]="ita" ["ja_XX"]="jpn" ["ko_KR"]="kor" ["kk_KZ"]="kaz" ["nl_XX"]="nld" ["ru_RU"]="rus" ["tr_TR"]="tur" ["vi_VN"]="vie" ["zh_CN"]="cmn") for lang in ar_AR de_DE es_XX et_EE fi_FI fr_XX hi_IN it_IT ja_XX kk_KZ ko_KR nl_XX ru_RU tr_TR vi_VN zh_CN; do lang_tatoeba=${lang_tatoeba_map[$lang]} echo $lang_tatoeba datadir=$DATA/${lang}-en_XX-tatoeba rm -rf $datadir mkdir -p $datadir TEST_PREFIX=LASER/data/tatoeba/v1/tatoeba python $SPM_ENCODE \ --model ${SPM_MODEL} \ --output_format=piece \ --inputs ${TEST_PREFIX}.${lang_tatoeba}-eng.${lang_tatoeba} ${TEST_PREFIX}.${lang_tatoeba}-eng.eng \ --outputs $datadir/test.bpe.${lang}-en_XX.${lang} $datadir/test.bpe.${lang}-en_XX.en_XX # binarize data fairseq-preprocess \ --source-lang ${lang} --target-lang en_XX \ --testpref $datadir/test.bpe.${lang}-en_XX \ --destdir $datadir \ --srcdict ${DICT} \ --joined-dictionary \ --workers 4 done ================================================ FILE: examples/criss/mining/mine.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import glob from subprocess import check_call try: import faiss has_faiss = True except ImportError: has_faiss = False import numpy as np GB = 1024 * 1024 * 1024 def call(cmd): print(cmd) check_call(cmd, shell=True) def get_batches(directory, lang, prefix="all_avg_pool"): print(f"Finding in {directory}/{prefix}.{lang}*") files = glob.glob(f"{directory}/{prefix}.{lang}*") emb_files = [] txt_files = [] for emb_fi in files: emb_files.append(emb_fi) txt_fi = emb_fi.replace(prefix, "sentences") txt_files.append(txt_fi) return emb_files, txt_files def load_batch(emb_file, dim): embeddings = np.fromfile(emb_file, dtype=np.float32) num_rows = int(embeddings.shape[0] / dim) embeddings = embeddings.reshape((num_rows, dim)) faiss.normalize_L2(embeddings) return embeddings def knnGPU_sharded(x_batches_f, y_batches_f, dim, k, direction="x2y"): if not has_faiss: raise ImportError("Please install Faiss") sims = [] inds = [] xfrom = 0 xto = 0 for x_batch_f in x_batches_f: yfrom = 0 yto = 0 x_batch = load_batch(x_batch_f, dim) xto = xfrom + x_batch.shape[0] bsims, binds = [], [] for y_batch_f in y_batches_f: y_batch = load_batch(y_batch_f, dim) neighbor_size = min(k, y_batch.shape[0]) yto = yfrom + y_batch.shape[0] print("{}-{} -> {}-{}".format(xfrom, xto, yfrom, yto)) idx = faiss.IndexFlatIP(dim) idx = faiss.index_cpu_to_all_gpus(idx) idx.add(y_batch) bsim, bind = idx.search(x_batch, neighbor_size) bsims.append(bsim) binds.append(bind + yfrom) yfrom += y_batch.shape[0] del idx del y_batch bsims = np.concatenate(bsims, axis=1) binds = np.concatenate(binds, axis=1) aux = np.argsort(-bsims, axis=1) sim_batch = np.zeros((x_batch.shape[0], k), dtype=np.float32) ind_batch = np.zeros((x_batch.shape[0], k), dtype=np.int64) for i in range(x_batch.shape[0]): for j in range(k): sim_batch[i, j] = bsims[i, aux[i, j]] ind_batch[i, j] = binds[i, aux[i, j]] sims.append(sim_batch) inds.append(ind_batch) xfrom += x_batch.shape[0] del x_batch sim = np.concatenate(sims, axis=0) ind = np.concatenate(inds, axis=0) return sim, ind def score(sim, fwd_mean, bwd_mean, margin): return margin(sim, (fwd_mean + bwd_mean) / 2) def score_candidates( sim_mat, candidate_inds, fwd_mean, bwd_mean, margin, verbose=False ): print(" - scoring {:d} candidates".format(sim_mat.shape[0])) scores = np.zeros(candidate_inds.shape) for i in range(scores.shape[0]): for j in range(scores.shape[1]): k = int(candidate_inds[i, j]) scores[i, j] = score(sim_mat[i, j], fwd_mean[i], bwd_mean[k], margin) return scores def load_text(files): all_sentences = [] for fi in files: with open(fi) as sentence_fi: for line in sentence_fi: all_sentences.append(line.strip()) print(f"Read {len(all_sentences)} sentences") return all_sentences if __name__ == "__main__": parser = argparse.ArgumentParser(description="Mine bitext") parser.add_argument("--src-lang", help="Source language") parser.add_argument("--tgt-lang", help="Target language") parser.add_argument( "--dict-path", help="Path to dictionary file", default="dict.txt" ) parser.add_argument( "--spm-path", help="Path to SPM model file", default="sentence.bpe.model" ) parser.add_argument("--dim", type=int, default=1024, help="Embedding dimension") parser.add_argument("--mem", type=int, default=5, help="Memory in GB") parser.add_argument("--src-dir", help="Source directory") parser.add_argument("--tgt-dir", help="Target directory") parser.add_argument("--output", help="Output path") parser.add_argument( "--neighborhood", type=int, default=4, help="Embedding dimension" ) parser.add_argument( "--threshold", type=float, default=1.06, help="Threshold on mined bitext" ) parser.add_argument( "--valid-size", type=int, default=2000, help="Number of sentences used for validation set", ) parser.add_argument( "--min-count", type=int, default=50000, help="Min num sentences used for each language", ) args = parser.parse_args() x_batches_f, x_sents_f = get_batches(args.src_dir, args.src_lang) y_batches_f, y_sents_f = get_batches(args.tgt_dir, args.tgt_lang) margin = lambda a, b: a / b y2x_sim, y2x_ind = knnGPU_sharded( y_batches_f, x_batches_f, args.dim, args.neighborhood, direction="y2x" ) x2y_sim, x2y_ind = knnGPU_sharded( x_batches_f, y_batches_f, args.dim, args.neighborhood, direction="x2y" ) x2y_mean = x2y_sim.mean(axis=1) y2x_mean = y2x_sim.mean(axis=1) fwd_scores = score_candidates(x2y_sim, x2y_ind, x2y_mean, y2x_mean, margin) bwd_scores = score_candidates(y2x_sim, y2x_ind, y2x_mean, x2y_mean, margin) fwd_best = x2y_ind[np.arange(x2y_sim.shape[0]), fwd_scores.argmax(axis=1)] bwd_best = y2x_ind[np.arange(y2x_sim.shape[0]), bwd_scores.argmax(axis=1)] indices = np.stack( ( np.concatenate((np.arange(x2y_ind.shape[0]), bwd_best)), np.concatenate((fwd_best, np.arange(y2x_ind.shape[0]))), ), axis=1, ) scores = np.concatenate((fwd_scores.max(axis=1), bwd_scores.max(axis=1))) x_sentences = load_text(x_sents_f) y_sentences = load_text(y_sents_f) threshold = args.threshold min_count = args.min_count seen_src, seen_trg = set(), set() directory = args.output call(f"mkdir -p {directory}") src_out = open( f"{directory}/all.{args.src_lang}", mode="w", encoding="utf-8", errors="surrogateescape", ) tgt_out = open( f"{directory}/all.{args.tgt_lang}", mode="w", encoding="utf-8", errors="surrogateescape", ) scores_out = open( f"{directory}/all.scores", mode="w", encoding="utf-8", errors="surrogateescape" ) count = 0 for i in np.argsort(-scores): src_ind, trg_ind = indices[i] if src_ind not in seen_src and trg_ind not in seen_trg: seen_src.add(src_ind) seen_trg.add(trg_ind) if scores[i] > threshold or count < min_count: if x_sentences[src_ind]: print(scores[i], file=scores_out) print(x_sentences[src_ind], file=src_out) print(y_sentences[trg_ind], file=tgt_out) count += 1 else: print(f"Ignoring sentence: {x_sentences[src_ind]}") src_out.close() tgt_out.close() scores_out.close() print(f"Found {count} pairs for threshold={threshold}") with open(f"{directory}/all.{args.src_lang}") as all_s, open( f"{directory}/all.{args.tgt_lang}" ) as all_t, open(f"{directory}/valid.{args.src_lang}", "w") as valid_s, open( f"{directory}/valid.{args.tgt_lang}", "w" ) as valid_t, open( f"{directory}/train.{args.src_lang}", "w" ) as train_s, open( f"{directory}/train.{args.tgt_lang}", "w" ) as train_t: count = 0 for s_line, t_line in zip(all_s, all_t): s_line = s_line.split("\t")[1] t_line = t_line.split("\t")[1] if count >= args.valid_size: train_s.write(s_line) train_t.write(t_line) else: valid_s.write(s_line) valid_t.write(t_line) count += 1 ================================================ FILE: examples/criss/mining/mine_example.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # source_lang=kk_KZ target_lang=en_XX MODEL=criss_checkpoints/criss.3rd.pt SPM=criss_checkpoints/sentence.bpe.model SPLIT=test LANG_DICT=criss_checkpoints/lang_dict.txt SPM_ENCODE=flores/scripts/spm_encode.py SAVE_ENCODER=save_encoder.py ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL DICT=criss_checkpoints/dict.txt THRESHOLD=1.02 MIN_COUNT=500 DATA_DIR=data_tmp SAVE_DIR=mining/${source_lang}_${target_lang}_mined ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang} INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba mkdir -p $ENCODER_SAVE_DIR/${target_lang} mkdir -p $ENCODER_SAVE_DIR/${source_lang} mkdir -p $SAVE_DIR ## Save encoder outputs # Save encoder outputs for source sentences python $SAVE_ENCODER \ ${INPUT_DIR} \ --path ${MODEL} \ --task translation_multi_simple_epoch \ --lang-pairs ${source_lang}-${target_lang} \ --lang-dict ${LANG_DICT} \ --gen-subset ${SPLIT} \ --bpe 'sentencepiece' \ -s ${source_lang} -t ${target_lang} \ --sentencepiece-model ${SPM} \ --remove-bpe 'sentencepiece' \ --beam 1 \ --lang-tok-style mbart \ --encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang} ## Save encoder outputs for target sentences python $SAVE_ENCODER \ ${INPUT_DIR} \ --path ${MODEL} \ --lang-pairs ${source_lang}-${target_lang} \ --lang-dict ${LANG_DICT} \ --task translation_multi_simple_epoch \ --gen-subset ${SPLIT} \ --bpe 'sentencepiece' \ -t ${source_lang} -s ${target_lang} \ --sentencepiece-model ${SPM} \ --remove-bpe 'sentencepiece' \ --beam 1 \ --lang-tok-style mbart \ --encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang} ## Mining python mining/mine.py \ --src-lang ${source_lang} \ --tgt-lang ${target_lang} \ --dim 1024 \ --mem 10 \ --neighborhood 4 \ --src-dir ${ENCODER_SAVE_DIR}/${source_lang} \ --tgt-dir ${ENCODER_SAVE_DIR}/${target_lang} \ --output $SAVE_DIR \ --threshold ${THRESHOLD} \ --min-count ${MIN_COUNT} \ --valid-size 100 \ --dict-path ${DICT} \ --spm-path ${SPM} \ ## Process and binarize mined data python $SPM_ENCODE \ --model ${SPM} \ --output_format=piece \ --inputs mining/${source_lang}_${target_lang}_mined/train.${source_lang} mining/${source_lang}_${target_lang}_mined/train.${target_lang} \ --outputs mining/${source_lang}_${target_lang}_mined/train.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/train.bpe.${target_lang} python $SPM_ENCODE \ --model ${SPM} \ --output_format=piece \ --inputs mining/${source_lang}_${target_lang}_mined/valid.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.${target_lang} \ --outputs mining/${source_lang}_${target_lang}_mined/valid.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.bpe.${target_lang} fairseq-preprocess \ --source-lang ${source_lang} \ --target-lang ${target_lang} \ --trainpref mining/${source_lang}_${target_lang}_mined/train.bpe \ --validpref mining/${source_lang}_${target_lang}_mined/valid.bpe \ --destdir mining/${source_lang}_${target_lang}_mined \ --srcdict ${DICT} \ --joined-dictionary \ --workers 8 ================================================ FILE: examples/criss/save_encoder.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Translate pre-processed data with a trained model. """ import numpy as np import torch from fairseq import checkpoint_utils, options, progress_bar, tasks, utils from fairseq.sequence_generator import EnsembleModel from fairseq.utils import safe_hasattr def get_avg_pool( models, sample, prefix_tokens, src_dict, remove_bpe, has_langtok=False ): model = EnsembleModel(models) # model.forward normally channels prev_output_tokens into the decoder # separately, but SequenceGenerator directly calls model.encoder encoder_input = { k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens" } # compute the encoder output for each beam encoder_outs = model.forward_encoder(encoder_input) np_encoder_outs = encoder_outs[0].encoder_out.cpu().numpy().astype(np.float32) encoder_mask = 1 - encoder_outs[0].encoder_padding_mask.cpu().numpy().astype( np.float32 ) encoder_mask = np.expand_dims(encoder_mask.T, axis=2) if has_langtok: encoder_mask = encoder_mask[1:, :, :] np_encoder_outs = np_encoder_outs[1, :, :] masked_encoder_outs = encoder_mask * np_encoder_outs avg_pool = (masked_encoder_outs / encoder_mask.sum(axis=0)).sum(axis=0) return avg_pool def main(args): assert args.path is not None, "--path required for generation!" assert ( not args.sampling or args.nbest == args.beam ), "--sampling requires --nbest to be equal to --beam" assert ( args.replace_unk is None or args.raw_text ), "--replace-unk requires a raw text dataset (--raw-text)" args.beam = 1 utils.import_user_module(args) if args.max_tokens is None: args.max_tokens = 12000 print(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load dataset splits task = tasks.setup_task(args) task.load_dataset(args.gen_subset) # Set dictionaries try: src_dict = getattr(task, "source_dictionary", None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary # Load ensemble print("| loading model(s) from {}".format(args.path)) models, _model_args = checkpoint_utils.load_model_ensemble( args.path.split(":"), arg_overrides=eval(args.model_overrides), task=task, ) # Optimize ensemble for generation for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_positions=utils.resolve_max_positions( task.max_positions(), ), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, ).next_epoch_itr(shuffle=False) num_sentences = 0 source_sentences = [] shard_id = 0 all_avg_pool = None encoder_has_langtok = ( safe_hasattr(task.args, "encoder_langtok") and task.args.encoder_langtok is not None and safe_hasattr(task.args, "lang_tok_replacing_bos_eos") and not task.args.lang_tok_replacing_bos_eos ) with progress_bar.build_progress_bar(args, itr) as t: for sample in t: if sample is None: print("Skipping None") continue sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, : args.prefix_size] with torch.no_grad(): avg_pool = get_avg_pool( models, sample, prefix_tokens, src_dict, args.post_process, has_langtok=encoder_has_langtok, ) if all_avg_pool is not None: all_avg_pool = np.concatenate((all_avg_pool, avg_pool)) else: all_avg_pool = avg_pool if not isinstance(sample["id"], list): sample_ids = sample["id"].tolist() else: sample_ids = sample["id"] for i, sample_id in enumerate(sample_ids): # Remove padding src_tokens = utils.strip_pad( sample["net_input"]["src_tokens"][i, :], tgt_dict.pad() ) # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(args.gen_subset).src.get_original_text( sample_id ) else: if src_dict is not None: src_str = src_dict.string(src_tokens, args.post_process) else: src_str = "" if not args.quiet: if src_dict is not None: print("S-{}\t{}".format(sample_id, src_str)) source_sentences.append(f"{sample_id}\t{src_str}") num_sentences += sample["nsentences"] if all_avg_pool.shape[0] >= 1000000: with open( f"{args.encoder_save_dir}/all_avg_pool.{args.source_lang}.{shard_id}", "w", ) as avg_pool_file: all_avg_pool.tofile(avg_pool_file) with open( f"{args.encoder_save_dir}/sentences.{args.source_lang}.{shard_id}", "w", ) as sentence_file: sentence_file.writelines(f"{line}\n" for line in source_sentences) all_avg_pool = None source_sentences = [] shard_id += 1 if all_avg_pool is not None: with open( f"{args.encoder_save_dir}/all_avg_pool.{args.source_lang}.{shard_id}", "w" ) as avg_pool_file: all_avg_pool.tofile(avg_pool_file) with open( f"{args.encoder_save_dir}/sentences.{args.source_lang}.{shard_id}", "w" ) as sentence_file: sentence_file.writelines(f"{line}\n" for line in source_sentences) return None def cli_main(): parser = options.get_generation_parser() parser.add_argument( "--encoder-save-dir", default="", type=str, metavar="N", help="directory to save encoder outputs", ) args = options.parse_args_and_arch(parser) main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/criss/sentence_retrieval/encoder_analysis.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import glob import numpy as np DIM = 1024 def compute_dist(source_embs, target_embs, k=5, return_sim_mat=False): target_ids = [tid for tid in target_embs] source_mat = np.stack(source_embs.values(), axis=0) normalized_source_mat = source_mat / np.linalg.norm( source_mat, axis=1, keepdims=True ) target_mat = np.stack(target_embs.values(), axis=0) normalized_target_mat = target_mat / np.linalg.norm( target_mat, axis=1, keepdims=True ) sim_mat = normalized_source_mat.dot(normalized_target_mat.T) if return_sim_mat: return sim_mat neighbors_map = {} for i, sentence_id in enumerate(source_embs): idx = np.argsort(sim_mat[i, :])[::-1][:k] neighbors_map[sentence_id] = [target_ids[tid] for tid in idx] return neighbors_map def load_embeddings(directory, LANGS): sentence_embeddings = {} sentence_texts = {} for lang in LANGS: sentence_embeddings[lang] = {} sentence_texts[lang] = {} lang_dir = f"{directory}/{lang}" embedding_files = glob.glob(f"{lang_dir}/all_avg_pool.{lang}.*") for embed_file in embedding_files: shard_id = embed_file.split(".")[-1] embeddings = np.fromfile(embed_file, dtype=np.float32) num_rows = embeddings.shape[0] // DIM embeddings = embeddings.reshape((num_rows, DIM)) with open(f"{lang_dir}/sentences.{lang}.{shard_id}") as sentence_file: for idx, line in enumerate(sentence_file): sentence_id, sentence = line.strip().split("\t") sentence_texts[lang][sentence_id] = sentence sentence_embeddings[lang][sentence_id] = embeddings[idx, :] return sentence_embeddings, sentence_texts def compute_accuracy(directory, LANGS): sentence_embeddings, sentence_texts = load_embeddings(directory, LANGS) top_1_accuracy = {} top1_str = " ".join(LANGS) + "\n" for source_lang in LANGS: top_1_accuracy[source_lang] = {} top1_str += f"{source_lang} " for target_lang in LANGS: top1 = 0 top5 = 0 neighbors_map = compute_dist( sentence_embeddings[source_lang], sentence_embeddings[target_lang] ) for sentence_id, neighbors in neighbors_map.items(): if sentence_id == neighbors[0]: top1 += 1 if sentence_id in neighbors[:5]: top5 += 1 n = len(sentence_embeddings[target_lang]) top1_str += f"{top1/n} " top1_str += "\n" print(top1_str) print(top1_str, file=open(f"{directory}/accuracy", "w")) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Analyze encoder outputs") parser.add_argument("directory", help="Source language corpus") parser.add_argument("--langs", help="List of langs") args = parser.parse_args() langs = args.langs.split(",") compute_accuracy(args.directory, langs) ================================================ FILE: examples/criss/sentence_retrieval/sentence_retrieval_tatoeba.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # source_lang=kk_KZ target_lang=en_XX MODEL=criss_checkpoints/criss.3rd.pt SPM=criss_checkpoints/sentence.bpe.model SPLIT=test LANG_DICT=criss_checkpoints/lang_dict.txt ENCODER_ANALYSIS=sentence_retrieval/encoder_analysis.py SAVE_ENCODER=save_encoder.py ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL DATA_DIR=data_tmp INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang} mkdir -p $ENCODER_SAVE_DIR/${target_lang} mkdir -p $ENCODER_SAVE_DIR/${source_lang} # Save encoder outputs for source sentences python $SAVE_ENCODER \ ${INPUT_DIR} \ --path ${MODEL} \ --task translation_multi_simple_epoch \ --lang-dict ${LANG_DICT} \ --gen-subset ${SPLIT} \ --bpe 'sentencepiece' \ --lang-pairs ${source_lang}-${target_lang} \ -s ${source_lang} -t ${target_lang} \ --sentencepiece-model ${SPM} \ --remove-bpe 'sentencepiece' \ --beam 1 \ --lang-tok-style mbart \ --encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang} # Save encoder outputs for target sentences python $SAVE_ENCODER \ ${INPUT_DIR} \ --path ${MODEL} \ --lang-dict ${LANG_DICT} \ --task translation_multi_simple_epoch \ --gen-subset ${SPLIT} \ --bpe 'sentencepiece' \ --lang-pairs ${target_lang}-${source_lang} \ -t ${source_lang} -s ${target_lang} \ --sentencepiece-model ${SPM} \ --remove-bpe 'sentencepiece' \ --beam 1 \ --lang-tok-style mbart \ --encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang} # Analyze sentence retrieval accuracy python $ENCODER_ANALYSIS --langs "${source_lang},${target_lang}" ${ENCODER_SAVE_DIR} ================================================ FILE: examples/criss/unsupervised_mt/eval.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # SRC=si_LK TGT=en_XX MODEL=criss_checkpoints/criss.3rd.pt MULTIBLEU=mosesdecoder/scripts/generic/multi-bleu.perl MOSES=mosesdecoder REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl GEN_TMP_DIR=gen_tmp LANG_DICT=criss_checkpoints/lang_dict.txt if [ ! -d "mosesdecoder" ]; then git clone https://github.com/moses-smt/mosesdecoder fi mkdir -p $GEN_TMP_DIR fairseq-generate data_tmp/${SRC}-${TGT}-flores \ --task translation_multi_simple_epoch \ --max-tokens 2000 \ --path ${MODEL} \ --skip-invalid-size-inputs-valid-test \ --beam 5 --lenpen 1.0 --gen-subset test \ --remove-bpe=sentencepiece \ --source-lang ${SRC} --target-lang ${TGT} \ --decoder-langtok --lang-pairs 'en_XX-ar_AR,en_XX-de_DE,en_XX-es_XX,en_XX-fr_XX,en_XX-hi_IN,en_XX-it_IT,en_XX-ja_XX,en_XX-ko_KR,en_XX-nl_XX,en_XX-ru_RU,en_XX-zh_CN,en_XX-tr_TR,en_XX-vi_VN,en_XX-ro_RO,en_XX-my_MM,en_XX-ne_NP,en_XX-si_LK,en_XX-cs_CZ,en_XX-lt_LT,en_XX-kk_KZ,en_XX-gu_IN,en_XX-fi_FI,en_XX-et_EE,en_XX-lv_LV,ar_AR-en_XX,cs_CZ-en_XX,de_DE-en_XX,es_XX-en_XX,et_EE-en_XX,fi_FI-en_XX,fr_XX-en_XX,gu_IN-en_XX,hi_IN-en_XX,it_IT-en_XX,ja_XX-en_XX,kk_KZ-en_XX,ko_KR-en_XX,lt_LT-en_XX,lv_LV-en_XX,my_MM-en_XX,ne_NP-en_XX,nl_XX-en_XX,ro_RO-en_XX,ru_RU-en_XX,si_LK-en_XX,tr_TR-en_XX,vi_VN-en_XX,zh_CN-en_XX,ar_AR-es_XX,es_XX-ar_AR,ar_AR-hi_IN,hi_IN-ar_AR,ar_AR-zh_CN,zh_CN-ar_AR,cs_CZ-es_XX,es_XX-cs_CZ,cs_CZ-hi_IN,hi_IN-cs_CZ,cs_CZ-zh_CN,zh_CN-cs_CZ,de_DE-es_XX,es_XX-de_DE,de_DE-hi_IN,hi_IN-de_DE,de_DE-zh_CN,zh_CN-de_DE,es_XX-hi_IN,hi_IN-es_XX,es_XX-zh_CN,zh_CN-es_XX,et_EE-es_XX,es_XX-et_EE,et_EE-hi_IN,hi_IN-et_EE,et_EE-zh_CN,zh_CN-et_EE,fi_FI-es_XX,es_XX-fi_FI,fi_FI-hi_IN,hi_IN-fi_FI,fi_FI-zh_CN,zh_CN-fi_FI,fr_XX-es_XX,es_XX-fr_XX,fr_XX-hi_IN,hi_IN-fr_XX,fr_XX-zh_CN,zh_CN-fr_XX,gu_IN-es_XX,es_XX-gu_IN,gu_IN-hi_IN,hi_IN-gu_IN,gu_IN-zh_CN,zh_CN-gu_IN,hi_IN-zh_CN,zh_CN-hi_IN,it_IT-es_XX,es_XX-it_IT,it_IT-hi_IN,hi_IN-it_IT,it_IT-zh_CN,zh_CN-it_IT,ja_XX-es_XX,es_XX-ja_XX,ja_XX-hi_IN,hi_IN-ja_XX,ja_XX-zh_CN,zh_CN-ja_XX,kk_KZ-es_XX,es_XX-kk_KZ,kk_KZ-hi_IN,hi_IN-kk_KZ,kk_KZ-zh_CN,zh_CN-kk_KZ,ko_KR-es_XX,es_XX-ko_KR,ko_KR-hi_IN,hi_IN-ko_KR,ko_KR-zh_CN,zh_CN-ko_KR,lt_LT-es_XX,es_XX-lt_LT,lt_LT-hi_IN,hi_IN-lt_LT,lt_LT-zh_CN,zh_CN-lt_LT,lv_LV-es_XX,es_XX-lv_LV,lv_LV-hi_IN,hi_IN-lv_LV,lv_LV-zh_CN,zh_CN-lv_LV,my_MM-es_XX,es_XX-my_MM,my_MM-hi_IN,hi_IN-my_MM,my_MM-zh_CN,zh_CN-my_MM,ne_NP-es_XX,es_XX-ne_NP,ne_NP-hi_IN,hi_IN-ne_NP,ne_NP-zh_CN,zh_CN-ne_NP,nl_XX-es_XX,es_XX-nl_XX,nl_XX-hi_IN,hi_IN-nl_XX,nl_XX-zh_CN,zh_CN-nl_XX,ro_RO-es_XX,es_XX-ro_RO,ro_RO-hi_IN,hi_IN-ro_RO,ro_RO-zh_CN,zh_CN-ro_RO,ru_RU-es_XX,es_XX-ru_RU,ru_RU-hi_IN,hi_IN-ru_RU,ru_RU-zh_CN,zh_CN-ru_RU,si_LK-es_XX,es_XX-si_LK,si_LK-hi_IN,hi_IN-si_LK,si_LK-zh_CN,zh_CN-si_LK,tr_TR-es_XX,es_XX-tr_TR,tr_TR-hi_IN,hi_IN-tr_TR,tr_TR-zh_CN,zh_CN-tr_TR,vi_VN-es_XX,es_XX-vi_VN,vi_VN-hi_IN,hi_IN-vi_VN,vi_VN-zh_CN,zh_CN-vi_VN' \ --lang-dict ${LANG_DICT} --lang-tok-style 'mbart' --sampling-method 'temperature' --sampling-temperature '1.0' > $GEN_TMP_DIR/${SRC}_${TGT}.gen cat $GEN_TMP_DIR/${SRC}_${TGT}.gen | grep -P "^T-" | cut -f2 | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l ${TGT:0:2} | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape ${TGT:0:2} > $GEN_TMP_DIR/${SRC}_${TGT}.hyp cat $GEN_TMP_DIR/${SRC}_${TGT}.gen | grep -P "^H-" | cut -f3 | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l ${TGT:0:2} | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape ${TGT:0:2} > $GEN_TMP_DIR/${SRC}_${TGT}.ref ${MULTIBLEU} $GEN_TMP_DIR/${SRC}_${TGT}.ref < $GEN_TMP_DIR/${SRC}_${TGT}.hyp ================================================ FILE: examples/cross_lingual_language_model/README.md ================================================ # Cross-Lingual Language Model Pre-training Below are some details for training Cross-Lingual Language Models (XLM) - similar to the ones presented in [Lample & Conneau, 2019](https://arxiv.org/pdf/1901.07291.pdf) - in Fairseq. The current implementation only supports the Masked Language Model (MLM) from the paper above. ## Downloading and Tokenizing Monolingual Data Pointers to the monolingual data from wikipedia, used for training the XLM-style MLM model as well as details on processing (tokenization and BPE) it can be found in the [XLM Github Repository](https://github.com/facebookresearch/XLM#download--preprocess-monolingual-data). Let's assume the following for the code snippets in later sections to work - Processed data is in the folder: monolingual_data/processed - Each language has 3 files for train, test and validation. For example we have the following files for English: train.en, valid.en - We are training a model for 5 languages: Arabic (ar), German (de), English (en), Hindi (hi) and French (fr) - The vocabulary file is monolingual_data/processed/vocab_mlm ## Fairseq Pre-processing and Binarization Pre-process and binarize the data with the MaskedLMDictionary and cross_lingual_lm task ```bash # Ensure the output directory exists DATA_DIR=monolingual_data/fairseq_processed mkdir -p "$DATA_DIR" for lg in ar de en hi fr do fairseq-preprocess \ --task cross_lingual_lm \ --srcdict monolingual_data/processed/vocab_mlm \ --only-source \ --trainpref monolingual_data/processed/train \ --validpref monolingual_data/processed/valid \ --testpref monolingual_data/processed/test \ --destdir monolingual_data/fairseq_processed \ --workers 20 \ --source-lang $lg # Since we only have a source language, the output file has a None for the # target language. Remove this for stage in train test valid sudo mv "$DATA_DIR/$stage.$lg-None.$lg.bin" "$stage.$lg.bin" sudo mv "$DATA_DIR/$stage.$lg-None.$lg.idx" "$stage.$lg.idx" done done ``` ## Train a Cross-lingual Language Model similar to the XLM MLM model Use the following command to train the model on 5 languages. ``` fairseq-train \ --task cross_lingual_lm monolingual_data/fairseq_processed \ --save-dir checkpoints/mlm \ --max-update 2400000 --save-interval 1 --no-epoch-checkpoints \ --arch xlm_base \ --optimizer adam --lr-scheduler reduce_lr_on_plateau \ --lr-shrink 0.5 --lr 0.0001 --stop-min-lr 1e-09 \ --dropout 0.1 \ --criterion legacy_masked_lm_loss \ --max-tokens 2048 --tokens-per-sample 256 --attention-dropout 0.1 \ --dataset-impl lazy --seed 0 \ --masked-lm-only \ --monolingual-langs 'ar,de,en,hi,fr' --num-segment 5 \ --ddp-backend=legacy_ddp ``` Some Notes: - Using tokens_per_sample greater than 256 can cause OOM (out-of-memory) issues. Usually since MLM packs in streams of text, this parameter doesn't need much tuning. - The Evaluation workflow for computing MLM Perplexity on test data is in progress. - Finetuning this model on a downstream task is something which is not currently available. ================================================ FILE: examples/data2vec/README.md ================================================ # data2vec 2.0 data2vec 2.0 improves the training efficiency of the original data2vec algorithm. We make the following improvements for efficiency considerations - we forward only the unmasked timesteps through the encoder, we use convolutional decoder and we use multimasking to amortize the compute overhead of the teacher model. You can find details in the paper [Efficient Self-supervised Learning with Contextualized Target Representations for Vision, Speech and Language](https://arxiv.org/abs/2212.07525) and our [blog post](https://ai.facebook.com/blog/ai-self-supervised-learning-data2vec/). ## Pretrained and finetuned models ### Vision | Model | Finetuning split | Link |---|---|--- data2vec ViT-B | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_imagenet.pt) data2vec ViT-B | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_imagenet_ft.pt) data2vec ViT-L | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_imagenet.pt) data2vec ViT-L | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_imagenet_ft.pt) data2vec ViT-H | No fine-tuning | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/huge_imagenet.pt) data2vec ViT-H | Imagenet-1K | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/huge_imagenet_ft.pt) Vision models only are license under CC-BY-NC. ### Speech | Model | Finetuning split | Dataset | Link |---|---|---|--- data2vec Base | No fine-tuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_libri.pt) data2vec Base | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/base_libri_960h.pt) data2vec Large | No fine-tuning | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_vox.pt) data2vec Large | 960 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/large_vox_960h.pt) ### NLP | Model | Fine-tuning data | Dataset | Link | Dict | BPE |---|---|---|---|---|--- data2vec Base | No fine-tuning | Books + Wiki | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec2/nlp_base.pt) | [dict](https://dl.fbaipublicfiles.com/fairseq/data2vec2/dict.txt) | [encoder](https://dl.fbaipublicfiles.com/fairseq/data2vec2/encoder.json) / [vocab](https://dl.fbaipublicfiles.com/fairseq/data2vec2/vocab.bpe) [//]: # (## Data Preparation) [//]: # () [//]: # (### Vision) [//]: # (add details) [//]: # (### Speech) [//]: # (add details) [//]: # () [//]: # (### NLP) [//]: # (add details) ## Commands to train different models using data2vec 2.0 ### Vision Commands to pretrain different model configurations ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ --config-name base_images_only_task task.data=/path/to/dir ``` ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ --config-name large_images_only_task task.data=/path/to/dir ``` ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ --config-name huge_images14_only_task task.data=/path/to/dir ``` Commands to finetune different model configurations ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \ --config-name mae_imagenet_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model ``` ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \ --config-name mae_imagenet_large_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model ``` ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/vision/finetuning \ --config-name mae_imagenet_huge_clean task.data=/path/to/dir model.model_path=/path/to/pretrained/model ``` ### Speech ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ --config-name base_audio_only_task task.data=/path/to/manifests ``` ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ --config-name large_audio_only_task task.data=/path/to/manifests ``` Finetuning: ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/wav2vec/config/finetuning --config-name vox_10h \ task.data=/path/to/manifests model.w2v_path=/path/to/pretrained/model common.user_dir=examples/data2vec ``` Replace vox_10h with the right config depending on your model and fine-tuning split. See examples/wav2vec/config/finetuning for all available configs. ### NLP Commands to pretrain ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2 \ --config-name base_text_only_task task.data=/path/to/file ``` Commands to fine-tune all GLUE tasks ```shell script $ task=cola # choose from [cola|qnli|mrpc|rte|sst_2|mnli|qqp|sts_b] $ lr=1e-5 # sweep [1e-5|2e-5|4e-5|6e-5] for each task $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/v2/text_finetuning \ --config-name $task task.data=/path/to/file model.model_path=/path/to/pretrained/model "optimization.lr=[${lr}]" ``` # data2vec data2vec is a framework for self-supervised representation learning for images, speech, and text as described in [data2vec: A General Framework for Self-supervised Learning in Speech, Vision and Language (Baevski et al., 2022)](https://ai.facebook.com/research/data2vec-a-general-framework-for-self-supervised-learning-in-speech-vision-and-language). The algorithm uses the same learning mechanism for different modalities. ## Pre-trained models ### Vision Code and pre-trained models for data2vec visions can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit). ### Speech | Model | Finetuning split | Dataset | Link |---|---|---|--- data2vec Base | No fine-tuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls.pt) data2vec Base | 10 minutes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_10m.pt) data2vec Base | 100 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_100h.pt) data2vec Base | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/audio_base_ls_960h.pt) data2vec Large | No fine-tuning | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_pretrained.pt) data2vec Large | 10 minutes | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_10m.pt) data2vec Large | 100 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_100h.pt) data2vec Large | 960 hours | [Libri-light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/vox_960h.pt) --- ### NLP Model | Fine-tuning data | Dataset | Link |---|---|---|---| data2vec Base | No fine-tuning | Books + Wiki | [download](https://dl.fbaipublicfiles.com/fairseq/data2vec/nlp_base.pt) ## Training a new speech model with the CLI tools Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length) ### Prepare training data manifest: First, install the `soundfile` library: ```shell script pip install soundfile ``` Next, run: ```shell script $ python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext $ext --valid-percent $valid ``` $ext should be set to flac, wav, or whatever format your dataset happens to use that soundfile can read. $valid should be set to some reasonable percentage (like 0.01) of training data to use for validation. To use a pre-defined validation set (like dev-other from librispeech), set to it 0 and then overwrite valid.tsv with a separately pre-processed manifest file. ### Train a data2vec Base model: This configuration was used for the base model trained on the Librispeech dataset in the data2vec paper Note that the input is expected to be single channel, sampled at 16 kHz ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/audio/pretraining \ --config-name base_librispeech task.data=/path/to/manifests common.user_dir=examples/data2vec ``` Note: you can simulate 16 GPUs by using k GPUs and adding command line parameters `distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 16/k ### Fine-tune a pre-trained model with CTC: Fine-tuning a model requires parallel audio and labels file, as well as a vocabulary file in fairseq format. A letter vocabulary can be downloaded [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt). An example [script](../wav2vec/libri_labels.py) that generates labels for the Librispeech dataset from the tsv file produced by wav2vec_manifest.py can be used as follows: ```shell script split=train $ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $split ``` Fine-tuning on 100h of Librispeech with letter targets: ```shell script $ fairseq-hydra-train \ distributed_training.distributed_port=$PORT \ task.data=/path/to/data \ model.w2v_path=/path/to/model.pt \ --config-dir /path/to/fairseq-py/examples/wav2vec/config/finetuning \ --config-name base_100h common.user_dir=examples/data2vec ``` There are other config files in the config/finetuning directory that can be used to fine-tune on other splits. You can specify the right config via the `--config-name` parameter. Decoding with a language model during training requires flashlight [python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter). If you want to use a language model, add `+criterion.wer_args='[/path/to/kenlm, /path/to/lexicon, 2, -1]'` to the command line. ### Evaluating a CTC model: Evaluating a CTC model with a language model requires [flashlight python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter) to be installed. Fairseq transformer language model used in the wav2vec 2.0 paper can be obtained from the [wav2letter model repository](https://github.com/facebookresearch/wav2letter/tree/master/recipes/sota/2019). Be sure to upper-case the language model vocab after downloading it. Letter dictionary for pre-trained models can be found [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt). Next, run the evaluation command: ```shell script python examples/speech_recognition/new/infer.py --config-dir examples/speech_recognition/new/conf \ --config-name infer task=audio_finetuning task.data=/path/to/manifests common.user_dir=examples/data2vec \ task.labels=ltr decoding.type=kenlm \ decoding.lmweight=${lmweight} decoding.wordscore=${wordscore} decoding.silweight=${silscore} \ decoding.lexicon=/path/to/lexicon \ decoding.lmpath=/path/to/lm decoding.unique_wer_file=True \ dataset.gen_subset=dev_clean,dev_other,test_clean,test_other \ common_eval.path=/path/to/checkpoint.pt decoding.beam=1500 distributed_training.distributed_world_size=${num_gpus} ``` To get raw numbers, use decoding.type=viterbi and omit the lexicon. To use the transformer language model, use decoding.type=fairseqlm. ## Training a new NLP model with the CLI tools Please follow the [RoBERTa](../roberta/README.md) instructions to preprocess your data. To train a data2vec model on run: ```shell script $ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/text/pretraining \ --config-name base task.data=/path/to/data common.user_dir=examples/data2vec ``` As for speech models, you can simulate 16 gpus by using the update_freq parameter. ### Finetuning data2vec-text on GLUE Please use a command similar to this: ```shell $ python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task task.data=$data_path checkpoint.restore_file="${/path/to/pretrained/model.pt}" ``` ================================================ FILE: examples/data2vec/__init__.py ================================================ ================================================ FILE: examples/data2vec/config/audio/classification/base_classification.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 all_gather_list_size: 70000 tensorboard_logdir: tb min_loss_scale: 1e-6 checkpoint: save_interval: 1 no_epoch_checkpoints: true best_checkpoint_metric: mAP maximize_best_checkpoint_metric: true task: _name: audio_classification data: ??? normalize: true labels: lbl dataset: num_workers: 6 max_tokens: 2560000 skip_invalid_size_inputs_valid_test: true valid_subset: eval validate_interval: 5 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: model can_sum: false log_keys: - _predictions - _targets optimization: max_update: 30000 lr: [0.00006] # scratch 53-5 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: cosine warmup_updates: 5000 model: _name: audio_classification model_path: ??? apply_mask: true mask_prob: 0.6 mask_length: 5 # scratch 1 mask_channel_prob: 0 mask_channel_length: 64 layerdrop: 0.1 dropout: 0.1 activation_dropout: 0.1 attention_dropout: 0.2 feature_grad_mult: 0 # scratch 1 label_mixup: true source_mixup: 0.5 prediction_mode: lin_softmax # scratch average_sigmoid ================================================ FILE: examples/data2vec/config/audio/classification/run_config/slurm_1.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/classification/run_config/slurm_1g.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 1 tasks_per_node: 1 mem_gb: 100 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/classification/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/audioset.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec checkpoint: save_interval: 1 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: /private/home/abaevski/data/audioset max_sample_size: 320000 min_sample_size: 32000 normalize: true dataset: num_workers: 6 max_tokens: 3400000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 24 ddp_backend: legacy_ddp criterion: _name: model log_keys: - ema_decay - target_var - pred_var # - avg_self_attn # - weights optimization: max_update: 200000 lr: [0.0005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 10000 model: _name: data2vec_audio extractor_mode: layer_norm encoder_layerdrop: 0.05 dropout_input: 0.0 dropout_features: 0.0 feature_grad_mult: 1.0 encoder_embed_dim: 768 mask_prob: 0.65 mask_length: 10 loss_beta: 0 loss_scale: null instance_norm_target_layer: true layer_norm_targets: true average_top_k_layers: 12 self_attn_norm_type: deepnorm final_norm_type: deepnorm pos_conv_depth: 5 conv_pos: 95 ema_decay: 0.999 ema_end_decay: 0.9999 ema_anneal_end_step: 30000 ema_transformer_only: true ema_layers_only: false require_same_masks: true mask_dropout: 0 ================================================ FILE: examples/data2vec/config/audio/pretraining/base_librispeech.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: ??? max_sample_size: 320000 min_sample_size: 32000 normalize: true dataset: num_workers: 6 max_tokens: 3800000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 16 ddp_backend: legacy_ddp criterion: _name: model log_keys: - ema_decay - target_var - pred_var optimization: max_update: 400000 lr: [0.0005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: tri_stage phase_ratio: [0.03,0.9,0.07] model: _name: data2vec_audio extractor_mode: layer_norm encoder_layerdrop: 0.05 dropout_input: 0.0 dropout_features: 0.0 feature_grad_mult: 1.0 encoder_embed_dim: 768 mask_prob: 0.65 mask_length: 10 loss_beta: 0 loss_scale: null instance_norm_target_layer: true average_top_k_layers: 8 pos_conv_depth: 5 conv_pos: 95 ema_decay: 0.999 ema_end_decay: 0.9999 ema_anneal_end_step: 30000 ema_transformer_only: true ema_layers_only: true require_same_masks: true mask_dropout: 0 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_1.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_1_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_2_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - task.post_save_script - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_3.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 3 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_4.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_4_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - task.post_save_script - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_6_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 6 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/audio/pretraining/run_config/slurm_8_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 8 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/text/pretraining/base.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: no_epoch_checkpoints: true save_interval_updates: 50000 keep_interval_updates: 1 distributed_training: distributed_world_size: 16 ddp_backend: legacy_ddp task: _name: masked_lm data: ??? sample_break_mode: complete_doc tokens_per_sample: 512 include_target_tokens: true random_token_prob: 0 leave_unmasked_prob: 0 mask_prob: 0.35 mask_multiple_length: 4 criterion: model dataset: max_tokens: 8192 ignore_unused_valid_subsets: true skip_invalid_size_inputs_valid_test: true optimizer: _name: adam weight_decay: 0.01 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: cosine warmup_updates: 10000 optimization: clip_norm: 5 lr: [0.0002] max_update: 1000000 update_freq: [1] model: _name: data2vec_text head_layers: 2 average_top_k_layers: 10 layer_norm_target_layer: true loss_scale: 1 ema_decay: 0.999 ema_end_decay: 0.9999 ema_anneal_end_step: 300000 loss_beta: 4 ema_transformer_layers_only: true transformer: dropout: 0.1 attention_dropout: 0.1 layernorm_embedding: true activation_fn: gelu no_scale_embedding: true max_source_positions: 512 encoder: embed_dim: 768 ffn_embed_dim: 3072 layers: 12 attention_heads: 12 normalize_before: false learned_pos: true layerdrop: 0 ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/slurm_1_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: '_' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir}/submitit timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec max_num_timeout: 30 exclude: a100-st-p4d24xlarge-471 ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/slurm_2_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: '_' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir}/submitit timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec max_num_timeout: 30 exclude: a100-st-p4d24xlarge-471 ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/slurm_3.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 3 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/slurm_4.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/slurm_4_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: '_' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir}/submitit timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec max_num_timeout: 30 exclude: a100-st-p4d24xlarge-471 distributed_training: distributed_world_size: 32 ddp_backend: legacy_ddp ================================================ FILE: examples/data2vec/config/text/pretraining/run_config/slurm_8_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: '_' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir}/submitit timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 8 name: pt partition: wav2vec max_num_timeout: 30 exclude: a100-st-p4d24xlarge-471 distributed_training: distributed_world_size: 64 ddp_backend: legacy_ddp ================================================ FILE: examples/data2vec/config/v2/base_audio_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 fp16_no_flatten_grads: false user_dir: ${env:PWD}/examples/data2vec checkpoint: save_interval: 1 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: /private/home/abaevski/data/librispeech/full max_sample_size: 320000 min_sample_size: 32000 normalize: true precompute_mask_config: {} dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 8 ddp_backend: legacy_ddp criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct optimization: max_update: 400000 lr: [0.00075] debug_param_names: true optimizer: _name: adam adam_betas: [ 0.9,0.98 ] adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 8000 model: _name: data2vec_multi loss_beta: 0 loss_scale: null depth: 12 embed_dim: 768 clone_batch: 8 ema_decay: 0.999 ema_end_decay: 0.99999 ema_anneal_end_step: 75000 ema_encoder_only: false average_top_k_layers: 8 instance_norm_target_layer: true layer_norm_target_layer: false layer_norm_targets: false layerdrop: 0.05 norm_eps: 1e-5 supported_modality: AUDIO modalities: audio: feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' conv_pos_depth: 5 conv_pos_width: 95 conv_pos_groups: 16 prenet_depth: 0 mask_prob: 0.5 mask_prob_adjust: 0.05 inverse_mask: false mask_length: 5 mask_noise_std: 0.01 mask_dropout: 0 add_masks: false ema_local_encoder: false use_alibi_encoder: true prenet_layerdrop: 0.05 prenet_dropout: 0.1 learned_alibi_scale: true learned_alibi_scale_per_head: true decoder: input_dropout: 0.1 decoder_dim: 384 decoder_groups: 16 decoder_kernel: 7 decoder_layers: 4 ================================================ FILE: examples/data2vec/config/v2/base_images_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: mae_image_pretraining data: /datasets01/imagenet_full_size/061417/ rebuild_batches: true local_cache_path: /scratch/cache_abaevski/imagenet key: source precompute_mask_config: {} dataset: num_workers: 10 batch_size: 16 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct optimization: max_update: 375300 lr: [ 0.001 ] debug_param_names: true clip_norm: 4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 1e-3 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 50040 lr_scheduler: pass_through model: _name: data2vec_multi ema_decay: 0.9998 ema_end_decay: 0.99999 ema_anneal_end_step: 100000 instance_norm_target_layer: true layer_norm_target_layer: false layer_norm_targets: true end_of_block_targets: false depth: 10 average_top_k_layers: 10 clone_batch: 16 norm_eps: 1e-6 min_target_var: 0 min_pred_var: 0 encoder_dropout: 0 post_mlp_drop: 0 attention_dropout: 0 activation_dropout: 0 supported_modality: IMAGE cls_loss: 0.01 ema_encoder_only: false modalities: image: inverse_mask: true mask_prob: 0.8 mask_prob_adjust: 0.07 mask_length: 3 mask_noise_std: 0.01 prenet_depth: 2 ema_local_encoder: true num_extra_tokens: 1 init_extra_token_zero: false use_alibi_encoder: false decoder: decoder_dim: 768 decoder_groups: 16 decoder_kernel: 3 decoder_layers: 6 input_dropout: 0 ================================================ FILE: examples/data2vec/config/v2/base_text_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: no_epoch_checkpoints: true save_interval_updates: 50000 keep_interval_updates: 1 distributed_training: distributed_world_size: 16 ddp_backend: legacy_ddp task: _name: masked_lm data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin sample_break_mode: none tokens_per_sample: 512 include_target_tokens: true random_token_prob: 0 leave_unmasked_prob: 0 include_index: True skip_masking: True d2v2_multi: True criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct dataset: batch_size: 4 ignore_unused_valid_subsets: true skip_invalid_size_inputs_valid_test: true disable_validation: true optimization: clip_norm: 1 lr: [0.0002] max_update: 1000000 update_freq: [1] optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 0.0002 optimizer: _name: adam adam_betas: [0.9,0.98] adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 4000 lr_scheduler: pass_through model: _name: data2vec_multi loss_beta: 0 loss_scale: 1 depth: 12 embed_dim: 768 clone_batch: 8 ema_decay: 0.9999 ema_end_decay: 0.99999 ema_anneal_end_step: 100000 ema_encoder_only: true average_top_k_layers: 12 layer_norm_target_layer: false instance_norm_target_layer: true batch_norm_target_layer: false instance_norm_targets: false layer_norm_targets: false layerdrop: 0 norm_eps: 1e-5 supported_modality: TEXT modalities: text: mask_prob: 0.48 mask_length: 1 mask_noise_std: 0.01 prenet_depth: 0 decoder: input_dropout: 0.1 decoder_dim: 768 decoder_groups: 1 decoder_kernel: 9 decoder_layers: 5 decoder_residual: false projection_layers: 2 projection_ratio: 2.0 ================================================ FILE: examples/data2vec/config/v2/huge_images14_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: mae_image_pretraining data: /datasets01/imagenet_full_size/061417/ rebuild_batches: true local_cache_path: /scratch/cache_abaevski/imagenet key: source precompute_mask_config: {} dataset: num_workers: 10 batch_size: 8 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 32 ddp_backend: c10d criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct optimization: max_update: 500000 lr: [ 0.0004 ] debug_param_names: true clip_norm: 4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 4e-4 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 50040 lr_scheduler: pass_through model: _name: data2vec_multi ema_decay: 0.9998 ema_end_decay: 1 ema_anneal_end_step: 300000 instance_norm_target_layer: true layer_norm_target_layer: false layer_norm_targets: true end_of_block_targets: false depth: 32 embed_dim: 1280 num_heads: 16 average_top_k_layers: 24 clone_batch: 16 norm_eps: 1e-6 min_target_var: 0 min_pred_var: 0 encoder_dropout: 0 post_mlp_drop: 0 attention_dropout: 0 activation_dropout: 0 supported_modality: IMAGE cls_loss: 0.01 ema_encoder_only: false modalities: image: patch_size: 14 inverse_mask: true mask_prob: 0.75 mask_prob_adjust: 0.1 mask_length: 3 mask_noise_std: 0.01 prenet_depth: 0 ema_local_encoder: true num_extra_tokens: 1 init_extra_token_zero: false use_alibi_encoder: false embed_dim: 1280 decoder: decoder_dim: 1024 decoder_groups: 16 decoder_kernel: 5 decoder_layers: 3 final_layer_norm: false input_dropout: 0 ================================================ FILE: examples/data2vec/config/v2/huge_images_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: mae_image_pretraining data: /datasets01/imagenet_full_size/061417/ rebuild_batches: true local_cache_path: /scratch/cache_abaevski/imagenet key: source precompute_mask_config: {} dataset: num_workers: 10 batch_size: 8 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct optimization: max_update: 375300 lr: [ 0.0004 ] debug_param_names: true clip_norm: 4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 4e-4 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 50040 lr_scheduler: pass_through model: _name: data2vec_multi ema_decay: 0.9998 ema_end_decay: 0.99995 ema_anneal_end_step: 150000 instance_norm_target_layer: true layer_norm_target_layer: false layer_norm_targets: true end_of_block_targets: false depth: 32 embed_dim: 1280 num_heads: 16 average_top_k_layers: 24 clone_batch: 16 norm_eps: 1e-6 min_target_var: 0 min_pred_var: 0 encoder_dropout: 0 post_mlp_drop: 0 attention_dropout: 0 activation_dropout: 0 supported_modality: IMAGE cls_loss: 0.01 ema_encoder_only: false modalities: image: inverse_mask: true mask_prob: 0.75 mask_prob_adjust: 0.1 mask_length: 3 mask_noise_std: 0.01 prenet_depth: 0 ema_local_encoder: true num_extra_tokens: 1 init_extra_token_zero: false use_alibi_encoder: false embed_dim: 1280 decoder: decoder_dim: 1024 decoder_groups: 16 decoder_kernel: 5 decoder_layers: 3 input_dropout: 0 ================================================ FILE: examples/data2vec/config/v2/large_audio_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: save_interval: 1 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: /fsx-wav2vec/abaevski/data/librivox/no_silence max_sample_size: 320000 min_sample_size: 32000 normalize: true precompute_mask_config: {} dataset: num_workers: 8 max_tokens: 320000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 48 ddp_backend: c10d criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct optimization: max_update: 600000 debug_param_names: true clip_norm: 1 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 0.0004 optimizer: _name: adam adam_betas: [0.9,0.98] adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 10000 lr_scheduler: pass_through model: _name: data2vec_multi loss_beta: 0 loss_scale: null depth: 16 embed_dim: 1024 num_heads: 16 clone_batch: 12 ema_decay: 0.9997 ema_end_decay: 1 ema_anneal_end_step: 300000 ema_encoder_only: false average_top_k_layers: 16 instance_norm_target_layer: true layer_norm_target_layer: false layer_norm_targets: false layerdrop: 0 norm_eps: 1e-5 supported_modality: AUDIO modalities: audio: feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]' conv_pos_depth: 5 conv_pos_width: 95 conv_pos_groups: 16 prenet_depth: 8 mask_prob: 0.55 mask_prob_adjust: 0.1 inverse_mask: false mask_length: 5 mask_noise_std: 0.01 mask_dropout: 0 add_masks: false ema_local_encoder: false use_alibi_encoder: true prenet_layerdrop: 0 prenet_dropout: 0.1 learned_alibi_scale: true learned_alibi_scale_per_head: true decoder: input_dropout: 0.1 decoder_dim: 768 decoder_groups: 16 decoder_kernel: 7 decoder_layers: 4 ================================================ FILE: examples/data2vec/config/v2/large_images_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: mae_image_pretraining data: /datasets01/imagenet_full_size/061417/ rebuild_batches: true local_cache_path: /scratch/cache_abaevski/imagenet key: source precompute_mask_config: {} dataset: num_workers: 10 batch_size: 8 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct optimization: max_update: 375300 lr: [ 0.0004 ] debug_param_names: true clip_norm: 4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 4e-4 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 50040 lr_scheduler: pass_through model: _name: data2vec_multi ema_decay: 0.9998 ema_end_decay: 0.99999 ema_anneal_end_step: 150000 instance_norm_target_layer: true layer_norm_target_layer: false layer_norm_targets: true end_of_block_targets: false depth: 24 embed_dim: 1024 num_heads: 16 average_top_k_layers: 18 clone_batch: 16 norm_eps: 1e-6 min_target_var: 0 min_pred_var: 0 encoder_dropout: 0 post_mlp_drop: 0 attention_dropout: 0 activation_dropout: 0 supported_modality: IMAGE cls_loss: 0.01 ema_encoder_only: false modalities: image: inverse_mask: true mask_prob: 0.75 mask_prob_adjust: 0.1 mask_length: 3 mask_noise_std: 0.01 prenet_depth: 0 ema_local_encoder: true num_extra_tokens: 1 init_extra_token_zero: false use_alibi_encoder: false embed_dim: 1024 decoder: decoder_dim: 1024 decoder_groups: 16 decoder_kernel: 5 decoder_layers: 3 input_dropout: 0 ================================================ FILE: examples/data2vec/config/v2/large_text_only_task.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb min_loss_scale: 1e-6 fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: save_interval_updates: 50000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: masked_lm data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin sample_break_mode: none tokens_per_sample: 512 include_target_tokens: true random_token_prob: 0 leave_unmasked_prob: 0 include_index: True skip_masking: True d2v2_multi: True dataset: batch_size: 2 ignore_unused_valid_subsets: true skip_invalid_size_inputs_valid_test: true disable_validation: true distributed_training: distributed_world_size: 32 ddp_backend: c10d criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct optimization: max_update: 600000 clip_norm: 1 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 0.0001 optimizer: _name: adam adam_betas: [0.9,0.98] adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 4000 lr_scheduler: pass_through model: _name: data2vec_multi loss_beta: 0 loss_scale: 1 depth: 24 num_heads: 16 embed_dim: 1024 clone_batch: 8 ema_decay: 0.9999 ema_end_decay: 0.99999 ema_anneal_end_step: 100000 ema_encoder_only: true average_top_k_layers: 24 layer_norm_target_layer: true instance_norm_target_layer: false batch_norm_target_layer: false instance_norm_targets: true layer_norm_targets: false layerdrop: 0 norm_eps: 1e-5 supported_modality: TEXT modalities: text: mask_prob: 0.5 mask_length: 1 mask_noise_std: 0.01 prenet_depth: 0 decoder: input_dropout: 0.1 decoder_dim: 768 decoder_groups: 1 decoder_kernel: 9 decoder_layers: 5 decoder_residual: false projection_layers: 2 projection_ratio: 2.0 ================================================ FILE: examples/data2vec/config/v2/large_text_only_task_pgrp_1M.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb fp16_no_flatten_grads: true user_dir: ${env:PWD}/examples/data2vec checkpoint: no_epoch_checkpoints: true save_interval_updates: 50000 keep_interval_updates: 1 distributed_training: distributed_world_size: 32 ddp_backend: legacy_ddp task: _name: masked_lm data: /fsx-wav2vec/abaevski/data/nlp/bookwiki_aml-full-mmap2-bin sample_break_mode: none tokens_per_sample: 512 include_target_tokens: true random_token_prob: 0 leave_unmasked_prob: 0 include_index: True skip_masking: True d2v2_multi: True criterion: _name: model log_keys: - ema_decay - target_var - pred_var - model_norm - ema_norm - masked_pct dataset: batch_size: 2 ignore_unused_valid_subsets: true skip_invalid_size_inputs_valid_test: true disable_validation: true optimization: clip_norm: 1 lr: [3e-4] max_update: 1000000 update_freq: [1] optimizer: _name: composite groups: default: lr_float: 1e-4 optimizer: _name: adam adam_betas: [0.9,0.98] adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 4000 decoder: lr_float: 1e-4 optimizer: _name: adam adam_betas: [0.9,0.98] adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 4000 lr_scheduler: pass_through model: _name: data2vec_multi loss_beta: 4 loss_scale: 1 depth: 24 num_heads: 16 embed_dim: 1024 clone_batch: 8 ema_decay: 0.9999 ema_end_decay: 0.99999 ema_anneal_end_step: 100000 ema_encoder_only: true average_top_k_layers: 24 layer_norm_target_layer: true instance_norm_target_layer: false batch_norm_target_layer: false instance_norm_targets: true layer_norm_targets: false layerdrop: 0 norm_eps: 1e-5 supported_modality: TEXT decoder_group: true modalities: text: mask_prob: 0.5 mask_length: 1 mask_noise_std: 0.01 prenet_depth: 0 decoder: input_dropout: 0.1 decoder_dim: 768 decoder_groups: 1 decoder_kernel: 9 decoder_layers: 5 decoder_residual: false projection_layers: 2 projection_ratio: 2.0 ================================================ FILE: examples/data2vec/config/v2/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_1.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_1_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.local_cache_path - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_2_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.local_cache_path - task.data - task.post_save_script - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir - model.model_path sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 12 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_3.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 3 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_4.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_4_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - task.post_save_script - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 12 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_6_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 12 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 6 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_8.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 8 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/run_config/slurm_8_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 12 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 8 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/v2/text_finetuning/cola.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: mcc maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction report_mcc: True dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 320 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 5336 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/v2/text_finetuning/mnli.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 3 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 valid_subset: valid,valid1 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 7432 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 123873 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/v2/text_finetuning/mrpc.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: acc_and_f1 maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction report_acc_and_f1: True dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 137 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 2296 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/v2/text_finetuning/qnli.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 1986 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 33112 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/v2/text_finetuning/qqp.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: acc_and_f1 maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction report_acc_and_f1: True dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 28318 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 113272 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/v2/text_finetuning/rte.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 122 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 2036 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/v2/text_finetuning/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/data2vec/config/v2/text_finetuning/sst_2.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 1256 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 20935 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/v2/text_finetuning/sts_b.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 user_dir: ${env:PWD}/examples/data2vec task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 1 max_positions: 512 d2v2_multi: True checkpoint: best_checkpoint_metric: pearson_and_spearman maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 criterion: _name: sentence_prediction regression_target: true report_pearson_and_spearman: True dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 num_workers: 1 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 214 optimization: clip_norm: 0.0 lr: [4e-05] max_update: 3598 max_epoch: 10 model: _name: data2vec_text_classification model_path: ??? ================================================ FILE: examples/data2vec/config/vision/finetuning/imagenet.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: save_interval: 1 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: accuracy task: _name: image_classification data: /datasets01/imagenet_full_size/061417 dataset: num_workers: 6 batch_size: 64 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 1 valid_subset: val distributed_training: distributed_world_size: 8 ddp_backend: c10d criterion: _name: model log_keys: - correct optimization: max_update: 100000 lr: [0.0005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 10000 model: _name: data2vec_image_classification model_path: ??? ================================================ FILE: examples/data2vec/config/vision/finetuning/mae_imagenet_clean.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb fp16_no_flatten_grads: true checkpoint: save_interval: 1 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true task: _name: mae_image_classification data: /datasets01/imagenet_full_size/061417 dataset: num_workers: 6 batch_size: 32 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 2 valid_subset: val distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model log_keys: - correct optimization: max_update: 250200 lr: [0.001] optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 0.001 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 16000 min_lr: 1e-6 lr_scheduler: pass_through model: _name: mae_image_classification mixup: 0.7 mixup_prob: 0.9 model_path: ??? ================================================ FILE: examples/data2vec/config/vision/finetuning/mae_imagenet_huge_clean.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb fp16_no_flatten_grads: true checkpoint: save_interval: 1 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true task: _name: mae_image_classification data: /datasets01/imagenet_full_size/061417 dataset: num_workers: 6 batch_size: 32 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 2 valid_subset: val distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model log_keys: - correct optimization: max_update: 125200 lr: [0.0005] clip_norm: 4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 0.0005 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 16000 min_lr: 1e-20 lr_scheduler: pass_through model: _name: mae_image_classification mixup: 0.7 mixup_prob: 0.9 layer_decay: 0.75 drop_path_rate: 0.2 model_path: ??? ================================================ FILE: examples/data2vec/config/vision/finetuning/mae_imagenet_large_clean.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb fp16_no_flatten_grads: true checkpoint: save_interval: 1 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true task: _name: mae_image_classification data: /datasets01/imagenet_full_size/061417 dataset: num_workers: 6 batch_size: 32 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 2 valid_subset: val distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model log_keys: - correct optimization: max_update: 125200 lr: [0.0005] clip_norm: 4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 0.0005 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 16000 min_lr: 1e-7 lr_scheduler: pass_through model: _name: mae_image_classification mixup: 0.7 mixup_prob: 0.9 layer_decay: 0.75 drop_path_rate: 0.2 model_path: ??? ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_1.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_1_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir - task.local_cache_path sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_2_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir - task.local_cache_path - model.model_path sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_3.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 3 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_4.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_4_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_6_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 6 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/finetuning/run_config/slurm_8_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 8 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/base_imagenet.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: image_pretraining data: /datasets01/imagenet_full_size/061417/ dataset: num_workers: 6 batch_size: 64 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model log_keys: - ema_decay - target_var - pred_var optimization: max_update: 400000 lr: [0.0005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: cosine warmup_updates: 10000 model: _name: data2vec_vision ================================================ FILE: examples/data2vec/config/vision/pretraining/base_imagenet_d2v1.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: image_pretraining data: /datasets01/imagenet_full_size/061417 dataset: num_workers: 6 batch_size: 128 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 2 disable_validation: true distributed_training: distributed_world_size: 16 ddp_backend: legacy_ddp criterion: _name: model log_keys: - ema_decay - target_var - pred_var optimization: max_update: 375300 #300*1251 lr: [0.0005] clip_norm: 3.0 optimizer: _name: adam adam_betas: (0.9,0.999) adam_eps: 1e-08 weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 12510 # it should be 10 epochs model: _name: data2vec_vision attention_dropout: 0.05 ema_decay: 0.999 ema_end_decay: 0.9998 layer_norm_targets: True average_top_k_layers: 6 loss_beta: 2.0 drop_path: 0.25 ================================================ FILE: examples/data2vec/config/vision/pretraining/base_mae_imagenet.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb fp16_no_flatten_grads: true checkpoint: save_interval: 5 save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: mae_image_pretraining data: /datasets01/imagenet_full_size/061417/ rebuild_batches: true dataset: num_workers: 6 batch_size: 64 skip_invalid_size_inputs_valid_test: true required_batch_size_multiple: 1 disable_validation: true distributed_training: distributed_world_size: 16 ddp_backend: c10d criterion: _name: model optimization: max_update: 375300 lr: [0.0006] optimizer: _name: composite groups: with_decay: lr_float: 6e-4 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0.05 lr_scheduler: _name: cosine warmup_updates: 50040 no_decay: lr_float: 6e-4 optimizer: _name: adam adam_betas: [0.9,0.95] weight_decay: 0 lr_scheduler: _name: cosine warmup_updates: 50040 lr_scheduler: pass_through model: _name: mae ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_1.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_1_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir - task.local_cache_path sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_2_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir - task.local_cache_path sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_3.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 3 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_4.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_4_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_6_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 6 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/config/vision/pretraining/run_config/slurm_8_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 8 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/data2vec/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .image_dataset import ImageDataset from .path_dataset import PathDataset from .mae_image_dataset import MaeImageDataset from .mae_finetuning_image_dataset import MaeFinetuningImageDataset __all__ = [ "ImageDataset", "MaeImageDataset", "MaeFinetuningImageDataset", "PathDataset", ] ================================================ FILE: examples/data2vec/data/add_class_target_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.data import BaseWrapperDataset, data_utils class AddClassTargetDataset(BaseWrapperDataset): def __init__( self, dataset, labels, multi_class, num_classes=None, label_indices=None, add_to_input=True, ): super().__init__(dataset) self.label_indices = label_indices self.labels = labels self.multi_class = multi_class self.add_to_input = add_to_input if num_classes is None and multi_class: assert self.label_indices is not None num_classes = len(self.label_indices) self.num_classes = num_classes def __getitem__(self, index): item = self.dataset[index] item_labels = self.labels[index] if self.multi_class: item["label"] = torch.zeros(self.num_classes) for il in item_labels: if self.label_indices is not None: il = self.label_indices[il] item["label"][il] = 1.0 else: item["label"] = torch.tensor( self.labels[index] if self.label_indices is None else self.label_indices[self.labels[index]] ) return item def collater(self, samples): collated = self.dataset.collater(samples) if len(collated) == 0: return collated indices = set(collated["id"].tolist()) target = [s["label"] for s in samples if s["id"] in indices] collated["label"] = torch.stack(target, dim=0) if self.add_to_input: collated["net_input"]["label"] = collated["label"] return collated ================================================ FILE: examples/data2vec/data/image_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import numpy as np import os from typing import Optional, Callable, Set import torch from torchvision.datasets.vision import VisionDataset from torchvision.transforms import ToTensor from fairseq.data import FairseqDataset logger = logging.getLogger(__name__) class ImageDataset(FairseqDataset, VisionDataset): def __init__( self, root: str, extensions: Set[str], load_classes: bool, transform: Optional[Callable] = None, shuffle=True, ): FairseqDataset.__init__(self) VisionDataset.__init__(self, root=root, transform=transform) self.shuffle = shuffle self.tensor_transform = ToTensor() self.classes = None self.labels = None if load_classes: classes = [d.name for d in os.scandir(root) if d.is_dir()] classes.sort() self.classes = {cls_name: i for i, cls_name in enumerate(classes)} logger.info(f"loaded {len(self.classes)} classes") self.labels = [] def walk_path(root_path): for root, _, fnames in sorted(os.walk(root_path, followlinks=True)): for fname in sorted(fnames): fname_ext = os.path.splitext(fname) if fname_ext[-1].lower() not in extensions: continue path = os.path.join(root, fname) yield path logger.info(f"finding images in {root}") if self.classes is not None: self.files = [] self.labels = [] for c, i in self.classes.items(): for f in walk_path(os.path.join(root, c)): self.files.append(f) self.labels.append(i) else: self.files = [f for f in walk_path(root)] logger.info(f"loaded {len(self.files)} examples") def __getitem__(self, index): from PIL import Image fpath = self.files[index] with open(fpath, "rb") as f: img = Image.open(f).convert("RGB") if self.transform is None: img = self.tensor_transform(img) else: img = self.transform(img) assert torch.is_tensor(img) res = {"id": index, "img": img} if self.labels is not None: res["label"] = self.labels[index] return res def __len__(self): return len(self.files) def collater(self, samples): if len(samples) == 0: return {} collated_img = torch.stack([s["img"] for s in samples], dim=0) res = { "id": torch.LongTensor([s["id"] for s in samples]), "net_input": { "img": collated_img, }, } if "label" in samples[0]: res["net_input"]["label"] = torch.LongTensor([s["label"] for s in samples]) return res def num_tokens(self, index): return 1 def size(self, index): return 1 def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] return order[0] ================================================ FILE: examples/data2vec/data/mae_finetuning_image_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import numpy as np import os import torch from torchvision import datasets, transforms from timm.data import create_transform from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD import PIL from fairseq.data import FairseqDataset from .mae_image_dataset import caching_loader logger = logging.getLogger(__name__) def build_transform(is_train, input_size, color_jitter, aa, reprob, remode, recount): mean = IMAGENET_DEFAULT_MEAN std = IMAGENET_DEFAULT_STD # train transform if is_train: # this should always dispatch to transforms_imagenet_train transform = create_transform( input_size=input_size, is_training=True, color_jitter=color_jitter, auto_augment=aa, interpolation="bicubic", re_prob=reprob, re_mode=remode, re_count=recount, mean=mean, std=std, ) return transform # eval transform t = [] if input_size <= 224: crop_pct = 224 / 256 else: crop_pct = 1.0 size = int(input_size / crop_pct) t.append( transforms.Resize( size, interpolation=PIL.Image.BICUBIC ), # to maintain same ratio w.r.t. 224 images ) t.append(transforms.CenterCrop(input_size)) t.append(transforms.ToTensor()) t.append(transforms.Normalize(mean, std)) return transforms.Compose(t) class MaeFinetuningImageDataset(FairseqDataset): def __init__( self, root: str, split: str, is_train: bool, input_size, color_jitter=None, aa="rand-m9-mstd0.5-inc1", reprob=0.25, remode="pixel", recount=1, local_cache_path=None, shuffle=True, ): FairseqDataset.__init__(self) self.shuffle = shuffle transform = build_transform( is_train, input_size, color_jitter, aa, reprob, remode, recount ) path = os.path.join(root, split) loader = caching_loader(local_cache_path, datasets.folder.default_loader) self.dataset = datasets.ImageFolder(path, loader=loader, transform=transform) logger.info(f"loaded {len(self.dataset)} examples") def __getitem__(self, index): img, label = self.dataset[index] return {"id": index, "img": img, "label": label} def __len__(self): return len(self.dataset) def collater(self, samples): if len(samples) == 0: return {} collated_img = torch.stack([s["img"] for s in samples], dim=0) res = { "id": torch.LongTensor([s["id"] for s in samples]), "net_input": { "imgs": collated_img, }, } if "label" in samples[0]: res["net_input"]["labels"] = torch.LongTensor([s["label"] for s in samples]) return res def num_tokens(self, index): return 1 def size(self, index): return 1 def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] return order[0] ================================================ FILE: examples/data2vec/data/mae_image_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from functools import partial import logging import math import random import time import numpy as np import os import torch from torchvision import datasets, transforms from .path_dataset import PathDataset from fairseq.data import FairseqDataset from fairseq.data.data_utils import compute_block_mask_1d, compute_block_mask_2d from shutil import copyfile logger = logging.getLogger(__name__) def load(path, loader, cache): if hasattr(caching_loader, "cache_root"): cache = caching_loader.cache_root cached_path = cache + path num_tries = 3 for curr_try in range(num_tries): try: if curr_try == 2: return loader(path) if not os.path.exists(cached_path) or curr_try > 0: os.makedirs(os.path.dirname(cached_path), exist_ok=True) copyfile(path, cached_path) os.chmod(cached_path, 0o777) return loader(cached_path) except Exception as e: logger.warning(str(e)) if "Errno 13" in str(e): caching_loader.cache_root = f"/scratch/{random.randint(0, 69420)}" logger.warning(f"setting cache root to {caching_loader.cache_root}") cached_path = caching_loader.cache_root + path if curr_try == (num_tries - 1): raise time.sleep(2) def caching_loader(cache_root: str, loader): if cache_root is None: return loader if cache_root == "slurm_tmpdir": cache_root = os.environ["SLURM_TMPDIR"] assert len(cache_root) > 0 if not cache_root.endswith("/"): cache_root += "/" return partial(load, loader=loader, cache=cache_root) class RandomResizedCropAndInterpolationWithTwoPic: """Crop the given PIL Image to random size and aspect ratio with random interpolation. A crop of random size (default: of 0.08 to 1.0) of the original size and a random aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop is finally resized to given size. This is popularly used to train the Inception networks. Args: size: expected output size of each edge scale: range of size of the origin size cropped ratio: range of aspect ratio of the origin aspect ratio cropped interpolation: Default: PIL.Image.BILINEAR """ def __init__( self, size, second_size=None, scale=(0.08, 1.0), ratio=(3.0 / 4.0, 4.0 / 3.0), interpolation="bilinear", second_interpolation="lanczos", ): if isinstance(size, tuple): self.size = size else: self.size = (size, size) if second_size is not None: if isinstance(second_size, tuple): self.second_size = second_size else: self.second_size = (second_size, second_size) else: self.second_size = None if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): logger.warning("range should be of kind (min, max)") if interpolation == "random": from PIL import Image self.interpolation = (Image.BILINEAR, Image.BICUBIC) else: self.interpolation = self._pil_interp(interpolation) self.second_interpolation = ( self._pil_interp(second_interpolation) if second_interpolation is not None else None ) self.scale = scale self.ratio = ratio def _pil_interp(self, method): from PIL import Image if method == "bicubic": return Image.BICUBIC elif method == "lanczos": return Image.LANCZOS elif method == "hamming": return Image.HAMMING else: # default bilinear, do we want to allow nearest? return Image.BILINEAR @staticmethod def get_params(img, scale, ratio): """Get parameters for ``crop`` for a random sized crop. Args: img (PIL Image): Image to be cropped. scale (tuple): range of size of the origin size cropped ratio (tuple): range of aspect ratio of the origin aspect ratio cropped Returns: tuple: params (i, j, h, w) to be passed to ``crop`` for a random sized crop. """ area = img.size[0] * img.size[1] for attempt in range(10): target_area = random.uniform(*scale) * area log_ratio = (math.log(ratio[0]), math.log(ratio[1])) aspect_ratio = math.exp(random.uniform(*log_ratio)) w = int(round(math.sqrt(target_area * aspect_ratio))) h = int(round(math.sqrt(target_area / aspect_ratio))) if w <= img.size[0] and h <= img.size[1]: i = random.randint(0, img.size[1] - h) j = random.randint(0, img.size[0] - w) return i, j, h, w # Fallback to central crop in_ratio = img.size[0] / img.size[1] if in_ratio < min(ratio): w = img.size[0] h = int(round(w / min(ratio))) elif in_ratio > max(ratio): h = img.size[1] w = int(round(h * max(ratio))) else: # whole image w = img.size[0] h = img.size[1] i = (img.size[1] - h) // 2 j = (img.size[0] - w) // 2 return i, j, h, w def __call__(self, img): import torchvision.transforms.functional as F """ Args: img (PIL Image): Image to be cropped and resized. Returns: PIL Image: Randomly cropped and resized image. """ i, j, h, w = self.get_params(img, self.scale, self.ratio) if isinstance(self.interpolation, (tuple, list)): interpolation = random.choice(self.interpolation) else: interpolation = self.interpolation if self.second_size is None: return F.resized_crop(img, i, j, h, w, self.size, interpolation) else: return F.resized_crop( img, i, j, h, w, self.size, interpolation ), F.resized_crop( img, i, j, h, w, self.second_size, self.second_interpolation ) class MaeImageDataset(FairseqDataset): def __init__( self, root: str, split: str, input_size, local_cache_path=None, shuffle=True, key="imgs", beit_transforms=False, target_transform=False, no_transform=False, compute_mask=False, patch_size: int = 16, mask_prob: float = 0.75, mask_prob_adjust: float = 0, mask_length: int = 1, inverse_mask: bool = False, expand_adjacent: bool = False, mask_dropout: float = 0, non_overlapping: bool = False, require_same_masks: bool = True, clone_batch: int = 1, dataset_type: str = "imagefolder", ): FairseqDataset.__init__(self) self.shuffle = shuffle self.key = key loader = caching_loader(local_cache_path, datasets.folder.default_loader) self.transform_source = None self.transform_target = None if target_transform: self.transform_source = transforms.ColorJitter(0.4, 0.4, 0.4) self.transform_target = transforms.ColorJitter(0.4, 0.4, 0.4) if no_transform: if input_size <= 224: crop_pct = 224 / 256 else: crop_pct = 1.0 size = int(input_size / crop_pct) self.transform_train = transforms.Compose( [ transforms.Resize(size, interpolation=3), transforms.CenterCrop(input_size), ] ) self.transform_train = transforms.Resize((input_size, input_size)) elif beit_transforms: beit_transform_list = [] if not target_transform: beit_transform_list.append(transforms.ColorJitter(0.4, 0.4, 0.4)) beit_transform_list.extend( [ transforms.RandomHorizontalFlip(p=0.5), RandomResizedCropAndInterpolationWithTwoPic( size=input_size, second_size=None, interpolation="bicubic", second_interpolation=None, ), ] ) self.transform_train = transforms.Compose(beit_transform_list) else: self.transform_train = transforms.Compose( [ transforms.RandomResizedCrop( input_size, scale=(0.2, 1.0), interpolation=3 ), # 3 is bicubic transforms.RandomHorizontalFlip(), ] ) self.final_transform = transforms.Compose( [ transforms.ToTensor(), transforms.Normalize( mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] ), ] ) if dataset_type == "imagefolder": self.dataset = datasets.ImageFolder( os.path.join(root, split), loader=loader ) elif dataset_type == "path": self.dataset = PathDataset( root, loader, None, None, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], ) else: raise Exception(f"invalid dataset type {dataset_type}") logger.info( f"initial transform: {self.transform_train}, " f"source transform: {self.transform_source}, " f"target transform: {self.transform_target}, " f"final transform: {self.final_transform}" ) logger.info(f"loaded {len(self.dataset)} examples") self.is_compute_mask = compute_mask self.patches = (input_size // patch_size) ** 2 self.mask_prob = mask_prob self.mask_prob_adjust = mask_prob_adjust self.mask_length = mask_length self.inverse_mask = inverse_mask self.expand_adjacent = expand_adjacent self.mask_dropout = mask_dropout self.non_overlapping = non_overlapping self.require_same_masks = require_same_masks self.clone_batch = clone_batch def __getitem__(self, index): img, _ = self.dataset[index] img = self.transform_train(img) source = None target = None if self.transform_source is not None: source = self.final_transform(self.transform_source(img)) if self.transform_target is not None: target = self.final_transform(self.transform_target(img)) if source is None: img = self.final_transform(img) v = {"id": index, self.key: source if source is not None else img} if target is not None: v["target"] = target if self.is_compute_mask: if self.mask_length == 1: mask = compute_block_mask_1d( shape=(self.clone_batch, self.patches), mask_prob=self.mask_prob, mask_length=self.mask_length, mask_prob_adjust=self.mask_prob_adjust, inverse_mask=self.inverse_mask, require_same_masks=True, ) else: mask = compute_block_mask_2d( shape=(self.clone_batch, self.patches), mask_prob=self.mask_prob, mask_length=self.mask_length, mask_prob_adjust=self.mask_prob_adjust, inverse_mask=self.inverse_mask, require_same_masks=True, expand_adjcent=self.expand_adjacent, mask_dropout=self.mask_dropout, non_overlapping=self.non_overlapping, ) v["precomputed_mask"] = mask return v def __len__(self): return len(self.dataset) def collater(self, samples): if len(samples) == 0: return {} collated_img = torch.stack([s[self.key] for s in samples], dim=0) res = { "id": torch.LongTensor([s["id"] for s in samples]), "net_input": { self.key: collated_img, }, } if "target" in samples[0]: collated_target = torch.stack([s["target"] for s in samples], dim=0) res["net_input"]["target"] = collated_target if "precomputed_mask" in samples[0]: collated_mask = torch.cat([s["precomputed_mask"] for s in samples], dim=0) res["net_input"]["precomputed_mask"] = collated_mask return res def num_tokens(self, index): return 1 def size(self, index): return 1 @property def sizes(self): return np.full((len(self),), 1) def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] return order[0] ================================================ FILE: examples/data2vec/data/modality.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. from enum import Enum, auto class Modality(Enum): AUDIO = auto() IMAGE = auto() TEXT = auto() ================================================ FILE: examples/data2vec/data/path_dataset.py ================================================ import glob import os from typing import List, Optional, Tuple import logging import numpy as np import torchvision.transforms.functional as TF import PIL from PIL import Image from torchvision.datasets import VisionDataset logger = logging.getLogger(__name__) class PathDataset(VisionDataset): def __init__( self, root: List[str], loader: None = None, transform: Optional[str] = None, extra_transform: Optional[str] = None, mean: Optional[List[float]] = None, std: Optional[List[float]] = None, ): super().__init__(root=root) PIL.Image.MAX_IMAGE_PIXELS = 256000001 self.files = [] for folder in self.root: self.files.extend( sorted(glob.glob(os.path.join(folder, "**", "*.jpg"), recursive=True)) ) self.files.extend( sorted(glob.glob(os.path.join(folder, "**", "*.png"), recursive=True)) ) self.transform = transform self.extra_transform = extra_transform self.mean = mean self.std = std self.loader = loader logger.info(f"loaded {len(self.files)} samples from {root}") assert (mean is None) == (std is None) def __len__(self) -> int: return len(self.files) def __getitem__(self, idx) -> Tuple[np.ndarray, np.ndarray]: path = self.files[idx] if self.loader is not None: return self.loader(path), None img = Image.open(path).convert("RGB") if self.transform is not None: img = self.transform(img) img = TF.to_tensor(img) if self.mean is not None and self.std is not None: img = TF.normalize(img, self.mean, self.std) return img, None ================================================ FILE: examples/data2vec/fb_convert_beit_cp.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import torch from omegaconf import OmegaConf from fairseq.criterions.model_criterion import ModelCriterionConfig from fairseq.dataclass.configs import FairseqConfig from tasks import ImageClassificationConfig, ImagePretrainingConfig from models.data2vec_image_classification import ( Data2VecImageClassificationConfig, Data2VecImageClassificationModel, ) from models.data2vec_vision import Data2VecVisionConfig, Data2VecVisionModel def get_parser(): parser = argparse.ArgumentParser( description="convert beit checkpoint into data2vec - vision checkpoint" ) # fmt: off parser.add_argument('checkpoint', help='checkpoint to convert') parser.add_argument('--output', required=True, metavar='PATH', help='where to output converted checkpoint') parser.add_argument('--type', type=str, choices=['vision', 'image_classification'], default='image_classification', help='type of model to upgrade') parser.add_argument('--inception_norms', action='store_true', default=False) # fmt: on return parser def update_checkpoint(model_dict, prefix, is_nested): replace_paths = { "cls_token": "model.cls_emb" if is_nested else "cls_emb", "patch_embed": "model.patch_embed" if is_nested else "patch_embed", "mask_token": "mask_emb", } starts_with = { "patch_embed.proj": "model.patch_embed.conv" if is_nested else "patch_embed.conv", "lm_head": "final_proj", "fc_norm": "fc_norm", "head": "head", } partial = { "mlp.fc1": "mlp.0", "mlp.fc2": "mlp.2", } for k in list(model_dict.keys()): for sw, r in starts_with.items(): if k.startswith(sw): replace_paths[k] = k.replace(sw, r) for p, r in partial.items(): if p in k: replace_paths[k] = prefix + k.replace(p, r) if prefix != "": for k in list(model_dict.keys()): if k not in replace_paths: replace_paths[k] = prefix + k for k in list(model_dict.keys()): if k in replace_paths: model_dict[replace_paths[k]] = model_dict[k] if k != replace_paths[k]: del model_dict[k] return model_dict def main(): parser = get_parser() args = parser.parse_args() cp = torch.load(args.checkpoint, map_location="cpu") cfg = FairseqConfig( criterion=ModelCriterionConfig(_name="model", log_keys=["correct"]), ) if args.type == "image_classification": cfg.task = ImageClassificationConfig( _name="image_classification", data=".", ) if args.inception_norms: cfg.task.normalization_mean = [0.5, 0.5, 0.5] cfg.task.normalization_std = [0.5, 0.5, 0.5] cfg.model = Data2VecImageClassificationConfig( _name="data2vec_image_classification", ) cfg.model.pretrained_model_args = FairseqConfig( model=Data2VecVisionConfig( _name="data2vec_vision", shared_rel_pos_bias=False ), task=ImagePretrainingConfig( _name="image_pretraining", ), ) cfg = OmegaConf.create(cfg) state = { "cfg": OmegaConf.to_container(cfg, resolve=True, enum_to_str=True), "model": cp["module"], "best_loss": None, "optimizer": None, "extra_state": {}, } model = Data2VecImageClassificationModel(cfg.model) model.load_state_dict( update_checkpoint(state["model"], prefix="model.encoder.", is_nested=True), strict=True, ) elif args.type == "vision": cfg.task = ImagePretrainingConfig( _name="image_pretraining", data=".", ) if args.inception_norms: cfg.task.normalization_mean = [0.5, 0.5, 0.5] cfg.task.normalization_std = [0.5, 0.5, 0.5] cfg.model = Data2VecVisionConfig( _name="data2vec_vision", ) cfg = OmegaConf.create(cfg) state = { "cfg": OmegaConf.to_container(cfg, resolve=True, enum_to_str=True), "model": cp["model"], "best_loss": None, "optimizer": None, "extra_state": {}, } model = Data2VecVisionModel(cfg.model) model.load_state_dict( update_checkpoint(state["model"], prefix="encoder.", is_nested=False), strict=True, ) else: raise Exception("unsupported type " + args.type) print(state["cfg"], state.keys()) torch.save(state, args.output) if __name__ == "__main__": main() ================================================ FILE: examples/data2vec/models/__init__.py ================================================ ================================================ FILE: examples/data2vec/models/audio_classification.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import logging import re from dataclasses import dataclass, field from typing import Any, Optional import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from omegaconf import II, MISSING, open_dict from fairseq import checkpoint_utils, tasks from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import ( BaseFairseqModel, register_model, ) from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES from fairseq.modules import TransposeLast from fairseq.tasks import FairseqTask logger = logging.getLogger(__name__) @dataclass class AudioClassificationConfig(FairseqDataclass): model_path: str = field( default=MISSING, metadata={"help": "path to wav2vec 2.0 model"} ) no_pretrained_weights: bool = field( default=False, metadata={"help": "if true, does not load pretrained weights"} ) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) final_dropout: float = field( default=0.0, metadata={"help": "dropout after transformer and before final projection"}, ) dropout: float = field( default=0.0, metadata={"help": "dropout probability inside wav2vec 2.0 model"} ) attention_dropout: float = field( default=0.0, metadata={ "help": "dropout probability for attention weights inside wav2vec 2.0 model" }, ) activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN inside wav2vec 2.0 model" }, ) # masking apply_mask: bool = field( default=False, metadata={"help": "apply masking during fine-tuning"} ) mask_length: int = field( default=10, metadata={"help": "repeat the mask indices multiple times"} ) mask_prob: float = field( default=0.5, metadata={ "help": "probability of replacing a token with mask (normalized by length)" }, ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose masks"} ) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"} ) mask_min_space: Optional[int] = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) require_same_masks: bool = field( default=True, metadata={ "help": "whether to number of masked timesteps must be the same across all " "examples in a batch" }, ) mask_dropout: float = field( default=0.0, metadata={"help": "percent of masks to unmask for each sample"}, ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"} ) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"} ) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"} ) freeze_finetune_updates: int = field( default=0, metadata={"help": "dont finetune wav2vec for this many updates"} ) feature_grad_mult: float = field( default=0.0, metadata={"help": "reset feature grad mult in wav2vec 2.0 to this"} ) layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a layer in wav2vec 2.0"} ) mask_channel_min_space: Optional[int] = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) mask_channel_before: bool = False normalize: bool = II("task.normalize") data: str = II("task.data") # this holds the loaded wav2vec args d2v_args: Any = None offload_activations: bool = field( default=False, metadata={"help": "offload_activations"} ) min_params_to_wrap: int = field( default=int(1e8), metadata={ "help": "minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed." }, ) checkpoint_activations: bool = field( default=False, metadata={"help": "recompute activations and save memory for extra compute"}, ) ddp_backend: str = II("distributed_training.ddp_backend") prediction_mode: str = "lin_softmax" eval_prediction_mode: Optional[str] = None conv_kernel: int = -1 conv_stride: int = 1 two_convs: bool = False extreme_factor: float = 1.0 conv_feature_layers: Optional[str] = field( default=None, metadata={ "help": "string describing convolutional feature extraction layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" }, ) mixup_prob: float = 1.0 source_mixup: float = -1 same_mixup: bool = True label_mixup: bool = False gain_mode: str = "none" @register_model("audio_classification", dataclass=AudioClassificationConfig) class AudioClassificationModel(BaseFairseqModel): def __init__(self, cfg: AudioClassificationConfig, num_classes): super().__init__() self.apply_mask = cfg.apply_mask self.cfg = cfg arg_overrides = { "dropout": cfg.dropout, "activation_dropout": cfg.activation_dropout, "dropout_input": cfg.dropout_input, "attention_dropout": cfg.attention_dropout, "mask_length": cfg.mask_length, "mask_prob": cfg.mask_prob, "require_same_masks": getattr(cfg, "require_same_masks", True), "mask_dropout": getattr(cfg, "mask_dropout", 0), "mask_selection": cfg.mask_selection, "mask_other": cfg.mask_other, "no_mask_overlap": cfg.no_mask_overlap, "mask_channel_length": cfg.mask_channel_length, "mask_channel_prob": cfg.mask_channel_prob, "mask_channel_before": cfg.mask_channel_before, "mask_channel_selection": cfg.mask_channel_selection, "mask_channel_other": cfg.mask_channel_other, "no_mask_channel_overlap": cfg.no_mask_channel_overlap, "encoder_layerdrop": cfg.layerdrop, "feature_grad_mult": cfg.feature_grad_mult, "checkpoint_activations": cfg.checkpoint_activations, "offload_activations": cfg.offload_activations, "min_params_to_wrap": cfg.min_params_to_wrap, "mixup": -1, } if cfg.conv_feature_layers is not None: arg_overrides["conv_feature_layers"] = cfg.conv_feature_layers if cfg.d2v_args is None: state = checkpoint_utils.load_checkpoint_to_cpu( cfg.model_path, arg_overrides ) d2v_args = state.get("cfg", None) if d2v_args is None: d2v_args = convert_namespace_to_omegaconf(state["args"]) d2v_args.criterion = None d2v_args.lr_scheduler = None cfg.d2v_args = d2v_args logger.info(d2v_args) else: state = None d2v_args = cfg.d2v_args model_normalized = d2v_args.task.get( "normalize", d2v_args.model.get("normalize", False) ) assert cfg.normalize == model_normalized, ( "Fine-tuning works best when data normalization is the same. " "Please check that --normalize is set or unset for both pre-training and here" ) if hasattr(cfg, "checkpoint_activations") and cfg.checkpoint_activations: with open_dict(d2v_args): d2v_args.model.checkpoint_activations = cfg.checkpoint_activations d2v_args.task.data = cfg.data task = tasks.setup_task(d2v_args.task) model = task.build_model(d2v_args.model, from_checkpoint=True) model.remove_pretraining_modules() if state is not None and not cfg.no_pretrained_weights: self.load_model_weights(state, model, cfg) d = d2v_args.model.encoder_embed_dim self.d2v_model = model self.final_dropout = nn.Dropout(cfg.final_dropout) self.freeze_finetune_updates = cfg.freeze_finetune_updates self.num_updates = 0 for p in self.parameters(): p.param_group = "pretrained" if cfg.prediction_mode == "proj_avg_proj": self.proj = nn.Linear(d, d * 2) self.proj2 = nn.Linear(d * 2, num_classes) for p in self.proj.parameters(): p.param_group = "projection" for p in self.proj2.parameters(): p.param_group = "projection" elif self.cfg.prediction_mode == "summary_proj": self.proj = nn.Linear(d // 3, num_classes) for p in self.proj.parameters(): p.param_group = "projection" elif self.cfg.conv_kernel > 1 and not self.cfg.two_convs: self.proj = nn.Sequential( TransposeLast(), nn.Conv1d(d, num_classes, kernel_size=self.cfg.conv_kernel, stride=self.cfg.conv_stride), TransposeLast(), ) for p in self.proj.parameters(): p.param_group = "projection" elif self.cfg.conv_kernel > 0 and self.cfg.two_convs: self.proj = nn.Sequential( TransposeLast(), nn.Conv1d(d, d, kernel_size=self.cfg.conv_kernel, stride=self.cfg.conv_stride), TransposeLast(), nn.GELU(), nn.Linear(d, num_classes), ) for p in self.proj.parameters(): p.param_group = "projection" else: self.proj = nn.Linear(d, num_classes) for p in self.proj.parameters(): p.param_group = "projection" def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model(cls, cfg: AudioClassificationConfig, task: FairseqTask): """Build a new model instance.""" assert hasattr(task, "labels"), f"Task {task} must have an attribute 'labels'" return cls(cfg, len(task.labels)) def load_model_weights(self, state, model, cfg): if cfg.ddp_backend == "fully_sharded": from fairseq.distributed import FullyShardedDataParallel for name, module in model.named_modules(): if "encoder.layers" in name and len(name.split(".")) == 3: # Only for layers, we do a special handling and load the weights one by one # We dont load all weights together as that wont be memory efficient and may # cause oom new_dict = { k.replace(name + ".", ""): v for (k, v) in state["model"].items() if name + "." in k } assert isinstance(module, FullyShardedDataParallel) with module.summon_full_params(): module.load_state_dict(new_dict, strict=True) module._reset_lazy_init() # Once layers are loaded, filter them out and load everything else. r = re.compile("encoder.layers.\d.") filtered_list = list(filter(r.match, state["model"].keys())) new_big_dict = { k: v for (k, v) in state["model"].items() if k not in filtered_list } model.load_state_dict(new_big_dict, strict=False) else: if "_ema" in state["model"]: del state["model"]["_ema"] model.load_state_dict(state["model"], strict=False) def set_num_updates(self, num_updates): """Set the number of parameters updates.""" super().set_num_updates(num_updates) self.num_updates = num_updates def compute_gain(self, sound, fs=16_000, min_db=-80.0, mode="A_weighting"): if fs == 16000: n_fft = 2048 elif fs == 44100: n_fft = 4096 else: raise Exception("Invalid fs {}".format(fs)) stride = n_fft // 2 def a_weight(fs, n_fft, min_db=-80.0): freq = np.linspace(0, fs // 2, n_fft // 2 + 1) freq_sq = np.power(freq, 2) freq_sq[0] = 1.0 weight = 2.0 + 20.0 * ( 2 * np.log10(12194) + 2 * np.log10(freq_sq) - np.log10(freq_sq + 12194 ** 2) - np.log10(freq_sq + 20.6 ** 2) - 0.5 * np.log10(freq_sq + 107.7 ** 2) - 0.5 * np.log10(freq_sq + 737.9 ** 2) ) weight = np.maximum(weight, min_db) return weight gain = [] for i in range(0, len(sound) - n_fft + 1, stride): if mode == "RMSE": g = np.mean(sound[i : i + n_fft] ** 2) elif mode == "A_weighting": spec = np.fft.rfft(np.hanning(n_fft + 1)[:-1] * sound[i : i + n_fft]) power_spec = np.abs(spec) ** 2 a_weighted_spec = power_spec * np.power(10, a_weight(fs, n_fft) / 10) g = np.sum(a_weighted_spec) else: raise Exception("Invalid mode {}".format(mode)) gain.append(g) gain = np.array(gain) gain = np.maximum(gain, np.power(10, min_db / 10)) gain_db = 10 * np.log10(gain) return gain_db # adapted from https://github.com/mil-tokyo/bc_learning_sound/blob/master/utils.py def compute_gain_torch(self, sound, fs=16_000, min_db=-80.0, mode="A_weighting"): if fs == 16000: n_fft = 2048 elif fs == 44100: n_fft = 4096 else: raise Exception("Invalid fs {}".format(fs)) if mode == "A_weighting": if not hasattr(self, f"a_weight"): self.a_weight = {} if fs not in self.a_weight: def a_weight(fs, n_fft, min_db=-80.0): freq = np.linspace(0, fs // 2, n_fft // 2 + 1) freq_sq = freq ** 2 freq_sq[0] = 1.0 weight = 2.0 + 20.0 * ( 2 * np.log10(12194) + 2 * np.log10(freq_sq) - np.log10(freq_sq + 12194 ** 2) - np.log10(freq_sq + 20.6 ** 2) - 0.5 * np.log10(freq_sq + 107.7 ** 2) - 0.5 * np.log10(freq_sq + 737.9 ** 2) ) weight = np.maximum(weight, min_db) return weight self.a_weight[fs] = torch.from_numpy( np.power(10, a_weight(fs, n_fft, min_db) / 10) ).to(device=sound.device) sound = sound.unfold(-1, n_fft, n_fft // 2) if mode == "RMSE": sound = sound ** 2 g = sound.mean(-1) elif mode == "A_weighting": w = torch.hann_window(n_fft, device=sound.device) * sound spec = torch.fft.rfft(w) power_spec = spec.abs() ** 2 a_weighted_spec = power_spec * self.a_weight[fs] g = a_weighted_spec.sum(-1) else: raise Exception("Invalid mode {}".format(mode)) gain = torch.maximum(g, torch.tensor(10 ** (min_db / 10), device=g.device)) gain_db = 10 * torch.log10(gain) return gain_db def forward(self, source, padding_mask, label=None, **kwargs): if self.cfg.source_mixup >= 0 and self.training and self.cfg.mixup_prob > 0: with torch.no_grad(): mixed_source = source mix_mask = None if self.cfg.mixup_prob < 1: mix_mask = ( torch.empty((source.size(0),), device=source.device) .bernoulli_(self.cfg.mixup_prob) .bool() ) mixed_source = source[mix_mask] r = ( torch.FloatTensor( 1 if self.cfg.same_mixup else mixed_source.size(0) ) .uniform_(max(1e-6, self.cfg.source_mixup), 1) .to(dtype=source.dtype, device=source.device) ) mixup_perm = torch.randperm(source.size(0)) s2 = source[mixup_perm] if self.cfg.gain_mode == "none": p = r.unsqueeze(-1) if mix_mask is not None: s2 = s2[mix_mask] else: if self.cfg.gain_mode == "naive_rms": G1 = source.pow(2).mean(dim=-1).sqrt() else: G1, _ = self.compute_gain_torch( source, mode=self.cfg.gain_mode ).max(-1) G1 = G1.to(dtype=source.dtype) G2 = G1[mixup_perm] if mix_mask is not None: G1 = G1[mix_mask] G2 = G2[mix_mask] s2 = s2[mix_mask] p = 1 / (1 + 10 ** ((G1 - G2) / 20) * (1 - r) / r) p = p.unsqueeze(-1) mixed = (p * mixed_source) + (1 - p) * s2 if mix_mask is None: source = mixed / torch.sqrt(p ** 2 + (1 - p) ** 2) else: source[mix_mask] = mixed / torch.sqrt(p ** 2 + (1 - p) ** 2) if label is not None and self.cfg.label_mixup: r = r.unsqueeze(-1) if mix_mask is None: label = label * r + (1 - r) * label[mixup_perm] else: label[mix_mask] = ( label[mix_mask] * r + (1 - r) * label[mixup_perm][mix_mask] ) d2v_args = { "source": source, "padding_mask": padding_mask, "mask": self.apply_mask and self.training, } ft = self.freeze_finetune_updates <= self.num_updates with torch.no_grad() if not ft else contextlib.ExitStack(): res = self.d2v_model.extract_features(**d2v_args) x = res["x"] padding_mask = res["padding_mask"] if padding_mask is not None: x[padding_mask] = 0 x = self.final_dropout(x) if self.training or ( self.cfg.eval_prediction_mode is None or self.cfg.eval_prediction_mode == "" ): prediction_mode = self.cfg.prediction_mode else: prediction_mode = self.cfg.eval_prediction_mode if prediction_mode == "average_before": x = x.mean(dim=1) if prediction_mode != "summary_mha" and prediction_mode != "summary_proj" and prediction_mode != "cls": x = self.proj(x) logits = True if prediction_mode == "lin_softmax": x = F.logsigmoid(x.float()) x = torch.logsumexp(x + x, dim=1) - torch.logsumexp(x, dim=1) x = x.clamp(max=0) x = x - torch.log(-(torch.expm1(x))) elif prediction_mode == "extremized_odds": x = x.float().sum(dim=1) x = x * self.cfg.extreme_factor elif prediction_mode == "average_before": x = x.float() elif prediction_mode == "average": x = x.float().mean(dim=1) elif prediction_mode == "average_sigmoid": x = torch.sigmoid(x.float()) x = x.mean(dim=1) logits = False elif prediction_mode == "max": x, _ = x.float().max(dim=1) elif prediction_mode == "max_sigmoid": x = torch.sigmoid(x.float()) x, _ = x.float().max(dim=1) logits = False elif prediction_mode == "proj_avg_proj": x = x.mean(dim=1) x = self.proj2(x) elif prediction_mode == "summary_mha" or prediction_mode == "summary_proj": x = self.d2v_model.summary( x, padding_mask, proj=prediction_mode == "summary_proj" ) x = x.type_as(source) x = self.proj(x) elif prediction_mode == "cls": x = x[:,0] x = self.proj(x) else: raise Exception(f"unknown prediction mode {prediction_mode}") if label is None: return torch.sigmoid(x) if logits else x x = torch.nan_to_num(x) if logits: loss = F.binary_cross_entropy_with_logits( x, label.float(), reduction="none" ) else: loss = F.binary_cross_entropy(x, label.float(), reduction="none") result = { "losses": { "main": loss, }, "sample_size": label.sum(), } if not self.training: result["_predictions"] = torch.sigmoid(x) if logits else x result["_targets"] = label return result ================================================ FILE: examples/data2vec/models/data2vec2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from dataclasses import dataclass, field from typing import Optional, Callable from functools import partial import numpy as np from omegaconf import II import torch import torch.nn as nn import torch.nn.functional as F import torch.distributed as dist from fairseq.modules import EMAModule, EMAModuleConfig from fairseq.dataclass import FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from examples.data2vec.data.modality import Modality from examples.data2vec.models.modalities.base import ( MaskSeed, D2vModalityConfig, ModalitySpecificEncoder, get_annealed_rate, ) from examples.data2vec.models.modalities.modules import ( D2vDecoderConfig, AltBlock, Decoder1d, ) from examples.data2vec.models.modalities.audio import ( D2vAudioConfig, AudioEncoder, ) from examples.data2vec.models.modalities.images import ( D2vImageConfig, ImageEncoder, ) from examples.data2vec.models.modalities.text import ( D2vTextConfig, TextEncoder, ) logger = logging.getLogger(__name__) @dataclass class D2vModalitiesConfig(FairseqDataclass): audio: D2vAudioConfig = D2vAudioConfig() image: D2vImageConfig = D2vImageConfig() text: D2vTextConfig = D2vTextConfig() @dataclass class Data2VecMultiConfig(FairseqDataclass): loss_beta: float = field( default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} ) loss_scale: Optional[float] = field( default=None, metadata={ "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" }, ) depth: int = 8 start_drop_path_rate: float = 0 end_drop_path_rate: float = 0 num_heads: int = 12 norm_eps: float = 1e-6 norm_affine: bool = True encoder_dropout: float = 0.1 post_mlp_drop: float = 0.1 attention_dropout: float = 0.1 activation_dropout: float = 0.0 dropout_input: float = 0.0 layerdrop: float = 0.0 embed_dim: int = 768 mlp_ratio: float = 4 layer_norm_first: bool = False average_top_k_layers: int = field( default=8, metadata={"help": "how many layers to average"} ) end_of_block_targets: bool = False clone_batch: int = 1 layer_norm_target_layer: bool = False batch_norm_target_layer: bool = False instance_norm_target_layer: bool = False instance_norm_targets: bool = False layer_norm_targets: bool = False ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) ema_same_dtype: bool = True log_norms: bool = True ema_end_decay: float = field( default=0.9999, metadata={"help": "final ema decay rate"} ) # when to finish annealing ema decay rate ema_anneal_end_step: int = II("optimization.max_update") ema_encoder_only: bool = field( default=True, metadata={ "help": "whether to momentum update only the shared transformer encoder" }, ) max_update: int = II("optimization.max_update") modalities: D2vModalitiesConfig = D2vModalitiesConfig() shared_decoder: Optional[D2vDecoderConfig] = None min_target_var: float = field( default=0.1, metadata={"help": "stop training if target var falls below this"} ) min_pred_var: float = field( default=0.01, metadata={"help": "stop training if prediction var falls below this"}, ) supported_modality: Optional[Modality] = None mae_init: bool = False seed: int = II("common.seed") skip_ema: bool = False cls_loss: float = 0 recon_loss: float = 0 d2v_loss: float = 1 decoder_group: bool = False @register_model("data2vec_multi", dataclass=Data2VecMultiConfig) class Data2VecMultiModel(BaseFairseqModel): def make_modality_encoder( self, cfg: D2vModalityConfig, embed_dim: int, make_block: Callable[[float], nn.ModuleList], norm_layer: Callable[[int], nn.LayerNorm], layer_norm_first: bool, alibi_biases, task, ) -> ModalitySpecificEncoder: if cfg.type == Modality.AUDIO: enc_cls = AudioEncoder elif cfg.type == Modality.IMAGE: enc_cls = ImageEncoder elif cfg.type == Modality.TEXT: enc_cls = TextEncoder if hasattr(task, "text_task"): task = task.text_task else: raise Exception(f"unsupported modality {cfg.type}") return enc_cls( cfg, embed_dim, make_block, norm_layer, layer_norm_first, alibi_biases, task, ) def __init__(self, cfg: Data2VecMultiConfig, modalities, skip_ema=False, task=None): super().__init__() self.cfg = cfg self.modalities = modalities self.task = task make_layer_norm = partial( nn.LayerNorm, eps=cfg.norm_eps, elementwise_affine=cfg.norm_affine ) def make_block(drop_path, dim=None, heads=None): return AltBlock( cfg.embed_dim if dim is None else dim, cfg.num_heads if heads is None else heads, cfg.mlp_ratio, qkv_bias=True, drop=cfg.encoder_dropout, attn_drop=cfg.attention_dropout, mlp_drop=cfg.activation_dropout, post_mlp_drop=cfg.post_mlp_drop, drop_path=drop_path, norm_layer=make_layer_norm, layer_norm_first=cfg.layer_norm_first, ffn_targets=not cfg.end_of_block_targets, ) self.alibi_biases = {} self.modality_encoders = nn.ModuleDict() for mod in self.modalities: mod_cfg = getattr(cfg.modalities, mod.name.lower()) enc = self.make_modality_encoder( mod_cfg, cfg.embed_dim, make_block, make_layer_norm, cfg.layer_norm_first, self.alibi_biases, task, ) self.modality_encoders[mod.name] = enc self.ema = None self.average_top_k_layers = cfg.average_top_k_layers self.loss_beta = cfg.loss_beta self.loss_scale = cfg.loss_scale self.dropout_input = nn.Dropout(cfg.dropout_input) dpr = np.linspace(cfg.start_drop_path_rate, cfg.end_drop_path_rate, cfg.depth) self.blocks = nn.ModuleList([make_block(dpr[i]) for i in range(cfg.depth)]) self.norm = None if cfg.layer_norm_first: self.norm = make_layer_norm(cfg.embed_dim) if self.cfg.mae_init: self.apply(self._init_weights) else: from fairseq.modules.transformer_sentence_encoder import init_bert_params self.apply(init_bert_params) for mod_enc in self.modality_encoders.values(): mod_enc.reset_parameters() if not skip_ema: self.ema = self.make_ema_teacher(cfg.ema_decay) self.shared_decoder = ( Decoder1d(cfg.shared_decoder, cfg.embed_dim) if self.cfg.shared_decoder is not None else None ) if self.shared_decoder is not None: self.shared_decoder.apply(self._init_weights) self.recon_proj = None if cfg.recon_loss > 0: self.recon_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim) for pn, p in self.named_parameters(): if len(p.shape) == 1 or pn.endswith(".bias") or "alibi_scale" in pn: p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} if cfg.decoder_group and "decoder" in pn: p.param_group = "decoder" self.num_updates = 0 def _init_weights(self, m): try: from apex.normalization import FusedLayerNorm fn = FusedLayerNorm except: fn = nn.LayerNorm if isinstance(m, nn.Linear): torch.nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm) or isinstance(m, fn): if m.bias is not None: nn.init.constant_(m.bias, 0) if m.weight is not None: nn.init.constant_(m.weight, 1.0) @torch.no_grad() def make_ema_teacher(self, ema_decay): ema_config = EMAModuleConfig( ema_decay=ema_decay, ema_fp32=True, log_norms=self.cfg.log_norms, add_missing_params=False, ) model_copy = self.make_target_model() return EMAModule( model_copy, ema_config, copy_model=False, ) def make_target_model(self): logger.info("making target model") model_copy = Data2VecMultiModel( self.cfg, self.modalities, skip_ema=True, task=self.task ) if self.cfg.ema_encoder_only: model_copy = model_copy.blocks for p_s, p_t in zip(self.blocks.parameters(), model_copy.parameters()): p_t.data.copy_(p_s.data) else: for p_s, p_t in zip(self.parameters(), model_copy.parameters()): p_t.data.copy_(p_s.data) for mod_enc in model_copy.modality_encoders.values(): mod_enc.decoder = None if not mod_enc.modality_cfg.ema_local_encoder: mod_enc.local_encoder = None mod_enc.project_features = None model_copy.requires_grad_(False) return model_copy def set_num_updates(self, num_updates): super().set_num_updates(num_updates) if self.ema is not None and ( (self.num_updates == 0 and num_updates > 1) or self.num_updates >= num_updates ): pass elif self.training and self.ema is not None: ema_weight_decay = None if self.cfg.ema_decay != self.cfg.ema_end_decay: if num_updates >= self.cfg.ema_anneal_end_step: decay = self.cfg.ema_end_decay else: decay = get_annealed_rate( self.cfg.ema_decay, self.cfg.ema_end_decay, num_updates, self.cfg.ema_anneal_end_step, ) self.ema.set_decay(decay, weight_decay=ema_weight_decay) if self.ema.get_decay() < 1: self.ema.step(self.blocks if self.cfg.ema_encoder_only else self) self.num_updates = num_updates def state_dict(self, destination=None, prefix="", keep_vars=False): state = super().state_dict(destination, prefix, keep_vars) if self.ema is not None: state[prefix + "_ema"] = self.ema.fp32_params return state def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): k = prefix + "_ema" if self.ema is not None: assert k in state_dict self.ema.restore(state_dict[k], True) del state_dict[k] elif k in state_dict: del state_dict[k] return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @classmethod def build_model(cls, cfg: Data2VecMultiConfig, task=None): """Build a new model instance.""" if task is None or not hasattr(task, "supported_modalities"): modalities = ( [cfg.supported_modality] if cfg.supported_modality is not None else [ Modality.AUDIO, Modality.IMAGE, Modality.TEXT, ] ) else: modalities = task.supported_modalities return cls(cfg, modalities, task=task, skip_ema=cfg.skip_ema) def forward( self, source, target=None, id=None, mode=None, padding_mask=None, mask=True, features_only=False, force_remove_masked=False, remove_extra_tokens=True, precomputed_mask=None, ): if mode is None: assert self.cfg.supported_modality is not None mode = self.cfg.supported_modality if isinstance(mode, Modality): mode = mode.name feature_extractor = self.modality_encoders[mode] mask_seeds = None if id is not None: mask_seeds = MaskSeed(seed=self.cfg.seed, update=self.num_updates, ids=id) extractor_out = feature_extractor( source, padding_mask, mask, remove_masked=not features_only or force_remove_masked, clone_batch=self.cfg.clone_batch if not features_only else 1, mask_seeds=mask_seeds, precomputed_mask=precomputed_mask, ) x = extractor_out["x"] encoder_mask = extractor_out["encoder_mask"] masked_padding_mask = extractor_out["padding_mask"] masked_alibi_bias = extractor_out.get("alibi_bias", None) alibi_scale = extractor_out.get("alibi_scale", None) if self.dropout_input is not None: x = self.dropout_input(x) layer_results = [] for i, blk in enumerate(self.blocks): if ( not self.training or self.cfg.layerdrop == 0 or (np.random.random() > self.cfg.layerdrop) ): ab = masked_alibi_bias if ab is not None and alibi_scale is not None: scale = ( alibi_scale[i] if alibi_scale.size(0) > 1 else alibi_scale.squeeze(0) ) ab = ab * scale.type_as(ab) x, lr = blk( x, padding_mask=masked_padding_mask, alibi_bias=ab, ) if features_only: layer_results.append(lr) if self.norm is not None: x = self.norm(x) if features_only: if remove_extra_tokens: x = x[:, feature_extractor.modality_cfg.num_extra_tokens :] if masked_padding_mask is not None: masked_padding_mask = masked_padding_mask[ :, feature_extractor.modality_cfg.num_extra_tokens : ] return { "x": x, "padding_mask": masked_padding_mask, "layer_results": layer_results, "mask": encoder_mask, } xs = [] if self.shared_decoder is not None: dx = self.forward_decoder( x, feature_extractor, self.shared_decoder, encoder_mask, ) xs.append(dx) if feature_extractor.decoder is not None: dx = self.forward_decoder( x, feature_extractor, feature_extractor.decoder, encoder_mask, ) xs.append(dx) orig_x = x assert len(xs) > 0 p = next(self.ema.model.parameters()) device = x.device dtype = x.dtype ema_device = p.device ema_dtype = p.dtype if not self.cfg.ema_same_dtype: dtype = ema_dtype if ema_device != device or ema_dtype != dtype: logger.info(f"adjusting ema dtype to {dtype} and device to {device}") self.ema.model = self.ema.model.to(dtype=dtype, device=device) ema_dtype = dtype def to_device(d): for k, p in d.items(): if isinstance(d[k], dict): to_device(d[k]) else: d[k] = p.to(device=device) to_device(self.ema.fp32_params) tm = self.ema.model with torch.no_grad(): tm.eval() if self.cfg.ema_encoder_only: assert target is None ema_input = extractor_out["local_features"] ema_input = feature_extractor.contextualized_features( ema_input.to(dtype=ema_dtype), padding_mask, mask=False, remove_masked=False, ) ema_blocks = tm else: ema_blocks = tm.blocks if feature_extractor.modality_cfg.ema_local_encoder: inp = ( target.to(dtype=ema_dtype) if target is not None else source.to(dtype=ema_dtype) ) ema_input = tm.modality_encoders[mode]( inp, padding_mask, mask=False, remove_masked=False, ) else: assert target is None ema_input = extractor_out["local_features"] ema_feature_enc = tm.modality_encoders[mode] ema_input = ema_feature_enc.contextualized_features( ema_input.to(dtype=ema_dtype), padding_mask, mask=False, remove_masked=False, ) ema_padding_mask = ema_input["padding_mask"] ema_alibi_bias = ema_input.get("alibi_bias", None) ema_alibi_scale = ema_input.get("alibi_scale", None) ema_input = ema_input["x"] y = [] ema_x = [] extra_tokens = feature_extractor.modality_cfg.num_extra_tokens for i, blk in enumerate(ema_blocks): ab = ema_alibi_bias if ab is not None and alibi_scale is not None: scale = ( ema_alibi_scale[i] if ema_alibi_scale.size(0) > 1 else ema_alibi_scale.squeeze(0) ) ab = ab * scale.type_as(ab) ema_input, lr = blk( ema_input, padding_mask=ema_padding_mask, alibi_bias=ab, ) y.append(lr[:, extra_tokens:]) ema_x.append(ema_input[:, extra_tokens:]) y = self.make_targets(y, self.average_top_k_layers) orig_targets = y if self.cfg.clone_batch > 1: y = y.repeat_interleave(self.cfg.clone_batch, 0) masked = encoder_mask.mask.unsqueeze(-1) masked_b = encoder_mask.mask.bool() y = y[masked_b] if xs[0].size(1) == masked_b.size(1): xs = [x[masked_b] for x in xs] else: xs = [x.reshape(-1, x.size(-1)) for x in xs] sample_size = masked.sum().long() result = { "losses": {}, "sample_size": sample_size, } sample_size = result["sample_size"] if self.cfg.cls_loss > 0: assert extra_tokens > 0 cls_target = orig_targets.mean(dim=1) if self.cfg.clone_batch > 1: cls_target = cls_target.repeat_interleave(self.cfg.clone_batch, 0) cls_pred = x[:, extra_tokens - 1] result["losses"]["cls"] = self.d2v_loss(cls_pred, cls_target) * ( self.cfg.cls_loss * sample_size ) if self.cfg.recon_loss > 0: with torch.no_grad(): target = feature_extractor.patchify(source) mean = target.mean(dim=-1, keepdim=True) var = target.var(dim=-1, keepdim=True) target = (target - mean) / (var + 1.0e-6) ** 0.5 if self.cfg.clone_batch > 1: target = target.repeat_interleave(self.cfg.clone_batch, 0) if masked_b is not None: target = target[masked_b] recon = xs[0] if self.recon_proj is not None: recon = self.recon_proj(recon) result["losses"]["recon"] = ( self.d2v_loss(recon, target.float()) * self.cfg.recon_loss ) if self.cfg.d2v_loss > 0: for i, x in enumerate(xs): reg_loss = self.d2v_loss(x, y) n = f"{mode}_regression_{i}" if len(xs) > 1 else f"{mode}_regression" result["losses"][n] = reg_loss * self.cfg.d2v_loss suffix = "" if len(self.modalities) == 1 else f"_{mode}" with torch.no_grad(): if encoder_mask is not None: result["masked_pct"] = 1 - ( encoder_mask.ids_keep.size(1) / encoder_mask.ids_restore.size(1) ) for i, x in enumerate(xs): n = f"pred_var{suffix}_{i}" if len(xs) > 1 else f"pred_var{suffix}" result[n] = self.compute_var(x.float()) if self.ema is not None: for k, v in self.ema.logs.items(): result[k] = v y = y.float() result[f"target_var{suffix}"] = self.compute_var(y) if self.num_updates > 5000: if result[f"target_var{suffix}"] < self.cfg.min_target_var: logger.error( f"target var is {result[f'target_var{suffix}'].item()} < {self.cfg.min_target_var}, exiting ({mode})" ) raise Exception( f"target var is {result[f'target_var{suffix}'].item()} < {self.cfg.min_target_var}, exiting ({mode})" ) for k in result.keys(): if k.startswith("pred_var") and result[k] < self.cfg.min_pred_var: logger.error( f"{k} is {result[k].item()} < {self.cfg.min_pred_var}, exiting ({mode})" ) raise Exception( f"{k} is {result[k].item()} < {self.cfg.min_pred_var}, exiting ({mode})" ) result["ema_decay"] = self.ema.get_decay() * 1000 return result def forward_decoder( self, x, feature_extractor, decoder, mask_info, ): x = feature_extractor.decoder_input(x, mask_info) x = decoder(*x) return x def d2v_loss(self, x, y): x = x.view(-1, x.size(-1)).float() y = y.view(-1, x.size(-1)) if self.loss_beta == 0: loss = F.mse_loss(x, y, reduction="none") else: loss = F.smooth_l1_loss(x, y, reduction="none", beta=self.loss_beta) if self.loss_scale is not None: scale = self.loss_scale else: scale = 1 / math.sqrt(x.size(-1)) reg_loss = loss * scale return reg_loss def make_targets(self, y, num_layers): with torch.no_grad(): target_layer_results = y[-num_layers:] permuted = False if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: target_layer_results = [ tl.transpose(1, 2) for tl in target_layer_results # BTC -> BCT ] permuted = True if self.cfg.batch_norm_target_layer: target_layer_results = [ F.batch_norm( tl.float(), running_mean=None, running_var=None, training=True ) for tl in target_layer_results ] if self.cfg.instance_norm_target_layer: target_layer_results = [ F.instance_norm(tl.float()) for tl in target_layer_results ] if permuted: target_layer_results = [ tl.transpose(1, 2) for tl in target_layer_results # BCT -> BTC ] if self.cfg.layer_norm_target_layer: target_layer_results = [ F.layer_norm(tl.float(), tl.shape[-1:]) for tl in target_layer_results ] y = target_layer_results[0].float() for tl in target_layer_results[1:]: y.add_(tl.float()) y = y.div_(len(target_layer_results)) if self.cfg.layer_norm_targets: y = F.layer_norm(y, y.shape[-1:]) if self.cfg.instance_norm_targets: y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2) return y @staticmethod def compute_var(y): y = y.view(-1, y.size(-1)) if dist.is_initialized(): zc = torch.tensor(y.size(0)).cuda() zs = y.sum(dim=0) zss = (y**2).sum(dim=0) dist.all_reduce(zc) dist.all_reduce(zs) dist.all_reduce(zss) var = zss / (zc - 1) - (zs**2) / (zc * (zc - 1)) return torch.sqrt(var + 1e-6).mean() else: return torch.sqrt(y.var(dim=0) + 1e-6).mean() def extract_features( self, source, mode=None, padding_mask=None, mask=False, remove_extra_tokens=True ): res = self.forward( source, mode=mode, padding_mask=padding_mask, mask=mask, features_only=True, remove_extra_tokens=remove_extra_tokens, ) return res def remove_pretraining_modules(self, modality=None, keep_decoder=False): self.ema = None self.cfg.clone_batch = 1 self.recon_proj = None if not keep_decoder: self.shared_decoder = None modality = modality.lower() if modality is not None else None for k in list(self.modality_encoders.keys()): if modality is not None and k.lower() != modality: del self.modality_encoders[k] else: self.modality_encoders[k].remove_pretraining_modules( keep_decoder=keep_decoder ) if not keep_decoder: self.modality_encoders[k].decoder = None ================================================ FILE: examples/data2vec/models/data2vec_audio.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from dataclasses import dataclass, field from typing import Optional from omegaconf import II import torch import torch.nn as nn import torch.nn.functional as F import torch.distributed as dist from fairseq.modules import EMAModule, EMAModuleConfig from fairseq.data.data_utils import compute_mask_indices from fairseq.models import BaseFairseqModel, register_model from fairseq.models.wav2vec import ( ConvFeatureExtractionModel, Wav2Vec2Config, TransformerEncoder, ) from fairseq.modules import ( GradMultiply, LayerNorm, ) from fairseq.utils import index_put logger = logging.getLogger(__name__) @dataclass class Data2VecAudioConfig(Wav2Vec2Config): loss_beta: float = field( default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} ) loss_scale: Optional[float] = field( default=None, metadata={ "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" }, ) average_top_k_layers: int = field( default=8, metadata={"help": "how many layers to average"} ) layer_norm_target_layer: bool = False instance_norm_target_layer: bool = False instance_norm_targets: bool = False layer_norm_targets: bool = False batch_norm_target_layer: bool = False group_norm_target_layer: bool = False ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) ema_end_decay: float = field( default=0.9999, metadata={"help": "final ema decay rate"} ) # when to finish annealing ema decay rate ema_anneal_end_step: int = II("optimization.max_update") ema_transformer_only: bool = field( default=True, metadata={"help": "whether to momentum update only the transformer"}, ) ema_layers_only: bool = field( default=True, metadata={"help": "whether to momentum update only the transformer layers"}, ) max_update: int = II("optimization.max_update") min_target_var: float = field( default=0.1, metadata={"help": "stop training if target var falls below this"} ) min_pred_var: float = field( default=0.01, metadata={"help": "stop training if prediction var falls below this"}, ) def get_annealed_rate(start, end, curr_step, total_steps): r = end - start pct_remaining = 1 - curr_step / total_steps return end - r * pct_remaining @register_model("data2vec_audio", dataclass=Data2VecAudioConfig) class Data2VecAudioModel(BaseFairseqModel): def __init__(self, cfg: Data2VecAudioConfig): super().__init__() self.cfg = cfg feature_enc_layers = eval(cfg.conv_feature_layers) self.extractor_embed = feature_enc_layers[-1][0] self.ema = None self.embed = cfg.encoder_embed_dim self.average_top_k_layers = cfg.average_top_k_layers self.loss_beta = cfg.loss_beta self.loss_scale = cfg.loss_scale self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias, ) self.post_extract_proj = nn.Linear(self.extractor_embed, cfg.encoder_embed_dim) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection self.mask_other = cfg.mask_other self.mask_length = cfg.mask_length self.no_mask_overlap = cfg.no_mask_overlap self.mask_min_space = cfg.mask_min_space self.mask_channel_prob = cfg.mask_channel_prob self.mask_channel_before = cfg.mask_channel_before self.mask_channel_selection = cfg.mask_channel_selection self.mask_channel_other = cfg.mask_channel_other self.mask_channel_length = cfg.mask_channel_length self.no_mask_channel_overlap = cfg.no_mask_channel_overlap self.mask_channel_min_space = cfg.mask_channel_min_space self.dropout_input = nn.Dropout(cfg.dropout_input) self.dropout_features = nn.Dropout(cfg.dropout_features) self.feature_grad_mult = cfg.feature_grad_mult self.mask_emb = nn.Parameter( torch.FloatTensor(cfg.encoder_embed_dim).uniform_() ) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.extractor_embed) self.final_proj = nn.Linear(self.embed, self.embed) self.num_updates = 0 def make_ema_teacher(self): ema_config = EMAModuleConfig( ema_decay=self.cfg.ema_decay, ema_fp32=True, ) skip_keys = set() if self.cfg.ema_layers_only: self.cfg.ema_transformer_only = True for k, _ in self.encoder.pos_conv.named_parameters(): skip_keys.add(f"pos_conv.{k}") self.ema = EMAModule( self.encoder if self.cfg.ema_transformer_only else self, ema_config, skip_keys=skip_keys, ) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) if self.ema is None and self.final_proj is not None: logger.info(f"making ema teacher") self.make_ema_teacher() elif self.training and self.ema is not None: if self.cfg.ema_decay != self.cfg.ema_end_decay: if num_updates >= self.cfg.ema_anneal_end_step: decay = self.cfg.ema_end_decay else: decay = get_annealed_rate( self.cfg.ema_decay, self.cfg.ema_end_decay, num_updates, self.cfg.ema_anneal_end_step, ) self.ema.set_decay(decay) if self.ema.get_decay() < 1: self.ema.step(self.encoder if self.cfg.ema_transformer_only else self) self.num_updates = num_updates def state_dict(self, destination=None, prefix="", keep_vars=False): state = super().state_dict(destination, prefix, keep_vars) if self.ema is not None: state[prefix + "_ema"] = self.ema.fp32_params return state def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): if self.ema is not None: k = prefix + "_ema" assert k in state_dict self.ema.restore(state_dict[k], True) del state_dict[k] return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @classmethod def build_model(cls, cfg: Data2VecAudioConfig, task=None): """Build a new model instance.""" return cls(cfg) def apply_mask( self, x, padding_mask, mask_indices=None, mask_channel_indices=None, ): B, T, C = x.shape if self.mask_channel_prob > 0 and self.mask_channel_before: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x[mask_channel_indices] = 0 if self.mask_prob > 0: if mask_indices is None: mask_indices = compute_mask_indices( (B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=1, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space, require_same_masks=self.cfg.require_same_masks, mask_dropout=self.cfg.mask_dropout, ) mask_indices = torch.from_numpy(mask_indices).to(x.device) x = index_put(x, mask_indices, self.mask_emb) else: mask_indices = None if self.mask_channel_prob > 0 and not self.mask_channel_before: if mask_channel_indices is None: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x = index_put(x, mask_channel_indices, 0) return x, mask_indices def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): """ Computes the output length of the convolutional layers """ def _conv_out_length(input_length, kernel_size, stride): return torch.floor((input_length - kernel_size) / stride + 1) conv_cfg_list = eval(self.cfg.conv_feature_layers) for i in range(len(conv_cfg_list)): input_lengths = _conv_out_length( input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2] ) return input_lengths.to(torch.long) def forward( self, source, padding_mask=None, mask=True, features_only=False, layer=None, mask_indices=None, mask_channel_indices=None, padding_count=None, ): features = source if self.feature_grad_mult > 0: features = self.feature_extractor(features) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(features) features = features.transpose(1, 2) features = self.layer_norm(features) orig_padding_mask = padding_mask if padding_mask is not None and padding_mask.any(): input_lengths = (1 - padding_mask.long()).sum(-1) # apply conv formula to get real output_lengths output_lengths = self._get_feat_extract_output_lengths(input_lengths) padding_mask = torch.zeros( features.shape[:2], dtype=features.dtype, device=features.device ) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[ ( torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1, ) ] = 1 padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() else: padding_mask = None if self.post_extract_proj is not None: features = self.post_extract_proj(features) pre_encoder_features = None if self.cfg.ema_transformer_only: pre_encoder_features = features.clone() features = self.dropout_input(features) if mask: x, mask_indices = self.apply_mask( features, padding_mask, mask_indices=mask_indices, mask_channel_indices=mask_channel_indices, ) else: x = features mask_indices = None x, layer_results = self.encoder( x, padding_mask=padding_mask, layer=layer, ) if features_only: return { "x": x, "padding_mask": padding_mask, "layer_results": layer_results, } result = { "losses": {}, } with torch.no_grad(): self.ema.model.eval() if self.cfg.ema_transformer_only: y, layer_results = self.ema.model.extract_features( pre_encoder_features, padding_mask=padding_mask, min_layer=self.cfg.encoder_layers - self.average_top_k_layers, ) y = { "x": y, "padding_mask": padding_mask, "layer_results": layer_results, } else: y = self.ema.model.extract_features( source=source, padding_mask=orig_padding_mask, mask=False, ) target_layer_results = [l[2] for l in y["layer_results"]] permuted = False if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: target_layer_results = [ tl.permute(1, 2, 0) for tl in target_layer_results # TBC -> BCT ] permuted = True if self.cfg.batch_norm_target_layer: target_layer_results = [ F.batch_norm( tl.float(), running_mean=None, running_var=None, training=True ) for tl in target_layer_results ] if self.cfg.instance_norm_target_layer: target_layer_results = [ F.instance_norm(tl.float()) for tl in target_layer_results ] if permuted: target_layer_results = [ tl.transpose(1, 2) for tl in target_layer_results # BCT -> BTC ] if self.cfg.group_norm_target_layer: target_layer_results = [ F.layer_norm(tl.float(), tl.shape[-2:]) for tl in target_layer_results ] if self.cfg.layer_norm_target_layer: target_layer_results = [ F.layer_norm(tl.float(), tl.shape[-1:]) for tl in target_layer_results ] y = sum(target_layer_results) / len(target_layer_results) if self.cfg.layer_norm_targets: y = F.layer_norm(y.float(), y.shape[-1:]) if self.cfg.instance_norm_targets: y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2) if not permuted: y = y.transpose(0, 1) y = y[mask_indices] x = x[mask_indices] x = self.final_proj(x) sz = x.size(-1) if self.loss_beta == 0: loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1) else: loss = F.smooth_l1_loss( x.float(), y.float(), reduction="none", beta=self.loss_beta ).sum(dim=-1) if self.loss_scale is not None: scale = self.loss_scale else: scale = 1 / math.sqrt(sz) result["losses"]["regression"] = loss.sum() * scale if "sample_size" not in result: result["sample_size"] = loss.numel() with torch.no_grad(): result["target_var"] = self.compute_var(y) result["pred_var"] = self.compute_var(x.float()) if self.num_updates > 5000 and result["target_var"] < self.cfg.min_target_var: logger.error( f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting" ) raise Exception( f"target var is {result['target_var'].item()} < {self.cfg.min_target_var}, exiting" ) if self.num_updates > 5000 and result["pred_var"] < self.cfg.min_pred_var: logger.error( f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting" ) raise Exception( f"pred var is {result['pred_var'].item()} < {self.cfg.min_pred_var}, exiting" ) if self.ema is not None: result["ema_decay"] = self.ema.get_decay() * 1000 return result @staticmethod def compute_var(y): y = y.view(-1, y.size(-1)) if dist.is_initialized(): zc = torch.tensor(y.size(0)).cuda() zs = y.sum(dim=0) zss = (y ** 2).sum(dim=0) dist.all_reduce(zc) dist.all_reduce(zs) dist.all_reduce(zss) var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1)) return torch.sqrt(var + 1e-6).mean() else: return torch.sqrt(y.var(dim=0) + 1e-6).mean() def extract_features( self, source, padding_mask, mask=False, layer=None ): res = self.forward( source, padding_mask, mask=mask, features_only=True, layer=layer, ) return res def remove_pretraining_modules(self, last_layer=None): self.final_proj = None self.ema = None if last_layer is not None: self.encoder.layers = nn.ModuleList( l for i, l in enumerate(self.encoder.layers) if i <= last_layer ) ================================================ FILE: examples/data2vec/models/data2vec_image_classification.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # The code in this file is adapted from the BeiT implementation which can be found here: # https://github.com/microsoft/unilm/tree/master/beit import logging from dataclasses import dataclass from typing import Any from omegaconf import II, MISSING import torch import torch.nn as nn import torch.nn.functional as F from fairseq import checkpoint_utils, tasks from fairseq.dataclass import FairseqDataclass from fairseq.models import BaseFairseqModel, register_model logger = logging.getLogger(__name__) @dataclass class Data2VecImageClassificationConfig(FairseqDataclass): model_path: str = MISSING no_pretrained_weights: bool = False num_classes: int = 1000 mixup: float = 0.8 cutmix: float = 1.0 label_smoothing: float = 0.1 pretrained_model_args: Any = None data: str = II("task.data") @register_model( "data2vec_image_classification", dataclass=Data2VecImageClassificationConfig ) class Data2VecImageClassificationModel(BaseFairseqModel): def __init__(self, cfg: Data2VecImageClassificationConfig): super().__init__() self.cfg = cfg if cfg.pretrained_model_args is None: state = checkpoint_utils.load_checkpoint_to_cpu(cfg.model_path, {}) pretrained_args = state.get("cfg", None) pretrained_args.criterion = None pretrained_args.lr_scheduler = None cfg.pretrained_model_args = pretrained_args logger.info(pretrained_args) else: state = None pretrained_args = cfg.pretrained_model_args pretrained_args.task.data = cfg.data task = tasks.setup_task(pretrained_args.task) model = task.build_model(pretrained_args.model, from_checkpoint=True) model.remove_pretraining_modules() self.model = model if state is not None and not cfg.no_pretrained_weights: self.load_model_weights(state, model, cfg) self.fc_norm = nn.LayerNorm(pretrained_args.model.embed_dim) self.head = nn.Linear(pretrained_args.model.embed_dim, cfg.num_classes) self.head.weight.data.mul_(1e-3) self.head.bias.data.mul_(1e-3) self.mixup_fn = None if cfg.mixup > 0 or cfg.cutmix > 0: from timm.data import Mixup self.mixup_fn = Mixup( mixup_alpha=cfg.mixup, cutmix_alpha=cfg.cutmix, cutmix_minmax=None, prob=1.0, switch_prob=0.5, mode="batch", label_smoothing=cfg.label_smoothing, num_classes=cfg.num_classes, ) def load_model_weights(self, state, model, cfg): if "_ema" in state["model"]: del state["model"]["_ema"] model.load_state_dict(state["model"], strict=True) @classmethod def build_model(cls, cfg: Data2VecImageClassificationConfig, task=None): """Build a new model instance.""" return cls(cfg) def forward( self, img, label=None, ): if self.training and self.mixup_fn is not None and label is not None: img, label = self.mixup_fn(img, label) x = self.model(img, mask=False) x = x[:, 1:] x = self.fc_norm(x.mean(1)) x = self.head(x) if label is None: return x if self.training and self.mixup_fn is not None: loss = -label * F.log_softmax(x.float(), dim=-1) else: loss = F.cross_entropy( x.float(), label, label_smoothing=self.cfg.label_smoothing if self.training else 0, reduction="none", ) result = { "losses": {"regression": loss}, "sample_size": img.size(0), } if not self.training: with torch.no_grad(): pred = x.argmax(-1) correct = (pred == label).sum() result["correct"] = correct return result ================================================ FILE: examples/data2vec/models/data2vec_text.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from typing import Optional import logging import math import torch import torch.nn as nn import torch.nn.functional as F from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.modules import EMAModule, EMAModuleConfig from fairseq.models import ( FairseqEncoder, FairseqEncoderModel, register_model, ) from fairseq.models.roberta.model import RobertaLMHead, RobertaClassificationHead from fairseq.models.transformer import TransformerEncoder, TransformerConfig from fairseq.modules.transformer_sentence_encoder import init_bert_params logger = logging.getLogger(__name__) @dataclass class Data2VecTextConfig(FairseqDataclass): max_positions: int = II("task.tokens_per_sample") head_layers: int = 1 transformer: TransformerConfig = TransformerConfig() load_checkpoint_heads: bool = field( default=False, metadata={"help": "(re-)register and load heads when loading checkpoints"}, ) loss_beta: float = field( default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} ) loss_scale: Optional[float] = field( default=None, metadata={ "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" }, ) average_top_k_layers: int = field( default=8, metadata={"help": "how many layers to average"} ) layer_norm_target_layer: bool = False instance_norm_target_layer: bool = False batch_norm_target_layer: bool = False instance_norm_targets: bool = False layer_norm_targets: bool = False ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) ema_end_decay: float = field( default=0.9999, metadata={"help": "final ema decay rate"} ) # when to finish annealing ema decay rate ema_anneal_end_step: int = II("optimization.max_update") ema_transformer_layers_only: bool = field( default=True, metadata={"help": "whether to momentum update only the transformer layers"}, ) def get_annealed_rate(start, end, curr_step, total_steps): r = end - start pct_remaining = 1 - curr_step / total_steps return end - r * pct_remaining @register_model("data2vec_text", dataclass=Data2VecTextConfig) class Data2VecTextModel(FairseqEncoderModel): def __init__(self, cfg: Data2VecTextConfig, encoder): super().__init__(encoder) self.cfg = cfg # We follow BERT's random weight initialization self.apply(init_bert_params) self.classification_heads = nn.ModuleDict() @classmethod def build_model(cls, cfg, task): """Build a new model instance.""" encoder = Data2VecTextEncoder(cfg, task.source_dictionary, task.cfg.data) return cls(cfg, encoder) def forward( self, src_tokens, target_tokens=None, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs, ): if classification_head_name is not None: features_only = True res = self.encoder( src_tokens, target_tokens, features_only, return_all_hiddens, **kwargs ) if isinstance(res, tuple): x, extra = res else: return res if classification_head_name is not None: x = self.classification_heads[classification_head_name](x) return x, extra def get_normalized_probs(self, net_output, log_probs, sample=None): """Get normalized probabilities (or log probs) from a net's output.""" logits = net_output[0].float() if log_probs: return F.log_softmax(logits, dim=-1) else: return F.softmax(logits, dim=-1) def register_classification_head( self, name, num_classes=None, inner_dim=None, **kwargs ): """Register a classification head.""" if name in self.classification_heads: prev_num_classes = self.classification_heads[name].out_proj.out_features prev_inner_dim = self.classification_heads[name].dense.out_features if num_classes != prev_num_classes or inner_dim != prev_inner_dim: logger.warning( 're-registering head "{}" with num_classes {} (prev: {}) ' "and inner_dim {} (prev: {})".format( name, num_classes, prev_num_classes, inner_dim, prev_inner_dim ) ) self.classification_heads[name] = RobertaClassificationHead( input_dim=self.cfg.transformer.encoder.embed_dim, inner_dim=inner_dim or self.cfg.transformer.encoder.embed_dim, num_classes=num_classes, activation_fn="tanh", pooler_dropout=0, ) @property def supported_targets(self): return {"self"} def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" # rename decoder -> encoder before upgrading children modules for k in list(state_dict.keys()): if k.startswith(prefix + "decoder"): new_k = prefix + "encoder" + k[len(prefix + "decoder") :] state_dict[new_k] = state_dict[k] del state_dict[k] # rename emb_layer_norm -> layernorm_embedding for k in list(state_dict.keys()): if ".emb_layer_norm." in k: new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.") state_dict[new_k] = state_dict[k] del state_dict[k] if self.encoder.regression_head is not None: if ".lm_head." in k: new_k = k.replace(".lm_head.", ".regression_head.") state_dict[new_k] = state_dict[k] del state_dict[k] else: if ".regression_head." in k: del state_dict[k] # upgrade children modules super().upgrade_state_dict_named(state_dict, name) # Handle new classification heads present in the state dict. current_head_names = ( [] if not hasattr(self, "classification_heads") or self.classification_heads is None else self.classification_heads.keys() ) keys_to_delete = [] for k in state_dict.keys(): if not k.startswith(prefix + "classification_heads."): continue head_name = k[len(prefix + "classification_heads.") :].split(".")[0] num_classes = state_dict[ prefix + "classification_heads." + head_name + ".out_proj.weight" ].size(0) inner_dim = state_dict[ prefix + "classification_heads." + head_name + ".dense.weight" ].size(0) if self.cfg.load_checkpoint_heads: if head_name not in current_head_names: self.register_classification_head(head_name, num_classes, inner_dim) else: if head_name not in current_head_names: logger.warning( "deleting classification head ({}) from checkpoint " "not present in current model: {}".format(head_name, k) ) keys_to_delete.append(k) elif ( num_classes != self.classification_heads[head_name].out_proj.out_features or inner_dim != self.classification_heads[head_name].dense.out_features ): logger.warning( "deleting classification head ({}) from checkpoint " "with different dimensions than current model: {}".format( head_name, k ) ) keys_to_delete.append(k) for k in keys_to_delete: del state_dict[k] # Copy any newly-added classification heads into the state dict # with their current weights. if ( hasattr(self, "classification_heads") and self.classification_heads is not None and len(self.classification_heads) > 0 ): cur_state = self.classification_heads.state_dict() for k, v in cur_state.items(): if prefix + "classification_heads." + k not in state_dict: logger.info("Overwriting " + prefix + "classification_heads." + k) state_dict[prefix + "classification_heads." + k] = v for k in list(state_dict.keys()): if k.startswith(prefix + "encoder.lm_head.") or k.startswith( prefix + "encoder.emb_head." ): del state_dict[k] self.encoder.lm_head = None if self.encoder.target_model is None: for k in list(state_dict.keys()): if k.startswith(prefix + "encoder.target_model."): del state_dict[k] if (self.encoder.ema is None) and (prefix + "encoder._ema" in state_dict): del state_dict[prefix + "encoder._ema"] def remove_pretraining_modules(self, last_layer=None): self.encoder.lm_head = None self.encoder.regression_head = None self.encoder.ema = None self.classification_heads = None if last_layer is not None: self.encoder.sentence_encoder.layers = nn.ModuleList( l for i, l in enumerate(self.encoder.sentence_encoder.layers) if i <= last_layer ) self.encoder.sentence_encoder.layer_norm = None class Data2VecTextEncoder(FairseqEncoder): def __init__(self, cfg: Data2VecTextConfig, dictionary, task_data): super().__init__(dictionary) self.cfg = cfg embed_tokens = self.build_embedding( len(dictionary), cfg.transformer.encoder.embed_dim, dictionary.pad() ) self.sentence_encoder = self.build_encoder(cfg, dictionary, embed_tokens) self.mask_idx = dictionary.index("") assert self.mask_idx != dictionary.unk(), dictionary.symbols self.ema = None self.average_top_k_layers = cfg.average_top_k_layers self.loss_scale = cfg.loss_scale assert self.cfg.head_layers >= 1 embed_dim = cfg.transformer.encoder.embed_dim curr_dim = embed_dim projs = [] for i in range(self.cfg.head_layers - 1): next_dim = embed_dim * 2 if i == 0 else curr_dim projs.append(nn.Linear(curr_dim, next_dim)) projs.append(nn.GELU()) curr_dim = next_dim projs.append(nn.Linear(curr_dim, embed_dim)) self.regression_head = nn.Sequential(*projs) self.num_updates = 0 def build_embedding(self, vocab_size, embedding_dim, padding_idx): return nn.Embedding(vocab_size, embedding_dim, padding_idx) def build_encoder(self, cfg, dictionary, embed_tokens): encoder = TransformerEncoder(cfg.transformer, dictionary, embed_tokens, return_fc=True) encoder.apply(init_bert_params) return encoder def build_lm_head(self, embed_dim, output_dim, activation_fn, weight): return RobertaLMHead(embed_dim, output_dim, activation_fn, weight) def make_ema_teacher(self): ema_config = EMAModuleConfig( ema_decay=self.cfg.ema_decay, ema_fp32=True, ) skip_keys = set() if self.cfg.ema_transformer_layers_only: for k, _ in self.sentence_encoder.embed_positions.named_parameters(): skip_keys.add(f"embed_tokens.{k}") for k, _ in self.sentence_encoder.embed_positions.named_parameters(): skip_keys.add(f"embed_positions.{k}") if self.sentence_encoder.layernorm_embedding is not None: for ( k, _, ) in self.sentence_encoder.layernorm_embedding.named_parameters(): skip_keys.add(f"layernorm_embedding.{k}") if self.sentence_encoder.layer_norm is not None: for k, _ in self.sentence_encoder.layer_norm.named_parameters(): skip_keys.add(f"layernorm_embedding.{k}") self.ema = EMAModule( self.sentence_encoder, ema_config, skip_keys=skip_keys, ) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) if self.ema is None and self.regression_head is not None: logger.info(f"making ema teacher") self.make_ema_teacher() elif self.training and self.ema is not None: if self.cfg.ema_decay != self.cfg.ema_end_decay: if num_updates >= self.cfg.ema_anneal_end_step: decay = self.cfg.ema_end_decay else: decay = get_annealed_rate( self.cfg.ema_decay, self.cfg.ema_end_decay, num_updates, self.cfg.ema_anneal_end_step, ) self.ema.set_decay(decay) if self.ema.get_decay() < 1: self.ema.step(self.sentence_encoder) def state_dict(self, destination=None, prefix="", keep_vars=False): state = super().state_dict(destination, prefix, keep_vars) if self.ema is not None: state[prefix + "_ema"] = self.ema.fp32_params return state def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): if self.ema is not None: k = prefix + "_ema" assert k in state_dict self.ema.restore(state_dict[k], True) del state_dict[k] return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) def forward( self, src_tokens, target_tokens=None, features_only=False, return_all_hiddens=False, masked_tokens=None, **unused, ): """ Args: src_tokens (LongTensor): input tokens of shape `(batch, src_len)` features_only (bool, optional): skip LM head and just return features. If True, the output will be of shape `(batch, src_len, embed_dim)`. return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: tuple: - the LM output of shape `(batch, src_len, vocab)` - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the hidden states have shape `(src_len, batch, vocab)`. """ x, extra = self.extract_features( src_tokens, return_all_hiddens=return_all_hiddens ) if features_only: return x, extra assert target_tokens is not None with torch.no_grad(): # use EMA parameter as the teacher self.ema.model.eval() encoder_out = self.ema.model( target_tokens, return_all_hiddens=True, ) y = encoder_out["fc_results"] y = y[-self.average_top_k_layers :] permuted = False if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: y = [tl.permute(1, 2, 0) for tl in y] # TBC -> BCT permuted = True if self.cfg.batch_norm_target_layer: y = [ F.batch_norm( tl.float(), running_mean=None, running_var=None, training=True ) for tl in y ] if self.cfg.instance_norm_target_layer: y = [F.instance_norm(tl.float()) for tl in y] if permuted: y = [tl.transpose(1, 2) for tl in y] # BCT -> BTC if self.cfg.layer_norm_target_layer: y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y] y = sum(y) / len(y) if not permuted: y = y.transpose(0, 1) if self.cfg.layer_norm_targets: y = F.layer_norm(y.float(), y.shape[-1:]) if self.cfg.instance_norm_targets: y = F.instance_norm(y.transpose(1, 2)).transpose(1, 2) masked_indices = src_tokens.eq(self.mask_idx) x = x[masked_indices] y = y[masked_indices] x = self.regression_head(x) sz = x.size(-1) if self.cfg.loss_beta == 0: loss = F.mse_loss(x.float(), y.float(), reduction="none").sum(dim=-1) else: loss = F.smooth_l1_loss( x.float(), y.float(), reduction="none", beta=self.cfg.loss_beta ).sum(dim=-1) result = { "losses": { "main": loss.sum() / math.sqrt(sz) if self.loss_scale <= 0 else loss.sum() * self.loss_scale, }, "sample_size": loss.numel(), } # logging other values other_logs = { "ema_decay": self.ema.get_decay() * 1000 } result["logs"] = other_logs return result def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs): encoder_out = self.sentence_encoder( src_tokens, return_all_hiddens=return_all_hiddens, token_embeddings=kwargs.get("token_embeddings", None), ) # T x B x C -> B x T x C features = encoder_out["encoder_out"][0].transpose(0, 1) inner_states = encoder_out["encoder_states"] if return_all_hiddens else None return features, { "inner_states": inner_states, "encoder_embedding": encoder_out["encoder_embedding"][0], } def output_layer(self, features, masked_tokens=None, **unused): return self.lm_head(features, masked_tokens) def max_positions(self): """Maximum output length supported by the encoder.""" return self.cfg.max_positions ================================================ FILE: examples/data2vec/models/data2vec_text_classification.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # The code in this file is adapted from the BeiT implementation which can be found here: # https://github.com/microsoft/unilm/tree/master/beit import logging from dataclasses import dataclass from typing import Any from omegaconf import II, MISSING import torch import torch.nn as nn import torch.nn.functional as F from fairseq import checkpoint_utils, tasks from fairseq.dataclass import FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from fairseq.models.roberta.model import RobertaClassificationHead from examples.data2vec.data.modality import Modality logger = logging.getLogger(__name__) @dataclass class Data2VecTextClassificationConfig(FairseqDataclass): pooler_dropout: float = 0.0 pooler_activation_fn: str = "tanh" quant_noise_pq: int = 0 quant_noise_pq_block_size: int = 8 spectral_norm_classification_head: bool = False model_path: str = MISSING no_pretrained_weights: bool = False pretrained_model_args: Any = None @register_model( "data2vec_text_classification", dataclass=Data2VecTextClassificationConfig ) class Data2VecTextClassificationModel(BaseFairseqModel): def __init__(self, cfg: Data2VecTextClassificationConfig): super().__init__() self.cfg = cfg if cfg.pretrained_model_args is None: state = checkpoint_utils.load_checkpoint_to_cpu(cfg.model_path, {}) pretrained_args = state.get("cfg", None) pretrained_args.criterion = None pretrained_args.lr_scheduler = None cfg.pretrained_model_args = pretrained_args logger.info(pretrained_args) else: state = None pretrained_args = cfg.pretrained_model_args task = tasks.setup_task(pretrained_args.task) model = task.build_model(pretrained_args.model, from_checkpoint=True) model.remove_pretraining_modules() self.model = model if state is not None and not cfg.no_pretrained_weights: self.load_model_weights(state, model, cfg) self.classification_heads = nn.ModuleDict() def load_model_weights(self, state, model, cfg): for k in list(state["model"].keys()): if ( k.startswith("shared_decoder") or k.startswith("_ema") or "decoder" in k ): logger.info(f"Deleting {k} from checkpoint") del state["model"][k] model.load_state_dict(state["model"], strict=True) @classmethod def build_model(cls, cfg: Data2VecTextClassificationConfig, task=None): """Build a new model instance.""" return cls(cfg) def register_classification_head( self, name, num_classes=None, inner_dim=None, **kwargs ): """Register a classification head.""" if name in self.classification_heads: prev_num_classes = self.classification_heads[name].out_proj.out_features prev_inner_dim = self.classification_heads[name].dense.out_features if num_classes != prev_num_classes or inner_dim != prev_inner_dim: logger.warning( 're-registering head "{}" with num_classes {} (prev: {}) ' "and inner_dim {} (prev: {})".format( name, num_classes, prev_num_classes, inner_dim, prev_inner_dim ) ) embed_dim = self.cfg.pretrained_model_args.model.embed_dim self.classification_heads[name] = RobertaClassificationHead( input_dim=embed_dim, inner_dim=inner_dim or embed_dim, num_classes=num_classes, activation_fn=self.cfg.pooler_activation_fn, pooler_dropout=self.cfg.pooler_dropout, q_noise=self.cfg.quant_noise_pq, qn_block_size=self.cfg.quant_noise_pq_block_size, do_spectral_norm=self.cfg.spectral_norm_classification_head, ) def forward( self, source, id, padding_mask, features_only=True, remove_extra_tokens=True, classification_head_name=None, ): encoder_out = self.model( source, id=id, mode=Modality.TEXT, padding_mask=padding_mask, mask=False, features_only=features_only, remove_extra_tokens=remove_extra_tokens ) logits = self.classification_heads[classification_head_name](encoder_out["x"]) return logits, encoder_out ================================================ FILE: examples/data2vec/models/data2vec_vision.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # The code in this file is adapted from the BeiT implementation which can be found here: # https://github.com/microsoft/unilm/tree/master/beit import logging import math import numpy as np import random from dataclasses import dataclass, field from typing import Optional from omegaconf import II import torch import torch.nn as nn import torch.nn.functional as F import torch.distributed as dist from fairseq.modules import EMAModule, EMAModuleConfig from fairseq.dataclass import FairseqDataclass from fairseq.models import BaseFairseqModel, register_model logger = logging.getLogger(__name__) @dataclass class Data2VecVisionConfig(FairseqDataclass): layer_scale_init_value: float = field( default=1e-4, metadata={"help": "rescale layer outputs, 0 to disable"} ) num_mask_patches: int = field( default=75, metadata={"help": "number of the visual tokens/patches need be masked"}, ) min_mask_patches_per_block: int = 16 max_mask_patches_per_block: int = 196 image_size: int = 224 patch_size: int = 16 in_channels: int = 3 shared_rel_pos_bias: bool = True drop_path: float = 0.1 attention_dropout: float = 0.0 depth: int = 12 embed_dim: int = 768 num_heads: int = 12 mlp_ratio: int = 4 loss_beta: float = field( default=0, metadata={"help": "beta for smooth l1 loss. 0 means use l2 loss"} ) loss_scale: Optional[float] = field( default=None, metadata={ "help": "scale the reconstruction loss by this constant. if None then scales by 1/sqrt(dim)" }, ) average_top_k_layers: int = field( default=8, metadata={"help": "how many layers to average"} ) end_of_block_targets: bool = True layer_norm_target_layer: bool = False instance_norm_target_layer: bool = False batch_norm_target_layer: bool = False instance_norm_targets: bool = False layer_norm_targets: bool = False ema_decay: float = field(default=0.999, metadata={"help": "initial ema decay rate"}) ema_end_decay: float = field( default=0.9999, metadata={"help": "final ema decay rate"} ) # when to finish annealing ema decay rate ema_anneal_end_step: int = II("optimization.max_update") ema_transformer_only: bool = field( default=True, metadata={"help": "whether to momentum update only the transformer layers"}, ) def get_annealed_rate(start, end, curr_step, total_steps): r = end - start pct_remaining = 1 - curr_step / total_steps return end - r * pct_remaining @register_model("data2vec_vision", dataclass=Data2VecVisionConfig) class Data2VecVisionModel(BaseFairseqModel): def __init__(self, cfg: Data2VecVisionConfig): super().__init__() self.cfg = cfg self.ema = None self.average_top_k_layers = cfg.average_top_k_layers self.loss_beta = cfg.loss_beta self.loss_scale = ( cfg.loss_scale if cfg.loss_scale is not None else 1 / math.sqrt(cfg.embed_dim) ) self.patch_embed = PatchEmbed( img_size=cfg.image_size, patch_size=cfg.patch_size, in_chans=cfg.in_channels, embed_dim=cfg.embed_dim, ) patch_size = self.patch_embed.patch_size self.window_size = ( cfg.image_size // patch_size[0], cfg.image_size // patch_size[1], ) self.cls_emb = nn.Parameter(torch.FloatTensor(1, 1, cfg.embed_dim)) self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, cfg.embed_dim)) nn.init.trunc_normal_(self.cls_emb, 0.02) nn.init.trunc_normal_(self.mask_emb, 0.02) self.encoder = TransformerEncoder(cfg, self.patch_embed.patch_shape) self.final_proj = nn.Linear(cfg.embed_dim, cfg.embed_dim) self.num_updates = 0 def make_ema_teacher(self): ema_config = EMAModuleConfig( ema_decay=self.cfg.ema_decay, ema_fp32=True, ) self.ema = EMAModule( self.encoder if self.cfg.ema_transformer_only else self, ema_config, ) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) if self.ema is None and self.final_proj is not None: logger.info(f"making ema teacher") self.make_ema_teacher() elif self.training and self.ema is not None: if self.cfg.ema_decay != self.cfg.ema_end_decay: if num_updates >= self.cfg.ema_anneal_end_step: decay = self.cfg.ema_end_decay else: decay = get_annealed_rate( self.cfg.ema_decay, self.cfg.ema_end_decay, num_updates, self.cfg.ema_anneal_end_step, ) self.ema.set_decay(decay) if self.ema.get_decay() < 1: self.ema.step(self.encoder if self.cfg.ema_transformer_only else self) self.num_updates = num_updates def state_dict(self, destination=None, prefix="", keep_vars=False): state = super().state_dict(destination, prefix, keep_vars) if self.ema is not None: state[prefix + "_ema"] = self.ema.fp32_params return state def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): if self.ema is not None: k = prefix + "_ema" assert k in state_dict self.ema.restore(state_dict[k], True) del state_dict[k] return super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @classmethod def build_model(cls, cfg: Data2VecVisionConfig, task=None): """Build a new model instance.""" return cls(cfg) def make_mask(self, bsz, num_masks, min_masks, max_masks): height, width = self.window_size masks = np.zeros(shape=(bsz, height, width), dtype=np.int) for i in range(bsz): mask = masks[i] mask_count = 0 min_aspect = 0.3 max_aspect = 1 / min_aspect log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) def _mask(mask, max_mask_patches): delta = 0 for attempt in range(10): target_area = random.uniform(min_masks, max_mask_patches) aspect_ratio = math.exp(random.uniform(*log_aspect_ratio)) h = int(round(math.sqrt(target_area * aspect_ratio))) w = int(round(math.sqrt(target_area / aspect_ratio))) if w < width and h < height: top = random.randint(0, height - h) left = random.randint(0, width - w) num_masked = mask[top : top + h, left : left + w].sum() # Overlap if 0 < h * w - num_masked <= max_mask_patches: for i in range(top, top + h): for j in range(left, left + w): if mask[i, j] == 0: mask[i, j] = 1 delta += 1 if delta > 0: break return delta while mask_count < num_masks: max_mask_patches = min(num_masks - mask_count, max_masks) delta = _mask(mask, max_mask_patches) if delta == 0: break else: mask_count += delta return torch.from_numpy(masks) def forward( self, img, mask: bool = True, layer_results: bool = False, ): x = self.patch_embed(img) batch_size, seq_len, _ = x.size() if mask: mask_indices = self.make_mask( img.size(0), self.cfg.num_mask_patches, self.cfg.min_mask_patches_per_block, self.cfg.max_mask_patches_per_block, ) bool_mask = mask_indices.view(mask_indices.size(0), -1).bool() else: mask_indices = bool_mask = None cls_tokens = self.cls_emb.expand(batch_size, -1, -1) x = torch.cat((cls_tokens, x), dim=1) if self.ema is not None: with torch.no_grad(): self.ema.model.eval() if self.cfg.ema_transformer_only: y = self.ema.model( x, layer_results="end" if self.cfg.end_of_block_targets else "fc", ) else: y = self.ema.model( img, mask=False, layer_results=True, ) y = y[-self.cfg.average_top_k_layers :] permuted = False if self.cfg.instance_norm_target_layer or self.cfg.batch_norm_target_layer: y = [tl.transpose(1, 2) for tl in y] # BTC -> BCT permuted = True if self.cfg.batch_norm_target_layer: y = [ F.batch_norm( tl.float(), running_mean=None, running_var=None, training=True ) for tl in y ] if self.cfg.instance_norm_target_layer: y = [F.instance_norm(tl.float()) for tl in y] if permuted: y = [tl.transpose(1, 2) for tl in y] # BCT -> BTC if self.cfg.layer_norm_target_layer: y = [F.layer_norm(tl.float(), tl.shape[-1:]) for tl in y] y = sum(y) / len(y) if self.cfg.layer_norm_targets: y = F.layer_norm(y.float(), y.shape[-1:]) if self.cfg.instance_norm_targets: y = F.instance_norm(y.float().transpose(1, 2)).transpose(1, 2) y = y[bool_mask].float() if mask_indices is not None: mask_token = self.mask_emb.expand(batch_size, seq_len, -1) w = mask_indices.view(mask_indices.size(0), -1, 1).type_as(mask_token) x[:, 1:] = x[:, 1:] * (1 - w) + mask_token * w if layer_results: enc_layer_results = "end" if self.cfg.end_of_block_targets else "fc" else: enc_layer_results = None x = self.encoder(x, layer_results=enc_layer_results) if layer_results or mask_indices is None: return x x = x[bool_mask].float() if self.loss_beta == 0: loss = F.mse_loss(x, y, reduction="none").sum(dim=-1) else: loss = F.smooth_l1_loss(x, y, reduction="none", beta=self.loss_beta).sum( dim=-1 ) if self.loss_scale > 0: loss = loss * self.loss_scale result = { "losses": {"regression": loss.sum()}, "sample_size": loss.numel(), "target_var": self.compute_var(y), "pred_var": self.compute_var(x), "ema_decay": self.ema.get_decay() * 1000, } return result @staticmethod def compute_var(y): y = y.view(-1, y.size(-1)) if dist.is_initialized(): zc = torch.tensor(y.size(0)).cuda() zs = y.sum(dim=0) zss = (y ** 2).sum(dim=0) dist.all_reduce(zc) dist.all_reduce(zs) dist.all_reduce(zss) var = zss / (zc - 1) - (zs ** 2) / (zc * (zc - 1)) return torch.sqrt(var + 1e-6).mean() else: return torch.sqrt(y.var(dim=0) + 1e-6).mean() def remove_pretraining_modules(self, last_layer=None): self.final_proj = None self.ema = None self.encoder.norm = nn.Identity() self.mask_emb = None if last_layer is not None: self.encoder.layers = nn.ModuleList( l for i, l in enumerate(self.encoder.layers) if i <= last_layer ) class PatchEmbed(nn.Module): """Image to Patch Embedding""" def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): super().__init__() if isinstance(img_size, int): img_size = img_size, img_size if isinstance(patch_size, int): patch_size = patch_size, patch_size num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) self.img_size = img_size self.patch_size = patch_size self.num_patches = num_patches self.conv = nn.Conv2d( in_chans, embed_dim, kernel_size=patch_size, stride=patch_size ) def forward(self, x): # BCHW -> BTC x = self.conv(x).flatten(2).transpose(1, 2) return x class Attention(nn.Module): def __init__( self, dim, num_heads=8, qkv_bias=True, attn_drop=0.0, proj_drop=0.0, window_size=None, attn_head_dim=None, ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads if attn_head_dim is not None: head_dim = attn_head_dim all_head_dim = head_dim * self.num_heads self.scale = head_dim ** -0.5 self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) if qkv_bias: self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) else: self.q_bias = None self.v_bias = None if window_size: self.window_size = window_size self.num_relative_distance = (2 * window_size[0] - 1) * ( 2 * window_size[1] - 1 ) + 3 self.relative_position_bias_table = nn.Parameter( torch.zeros(self.num_relative_distance, num_heads) ) # 2*Wh-1 * 2*Ww-1, nH # cls to token & token 2 cls & cls to cls # get pair-wise relative position index for each token inside the window coords_h = torch.arange(window_size[0]) coords_w = torch.arange(window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = ( coords_flatten[:, :, None] - coords_flatten[:, None, :] ) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute( 1, 2, 0 ).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 relative_position_index = torch.zeros( size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype, ) relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 self.register_buffer("relative_position_index", relative_position_index) else: self.window_size = None self.relative_position_bias_table = None self.relative_position_index = None self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(all_head_dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, rel_pos_bias=None): B, N, C = x.shape qkv_bias = None if self.q_bias is not None: qkv_bias = torch.cat( ( self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias, ) ) # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) q, k, v = ( qkv[0], qkv[1], qkv[2], ) # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) if self.relative_position_bias_table is not None: assert 1==2 relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1, ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) print("attn.size() :", attn.size()) print("rel_pos_bias.size() :", rel_pos_bias.size()) if rel_pos_bias is not None: attn = attn + rel_pos_bias attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, -1) x = self.proj(x) x = self.proj_drop(x) return x class RelativePositionBias(nn.Module): def __init__(self, window_size, num_heads): super().__init__() self.window_size = window_size self.num_relative_distance = (2 * window_size[0] - 1) * ( 2 * window_size[1] - 1 ) + 3 self.relative_position_bias_table = nn.Parameter( torch.zeros(self.num_relative_distance, num_heads) ) # get pair-wise relative position index for each token inside the window coords_h = torch.arange(window_size[0]) coords_w = torch.arange(window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = ( coords_flatten[:, :, None] - coords_flatten[:, None, :] ) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute( 1, 2, 0 ).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 relative_position_index = torch.zeros( size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype ) relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 self.register_buffer("relative_position_index", relative_position_index) def forward(self): relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1, ) # Wh*Ww,Wh*Ww,nH print("self.window_size :", self.window_size) print("self.num_relative_distance :", self.num_relative_distance) print("self.relative_position_index :", self.relative_position_index.size(), self.relative_position_index) print("relative_position_bias.size(), relative_position_bias :",relative_position_bias.size(), relative_position_bias) print("self.relative_position_bias_table.size(), self.relative_position_bias_table :",self.relative_position_bias_table.size(), self.relative_position_bias_table) return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww class DropPath(nn.Module): """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" def __init__(self, drop_prob=None): super(DropPath, self).__init__() self.drop_prob = drop_prob def forward(self, x): if self.drop_prob == 0.0 or not self.training: return x keep_prob = 1 - self.drop_prob shape = (x.shape[0],) + (1,) * ( x.ndim - 1 ) # work with diff dim tensors, not just 2D ConvNets random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) random_tensor.floor_() output = x.div(keep_prob) * random_tensor return output def extra_repr(self) -> str: return "p={}".format(self.drop_prob) class Block(nn.Module): def __init__( self, dim, num_heads, mlp_ratio=4.0, drop=0.0, attn_drop=0.0, drop_path=0.0, init_values=None, window_size=None, ): super().__init__() self.norm1 = nn.LayerNorm(dim) self.attn = Attention( dim, num_heads=num_heads, attn_drop=attn_drop, proj_drop=drop, window_size=window_size, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = nn.LayerNorm(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = nn.Sequential( nn.Linear(dim, mlp_hidden_dim), nn.GELU(), nn.Linear(mlp_hidden_dim, dim), nn.Dropout(drop), ) if init_values > 0: self.gamma_1 = nn.Parameter( init_values * torch.ones((dim)), requires_grad=True ) self.gamma_2 = nn.Parameter( init_values * torch.ones((dim)), requires_grad=True ) else: self.gamma_1, self.gamma_2 = None, None def forward(self, x, rel_pos_bias=None): print("inside block :", x.size()) if self.gamma_1 is None: x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)) fc_feature = self.drop_path(self.mlp(self.norm2(x))) x = x + fc_feature else: x = x + self.drop_path( self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias) ) fc_feature = self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) x = x + fc_feature return x, fc_feature class TransformerEncoder(nn.Module): def __init__(self, cfg: Data2VecVisionConfig, patch_shape): super().__init__() self.rel_pos_bias = None if cfg.shared_rel_pos_bias: self.rel_pos_bias = RelativePositionBias( window_size=patch_shape, num_heads=cfg.num_heads ) dpr = [ x.item() for x in torch.linspace(0, cfg.drop_path, cfg.depth) ] # stochastic depth decay rule print("TransformerEncoder > patch_shape :", patch_shape) self.blocks = nn.ModuleList( Block( dim=cfg.embed_dim, num_heads=cfg.num_heads, attn_drop=cfg.attention_dropout, drop_path=dpr[i], init_values=cfg.layer_scale_init_value, window_size=patch_shape if not cfg.shared_rel_pos_bias else None, ) for i in range(cfg.depth) ) self.norm = nn.LayerNorm(cfg.embed_dim) self.apply(self.init_weights) self.fix_init_weight() def init_weights(self, m): std = 0.02 if isinstance(m, nn.Linear): nn.init.trunc_normal_(m.weight, std=std) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) elif isinstance(m, nn.Conv2d): nn.init.trunc_normal_(m.weight, std=std) if m.bias is not None: nn.init.constant_(m.bias, 0) def fix_init_weight(self): def rescale(param, layer_id): param.div_(math.sqrt(2.0 * layer_id)) for layer_id, layer in enumerate(self.blocks): rescale(layer.attn.proj.weight.data, layer_id + 1) rescale(layer.mlp[2].weight.data, layer_id + 1) def extract_features(self, x, layer_results): rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None z = [] for i, blk in enumerate(self.blocks): x, fc_feature = blk(x, rel_pos_bias=rel_pos_bias) if layer_results == "end": z.append(x) elif layer_results == "fc": z.append(fc_feature) return z if layer_results else self.norm(x) def forward(self, x, layer_results=None): x = self.extract_features(x, layer_results=layer_results) if layer_results: return [z[:, 1:] for z in x] x = x[:, 1:] return x ================================================ FILE: examples/data2vec/models/mae.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # The code in this file is adapted from the BeiT implementation which can be found here: # https://github.com/microsoft/unilm/tree/master/beit import logging from dataclasses import dataclass from functools import partial from timm.models.vision_transformer import PatchEmbed, Block import torch import torch.nn as nn import numpy as np from fairseq.dataclass import FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from fairseq.models.wav2vec.wav2vec2 import TransformerSentenceEncoderLayer try: from apex.normalization import FusedLayerNorm except: FusedLayerNorm = nn.LayerNorm import torch.nn.functional as F logger = logging.getLogger(__name__) @dataclass class MaeConfig(FairseqDataclass): input_size: int = 224 in_chans: int = 3 patch_size: int = 16 embed_dim: int = 768 depth: int = 12 num_heads: int = 12 decoder_embed_dim: int = 512 decoder_depth: int = 8 decoder_num_heads: int = 16 mlp_ratio: int = 4 norm_eps: float = 1e-6 drop_path_rate: float = 0.0 mask_ratio: float = 0.75 norm_pix_loss: bool = True w2v_block: bool = False alt_block: bool = False alt_block2: bool = False alt_attention: bool = False block_dropout: float = 0 attention_dropout: float = 0 activation_dropout: float = 0 layer_norm_first: bool = False fused_ln: bool = True end_of_block_targets: bool = True no_decoder_embed: bool = False no_decoder_pos_embed: bool = False mask_noise_std: float = 0 single_qkv: bool = False use_rel_pos_bias: bool = False no_cls: bool = False def modify_relative_position_bias(orig_bias, bsz, mask): if mask is None: return orig_bias.unsqueeze(0).repeat( bsz, 1, 1, 1 ) # heads x seq_len x seq_len => bsz x heads x seq_len x seq_len heads, max_seq_len, max_seq_len = orig_bias.shape # includes CLS token mask_for_rel_pos_bias = torch.cat( (torch.zeros(bsz, 1, dtype=mask.dtype, device=mask.device), mask), dim=1 ).bool() # bsz x seqlen (add CLS token) unmasked_for_rel_pos_bias = ~mask_for_rel_pos_bias unmasked_for_rel_pos_bias = unmasked_for_rel_pos_bias.unsqueeze(1).repeat( 1, heads, 1 ) # bsz x seq_len => bsz x heads x seq_len b_t_t_rel_pos_bias = orig_bias.unsqueeze(0).repeat( bsz, 1, 1, 1 ) # heads x seq_len x seq_len => bsz x heads x seq_len x seq_len b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.masked_select( unmasked_for_rel_pos_bias.unsqueeze(-1) ) b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.view(bsz, heads, -1, max_seq_len) new_len = b_t_t_rel_pos_bias.size(-2) b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.masked_select( unmasked_for_rel_pos_bias.unsqueeze(-2) ) b_t_t_rel_pos_bias = b_t_t_rel_pos_bias.view(bsz, heads, new_len, new_len) return b_t_t_rel_pos_bias class AltBlock(nn.Module): def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, layer_norm_first=True, ffn_targets=False, use_rel_pos_bias=False, window_size=None, alt_attention=False, ): super().__init__() self.layer_norm_first = layer_norm_first self.ffn_targets = ffn_targets from timm.models.vision_transformer import Attention, DropPath, Mlp self.norm1 = norm_layer(dim) self.use_rel_pos_bias = use_rel_pos_bias if use_rel_pos_bias: self.attn = AltAttention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, window_size=window_size, ) else: if alt_attention: from .multi.modules import AltAttention as AltAttention2 self.attn = AltAttention2( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) else: self.attn = Attention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, ) # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop, ) def forward(self, x, rel_pos_bias=None, pos_mask=None): if self.layer_norm_first: if self.use_rel_pos_bias: x = x + self.drop_path( self.attn( self.norm1(x), rel_pos_bias=rel_pos_bias, pos_mask=pos_mask ) ) else: x = x + self.drop_path(self.attn(self.norm1(x))) t = self.mlp(self.norm2(x)) x = x + self.drop_path(t) if not self.ffn_targets: t = x return x, t else: if self.use_rel_pos_bias: x = x + self.drop_path( self.attn(x, rel_pos_bias=rel_pos_bias, pos_mask=pos_mask) ) else: x = x + self.drop_path(self.attn(x)) r = x = self.norm1(x) x = self.mlp(x) t = x x = self.norm2(r + self.drop_path(x)) if not self.ffn_targets: t = x return x, t class AltAttention(nn.Module): def __init__( self, dim, num_heads=8, qkv_bias=True, qk_scale=None, attn_drop=0.0, proj_drop=0.0, window_size=None, attn_head_dim=None, ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads if attn_head_dim is not None: head_dim = attn_head_dim all_head_dim = head_dim * self.num_heads self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) if qkv_bias: self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) else: self.q_bias = None self.v_bias = None if window_size: self.window_size = window_size self.num_relative_distance = (2 * window_size[0] - 1) * ( 2 * window_size[1] - 1 ) + 3 self.relative_position_bias_table = nn.Parameter( torch.zeros(self.num_relative_distance, num_heads) ) # 2*Wh-1 * 2*Ww-1, nH # cls to token & token 2 cls & cls to cls # get pair-wise relative position index for each token inside the window coords_h = torch.arange(window_size[0]) coords_w = torch.arange(window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = ( coords_flatten[:, :, None] - coords_flatten[:, None, :] ) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute( 1, 2, 0 ).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 relative_position_index = torch.zeros( size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype, ) relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 self.register_buffer("relative_position_index", relative_position_index) else: self.window_size = None self.relative_position_bias_table = None self.relative_position_index = None self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(all_head_dim, dim) self.proj_drop = nn.Dropout(proj_drop) def forward(self, x, rel_pos_bias=None, pos_mask=None): B, N, C = x.shape qkv_bias = None if self.q_bias is not None: qkv_bias = torch.cat( ( self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias, ) ) # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) q, k, v = ( qkv[0], qkv[1], qkv[2], ) # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = q @ k.transpose(-2, -1) if self.relative_position_bias_table is not None: relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1, ) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1 ).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + modify_relative_position_bias( relative_position_bias, x.size(0), pos_mask ) if rel_pos_bias is not None: attn = attn + rel_pos_bias attn = attn.softmax(dim=-1) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, -1) x = self.proj(x) x = self.proj_drop(x) return x class RelativePositionBias(nn.Module): def __init__(self, window_size, num_heads): super().__init__() self.window_size = window_size self.num_relative_distance = (2 * window_size[0] - 1) * ( 2 * window_size[1] - 1 ) + 3 self.relative_position_bias_table = nn.Parameter( torch.zeros(self.num_relative_distance, num_heads) ) # get pair-wise relative position index for each token inside the window coords_h = torch.arange(window_size[0]) coords_w = torch.arange(window_size[1]) coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww relative_coords = ( coords_flatten[:, :, None] - coords_flatten[:, None, :] ) # 2, Wh*Ww, Wh*Ww relative_coords = relative_coords.permute( 1, 2, 0 ).contiguous() # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 relative_coords[:, :, 1] += window_size[1] - 1 relative_coords[:, :, 0] *= 2 * window_size[1] - 1 relative_position_index = torch.zeros( size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype ) relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww relative_position_index[0, 0:] = self.num_relative_distance - 3 relative_position_index[0:, 0] = self.num_relative_distance - 2 relative_position_index[0, 0] = self.num_relative_distance - 1 self.register_buffer("relative_position_index", relative_position_index) def forward(self): relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1) ].view( self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1, ) # Wh*Ww,Wh*Ww,nH return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): """ grid_size: int of the grid height and width return: pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) """ grid_h = np.arange(grid_size, dtype=np.float32) grid_w = np.arange(grid_size, dtype=np.float32) grid = np.meshgrid(grid_w, grid_h) # here w goes first grid = np.stack(grid, axis=0) grid = grid.reshape([2, 1, grid_size, grid_size]) pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) if cls_token: pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) return pos_embed def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): assert embed_dim % 2 == 0 # use half of dimensions to encode grid_h emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) return emb def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D) """ assert embed_dim % 2 == 0 omega = np.arange(embed_dim // 2, dtype=np.float) omega /= embed_dim / 2.0 omega = 1.0 / 10000 ** omega # (D/2,) pos = pos.reshape(-1) # (M,) out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product emb_sin = np.sin(out) # (M, D/2) emb_cos = np.cos(out) # (M, D/2) emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) return emb def interpolate_pos_embed(model, checkpoint_model): if "pos_embed" in checkpoint_model: pos_embed_checkpoint = checkpoint_model["pos_embed"] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) # height (== width) for the new position embedding new_size = int(num_patches ** 0.5) # class_token and dist_token are kept unchanged if orig_size != new_size: print( "Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size) ) extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape( -1, orig_size, orig_size, embedding_size ).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate( pos_tokens, size=(new_size, new_size), mode="bicubic", align_corners=False, ) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model["pos_embed"] = new_pos_embed @register_model("mae", dataclass=MaeConfig) class MaeModel(BaseFairseqModel): def __init__(self, cfg: MaeConfig): super().__init__() self.cfg = cfg self.mask_ratio = cfg.mask_ratio # -------------------------------------------------------------------------- # MAE encoder specifics self.patch_embed = PatchEmbed( cfg.input_size, cfg.patch_size, cfg.in_chans, cfg.embed_dim ) num_patches = self.patch_embed.num_patches self.cls_token = nn.Parameter(torch.zeros(1, 1, cfg.embed_dim)) if not cfg.no_cls else None self.pos_embed = nn.Parameter( torch.zeros(1, num_patches + int(not cfg.no_cls), cfg.embed_dim), requires_grad=False ) # fixed sin-cos embedding norm_layer = partial(nn.LayerNorm, eps=cfg.norm_eps) dpr = [ x.item() for x in torch.linspace(0, cfg.drop_path_rate, cfg.depth) ] # stochastic depth decay rule def make_block(drop_path): if cfg.w2v_block: return TransformerSentenceEncoderLayer( embedding_dim=cfg.embed_dim, ffn_embedding_dim=cfg.embed_dim * cfg.mlp_ratio, num_attention_heads=cfg.num_heads, dropout=cfg.block_dropout, attention_dropout=cfg.attention_dropout, activation_dropout=cfg.activation_dropout, activation_fn="gelu", layer_norm_first=cfg.layer_norm_first, drop_path=drop_path, norm_eps=1e-6, single_qkv=cfg.single_qkv, fused_ln=cfg.fused_ln, ) elif cfg.alt_block: window_size = ( cfg.input_size // self.patch_embed.patch_size[0], cfg.input_size // self.patch_embed.patch_size[1], ) return AltBlock( cfg.embed_dim, cfg.num_heads, cfg.mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer, drop_path=drop_path, layer_norm_first=cfg.layer_norm_first, ffn_targets=not cfg.end_of_block_targets, use_rel_pos_bias=cfg.use_rel_pos_bias, window_size=window_size if (self.cfg.use_rel_pos_bias and not self.cfg.shared_rel_pos_bias) else None, alt_attention=cfg.alt_attention, ) elif cfg.alt_block2: from .multi.modules import AltBlock as AltBlock2 return AltBlock2( cfg.embed_dim, cfg.num_heads, cfg.mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer, drop_path=drop_path, layer_norm_first=cfg.layer_norm_first, ffn_targets=not cfg.end_of_block_targets, ) else: return Block( cfg.embed_dim, cfg.num_heads, cfg.mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer, drop_path=drop_path, ) self.blocks = nn.ModuleList([make_block(dpr[i]) for i in range(cfg.depth)]) self.norm = norm_layer(cfg.embed_dim) # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # MAE decoder specifics self.decoder_embed = ( nn.Linear(cfg.embed_dim, cfg.decoder_embed_dim, bias=True) if not cfg.no_decoder_embed else None ) self.mask_token = ( nn.Parameter( torch.zeros( 1, 1, cfg.decoder_embed_dim if not cfg.no_decoder_embed else cfg.embed_dim, ) ) if cfg.mask_noise_std <= 0 else None ) self.decoder_pos_embed = ( nn.Parameter( torch.zeros( 1, num_patches + 1, cfg.decoder_embed_dim if not cfg.no_decoder_embed else cfg.embed_dim, ), requires_grad=False, ) if not cfg.no_decoder_pos_embed else None ) self.decoder_blocks = nn.ModuleList( [ Block( cfg.decoder_embed_dim, cfg.decoder_num_heads, cfg.mlp_ratio, qkv_bias=True, qk_scale=None, norm_layer=norm_layer, ) for _ in range(cfg.decoder_depth) ] ) self.decoder_norm = norm_layer(cfg.decoder_embed_dim) self.decoder_pred = nn.Linear( cfg.decoder_embed_dim, cfg.patch_size ** 2 * cfg.in_chans, bias=True ) # decoder to patch # -------------------------------------------------------------------------- self.norm_pix_loss = cfg.norm_pix_loss self.initialize_weights() for pn, p in self.named_parameters(): if len(p.shape) == 1 or pn.endswith(".bias"): p.param_group = "no_decay" else: p.param_group = "with_decay" def initialize_weights(self): # initialization # initialize (and freeze) pos_embed by sin-cos embedding pos_embed = get_2d_sincos_pos_embed( self.pos_embed.shape[-1], int(self.patch_embed.num_patches ** 0.5), cls_token=not self.cfg.no_cls, ) self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0)) if self.decoder_pos_embed is not None: decoder_pos_embed = get_2d_sincos_pos_embed( self.decoder_pos_embed.shape[-1], int(self.patch_embed.num_patches ** 0.5), cls_token=not self.cfg.no_cls, ) self.decoder_pos_embed.data.copy_( torch.from_numpy(decoder_pos_embed).float().unsqueeze(0) ) # initialize patch_embed like nn.Linear (instead of nn.Conv2d) w = self.patch_embed.proj.weight.data torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.) if self.cls_token is not None: torch.nn.init.normal_(self.cls_token, std=0.02) if self.mask_token is not None: torch.nn.init.normal_(self.mask_token, std=0.02) # initialize nn.Linear and nn.LayerNorm self.apply(self._init_weights) def _init_weights(self, m): if isinstance(m, nn.Linear): # we use xavier_uniform following official JAX ViT: torch.nn.init.xavier_uniform_(m.weight) if isinstance(m, nn.Linear) and m.bias is not None: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.LayerNorm) or isinstance(m, FusedLayerNorm): nn.init.constant_(m.bias, 0) nn.init.constant_(m.weight, 1.0) def patchify(self, imgs): """ imgs: (N, 3, H, W) x: (N, L, patch_size**2 *3) """ p = self.patch_embed.patch_size[0] assert imgs.shape[2] == imgs.shape[3] and imgs.shape[2] % p == 0 h = w = imgs.shape[2] // p x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) x = torch.einsum("nchpwq->nhwpqc", x) x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3)) return x def unpatchify(self, x): """ x: (N, L, patch_size**2 *3) imgs: (N, 3, H, W) """ p = self.patch_embed.patch_size[0] h = w = int(x.shape[1] ** 0.5) assert h * w == x.shape[1] x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) x = torch.einsum("nhwpqc->nchpwq", x) imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) return imgs def random_masking(self, x, mask_ratio): """ Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random noise. x: [N, L, D], sequence """ N, L, D = x.shape # batch, length, dim len_keep = int(L * (1 - mask_ratio)) noise = torch.rand(N, L, device=x.device) # noise in [0, 1] # sort noise for each sample ids_shuffle = torch.argsort( noise, dim=1 ) # ascend: small is keep, large is remove ids_restore = torch.argsort(ids_shuffle, dim=1) # keep the first subset ids_keep = ids_shuffle[:, :len_keep] x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) # generate the binary mask: 0 is keep, 1 is remove mask = torch.ones([N, L], device=x.device) mask[:, :len_keep] = 0 # unshuffle to get the binary mask mask = torch.gather(mask, dim=1, index=ids_restore) return x_masked, mask, ids_restore # x_masked is actually unmasked x @classmethod def build_model(cls, cfg: MaeConfig, task=None): """Build a new model instance.""" return cls(cfg) def forward_encoder(self, x, mask_ratio): # embed patches x = self.patch_embed(x) # add pos embed w/o cls token # if self.cls_token is not None: # x = x + self.pos_embed # else: x = x + self.pos_embed[:, 1:, :] # masking: length -> length * mask_ratio if mask_ratio > 0: x, mask, ids_restore = self.random_masking(x, mask_ratio) else: mask = ids_restore = None # append cls token if self.cls_token is not None: cls_token = self.cls_token + self.pos_embed[:, :1, :] cls_tokens = cls_token.expand(x.shape[0], -1, -1) x = torch.cat((cls_tokens, x), dim=1) # apply Transformer blocks for blk in self.blocks: x = blk(x) if self.norm is not None: x = self.norm(x) return x, mask, ids_restore def forward_decoder(self, x, ids_restore): # embed tokens x = self.decoder_embed(x) # append mask tokens to sequence mask_tokens = self.mask_token.repeat( x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1 ) if self.cls_token is not None: x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token else: x_ = torch.cat([x, mask_tokens], dim=1) # no cls token x_ = torch.gather( x_, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, x.shape[2]) ) # unshuffle if self.cls_token is not None: x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token # add pos embed x = x + self.decoder_pos_embed # apply Transformer blocks for blk in self.decoder_blocks: x = blk(x) x = self.decoder_norm(x) # predictor projection x = self.decoder_pred(x) if self.cls_token is not None: # remove cls token x = x[:, 1:, :] return x def forward_loss(self, imgs, pred, mask): """ imgs: [N, 3, H, W] pred: [N, L, p*p*3] mask: [N, L], 0 is keep, 1 is remove, """ target = self.patchify(imgs) if self.norm_pix_loss: mean = target.mean(dim=-1, keepdim=True) var = target.var(dim=-1, keepdim=True) target = (target - mean) / (var + 1.0e-6) ** 0.5 loss = (pred - target) ** 2 loss = loss.mean(dim=-1) # [N, L], mean loss per patch loss = (loss * mask).sum() return loss, mask.sum() def forward(self, imgs, predictions_only=False): latent, mask, ids_restore = self.forward_encoder( imgs, self.mask_ratio if not predictions_only else 0 ) if predictions_only: return latent pred = self.forward_decoder(latent, ids_restore) # [N, L, p*p*3] loss, sample_size = self.forward_loss(imgs, pred, mask) result = { "losses": {"regression": loss}, "sample_size": sample_size, } return result def remove_pretraining_modules(self): self.decoder_embed = None self.decoder_blocks = None self.decoder_norm = None self.decoder_pos_embed = None self.decoder_pred = None self.mask_token = None if self.cfg.layer_norm_first: self.norm = None ================================================ FILE: examples/data2vec/models/mae_image_classification.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # The code in this file is adapted from the BeiT implementation which can be found here: # https://github.com/microsoft/unilm/tree/master/beit import logging from dataclasses import dataclass from enum import Enum, auto from typing import Any, Optional import numpy as np from omegaconf import II, MISSING import torch import torch.nn as nn import torch.nn.functional as F from fairseq import checkpoint_utils, tasks from omegaconf import open_dict from fairseq.dataclass import FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from .mae import interpolate_pos_embed logger = logging.getLogger(__name__) class PredictionMode(Enum): MEAN_POOLING = auto() CLS_TOKEN = auto() LIN_SOFTMAX = auto() @dataclass class MaeImageClassificationConfig(FairseqDataclass): model_path: str = MISSING no_pretrained_weights: bool = False linear_classifier: bool = False num_classes: int = 1000 mixup: float = 0.8 cutmix: float = 1.0 label_smoothing: float = 0.1 drop_path_rate: float = 0.1 layer_decay: float = 0.65 mixup_prob: float = 1.0 mixup_switch_prob: float = 0.5 mixup_mode: str = "batch" pretrained_model_args: Any = None data: str = II("task.data") norm_eps: Optional[float] = None remove_alibi: bool = False # regularization overwrites encoder_dropout: float = 0 post_mlp_drop: float = 0 attention_dropout: float = 0 activation_dropout: float = 0.0 dropout_input: float = 0.0 layerdrop: float = 0.0 prenet_layerdrop: float = 0 prenet_dropout: float = 0 use_fc_norm: bool = True prediction_mode: PredictionMode = PredictionMode.MEAN_POOLING no_decay_blocks: bool = True def get_layer_id_for_vit(name, num_layers): """ Assign a parameter with its layer id Following BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L33 """ if name in ["cls_token", "pos_embed"]: return 0 elif name.startswith("patch_embed"): return 0 elif name.startswith("rel_pos_bias"): return num_layers - 1 elif name.startswith("blocks"): return int(name.split(".")[1]) + 1 else: return num_layers @register_model("mae_image_classification", dataclass=MaeImageClassificationConfig) class MaeImageClassificationModel(BaseFairseqModel): def __init__(self, cfg: MaeImageClassificationConfig): super().__init__() self.cfg = cfg if cfg.pretrained_model_args is None: state = checkpoint_utils.load_checkpoint_to_cpu(cfg.model_path, {}) pretrained_args = state.get("cfg", None) pretrained_args.criterion = None pretrained_args.lr_scheduler = None logger.info(pretrained_args.model) with open_dict(pretrained_args.model): pretrained_args.model.drop_path_rate = cfg.drop_path_rate if cfg.norm_eps is not None: pretrained_args.model.norm_eps = cfg.norm_eps cfg.pretrained_model_args = pretrained_args logger.info(pretrained_args) else: state = None pretrained_args = cfg.pretrained_model_args if "data" in pretrained_args.task: pretrained_args.task.data = cfg.data elif "image" in pretrained_args.task: pretrained_args.task.image.data = cfg.data if "modalities" in pretrained_args.model: prenet_blocks = pretrained_args.model["modalities"]["image"]["prenet_depth"] model_blocks = pretrained_args.model["depth"] with open_dict(pretrained_args): dpr = np.linspace(0, cfg.drop_path_rate, model_blocks).tolist() pretrained_args.model["modalities"]["image"][ "start_drop_path_rate" ] = dpr[0] pretrained_args.model["modalities"]["image"][ "end_drop_path_rate" ] = max(0, dpr[prenet_blocks - 1]) pretrained_args.model["start_drop_path_rate"] = dpr[prenet_blocks] pretrained_args.model["end_drop_path_rate"] = dpr[-1] if "mae_masking" in pretrained_args.model["modalities"]["image"]: del pretrained_args.model["modalities"]["image"]["mae_masking"] if cfg.remove_alibi: pretrained_args.model["modalities"]["image"][ "use_alibi_encoder" ] = False if ( state is not None and "modality_encoders.IMAGE.alibi_bias" in state["model"] ): del state["model"]["modality_encoders.IMAGE.alibi_bias"] pretrained_args.model["encoder_dropout"] = cfg.encoder_dropout pretrained_args.model["post_mlp_drop"] = cfg.post_mlp_drop pretrained_args.model["attention_dropout"] = cfg.attention_dropout pretrained_args.model["activation_dropout"] = cfg.activation_dropout pretrained_args.model["dropout_input"] = cfg.dropout_input pretrained_args.model["layerdrop"] = cfg.layerdrop pretrained_args.model["modalities"]["image"][ "prenet_layerdrop" ] = cfg.prenet_layerdrop pretrained_args.model["modalities"]["image"][ "prenet_dropout" ] = cfg.prenet_dropout else: # not d2v multi with open_dict(pretrained_args): pretrained_args.model["drop_path_rate"] = cfg.drop_path_rate pretrained_args.model["block_dropout"] = cfg.encoder_dropout pretrained_args.model["attention_dropout"] = cfg.attention_dropout pretrained_args.model["activation_dropout"] = cfg.activation_dropout task = tasks.setup_task(pretrained_args.task) model = task.build_model(pretrained_args.model, from_checkpoint=True) self.d2v_multi = "data2vec_multi" in pretrained_args.model._name self.linear_classifier = cfg.linear_classifier self.model = model if state is not None and not cfg.no_pretrained_weights: interpolate_pos_embed(model, state) if "modality_encoders.IMAGE.positional_encoder.pos_embed" in state["model"]: state["model"][ "modality_encoders.IMAGE.positional_encoder.positions" ] = state["model"][ "modality_encoders.IMAGE.positional_encoder.pos_embed" ] del state["model"][ "modality_encoders.IMAGE.positional_encoder.pos_embed" ] if "modality_encoders.IMAGE.encoder_mask" in state["model"]: del state["model"]["modality_encoders.IMAGE.encoder_mask"] model.load_state_dict(state["model"], strict=True) if self.d2v_multi: model.remove_pretraining_modules(modality="image") else: model.remove_pretraining_modules() if self.linear_classifier: model.requires_grad_(False) self.fc_norm = None if self.cfg.use_fc_norm: self.fc_norm = nn.LayerNorm(pretrained_args.model.embed_dim, eps=1e-6) nn.init.constant_(self.fc_norm.bias, 0) nn.init.constant_(self.fc_norm.weight, 1.0) self.head = nn.Linear(pretrained_args.model.embed_dim, cfg.num_classes) nn.init.trunc_normal_(self.head.weight, std=0.02) nn.init.constant_(self.head.bias, 0) self.mixup_fn = None if cfg.mixup > 0 or cfg.cutmix > 0: from timm.data import Mixup self.mixup_fn = Mixup( mixup_alpha=cfg.mixup, cutmix_alpha=cfg.cutmix, cutmix_minmax=None, prob=cfg.mixup_prob, switch_prob=cfg.mixup_switch_prob, mode=cfg.mixup_mode, label_smoothing=cfg.label_smoothing, num_classes=cfg.num_classes, ) if self.model.norm is not None: for pn, p in self.model.norm.named_parameters(): if len(p.shape) == 1 or pn.endswith(".bias"): p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} if self.fc_norm is not None: for pn, p in self.fc_norm.named_parameters(): if len(p.shape) == 1 or pn.endswith(".bias"): p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} for pn, p in self.head.named_parameters(): if len(p.shape) == 1 or pn.endswith(".bias"): p.optim_overrides = {"optimizer": {"weight_decay_scale": 0}} if self.d2v_multi: mod_encs = list(model.modality_encoders.values()) assert len(mod_encs) == 1, len(mod_encs) blocks = list(mod_encs[0].context_encoder.blocks) + list(model.blocks) else: blocks = model.blocks num_layers = len(blocks) + 1 layer_scales = list( cfg.layer_decay ** (num_layers - i) for i in range(num_layers + 1) ) if self.d2v_multi: for n, p in self.model.named_parameters(): optimizer_override_dict = {} if len(p.shape) == 1 or n.endswith(".bias"): optimizer_override_dict["weight_decay_scale"] = 0 p.optim_overrides = {"optimizer": optimizer_override_dict} if cfg.layer_decay > 0: for i, b in enumerate(blocks): lid = i + 1 if layer_scales[lid] == 1.0: continue for n, p in b.named_parameters(): optim_override = getattr(p, "optim_overrides", {}) if "optimizer" not in optim_override: optim_override["optimizer"] = {} if cfg.no_decay_blocks: optim_override["optimizer"]["lr_scale"] = layer_scales[lid] p.optim_overrides = optim_override else: optim_override["optimizer"] = { "lr_scale": layer_scales[lid] } p.optim_overrides = optim_override else: for n, p in self.model.named_parameters(): optimizer_override_dict = {} layer_id = get_layer_id_for_vit(n, num_layers) if len(p.shape) == 1 or n.endswith(".bias"): optimizer_override_dict["weight_decay_scale"] = 0 if cfg.layer_decay > 0: optimizer_override_dict["lr_scale"] = layer_scales[layer_id] p.optim_overrides = {"optimizer": optimizer_override_dict} @classmethod def build_model(cls, cfg: MaeImageClassificationConfig, task=None): """Build a new model instance.""" return cls(cfg) def forward( self, imgs, labels=None, ): if self.training and self.mixup_fn is not None and labels is not None: imgs, labels = self.mixup_fn(imgs, labels) if self.linear_classifier: with torch.no_grad(): x = self.model_forward(imgs) else: x = self.model_forward(imgs) if self.cfg.prediction_mode == PredictionMode.MEAN_POOLING: x = x.mean(dim=1) elif self.cfg.prediction_mode == PredictionMode.CLS_TOKEN: x = x[:, 0] elif self.cfg.prediction_mode == PredictionMode.LIN_SOFTMAX: dtype = x.dtype x = F.logsigmoid(x.float()) x = torch.logsumexp(x + x, dim=1) - torch.logsumexp(x + 1e-6, dim=1) x = x.clamp(max=0) x = x - torch.log(-(torch.expm1(x))) x = torch.nan_to_num(x, nan=0, posinf=0, neginf=0) x = x.to(dtype=dtype) else: raise Exception(f"unknown prediction mode {self.cfg.prediction_mode.name}") if self.fc_norm is not None: x = self.fc_norm(x) x = self.head(x) if labels is None: return x if self.training and self.mixup_fn is not None: loss = -labels * F.log_softmax(x.float(), dim=-1) else: loss = F.cross_entropy( x.float(), labels, label_smoothing=self.cfg.label_smoothing if self.training else 0, reduction="none", ) result = { "losses": {"regression": loss}, "sample_size": imgs.size(0), } if not self.training: with torch.no_grad(): pred = x.argmax(-1) correct = (pred == labels).sum() result["correct"] = correct return result def model_forward(self, imgs): if self.d2v_multi: x = self.model.extract_features( imgs, mode="IMAGE", mask=False, remove_extra_tokens=( self.cfg.prediction_mode != PredictionMode.CLS_TOKEN ), )["x"] else: x = self.model(imgs, predictions_only=True) if ( "no_cls" not in self.model.cfg or not self.model.cfg.no_cls ) and not self.cfg.prediction_mode == PredictionMode.CLS_TOKEN: x = x[:, 1:] return x ================================================ FILE: examples/data2vec/models/modalities/__init__.py ================================================ ================================================ FILE: examples/data2vec/models/modalities/audio.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from functools import partial import torch import torch.nn as nn import numpy as np from dataclasses import dataclass, field from typing import Callable, Dict, Optional from fairseq.models.wav2vec import ConvFeatureExtractionModel from fairseq.modules import ( LayerNorm, SamePad, TransposeLast, ) from fairseq.tasks import FairseqTask from .base import D2vModalityConfig, ModalitySpecificEncoder, get_alibi_bias from .modules import BlockEncoder, Decoder1d from examples.data2vec.data.modality import Modality @dataclass class D2vAudioConfig(D2vModalityConfig): type: Modality = Modality.AUDIO extractor_mode: str = "layer_norm" feature_encoder_spec: str = field( default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", metadata={ "help": "string describing convolutional feature extraction layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" }, ) conv_pos_width: int = field( default=95, metadata={"help": "number of filters for convolutional positional embeddings"}, ) conv_pos_groups: int = field( default=16, metadata={"help": "number of groups for convolutional positional embedding"}, ) conv_pos_depth: int = field( default=5, metadata={"help": "depth of positional encoder network"}, ) conv_pos_pre_ln: bool = False class AudioEncoder(ModalitySpecificEncoder): modality_cfg: D2vAudioConfig def __init__( self, modality_cfg: D2vAudioConfig, embed_dim: int, make_block: Callable[[float], nn.ModuleList], norm_layer: Callable[[int], nn.LayerNorm], layer_norm_first: bool, alibi_biases: Dict, task: Optional[FairseqTask], ): self.feature_enc_layers = eval(modality_cfg.feature_encoder_spec) feature_embed_dim = self.feature_enc_layers[-1][0] local_encoder = ConvFeatureExtractionModel( conv_layers=self.feature_enc_layers, dropout=0.0, mode=modality_cfg.extractor_mode, conv_bias=False, ) project_features = nn.Sequential( TransposeLast(), nn.LayerNorm(feature_embed_dim), nn.Linear(feature_embed_dim, embed_dim), ) num_pos_layers = modality_cfg.conv_pos_depth k = max(3, modality_cfg.conv_pos_width // num_pos_layers) positional_encoder = nn.Sequential( TransposeLast(), *[ nn.Sequential( nn.Conv1d( embed_dim, embed_dim, kernel_size=k, padding=k // 2, groups=modality_cfg.conv_pos_groups, ), SamePad(k), TransposeLast(), LayerNorm(embed_dim, elementwise_affine=False), TransposeLast(), nn.GELU(), ) for _ in range(num_pos_layers) ], TransposeLast(), ) if modality_cfg.conv_pos_pre_ln: positional_encoder = nn.Sequential(LayerNorm(embed_dim), positional_encoder) dpr = np.linspace( modality_cfg.start_drop_path_rate, modality_cfg.end_drop_path_rate, modality_cfg.prenet_depth, ) context_encoder = BlockEncoder( nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)), norm_layer(embed_dim) if not layer_norm_first else None, layer_norm_first, modality_cfg.prenet_layerdrop, modality_cfg.prenet_dropout, ) decoder = ( Decoder1d(modality_cfg.decoder, embed_dim) if modality_cfg.decoder is not None else None ) alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases) super().__init__( modality_cfg=modality_cfg, embed_dim=embed_dim, local_encoder=local_encoder, project_features=project_features, fixed_positional_encoder=None, relative_positional_encoder=positional_encoder, context_encoder=context_encoder, decoder=decoder, get_alibi_bias=alibi_bias_fn, ) def convert_padding_mask(self, x, padding_mask): def get_feat_extract_output_lengths(input_lengths: torch.LongTensor): """ Computes the output length of the convolutional layers """ def _conv_out_length(input_length, kernel_size, stride): return torch.floor((input_length - kernel_size) / stride + 1) for i in range(len(self.feature_enc_layers)): input_lengths = _conv_out_length( input_lengths, self.feature_enc_layers[i][1], self.feature_enc_layers[i][2], ) return input_lengths.to(torch.long) if padding_mask is not None: input_lengths = (1 - padding_mask.long()).sum(-1) # apply conv formula to get real output_lengths output_lengths = get_feat_extract_output_lengths(input_lengths) if padding_mask.any(): padding_mask = torch.zeros(x.shape[:2], dtype=x.dtype, device=x.device) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[ ( torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1, ) ] = 1 padding_mask = ( 1 - padding_mask.flip([-1]).cumsum(-1).flip([-1]) ).bool() else: padding_mask = torch.zeros( x.shape[:2], dtype=torch.bool, device=x.device ) return padding_mask def reset_parameters(self): super().reset_parameters() for mod in self.project_features.children(): if isinstance(mod, nn.Linear): mod.reset_parameters() if self.decoder is not None: self.decoder.reset_parameters() ================================================ FILE: examples/data2vec/models/modalities/base.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from collections import namedtuple from dataclasses import dataclass from functools import partial from omegaconf import MISSING, II from typing import Optional, Callable from fairseq.data.data_utils import compute_mask_indices from fairseq.modules import GradMultiply from fairseq.utils import index_put from examples.data2vec.data.modality import Modality from .modules import D2vDecoderConfig logger = logging.getLogger(__name__) @dataclass class D2vModalityConfig: type: Modality = MISSING prenet_depth: int = 4 prenet_layerdrop: float = 0 prenet_dropout: float = 0 start_drop_path_rate: float = 0 end_drop_path_rate: float = 0 num_extra_tokens: int = 0 init_extra_token_zero: bool = True mask_noise_std: float = 0.01 mask_prob_min: Optional[float] = None mask_prob: float = 0.7 inverse_mask: bool = False mask_prob_adjust: float = 0 keep_masked_pct: float = 0 mask_length: int = 5 add_masks: bool = False remove_masks: bool = False mask_dropout: float = 0.0 encoder_zero_mask: bool = True mask_channel_prob: float = 0.0 mask_channel_length: int = 64 ema_local_encoder: bool = False # used in data2vec_multi local_grad_mult: float = 1.0 use_alibi_encoder: bool = False alibi_scale: float = 1.0 learned_alibi: bool = False alibi_max_pos: Optional[int] = None learned_alibi_scale: bool = False learned_alibi_scale_per_head: bool = False learned_alibi_scale_per_layer: bool = False num_alibi_heads: int = II("model.num_heads") model_depth: int = II("model.depth") decoder: Optional[D2vDecoderConfig] = D2vDecoderConfig() MaskSeed = namedtuple("MaskSeed", ["seed", "update", "ids"]) MaskInfo = namedtuple("MaskInfo", ["x_unmasked", "mask", "ids_restore", "ids_keep"]) class ModalitySpecificEncoder(nn.Module): def __init__( self, modality_cfg: D2vModalityConfig, embed_dim: int, local_encoder: nn.Module, project_features: nn.Module, fixed_positional_encoder: Optional[nn.Module], relative_positional_encoder: Optional[nn.Module], context_encoder: nn.Module, decoder: nn.Module, get_alibi_bias: Optional[Callable[[int, int, str, str], torch.Tensor]], ): super().__init__() self.modality_cfg = modality_cfg self.local_encoder = local_encoder self.project_features = project_features self.fixed_positional_encoder = fixed_positional_encoder self.relative_positional_encoder = relative_positional_encoder self.context_encoder = context_encoder self.decoder = decoder self.get_alibi_bias = get_alibi_bias if modality_cfg.use_alibi_encoder else None self.local_grad_mult = self.modality_cfg.local_grad_mult self.extra_tokens = None if modality_cfg.num_extra_tokens > 0: self.extra_tokens = nn.Parameter( torch.zeros(1, modality_cfg.num_extra_tokens, embed_dim) ) if not modality_cfg.init_extra_token_zero: nn.init.normal_(self.extra_tokens) elif self.extra_tokens.size(1) > 1: nn.init.normal_(self.extra_tokens[:, 1:]) self.alibi_scale = None if self.get_alibi_bias is not None: self.alibi_scale = nn.Parameter( torch.full( ( (modality_cfg.prenet_depth + modality_cfg.model_depth) if modality_cfg.learned_alibi_scale_per_layer else 1, 1, self.modality_cfg.num_alibi_heads if modality_cfg.learned_alibi_scale_per_head else 1, 1, 1, ), modality_cfg.alibi_scale, dtype=torch.float, ), requires_grad=modality_cfg.learned_alibi_scale, ) if modality_cfg.learned_alibi and self.get_alibi_bias is not None: assert modality_cfg.alibi_max_pos is not None alibi_bias = self.get_alibi_bias( batch_size=1, time_steps=modality_cfg.alibi_max_pos, heads=modality_cfg.num_alibi_heads, scale=1.0, dtype=torch.float, device="cpu", ) self.alibi_bias = nn.Parameter(alibi_bias) self.get_alibi_bias = partial( _learned_alibi_bias, alibi_bias=self.alibi_bias ) def upgrade_state_dict_named(self, state_dict, name): k = f"{name}.alibi_scale" if k in state_dict and state_dict[k].dim() == 4: state_dict[k] = state_dict[k].unsqueeze(0) return state_dict def convert_padding_mask(self, x, padding_mask): return padding_mask def decoder_input(self, x, mask_info: MaskInfo): inp_drop = self.modality_cfg.decoder.input_dropout if inp_drop > 0: x = F.dropout(x, inp_drop, training=self.training, inplace=True) num_extra = self.modality_cfg.num_extra_tokens if mask_info is not None: num_masked = mask_info.ids_restore.shape[1] - x.shape[1] + num_extra mask_tokens = x.new_empty( x.size(0), num_masked, x.size(-1), ).normal_(0, self.modality_cfg.mask_noise_std) x_ = torch.cat([x[:, num_extra:], mask_tokens], dim=1) x = torch.gather(x_, dim=1, index=mask_info.ids_restore) if self.modality_cfg.decoder.add_positions_masked: assert self.fixed_positional_encoder is not None pos = self.fixed_positional_encoder(x, None) x = x + (pos * mask_info.mask.unsqueeze(-1)) else: x = x[:, num_extra:] if self.modality_cfg.decoder.add_positions_all: assert self.fixed_positional_encoder is not None x = x + self.fixed_positional_encoder(x, None) return x, mask_info def local_features(self, features): if self.local_grad_mult > 0: if self.local_grad_mult == 1.0: x = self.local_encoder(features) else: x = GradMultiply.apply( self.local_encoder(features), self.local_grad_mult ) else: with torch.no_grad(): x = self.local_encoder(features) x = self.project_features(x) return x def contextualized_features( self, x, padding_mask, mask, remove_masked, clone_batch: int = 1, mask_seeds: Optional[torch.Tensor] = None, precomputed_mask=None, ): if padding_mask is not None: padding_mask = self.convert_padding_mask(x, padding_mask) local_features = x if mask and clone_batch == 1: local_features = local_features.clone() orig_B, orig_T, _ = x.shape pre_mask_B = orig_B mask_info = None x_pos = None if self.fixed_positional_encoder is not None: x = x + self.fixed_positional_encoder(x, padding_mask) if mask: if clone_batch > 1: x = x.repeat_interleave(clone_batch, 0) if mask_seeds is not None: clone_hash = [ int(hash((mask_seeds.seed, ind)) % 1e10) for ind in range(clone_batch - 1) ] clone_hash = torch.tensor([0] + clone_hash).long().view(1, -1) id = mask_seeds.ids id = id.repeat_interleave(clone_batch, 0) id = id.view(-1, clone_batch) + clone_hash.to(id) id = id.view(-1) mask_seeds = MaskSeed( seed=mask_seeds.seed, update=mask_seeds.update, ids=id ) if padding_mask is not None: padding_mask = padding_mask.repeat_interleave(clone_batch, 0) x, mask_info = self.compute_mask( x, padding_mask, mask_seed=mask_seeds, apply=self.relative_positional_encoder is not None or not remove_masked, precomputed_mask=precomputed_mask, ) if self.relative_positional_encoder is not None: x_pos = self.relative_positional_encoder(x) masked_padding_mask = padding_mask if mask and remove_masked: x = mask_info.x_unmasked if x_pos is not None: x = x + gather_unmasked(x_pos, mask_info) if padding_mask is not None and padding_mask.any(): masked_padding_mask = gather_unmasked_mask(padding_mask, mask_info) if not masked_padding_mask.any(): masked_padding_mask = None else: masked_padding_mask = None elif x_pos is not None: x = x + x_pos alibi_bias = None alibi_scale = self.alibi_scale if self.get_alibi_bias is not None: alibi_bias = self.get_alibi_bias( batch_size=pre_mask_B, time_steps=orig_T, heads=self.modality_cfg.num_alibi_heads, dtype=torch.float32, device=x.device, ) if alibi_scale is not None: alibi_scale = alibi_scale.clamp_min(0) if alibi_scale.size(0) == 1: alibi_bias = alibi_bias * alibi_scale.squeeze(0).type_as(alibi_bias) alibi_scale = None if clone_batch > 1: alibi_bias = alibi_bias.repeat_interleave(clone_batch, 0) if mask_info is not None and remove_masked: alibi_bias = masked_alibi(alibi_bias, mask_info) if self.extra_tokens is not None: num = self.extra_tokens.size(1) x = torch.cat([self.extra_tokens.expand(x.size(0), -1, -1), x], dim=1) if masked_padding_mask is not None: # B x T masked_padding_mask = F.pad(masked_padding_mask, (num, 0)) if alibi_bias is not None: # B x H x T x T alibi_bias = F.pad(alibi_bias, (num, 0, num, 0)) x = self.context_encoder( x, masked_padding_mask, alibi_bias, alibi_scale[: self.modality_cfg.prenet_depth] if alibi_scale is not None else None, ) return { "x": x, "local_features": local_features, "padding_mask": masked_padding_mask, "alibi_bias": alibi_bias, "alibi_scale": alibi_scale[self.modality_cfg.prenet_depth :] if alibi_scale is not None and alibi_scale.size(0) > 1 else alibi_scale, "encoder_mask": mask_info, } def forward( self, features, padding_mask, mask: bool, remove_masked: bool, clone_batch: int = 1, mask_seeds: Optional[torch.Tensor] = None, precomputed_mask=None, ): x = self.local_features(features) return self.contextualized_features( x, padding_mask, mask, remove_masked, clone_batch, mask_seeds, precomputed_mask, ) def reset_parameters(self): pass def compute_mask( self, x, padding_mask, mask_seed: Optional[MaskSeed], apply, precomputed_mask, ): if precomputed_mask is not None: mask = precomputed_mask mask_info = self.make_maskinfo(x, mask) else: B, T, C = x.shape cfg = self.modality_cfg mask_prob = cfg.mask_prob if ( cfg.mask_prob_min is not None and cfg.mask_prob_min >= 0 and cfg.mask_prob_min < mask_prob ): mask_prob = np.random.uniform(cfg.mask_prob_min, mask_prob) if mask_prob > 0: if cfg.mask_length == 1: mask_info = random_masking(x, mask_prob, mask_seed) else: if self.modality_cfg.inverse_mask: mask_prob = 1 - mask_prob mask = compute_mask_indices( (B, T), padding_mask, mask_prob, cfg.mask_length, min_masks=1, require_same_masks=True, mask_dropout=cfg.mask_dropout, add_masks=cfg.add_masks, seed=mask_seed.seed if mask_seed is not None else None, epoch=mask_seed.update if mask_seed is not None else None, indices=mask_seed.ids if mask_seed is not None else None, ) mask = torch.from_numpy(mask).to(device=x.device) if self.modality_cfg.inverse_mask: mask = 1 - mask mask_info = self.make_maskinfo(x, mask) else: mask_info = None if apply: x = self.apply_mask(x, mask_info) return x, mask_info def make_maskinfo(self, x, mask, shape=None): if shape is None: B, T, D = x.shape else: B, T, D = shape mask = mask.to(torch.uint8) ids_shuffle = mask.argsort(dim=1) ids_restore = ids_shuffle.argsort(dim=1).unsqueeze(-1).expand(-1, -1, D) len_keep = T - mask[0].sum() if self.modality_cfg.keep_masked_pct > 0: len_keep += round((T - int(len_keep)) * self.modality_cfg.keep_masked_pct) ids_keep = ids_shuffle[:, :len_keep] if shape is not None: x_unmasked = None else: ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D) x_unmasked = torch.gather(x, dim=1, index=ids_keep) mask_info = MaskInfo( x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep, ) return mask_info def apply_mask(self, x, mask_info): cfg = self.modality_cfg B, T, C = x.shape if mask_info is not None: mask = mask_info.mask if cfg.encoder_zero_mask: x = x * (1 - mask.type_as(x).unsqueeze(-1)) else: num_masks = mask.sum().item() masks = x.new_empty(num_masks, x.size(-1)).normal_( 0, cfg.mask_noise_std ) x = index_put(x, mask, masks) if cfg.mask_channel_prob > 0: mask_channel = compute_mask_indices( (B, C), None, cfg.mask_channel_prob, cfg.mask_channel_length, ) mask_channel = ( torch.from_numpy(mask_channel) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x = index_put(x, mask_channel, 0) return x def remove_pretraining_modules(self, keep_decoder=False): if not keep_decoder: self.decoder = None def get_annealed_rate(start, end, curr_step, total_steps): if curr_step >= total_steps: return end r = end - start pct_remaining = 1 - curr_step / total_steps return end - r * pct_remaining # adapted from MAE def random_masking(x, mask_ratio, mask_seed: Optional[MaskSeed]): N, L, D = x.shape # batch, length, dim len_keep = int(L * (1 - mask_ratio)) generator = None if mask_seed is not None: seed = int( hash((mask_seed.seed, mask_seed.update, mask_seed.ids.sum().item())) % 1e6 ) generator = torch.Generator(device=x.device) generator.manual_seed(seed) noise = torch.rand(N, L, generator=generator, device=x.device) # noise in [0, 1] # sort noise for each sample ids_shuffle = noise.argsort(dim=1) # ascend: small is keep, large is remove ids_restore = ids_shuffle.argsort(dim=1) # keep the first subset ids_keep = ids_shuffle[:, :len_keep] ids_keep = ids_keep.unsqueeze(-1).expand(-1, -1, D) x_unmasked = torch.gather(x, dim=1, index=ids_keep) # generate the binary mask: 0 is keep, 1 is remove mask = torch.ones([N, L], dtype=x.dtype, device=x.device) mask[:, :len_keep] = 0 # unshuffle to get the binary mask mask = torch.gather(mask, dim=1, index=ids_restore) ids_restore = ids_restore.unsqueeze(-1).expand(-1, -1, D) return MaskInfo( x_unmasked=x_unmasked, mask=mask, ids_restore=ids_restore, ids_keep=ids_keep ) def gather_unmasked(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor: return torch.gather( x, dim=1, index=mask_info.ids_keep, ) def gather_unmasked_mask(x: torch.Tensor, mask_info: MaskInfo) -> torch.Tensor: return torch.gather( x, dim=1, index=mask_info.ids_keep[..., 0], # ignore the feature dimension ) def get_alibi( max_positions: int, attention_heads: int, dims: int = 1, distance: str = "manhattan", ): def get_slopes(n): def get_slopes_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) ratio = start return [start * ratio**i for i in range(n)] # In the paper, we only train models that have 2^a heads for some # a. This function has some good properties that only occur when # the input is a power of 2. To maintain that even when the number # of heads is not a power of 2, we use this workaround. if math.log2(n).is_integer(): return get_slopes_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) return ( get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] ) maxpos = max_positions attn_heads = attention_heads slopes = torch.Tensor(get_slopes(attn_heads)) if dims == 1: # prepare alibi position linear bias. Note that wav2vec2 is non # autoregressive model so we want a symmetric mask with 0 on the # diagonal and other wise linear decreasing valuees pos_bias = ( torch.abs( torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1) ) * -1 ) elif dims == 2: if distance == "manhattan": df = lambda x1, y1, x2, y2: abs(x1 - x2) + abs(y1 - y2) elif distance == "euclidean": df = lambda x1, y1, x2, y2: math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2) n = math.sqrt(max_positions) assert n.is_integer(), n n = int(n) pos_bias = torch.zeros((max_positions, max_positions)) for i in range(n): for j in range(n): for k in range(n): for l in range(n): new_x = i * n + j new_y = k * n + l pos_bias[new_x, new_y] = -df(i, j, k, l) else: raise Exception(f"unsupported number of alibi dims: {dims}") alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand( attn_heads, -1, -1 ) return alibi_bias def get_alibi_bias( alibi_biases, batch_size, time_steps, heads, dtype, device, dims=1, distance="manhattan", ): cache_key = f"{dims}_{heads}_{distance}" buffered = alibi_biases.get(cache_key, None) target_size = heads * batch_size if ( buffered is None or buffered.size(0) < target_size or buffered.size(1) < time_steps or buffered.dtype != dtype or buffered.device != device ): bt = max(time_steps, buffered.size(1) if buffered is not None else 0) bn = max(target_size, buffered.size(0) if buffered is not None else 0) // heads buffered = ( get_alibi(bt, heads, dims=dims, distance=distance) .to(dtype=dtype, device=device) .repeat(bn, 1, 1) ) alibi_biases[cache_key] = buffered b = buffered[:target_size, :time_steps, :time_steps] b = b.view(batch_size, heads, time_steps, time_steps) return b def _learned_alibi_bias( alibi_bias, batch_size, time_steps, heads, scale, dtype, device, ): assert alibi_bias.size(1) == heads, alibi_bias.shape assert alibi_bias.dtype == dtype, alibi_bias.dtype assert alibi_bias.device == device, alibi_bias.device if alibi_bias.size(-1) < time_steps: psz = math.ceil((time_steps - alibi_bias.size(-1)) / 2) alibi_bias = F.pad(alibi_bias, (psz, psz, psz, psz), mode="replicate") alibi_bias = alibi_bias.expand(batch_size, -1, -1, -1) * scale return alibi_bias[..., :time_steps, :time_steps] def masked_alibi(alibi_bias, mask_info): H = alibi_bias.size(1) orig_bias = alibi_bias index = mask_info.ids_keep.unsqueeze(1)[..., 0].unsqueeze(-1) alibi_bias = torch.gather( orig_bias, dim=-2, index=index.expand(-1, H, -1, mask_info.ids_restore.size(1)), ) alibi_bias = torch.gather( alibi_bias, dim=-1, index=index.transpose(-1, -2).expand(-1, H, alibi_bias.size(-2), -1), ) return alibi_bias ================================================ FILE: examples/data2vec/models/modalities/images.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from functools import partial from dataclasses import dataclass from typing import Callable, Dict, Optional from timm.models.layers import to_2tuple from fairseq.tasks import FairseqTask from examples.data2vec.models.mae import get_2d_sincos_pos_embed, PatchEmbed from .base import ( D2vModalityConfig, ModalitySpecificEncoder, get_alibi_bias, MaskSeed, ) from .modules import ( BlockEncoder, Decoder2d, FixedPositionalEncoder, TransformerDecoder, EncDecTransformerDecoder, ) from examples.data2vec.data.modality import Modality @dataclass class D2vImageConfig(D2vModalityConfig): type: Modality = Modality.IMAGE input_size: int = 224 in_chans: int = 3 patch_size: int = 16 embed_dim: int = 768 alibi_dims: int = 2 alibi_distance: str = "manhattan" fixed_positions: bool = True transformer_decoder: bool = False enc_dec_transformer: bool = False class ImageEncoder(ModalitySpecificEncoder): modality_cfg: D2vImageConfig def __init__( self, modality_cfg: D2vImageConfig, embed_dim: int, make_block: Callable[[float, Optional[int], Optional[int]], nn.ModuleList], norm_layer: Callable[[int], nn.LayerNorm], layer_norm_first: bool, alibi_biases: Dict, task: Optional[FairseqTask], ): img_size = to_2tuple(modality_cfg.input_size) patch_size = to_2tuple(modality_cfg.patch_size) num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) local_encoder = PatchEmbed( modality_cfg.input_size, modality_cfg.patch_size, modality_cfg.in_chans, modality_cfg.embed_dim, ) w = local_encoder.proj.weight.data torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1])) if modality_cfg.embed_dim != embed_dim: local_encoder = nn.Sequential( local_encoder, nn.Linear(modality_cfg.embed_dim, embed_dim), ) project_features = nn.Identity() pos_embed = nn.Parameter( torch.zeros(1, num_patches, embed_dim), requires_grad=False ) side_n = int(num_patches ** 0.5) emb = get_2d_sincos_pos_embed( pos_embed.shape[-1], side_n, cls_token=False, ) pos_embed.data.copy_(torch.from_numpy(emb).float().unsqueeze(0)) fixed_positional_encoder = ( FixedPositionalEncoder(pos_embed) if modality_cfg.fixed_positions else None ) dpr = np.linspace( modality_cfg.start_drop_path_rate, modality_cfg.end_drop_path_rate, modality_cfg.prenet_depth, ) context_encoder = BlockEncoder( nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)), norm_layer(embed_dim) if not layer_norm_first else None, layer_norm_first, modality_cfg.prenet_layerdrop, modality_cfg.prenet_dropout, ) if modality_cfg.transformer_decoder: if modality_cfg.enc_dec_transformer: decoder = EncDecTransformerDecoder(modality_cfg.decoder, embed_dim) else: dec_enc = BlockEncoder( nn.ModuleList( make_block(0, modality_cfg.decoder.decoder_dim, 8) for _ in range(modality_cfg.decoder.decoder_layers) ), None, layer_norm_first, 0, 0, ) decoder = TransformerDecoder(modality_cfg.decoder, embed_dim, dec_enc) else: decoder = ( Decoder2d(modality_cfg.decoder, embed_dim, side_n, side_n) if modality_cfg.decoder is not None else None ) alibi_bias_fn = partial( get_alibi_bias, alibi_biases=alibi_biases, heads=modality_cfg.num_alibi_heads, dims=modality_cfg.alibi_dims, distance=modality_cfg.alibi_distance, ) super().__init__( modality_cfg=modality_cfg, embed_dim=embed_dim, local_encoder=local_encoder, project_features=project_features, fixed_positional_encoder=fixed_positional_encoder, relative_positional_encoder=None, context_encoder=context_encoder, decoder=decoder, get_alibi_bias=alibi_bias_fn, ) def reset_parameters(self): super().reset_parameters() if self.decoder is not None: self.decoder.reset_parameters() @torch.no_grad() def patchify(self, imgs): """ imgs: (N, 3, H, W) x: (N, L, patch_size**2 *3) """ p = self.modality_cfg.patch_size h = w = imgs.shape[2] // p x = imgs.reshape(shape=(imgs.shape[0], 3, h, p, w, p)) x = torch.einsum("nchpwq->nhwpqc", x) x = x.reshape(shape=(imgs.shape[0], h * w, p ** 2 * 3)) return x @torch.no_grad() def unpatchify(self, x): """ x: (N, L, patch_size**2 *3) imgs: (N, 3, H, W) """ p = self.modality_cfg.patch_size h = w = int(x.shape[1] ** 0.5) assert h * w == x.shape[1] x = x.reshape(shape=(x.shape[0], h, w, p, p, 3)) x = torch.einsum("nhwpqc->nchpwq", x) imgs = x.reshape(shape=(x.shape[0], 3, h * p, h * p)) return imgs def compute_mask( self, x, padding_mask, mask_seed: Optional[MaskSeed], apply, shape=None, precomputed_mask=None, ): mlen = self.modality_cfg.mask_length if mlen <= 1: return super().compute_mask( x, padding_mask, mask_seed, apply, precomputed_mask ) if precomputed_mask is not None: mask = precomputed_mask else: from fairseq.data.data_utils import compute_block_mask_2d if shape is not None: B, L, D = shape else: B, L, D = x.shape mask = compute_block_mask_2d( shape=(B, L), mask_prob=self.modality_cfg.mask_prob, mask_length=self.modality_cfg.mask_length, mask_prob_adjust=self.modality_cfg.mask_prob_adjust, inverse_mask=self.modality_cfg.inverse_mask, require_same_masks=True, mask_dropout=self.modality_cfg.mask_dropout, ) mask_info = self.make_maskinfo(x, mask, shape) if apply: x = self.apply_mask(x, mask_info) return x, mask_info def decoder_input(self, x, mask_info): if ( not self.modality_cfg.transformer_decoder or not self.modality_cfg.enc_dec_transformer ): return super().decoder_input(x, mask_info) inp_drop = self.modality_cfg.decoder.input_dropout if inp_drop > 0: x = F.dropout(x, inp_drop, training=self.training, inplace=True) kv = x[:, self.modality_cfg.num_extra_tokens :] assert self.fixed_positional_encoder is not None pos = self.fixed_positional_encoder(x, None).expand(x.size(0), -1, -1) mask = mask_info.mask.bool() if self.modality_cfg.decoder.add_positions_all: kv = kv + pos[~mask].view(kv.shape) q = pos[mask].view(x.size(0), -1, x.size(-1)) return q, kv ================================================ FILE: examples/data2vec/models/modalities/modules.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from dataclasses import dataclass from fairseq.modules import ( LayerNorm, SamePad, SamePad2d, TransposeLast, ) @dataclass class D2vDecoderConfig: decoder_dim: int = 384 decoder_groups: int = 16 decoder_kernel: int = 5 decoder_layers: int = 5 input_dropout: float = 0.1 add_positions_masked: bool = False add_positions_all: bool = False decoder_residual: bool = True projection_layers: int = 1 projection_ratio: float = 2.0 class FixedPositionalEncoder(nn.Module): def __init__(self, pos_embed): super().__init__() self.positions = pos_embed def forward(self, x, padding_mask): return self.positions class TextFeatPositionalEncoder(nn.Module): """ Original encoder expects (B, T) long input. This module wraps it to take local_encoder output which are (B, T, D) float tensors """ def __init__(self, pos_encoder): super().__init__() self.pos_encoder = pos_encoder def forward(self, x, padding_mask): # assume padded token embeddings are 0s # TODO: consider using padding_mask as input return self.pos_encoder(x[..., 0]) class BlockEncoder(nn.Module): def __init__(self, blocks, norm_layer, layer_norm_first, layerdrop, dropout): super().__init__() self.blocks = blocks self.norm = norm_layer self.layer_norm_first = layer_norm_first self.layerdrop = layerdrop self.dropout = nn.Dropout(dropout, inplace=True) def forward(self, x, padding_mask, alibi_bias, alibi_scale): if self.norm is not None and not self.layer_norm_first: x = self.norm(x) x = self.dropout(x) for i, blk in enumerate(self.blocks): if ( not self.training or self.layerdrop == 0 or (np.random.random() > self.layerdrop) ): ab = alibi_bias if ab is not None and alibi_scale is not None: scale = ( alibi_scale[i] if alibi_scale.size(0) > 1 else alibi_scale.squeeze(0) ) ab = ab * scale.type_as(ab) x, _ = blk(x, padding_mask, ab) if self.norm is not None and self.layer_norm_first: x = self.norm(x) return x class DecoderBase(nn.Module): decoder_cfg: D2vDecoderConfig def __init__(self, cfg: D2vDecoderConfig): super().__init__() self.decoder_cfg = cfg def reset_parameters(self): for mod in self.proj.modules(): if isinstance(mod, nn.Linear): mod.reset_parameters() def add_residual(self, x, residual, i, mask_info): if ( residual is None or not self.decoder_cfg.decoder_residual or residual.size(1) != x.size(1) ): return x ret = x + residual return ret class Decoder1d(DecoderBase): def __init__(self, cfg: D2vDecoderConfig, input_dim): super().__init__(cfg) def make_block(in_dim): block = [ nn.Conv1d( in_dim, cfg.decoder_dim, kernel_size=cfg.decoder_kernel, padding=cfg.decoder_kernel // 2, groups=cfg.decoder_groups, ), SamePad(cfg.decoder_kernel), TransposeLast(), LayerNorm(cfg.decoder_dim, elementwise_affine=False), TransposeLast(), nn.GELU(), ] return nn.Sequential(*block) self.blocks = nn.Sequential( *[ make_block(input_dim if i == 0 else cfg.decoder_dim) for i in range(cfg.decoder_layers) ] ) projs = [] curr_dim = cfg.decoder_dim for i in range(cfg.projection_layers - 1): next_dim = int(curr_dim * cfg.projection_ratio) if i == 0 else curr_dim projs.append(nn.Linear(curr_dim, next_dim)) projs.append(nn.GELU()) curr_dim = next_dim projs.append(nn.Linear(curr_dim, input_dim)) if len(projs) == 1: self.proj = projs[0] else: self.proj = nn.Sequential(*projs) def forward(self, x, mask_info): x = x.transpose(1, 2) residual = x for i, layer in enumerate(self.blocks): x = layer(x) x = self.add_residual(x, residual, i, mask_info) residual = x x = x.transpose(1, 2) x = self.proj(x) return x class Decoder2d(DecoderBase): def __init__(self, cfg: D2vDecoderConfig, input_dim, h_size, w_size): super().__init__(cfg) self.h_size = h_size self.w_size = w_size def make_block(in_dim): block = [ nn.Conv2d( in_dim, cfg.decoder_dim, kernel_size=cfg.decoder_kernel, padding=cfg.decoder_kernel // 2, groups=cfg.decoder_groups, ), SamePad2d(cfg.decoder_kernel), TransposeLast(tranpose_dim=-3), LayerNorm(cfg.decoder_dim, elementwise_affine=False), TransposeLast(tranpose_dim=-3), nn.GELU(), ] return nn.Sequential(*block) self.blocks = nn.Sequential( *[ make_block(input_dim if i == 0 else cfg.decoder_dim) for i in range(cfg.decoder_layers) ] ) self.proj = nn.Linear(cfg.decoder_dim, input_dim) def forward(self, x, mask_info): B, T, C = x.shape x = x.transpose(1, 2).reshape(B, C, self.h_size, self.w_size) residual = x for i, layer in enumerate(self.blocks): x = layer(x) x = self.add_residual(x, residual, i, mask_info) residual = x x = x.reshape(B, -1, T).transpose(1, 2) x = self.proj(x) return x class TransformerDecoder(nn.Module): decoder_cfg: D2vDecoderConfig def __init__(self, cfg: D2vDecoderConfig, input_dim, encoder): super().__init__() self.decoder_cfg = cfg self.input_proj = nn.Linear(input_dim, cfg.decoder_dim) self.encoder = encoder self.proj = nn.Linear(cfg.decoder_dim, input_dim) def reset_parameters(self): from fairseq.modules.transformer_sentence_encoder import init_bert_params self.apply(init_bert_params) def forward(self, x, mask_info): x = self.input_proj(x) x = self.encoder(x, None, None, 1) x = self.proj(x) return x class AltBlock(nn.Module): def __init__( self, dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, mlp_drop=0.0, post_mlp_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, layer_norm_first=True, ffn_targets=False, cosine_attention=False, ): super().__init__() self.layer_norm_first = layer_norm_first self.ffn_targets = ffn_targets from timm.models.vision_transformer import DropPath, Mlp self.norm1 = norm_layer(dim) self.attn = AltAttention( dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, cosine_attention=cosine_attention, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(dim) mlp_hidden_dim = int(dim * mlp_ratio) self.mlp = Mlp( in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=mlp_drop, ) self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False) def forward(self, x, padding_mask=None, alibi_bias=None): if self.layer_norm_first: x = x + self.drop_path(self.attn(self.norm1(x), padding_mask, alibi_bias)) r = x = self.mlp(self.norm2(x)) t = x x = r + self.drop_path(self.post_mlp_dropout(x)) if not self.ffn_targets: t = x else: x = x + self.drop_path(self.attn(x, padding_mask, alibi_bias)) r = x = self.norm1(x) x = self.mlp(x) t = x x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x))) if not self.ffn_targets: t = x return x, t class AltAttention(nn.Module): def __init__( self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0, cosine_attention=False, ): super().__init__() self.num_heads = num_heads head_dim = dim // num_heads self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(dim, dim) self.proj_drop = nn.Dropout(proj_drop) self.cosine_attention = cosine_attention if cosine_attention: self.logit_scale = nn.Parameter( torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True ) def forward(self, x, padding_mask=None, alibi_bias=None): B, N, C = x.shape qkv = ( self.qkv(x) .reshape(B, N, 3, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) # qkv x B x H x L x D ) q, k, v = ( qkv[0], qkv[1], qkv[2], ) # make torchscript happy (cannot use tensor as tuple) dtype = q.dtype if self.cosine_attention: # cosine attention attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1) logit_scale = torch.clamp( self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01)) ).exp() attn = attn * logit_scale else: q = q * self.scale attn = q @ k.transpose(-2, -1) if alibi_bias is not None: attn = attn.type_as(alibi_bias) attn[:, : alibi_bias.size(1)] += alibi_bias if padding_mask is not None and padding_mask.any(): attn = attn.masked_fill( padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf"), ) attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2) # x = x.reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class EncDecAttention(nn.Module): def __init__( self, q_dim, kv_dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.0, proj_drop=0.0, cosine_attention=False, ): super().__init__() self.num_heads = num_heads head_dim = q_dim // num_heads self.scale = qk_scale or head_dim ** -0.5 self.q_proj = nn.Linear(q_dim, q_dim, bias=qkv_bias) self.kv_proj = nn.Linear(kv_dim, 2 * q_dim, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop) self.proj = nn.Linear(q_dim, q_dim) self.proj_drop = nn.Dropout(proj_drop) self.cosine_attention = cosine_attention if cosine_attention: self.logit_scale = nn.Parameter( torch.log(10 * torch.ones((num_heads, 1, 1))), requires_grad=True ) def forward(self, q, kv, padding_mask=None, alibi_bias=None): B, N, C = q.shape q = ( self.q_proj(q) .reshape(B, N, self.num_heads, C // self.num_heads) .permute(0, 2, 1, 3) ) # B x H x L x D kv = ( self.kv_proj(kv) .reshape(B, -1, 2, self.num_heads, C // self.num_heads) .permute(2, 0, 3, 1, 4) ) # kv x B x H x L x D k, v = ( kv[0], kv[1], ) # make torchscript happy (cannot use tensor as tuple) dtype = q.dtype if self.cosine_attention: # cosine attention attn = F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1) logit_scale = torch.clamp( self.logit_scale, max=torch.log(torch.tensor(1.0 / 0.01)) ).exp() attn = attn * logit_scale else: q = q * self.scale attn = q @ k.transpose(-2, -1) if alibi_bias is not None: attn = attn.type_as(alibi_bias) attn[:, : alibi_bias.size(1)] += alibi_bias if padding_mask is not None and padding_mask.any(): attn = attn.masked_fill( padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf"), ) attn = attn.softmax(dim=-1, dtype=torch.float32).to(dtype=dtype) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2) # x = x.reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x class EncDecBlock(nn.Module): def __init__( self, q_dim, kv_dim, num_heads, mlp_ratio=4.0, qkv_bias=False, qk_scale=None, drop=0.0, attn_drop=0.0, mlp_drop=0.0, post_mlp_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, layer_norm_first=True, cosine_attention=False, first_residual=True, ): super().__init__() self.layer_norm_first = layer_norm_first from timm.models.vision_transformer import DropPath, Mlp self.norm1 = norm_layer(q_dim) self.attn = EncDecAttention( q_dim, kv_dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, cosine_attention=cosine_attention, ) self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() self.norm2 = norm_layer(q_dim) mlp_hidden_dim = int(q_dim * mlp_ratio) self.mlp = Mlp( in_features=q_dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=mlp_drop, ) self.post_mlp_dropout = nn.Dropout(post_mlp_drop, inplace=False) self.first_residual = first_residual def forward(self, q, kv, padding_mask=None, alibi_bias=None): r = q if self.first_residual else 0 if self.layer_norm_first: x = r + self.drop_path( self.attn(self.norm1(q), kv, padding_mask, alibi_bias) ) r = x = self.mlp(self.norm2(x)) x = r + self.drop_path(self.post_mlp_dropout(x)) else: x = r + self.drop_path(self.attn(q, kv, padding_mask, alibi_bias)) r = x = self.norm1(x) x = self.mlp(x) x = self.norm2(r + self.drop_path(self.post_mlp_dropout(x))) return x class EncDecTransformerDecoder(nn.Module): def __init__(self, cfg: D2vDecoderConfig, input_dim): super().__init__() self.input_proj = nn.Linear(input_dim, cfg.decoder_dim) self.blocks = nn.Sequential( *[ EncDecBlock( q_dim=cfg.decoder_dim, kv_dim=input_dim, num_heads=8, mlp_ratio=4.0, qkv_bias=True, qk_scale=None, drop=0.0, attn_drop=0.0, mlp_drop=0.0, post_mlp_drop=0.0, drop_path=0.0, act_layer=nn.GELU, norm_layer=nn.LayerNorm, layer_norm_first=False, cosine_attention=False, first_residual=i > 0, ) for i in range(cfg.decoder_layers) ] ) self.proj = nn.Linear(cfg.decoder_dim, input_dim) def reset_parameters(self): from fairseq.modules.transformer_sentence_encoder import init_bert_params self.apply(init_bert_params) def forward(self, x, kv): x = self.input_proj(x) for i, layer in enumerate(self.blocks): x = layer(x, kv) x = self.proj(x) return x ================================================ FILE: examples/data2vec/models/modalities/text.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass from functools import partial from typing import Callable, Dict, Optional import torch.nn as nn import torch.nn.functional as F import numpy as np from fairseq.modules import PositionalEmbedding, FairseqDropout, LayerNorm from fairseq.tasks import FairseqTask from .base import D2vModalityConfig, ModalitySpecificEncoder, get_alibi_bias from .modules import BlockEncoder, Decoder1d from examples.data2vec.data.modality import Modality @dataclass class D2vTextConfig(D2vModalityConfig): type: Modality = Modality.TEXT max_source_positions: int = 512 learned_pos: bool = True dropout: float = 0.1 # used for both local_encoder and contextualized encoder. tied with global transformer in data2vec_text no_scale_embedding: bool = True layernorm_embedding: bool = True no_token_positional_embeddings: bool = False class TextEncoder(ModalitySpecificEncoder): modality_cfg: D2vTextConfig def __init__( self, modality_cfg: D2vTextConfig, embed_dim: int, make_block: Callable[[float], nn.ModuleList], norm_layer: Callable[[int], nn.LayerNorm], layer_norm_first: bool, alibi_biases: Dict, task: Optional[FairseqTask], ): self.pad_idx = task.source_dictionary.pad() self.vocab_size = len(task.source_dictionary) local_encoder = TextLocalEncoder( vocab_size=self.vocab_size, embed_dim=embed_dim, max_source_positions=modality_cfg.max_source_positions, pad_idx=self.pad_idx, no_scale_embedding=modality_cfg.no_scale_embedding, layernorm_embedding=modality_cfg.layernorm_embedding, dropout=modality_cfg.dropout, no_token_positional_embeddings=modality_cfg.no_token_positional_embeddings, learned_pos=modality_cfg.learned_pos, ) dpr = np.linspace( modality_cfg.start_drop_path_rate, modality_cfg.end_drop_path_rate, modality_cfg.prenet_depth, ) context_encoder = BlockEncoder( nn.ModuleList(make_block(dpr[i]) for i in range(modality_cfg.prenet_depth)), norm_layer(embed_dim) if not layer_norm_first and modality_cfg.prenet_depth > 0 else None, layer_norm_first, modality_cfg.prenet_layerdrop, modality_cfg.prenet_dropout if modality_cfg.prenet_depth > 0 else 0.0, ) decoder = ( Decoder1d(modality_cfg.decoder, embed_dim) if modality_cfg.decoder is not None else None ) alibi_bias_fn = partial(get_alibi_bias, alibi_biases=alibi_biases) super().__init__( modality_cfg=modality_cfg, embed_dim=embed_dim, local_encoder=local_encoder, project_features=nn.Identity(), fixed_positional_encoder=None, relative_positional_encoder=None, context_encoder=context_encoder, decoder=decoder, get_alibi_bias=alibi_bias_fn, ) def reset_parameters(self): super().reset_parameters() def convert_padding_mask(self, x, padding_mask): if padding_mask is None or padding_mask.size(1) == x.size(1): return padding_mask diff = self.downsample - padding_mask.size(1) % self.downsample if 0 < diff < self.downsample: padding_mask = F.pad(padding_mask, (0, diff), value=True) padding_mask = padding_mask.view(padding_mask.size(0), -1, self.downsample) padding_mask = padding_mask.all(-1) if padding_mask.size(1) > x.size(1): padding_mask = padding_mask[:, : x.size(1)] assert x.size(1) == padding_mask.size( 1 ), f"{x.size(1), padding_mask.size(1), diff, self.downsample}" return padding_mask class TextLocalEncoder(nn.Module): def __init__( self, vocab_size, embed_dim, max_source_positions, pad_idx, no_scale_embedding, layernorm_embedding, dropout, no_token_positional_embeddings, learned_pos, ): super().__init__() self.pad_idx = pad_idx self.dropout_module = FairseqDropout(dropout) self.embed_tokens = nn.Embedding(vocab_size, embed_dim, pad_idx) self.embed_scale = 1.0 if no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( max_source_positions, embed_dim, pad_idx, learned=learned_pos, ) if not no_token_positional_embeddings else None ) self.embed_scale = 1.0 if no_scale_embedding else math.sqrt(embed_dim) self.layernorm_embedding = None if layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim) def forward(self, src_tokens): x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x = x + self.embed_positions(src_tokens) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) return x ================================================ FILE: examples/data2vec/models/utils.py ================================================ import math import torch def get_alibi( max_positions: int, attention_heads: int, ): def get_slopes(n): def get_slopes_power_of_2(n): start = 2 ** (-(2 ** -(math.log2(n) - 3))) ratio = start return [start * ratio ** i for i in range(n)] # In the paper, we only train models that have 2^a heads for some # a. This function has some good properties that only occur when # the input is a power of 2. To maintain that even when the number # of heads is not a power of 2, we use this workaround. if math.log2(n).is_integer(): return get_slopes_power_of_2(n) else: closest_power_of_2 = 2 ** math.floor(math.log2(n)) return ( get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][: n - closest_power_of_2] ) maxpos = max_positions attn_heads = attention_heads slopes = torch.Tensor(get_slopes(attn_heads)) # prepare alibi position linear bias. Note that wav2vec2 is non # autoregressive model so we want a symmetric mask with 0 on the # diagonal and other wise linear decreasing valuees pos_bias = ( torch.abs( torch.arange(maxpos).unsqueeze(0) - torch.arange(maxpos).unsqueeze(1) ) * -1 ) alibi_bias = slopes.unsqueeze(1).unsqueeze(1) * pos_bias.unsqueeze(0).expand( attn_heads, -1, -1 ) return alibi_bias def masked_alibi(alibi_bias, mask_indices, orig_B, orig_T): alibi_bias = alibi_bias.view(orig_B, -1, orig_T, orig_T) H = alibi_bias.size(1) alibi_mask = mask_indices.unsqueeze(1) alibi_bias = alibi_bias.masked_select(alibi_mask.unsqueeze(-1)) alibi_bias = alibi_bias.view(orig_B, H, -1, orig_T) M = alibi_bias.size(-2) alibi_bias = alibi_bias.masked_select(alibi_mask.unsqueeze(-2)) alibi_bias = alibi_bias.view(-1, M, M) return alibi_bias ================================================ FILE: examples/data2vec/scripts/convert_audioset_labels.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os def get_parser(): parser = argparse.ArgumentParser(description="convert audioset labels") # fmt: off parser.add_argument('in_file', help='audioset csv file to convert') parser.add_argument('--manifest', required=True, metavar='PATH', help='wav2vec-like manifest') parser.add_argument('--descriptors', required=True, metavar='PATH', help='path to label descriptor file') parser.add_argument('--output', required=True, metavar='PATH', help='where to output converted labels') # fmt: on return parser def main(): parser = get_parser() args = parser.parse_args() label_descriptors = {} with open(args.descriptors, "r") as ldf: next(ldf) for line in ldf: if line.strip() == "": continue items = line.split(",") assert len(items) > 2, line idx = items[0] lbl = items[1] assert lbl not in label_descriptors, lbl label_descriptors[lbl] = idx labels = {} with open(args.in_file, "r") as ifd: for line in ifd: if line.lstrip().startswith("#"): continue items = line.rstrip().split(",") id = items[0].strip() start = items[1].strip() end = items[2].strip() lbls = [label_descriptors[it.strip(' "')] for it in items[3:]] labels[id] = [start, end, ",".join(lbls)] with open(args.manifest, "r") as mf, open(args.output, "w") as of: next(mf) for line in mf: path, _ = line.split("\t") id = os.path.splitext(os.path.basename(path))[0] lbl = labels[id] print("\t".join(lbl), file=of) if __name__ == "__main__": main() ================================================ FILE: examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr.sh ================================================ #!/bin/bash set -eu job_id="$1" task_id="$2" dir="$3" echo "job_id: $job_id, task_id: $task_id, dir: $dir" mkdir -p "$dir/log" sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/decode_sweep_%A.out" sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" sbatch $sbatch_args examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh $dir ================================================ FILE: examples/data2vec/scripts/multi/finetune_all_fair_aws_local_lr_nodep.sh ================================================ #!/bin/bash set -eu dir="$1" echo "dir: $dir" mkdir -p "$dir/log" sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" sbatch_args="$sbatch_args -o $dir/log/decode_sweep_%A.out" sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" sbatch $sbatch_args examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh $dir ================================================ FILE: examples/data2vec/scripts/multi/finetune_all_fair_local_lr.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" tasks[mnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MNLI-bin" tasks[qqp]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QQP-bin" tasks[sts_b]="/fsx-wav2vec/abaevski/data/nlp/GLUE/STS-B-bin" lrs=(5e-6 8e-6 1e-5 2e-5) for task data_path in ${(kv)tasks}; do for lr in $lrs; do echo $lr $task PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" \ python fairseq_cli/hydra_train.py -m --config-dir examples/data2vec/config/multi/text_finetuning \ --config-name $task +run_config=local task.data="$data_path" common.log_interval=200 dataset.num_workers=1 \ model.model_path="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" +model=text_wrap done done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_char_fair_aws_local_lr.sh ================================================ #!/bin/bash set -eu job_id="$1" task_id="$2" dir="$3" echo "job_id: $job_id, task_id: $task_id, dir: $dir" mkdir -p "$dir/log" sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/ft_%A.out" sbatch_args="$sbatch_args -e $dir/log/ft_%A.err" sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_char_fair_local_lr.sh $dir ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair.sh ================================================ #!/usr/bin/env zsh job_id=$1 task_id=$2 dir="$3" cp="$dir/$task_id/checkpoints/checkpoint_last.pt" echo "job_id: $job_id, task_id: $task_id, dir: $dir" declare -A tasks tasks[cola]="/private/home/jgu/data/GLUE/CoLA-bin" tasks[qnli]="/private/home/jgu/data/GLUE/QNLI-bin" tasks[mrpc]="/private/home/jgu/data/GLUE/MRPC-bin" tasks[rte]="/private/home/jgu/data/GLUE/RTE-bin" tasks[sst_2]="/private/home/jgu/data/GLUE/SST-2-bin" for task data_path in ${(kv)tasks}; do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" +hydra.launcher.additional_parameters.dependency="afterok:$job_id" hydra.sweep.dir="$dir/finetune/$task" & done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_aws.sh ================================================ #!/usr/bin/env zsh job_id=$1 task_id=$2 dir="$3" cp="$dir/checkpoints/checkpoint_last.pt" echo "job_id: $job_id, task_id: $task_id, dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" for task data_path in ${(kv)tasks}; do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" +hydra.launcher.additional_parameters.dependency="afterok:$job_id" hydra.sweep.dir="$dir/finetune/$task" & done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_aws_local_lr.sh ================================================ #!/bin/bash set -eu job_id="$1" task_id="$2" dir="$3" echo "job_id: $job_id, task_id: $task_id, dir: $dir" mkdir -p "$dir/log" sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/decode_sweep_%A.out" sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh $dir ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_aws_lr.sh ================================================ #!/usr/bin/env zsh job_id=$1 task_id=$2 dir="$3" cp="$dir/checkpoints/checkpoint_last.pt" echo "job_id: $job_id, task_id: $task_id, dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" for task data_path in ${(kv)tasks}; do for lr in 5e-6 8e-6 1e-5 2e-5 5e-5 8e-5 1e-4 2e-4; do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" +hydra.launcher.additional_parameters.dependency="afterok:$job_id" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" & done done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" lrs=(5e-6 8e-6 1e-5 2e-5) for task data_path in ${(kv)tasks}; do for lr in $lrs; do echo $lr $task PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" \ python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task +run_config=local task.data="$data_path" common.log_interval=200 dataset.num_workers=1 \ checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" done done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_nodep.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[cola]="/private/home/jgu/data/GLUE/CoLA-bin" tasks[qnli]="/private/home/jgu/data/GLUE/QNLI-bin" tasks[mrpc]="/private/home/jgu/data/GLUE/MRPC-bin" tasks[rte]="/private/home/jgu/data/GLUE/RTE-bin" tasks[sst_2]="/private/home/jgu/data/GLUE/SST-2-bin" for task data_path in ${(kv)tasks}; do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune/$task" & done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_nodep_aws.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" for task data_path in ${(kv)tasks}; do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune/$task" & done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_local_lr.sh ================================================ #!/bin/bash set -eu dir="$1" echo "dir: $dir" mkdir -p "$dir/log" sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" sbatch_args="$sbatch_args -o $dir/log/decode_sweep_%A.out" sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_fair_local_lr.sh $dir ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" for task data_path in ${(kv)tasks}; do for lr in 5e-6 8e-6 1e-5 2e-5 5e-5 8e-5 1e-4 2e-4; do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" & done done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_fair_nodep_aws_lr_nopos.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" for task data_path in ${(kv)tasks}; do for lr in 5e-6 8e-6 1e-5 2e-5 5e-5 8e-5 1e-4 2e-4; do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g_aws task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" +model.encoder_learned_pos=False & done done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_large_fair_aws_local_lr.sh ================================================ #!/bin/bash set -eu job_id="$1" task_id="$2" dir="$3" echo "job_id: $job_id, task_id: $task_id, dir: $dir" mkdir -p "$dir/log" sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" sbatch_args="$sbatch_args -d afterok:$job_id -o $dir/log/decode_sweep_%A.out" sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh $dir ================================================ FILE: examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[cola]="/fsx-wav2vec/abaevski/data/nlp/GLUE/CoLA-bin" tasks[qnli]="/fsx-wav2vec/abaevski/data/nlp/GLUE/QNLI-bin" tasks[mrpc]="/fsx-wav2vec/abaevski/data/nlp/GLUE/MRPC-bin" tasks[rte]="/fsx-wav2vec/abaevski/data/nlp/GLUE/RTE-bin" tasks[sst_2]="/fsx-wav2vec/abaevski/data/nlp/GLUE/SST-2-bin" lrs=(5e-6 8e-6 1e-5 2e-5) for task data_path in ${(kv)tasks}; do for lr in $lrs; do echo $lr $task PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" \ python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task +run_config=local task.data="$data_path" common.log_interval=200 dataset.num_workers=1 \ checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_lr/$task/$lr" "optimization.lr=[${lr}]" \ model._name=roberta_large done done ================================================ FILE: examples/data2vec/scripts/text/finetune_all_large_fair_nodep_aws_local_lr.sh ================================================ #!/bin/bash set -eu dir="$1" echo "dir: $dir" mkdir -p "$dir/log" sbatch_args="-p wav2vec --nodes=1 --ntasks-per-node=1" sbatch_args="$sbatch_args --gpus-per-node=1 --cpus-per-task=8 --mem=0 --time=24:00:00" sbatch_args="$sbatch_args -o $dir/log/decode_sweep_%A.out" sbatch_args="$sbatch_args -e $dir/log/decode_sweep_%A.err" sbatch $sbatch_args examples/data2vec/scripts/text/finetune_all_large_fair_local_lr.sh $dir ================================================ FILE: examples/data2vec/scripts/text/finetune_sst2_qnli_sweep_fair_nodep.sh ================================================ #!/usr/bin/env zsh dir="$1" cp="$dir/checkpoints/checkpoint_last.pt" echo "dir: $dir" declare -A tasks tasks[qnli]="/private/home/jgu/data/GLUE/QNLI-bin" tasks[sst_2]="/private/home/jgu/data/GLUE/SST-2-bin" lrs="5e-6 1e-5 2e-5 5e-5 1e-4 2e-4 5e-4 1e-3" for task data_path in ${(kv)tasks}; do for lr in $(echo "$lrs"); do PYTHONPATH=. PREFIX="${PREFIX}" SUFFIX="" nohup python fairseq_cli/hydra_train.py -m --config-dir examples/roberta/config/finetuning \ --config-name $task hydra/launcher=submitit_slurm +run_config=slurm_1g task.data="$data_path" hydra.launcher.name=finetune_${task}_${PREFIX} \ checkpoint.restore_file="$cp" hydra.sweep.dir="$dir/finetune_sweep/$task/lr_$lr" "optimization.lr=[${lr}]" & done done ================================================ FILE: examples/data2vec/scripts/text/glue.py ================================================ from valids import parser, main as valids_main import os.path as osp args = parser.parse_args() args.target = "valid_accuracy" args.best_biggest = True args.best = True args.last = 0 args.path_contains = None res = valids_main(args, print_output=False) grouped = {} for k, v in res.items(): k = osp.dirname(k) run = osp.dirname(k) task = osp.basename(k) val = v["valid_accuracy"] if run not in grouped: grouped[run] = {} grouped[run][task] = val for run, tasks in grouped.items(): print(run) avg = sum(float(v) for v in tasks.values()) / len(tasks) avg_norte = sum(float(v) for k,v in tasks.items() if k != 'rte') / (len(tasks) -1) try: print(f"{tasks['cola']}\t{tasks['qnli']}\t{tasks['mrpc']}\t{tasks['rte']}\t{tasks['sst_2']}\t{avg:.2f}\t{avg_norte:.2f}") except: print(tasks) print() ================================================ FILE: examples/data2vec/scripts/text/glue_lr.py ================================================ import os.path as osp import re from collections import defaultdict from valids import parser, main as valids_main TASK_TO_METRIC = { "cola": "mcc", "qnli": "accuracy", "mrpc": "acc_and_f1", "rte": "accuracy", "sst_2": "accuracy", "mnli": "accuracy", "qqp": "acc_and_f1", "sts_b": "pearson_and_spearman", } TASKS = ["cola", "qnli", "mrpc", "rte", "sst_2", "mnli", "qqp", "sts_b"] def get_best_stat_str(task_vals, show_subdir): task_to_best_val = {} task_to_best_dir = {} for task, subdir_to_val in task_vals.items(): task_to_best_val[task] = max(subdir_to_val.values()) task_to_best_dir[task] = max(subdir_to_val.keys(), key=lambda x: subdir_to_val[x]) # import pdb; pdb.set_trace() N1 = len(task_to_best_val) N2 = len([k for k in task_to_best_val if k != "rte"]) avg1 = sum(task_to_best_val.values()) / N1 avg2 = sum(v for task, v in task_to_best_val.items() if task != "rte") / N2 try: msg = "" for task in TASKS: dir = task_to_best_dir.get(task, 'null') val = task_to_best_val.get(task, -100) msg += f"({dir}, {val})\t" if show_subdir else f"{val}\t" msg += f"{avg1:.2f}\t{avg2:.2f}" except Exception as e: msg = str(e) msg += str(sorted(task_vals.items())) return msg def get_all_stat_str(task_vals): msg = "" for task in [task for task in TASKS if task in task_vals]: msg += f"=== {task}\n" for subdir in sorted(task_vals[task].keys()): msg += f"\t{subdir}\t{task_vals[task][subdir]}\n" return msg def get_tabular_stat_str(task_vals): """assume subdir is /run_*/0""" msg = "" for task in [task for task in TASKS if task in task_vals]: msg += f"=== {task}\n" param_to_runs = defaultdict(dict) for subdir in task_vals[task]: match = re.match("(.*)/(run_.*)/0", subdir) assert match, "subdir" param, run = match.groups() param_to_runs[param][run] = task_vals[task][subdir] params = sorted(param_to_runs, key=lambda x: float(x)) runs = sorted(set(run for runs in param_to_runs.values() for run in runs)) msg += ("runs:" + "\t".join(runs) + "\n") msg += ("params:" + "\t".join(params) + "\n") for param in params: msg += "\t".join([str(param_to_runs[param].get(run, None)) for run in runs]) msg += "\n" # for subdir in sorted(task_vals[task].keys()): # msg += f"\t{subdir}\t{task_vals[task][subdir]}\n" return msg def main(): parser.add_argument("--show_glue", action="store_true", help="show glue metric for each task instead of accuracy") parser.add_argument("--print_mode", default="best", help="best|all|tabular") parser.add_argument("--show_subdir", action="store_true", help="print the subdir that has the best results for each run") parser.add_argument("--override_target", default="valid_accuracy", help="override target") args = parser.parse_args() args.target = args.override_target args.best_biggest = True args.best = True args.last = 0 args.path_contains = None res = valids_main(args, print_output=False) grouped_acc = {} grouped_met = {} # use official metric for each task for path, v in res.items(): path = "/".join([args.base, path]) path = re.sub("//*", "/", path) match = re.match("(.*)finetune[^/]*/([^/]*)/(.*)", path) if not match: continue run, task, subdir = match.groups() if run not in grouped_acc: grouped_acc[run] = {} grouped_met[run] = {} if task not in grouped_acc[run]: grouped_acc[run][task] = {} grouped_met[run][task] = {} if v is not None: grouped_acc[run][task][subdir] = float(v.get("valid_accuracy", -100)) grouped_met[run][task][subdir] = float(v.get(f"valid_{TASK_TO_METRIC[task]}", -100)) else: print(f"{path} has None return") header = "\t".join(TASKS) for run in sorted(grouped_acc): print(run) if args.print_mode == "all": if args.show_glue: print("===== GLUE =====") print(get_all_stat_str(grouped_met[run])) else: print("===== ACC =====") print(get_all_stat_str(grouped_acc[run])) elif args.print_mode == "best": print(f" {header}") if args.show_glue: print(f"GLEU: {get_best_stat_str(grouped_met[run], args.show_subdir)}") else: print(f"ACC: {get_best_stat_str(grouped_acc[run], args.show_subdir)}") elif args.print_mode == "tabular": if args.show_glue: print("===== GLUE =====") print(get_tabular_stat_str(grouped_met[run])) else: print("===== ACC =====") print(get_tabular_stat_str(grouped_acc[run])) else: raise ValueError(args.print_mode) print() if __name__ == "__main__": main() ================================================ FILE: examples/data2vec/scripts/text/unprocess_data.py ================================================ import json import os import tqdm from fairseq.data import Dictionary, data_utils def load_dictionary(dict_path): return Dictionary.load(dict_path) def load_dataset(split_path, src_dict): dataset = data_utils.load_indexed_dataset( split_path, src_dict, combine=False, # set to true for loading `train*` ) if dataset is None: raise FileNotFoundError(f"Dataset not found: {split_path}") return dataset def load_bpe(enc_path): with open(enc_path) as f: bpe2idx = json.load(f) idx2bpe = {v: k for k, v in bpe2idx.items()} return bpe2idx, idx2bpe def detokenize(tokens, src_dict, idx2bpe): raw_inds = map(int, src_dict.string(tokens).split()) raw_chrs = "".join([idx2bpe[raw_ind] for raw_ind in raw_inds]) raw_chrs = raw_chrs.replace("\u0120", " ") return raw_chrs def _main(src_root, src_dict_path, src_bpe_path, src_splits, tgt_root, tgt_splits): src_dict = load_dictionary(src_dict_path) bpe2idx, idx2bpe = load_bpe(src_bpe_path) assert len(src_splits) == len(tgt_splits) for src_split, tgt_split in zip(src_splits, tgt_splits): src_dataset = load_dataset(f"{src_root}/{src_split}", src_dict) tgt_path = f"{tgt_root}/{tgt_split}.txt" print(f"processing {src_split} (dump to {tgt_path})...") os.makedirs(os.path.dirname(tgt_path), exist_ok=True) with open(tgt_path, "w") as f: for tokens in tqdm.tqdm(src_dataset): raw_str = detokenize(tokens, src_dict, idx2bpe) f.write(raw_str + "\n") def main_pt(): src_root = "/datasets01/bookwiki_CC-NEWS_openwebtext_stories-mmap2-bin/121219/bookwiki_CC-NEWS_openwebtext_stories-mmap2-bin" src_dict_path = f"{src_root}/dict.txt" src_bpe_path = f"{src_root}/encoder.json" src_splits = [ "bookwiki_aml-mmap2-bin/shard0/train", "bookwiki_aml-mmap2-bin/shard1/train", "bookwiki_aml-mmap2-bin/shard2/train", "bookwiki_aml-mmap2-bin/shard3/train", "bookwiki_aml-mmap2-bin/shard4/train", "bookwiki_aml-mmap2-bin/valid/valid", ] tgt_root = "/checkpoint/wnhsu/data/data2vec2/data/text/bookwiki_aml-full-mmap2-txt" tgt_splits = [ "train0", "train1", "train2", "train3", "train4", "valid", ] _main(src_root, src_dict_path, src_bpe_path, src_splits, tgt_root, tgt_splits) def main_ft(): src_root = "/fsx-wav2vec/wnhsu/data/data2vec2/data/text/GLUE" src_dict_path = f"{src_root}/dict.txt" src_bpe_path = f"{src_root}/encoder.json" src_splits = [ "CoLA-bin/input0/train", "CoLA-bin/input0/valid", "CoLA-bin/input0/test", "MNLI-bin/input0/train", "MNLI-bin/input0/valid", "MNLI-bin/input0/test", "MNLI-bin/input0/test1", "MNLI-bin/input1/train", "MNLI-bin/input1/valid", "MNLI-bin/input1/test", "MNLI-bin/input1/test1", "MRPC-bin/input0/train", "MRPC-bin/input0/valid", "MRPC-bin/input0/test", "MRPC-bin/input1/train", "MRPC-bin/input1/valid", "MRPC-bin/input1/test", "QNLI-bin/input0/train", "QNLI-bin/input0/valid", "QNLI-bin/input0/test", "QNLI-bin/input1/train", "QNLI-bin/input1/valid", "QNLI-bin/input1/test", "QQP-bin/input0/train", "QQP-bin/input0/valid", "QQP-bin/input0/test", "QQP-bin/input1/train", "QQP-bin/input1/valid", "QQP-bin/input1/test", "RTE-bin/input0/train", "RTE-bin/input0/valid", "RTE-bin/input0/test", "RTE-bin/input1/train", "RTE-bin/input1/valid", "RTE-bin/input1/test", "SST-2-bin/input0/train", "SST-2-bin/input0/valid", "SST-2-bin/input0/test", "STS-B-bin/input0/train", "STS-B-bin/input0/valid", "STS-B-bin/input0/test", "STS-B-bin/input1/train", "STS-B-bin/input1/valid", "STS-B-bin/input1/test", ] tgt_root = "/fsx-wav2vec/wnhsu/data/data2vec2/data/text/GLUE_chr" tgt_splits = [ "CoLA-bin/input0/train", "CoLA-bin/input0/valid", "CoLA-bin/input0/test", "MNLI-bin/input0/train", "MNLI-bin/input0/valid", "MNLI-bin/input0/test", "MNLI-bin/input0/test1", "MNLI-bin/input1/train", "MNLI-bin/input1/valid", "MNLI-bin/input1/test", "MNLI-bin/input1/test1", "MRPC-bin/input0/train", "MRPC-bin/input0/valid", "MRPC-bin/input0/test", "MRPC-bin/input1/train", "MRPC-bin/input1/valid", "MRPC-bin/input1/test", "QNLI-bin/input0/train", "QNLI-bin/input0/valid", "QNLI-bin/input0/test", "QNLI-bin/input1/train", "QNLI-bin/input1/valid", "QNLI-bin/input1/test", "QQP-bin/input0/train", "QQP-bin/input0/valid", "QQP-bin/input0/test", "QQP-bin/input1/train", "QQP-bin/input1/valid", "QQP-bin/input1/test", "RTE-bin/input0/train", "RTE-bin/input0/valid", "RTE-bin/input0/test", "RTE-bin/input1/train", "RTE-bin/input1/valid", "RTE-bin/input1/test", "SST-2-bin/input0/train", "SST-2-bin/input0/valid", "SST-2-bin/input0/test", "STS-B-bin/input0/train", "STS-B-bin/input0/valid", "STS-B-bin/input0/test", "STS-B-bin/input1/train", "STS-B-bin/input1/valid", "STS-B-bin/input1/test", ] _main(src_root, src_dict_path, src_bpe_path, src_splits, tgt_root, tgt_splits) if __name__ == "__main__": main_pt() main_ft() ================================================ FILE: examples/data2vec/scripts/text/valids.py ================================================ import os, argparse, re, json, copy, math from collections import OrderedDict import numpy as np parser = argparse.ArgumentParser(description='Process some integers.') parser.add_argument('base', help='base log path') parser.add_argument('--file_name', default='train.log', help='the log file name') parser.add_argument('--target', default='valid_loss', help='target metric') parser.add_argument('--last', type=int, default=999999999, help='print last n matches') parser.add_argument('--last_files', type=int, default=None, help='print last x files') parser.add_argument('--everything', action='store_true', help='print everything instead of only last match') parser.add_argument('--path_contains', help='only consider matching file pattern') parser.add_argument('--group_on', help='if set, groups by this metric and shows table of differences') parser.add_argument('--epoch', help='epoch for comparison', type=int) parser.add_argument('--skip_empty', action='store_true', help='skip empty results') parser.add_argument('--skip_containing', help='skips entries containing this attribute') parser.add_argument('--unique_epochs', action='store_true', help='only consider the last line fore each epoch') parser.add_argument('--best', action='store_true', help='print the last best result') parser.add_argument('--avg_params', help='average these params through entire log') parser.add_argument('--extract_prev', help='extracts this metric from previous line') parser.add_argument('--remove_metric', help='extracts this metric from previous line') parser.add_argument('--compact', action='store_true', help='if true, just prints checkpoint best val') parser.add_argument('--hydra', action='store_true', help='if true, uses hydra param conventions') parser.add_argument('--best_biggest', action='store_true', help='if true, best is the biggest number, not smallest') parser.add_argument('--key_len', type=int, default=10, help='max length of key') parser.add_argument('--best_only', action='store_true', help='if set, only prints the best value') parser.add_argument('--flat', action='store_true', help='just print the best results') def main(args, print_output): ret = {} entries = [] def extract_metric(s, metric): try: j = json.loads(s) except: return None if args.epoch is not None and ('epoch' not in j or j['epoch'] != args.epoch): return None return j[metric] if metric in j else None def extract_params(s): s = s.replace(args.base, '', 1) if args.path_contains is not None: s = s.replace(args.path_contains, '', 1) if args.hydra: num_matches = re.findall(r'(?:/|__)([^/:]+):(\d+\.?\d*)', s) # str_matches = re.findall(r'(?:/|__)([^/:]+):([^\.]*[^\d\.]+)(?:/|__)', s) str_matches = re.findall(r'(?:/|__)?((?:(?!(?:\:|__)).)+):([^\.]*[^\d\.]+\d*)(?:/|__)', s) lr_matches = re.findall(r'optimization.(lr):\[([\d\.,]+)\]', s) task_matches = re.findall(r'.*/(\d+)$', s) else: num_matches = re.findall(r'\.?([^\.]+?)(\d+(e\-\d+)?(?:\.\d+)?)(\.|$)', s) str_matches = re.findall(r'[/\.]([^\.]*[^\d\.]+\d*)(?=\.)', s) lr_matches = [] task_matches = [] cp_matches = re.findall(r'checkpoint(?:_\d+)?_(\d+).pt', s) items = OrderedDict() for m in str_matches: if isinstance(m, tuple): if 'checkpoint' not in m[0]: items[m[0]] = m[1] else: items[m] = '' for m in num_matches: items[m[0]] = m[1] for m in lr_matches: items[m[0]] = m[1] for m in task_matches: items["hydra_task"] = m for m in cp_matches: items['checkpoint'] = m return items abs_best = None sources = [] for root, _, files in os.walk(args.base): if args.path_contains is not None and not args.path_contains in root: continue for f in files: if f.endswith(args.file_name): sources.append((root, f)) if args.last_files is not None: sources = sources[-args.last_files:] for root, file in sources: with open(os.path.join(root, file), 'r') as fin: found = [] avg = {} prev = None for line in fin: line = line.rstrip() if line.find(args.target) != -1 and ( args.skip_containing is None or line.find(args.skip_containing) == -1): try: idx = line.index("{") line = line[idx:] line_json = json.loads(line) except: continue if prev is not None: try: prev.update(line_json) line_json = prev except: pass if args.target in line_json: found.append(line_json) if args.avg_params: avg_params = args.avg_params.split(',') for p in avg_params: m = extract_metric(line, p) if m is not None: prev_v, prev_c = avg.get(p, (0, 0)) avg[p] = prev_v + float(m), prev_c + 1 if args.extract_prev: try: prev = json.loads(line) except: pass best = None if args.best: curr_best = None for i in range(len(found)): cand_best = found[i][args.target] if args.target in found[i] else None def cmp(a, b): a = float(a) b = float(b) if args.best_biggest: return a > b return a < b if cand_best is not None and not math.isnan(float(cand_best)) and ( curr_best is None or cmp(cand_best, curr_best)): curr_best = cand_best if abs_best is None or cmp(curr_best, abs_best): abs_best = curr_best best = found[i] if args.unique_epochs or args.epoch: last_found = [] last_epoch = None for i in reversed(range(len(found))): epoch = found[i]['epoch'] if args.epoch and args.epoch != epoch: continue if epoch != last_epoch: last_epoch = epoch last_found.append(found[i]) found = list(reversed(last_found)) if len(found) == 0: if print_output and (args.last_files is not None or not args.skip_empty): # print(root.split('/')[-1]) print(root[len(args.base):]) print('Nothing') else: if not print_output: ret[root[len(args.base):]] = best continue if args.compact: # print('{}\t{}'.format(root.split('/')[-1], curr_best)) print('{}\t{}'.format(root[len(args.base)+1:], curr_best)) continue if args.group_on is None and not args.best_only: # print(root.split('/')[-1]) print(root[len(args.base):]) if not args.everything: if best is not None and args.group_on is None and not args.best_only and not args.flat: print(best, '(best)') if args.group_on is None and args.last and not args.best_only and not args.flat: for f in found[-args.last:]: if args.extract_prev is not None: try: print('{}\t{}'.format(f[args.extract_prev], f[args.target])) except Exception as e: print('Exception!', e) else: print(f) try: metric = found[-1][args.target] if not args.best or best is None else best[args.target] except: print(found[-1]) raise if metric is not None: entries.append((extract_params(root), metric)) else: for f in found: print(f) if not args.group_on and print_output: print() if len(avg) > 0: for k, (v, c) in avg.items(): print(f'{k}: {v/c}') if args.best_only: print(abs_best) if args.flat: print("\t".join(m for _, m in entries)) if args.group_on is not None: by_val = OrderedDict() for e, m in entries: k = args.group_on if k not in e: m_keys = [x for x in e.keys() if x.startswith(k)] if len(m_keys) == 0: val = "False" else: assert len(m_keys) == 1 k = m_keys[0] val = m_keys[0] else: val = e[args.group_on] if val == "": val = "True" scrubbed_entry = copy.deepcopy(e) if k in scrubbed_entry: del scrubbed_entry[k] if args.remove_metric and args.remove_metric in scrubbed_entry: val += '_' + scrubbed_entry[args.remove_metric] del scrubbed_entry[args.remove_metric] by_val.setdefault(tuple(scrubbed_entry.items()), dict())[val] = m distinct_vals = set() for v in by_val.values(): distinct_vals.update(v.keys()) try: distinct_vals = {int(d) for d in distinct_vals} except: print(distinct_vals) print() print("by_val", len(by_val)) for k,v in by_val.items(): print(k, '=>', v) print() # , by_val, entries) raise from natsort import natsorted svals = list(map(str, natsorted(distinct_vals))) print('{}\t{}'.format(args.group_on, '\t'.join(svals))) sums = OrderedDict({n:[] for n in svals}) for k, v in by_val.items(): kstr = '.'.join(':'.join(x) for x in k) vstr = '' for mv in svals: x = v[mv] if mv in v else '' vstr += '\t{}'.format(round(x, 5) if isinstance(x, float) else x) try: sums[mv].append(float(x)) except: pass print('{}{}'.format(kstr[:args.key_len], vstr)) if any(len(x) > 0 for x in sums.values()): print('min:', end='') for v in sums.values(): min = np.min(v) print(f'\t{round(min, 5)}', end='') print() print('max:', end='') for v in sums.values(): max = np.max(v) print(f'\t{round(max, 5)}', end='') print() print('avg:', end='') for v in sums.values(): mean = np.mean(v) print(f'\t{round(mean, 5)}', end='') print() print('median:', end='') for v in sums.values(): median = np.median(v) print(f'\t{round(median, 5)}', end='') print() return ret if __name__ == "__main__": args = parser.parse_args() main(args, print_output=True) ================================================ FILE: examples/data2vec/tasks/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .image_pretraining import ImagePretrainingTask, ImagePretrainingConfig from .image_classification import ImageClassificationTask, ImageClassificationConfig from .mae_image_pretraining import MaeImagePretrainingTask, MaeImagePretrainingConfig __all__ = [ "ImageClassificationTask", "ImageClassificationConfig", "ImagePretrainingTask", "ImagePretrainingConfig", "MaeImagePretrainingTask", "MaeImagePretrainingConfig", ] ================================================ FILE: examples/data2vec/tasks/audio_classification.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import os import numpy as np import math import torch from sklearn import metrics as sklearn_metrics from dataclasses import dataclass from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig from fairseq.tasks import register_task from fairseq.logging import metrics from ..data.add_class_target_dataset import AddClassTargetDataset logger = logging.getLogger(__name__) @dataclass class AudioClassificationConfig(AudioPretrainingConfig): label_descriptors: str = "label_descriptors.csv" labels: str = "lbl" @register_task("audio_classification", dataclass=AudioClassificationConfig) class AudioClassificationTask(AudioPretrainingTask): """ """ cfg: AudioClassificationConfig def __init__( self, cfg: AudioClassificationConfig, ): super().__init__(cfg) self.state.add_factory("labels", self.load_labels) def load_labels(self): labels = {} path = os.path.join(self.cfg.data, self.cfg.label_descriptors) with open(path, "r") as ldf: for line in ldf: if line.strip() == "": continue items = line.split(",") idx = items[0] lbl = items[1] assert lbl not in labels, lbl labels[lbl] = idx return labels @property def labels(self): return self.state.labels def load_dataset( self, split: str, task_cfg: AudioClassificationConfig = None, **kwargs ): super().load_dataset(split, task_cfg, **kwargs) task_cfg = task_cfg or self.cfg data_path = self.cfg.data label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) labels = [] with open(label_path, "r") as f: for i, line in enumerate(f): if i not in skipped_indices: lbl_items = line.rstrip().split("\t") labels.append([int(x) for x in lbl_items[2].split(",")]) assert len(labels) == len(self.datasets[split]), ( f"labels length ({len(labels)}) and dataset length " f"({len(self.datasets[split])}) do not match" ) self.datasets[split] = AddClassTargetDataset( self.datasets[split], labels, multi_class=True, add_to_input=True, num_classes=len(self.labels), ) def calculate_stats(self, output, target): classes_num = target.shape[-1] stats = [] # Accuracy, only used for single-label classification such as esc-50, not for multiple label one such as AudioSet # acc = sklearn_metrics.accuracy_score(np.argmax(target, 1), np.argmax(output, 1)) # Class-wise statistics for k in range(classes_num): # Average precision avg_precision = sklearn_metrics.average_precision_score( target[:, k], output[:, k], average=None ) dict = { "AP": avg_precision, } # # AUC # try: # auc = sklearn_metrics.roc_auc_score(target[:, k], output[:, k], average=None) # except: # auc = 0 # # # Precisions, recalls # (precisions, recalls, thresholds) = sklearn_metrics.precision_recall_curve( # target[:, k], output[:, k] # ) # # # FPR, TPR # (fpr, tpr, thresholds) = sklearn_metrics.roc_curve(target[:, k], output[:, k]) # # save_every_steps = 1000 # Sample statistics to reduce size # dict = { # "precisions": precisions[0::save_every_steps], # "recalls": recalls[0::save_every_steps], # "AP": avg_precision, # "fpr": fpr[0::save_every_steps], # "fnr": 1.0 - tpr[0::save_every_steps], # "auc": auc, # # note acc is not class-wise, this is just to keep consistent with other metrics # "acc": acc, # } stats.append(dict) return stats def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) return loss, sample_size, logging_output def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if "_predictions" in logging_outputs[0]: metrics.log_concat_tensor( "_predictions", torch.cat([l["_predictions"].cpu() for l in logging_outputs], dim=0), ) metrics.log_concat_tensor( "_targets", torch.cat([l["_targets"].cpu() for l in logging_outputs], dim=0), ) def compute_stats(meters): if meters["_predictions"].tensor.shape[0] < 100: return 0 stats = self.calculate_stats( meters["_predictions"].tensor, meters["_targets"].tensor ) return np.nanmean([stat["AP"] for stat in stats]) metrics.log_derived("mAP", compute_stats) ================================================ FILE: examples/data2vec/tasks/image_classification.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import os.path as osp import logging from dataclasses import dataclass import torch from torchvision import transforms from fairseq.dataclass import FairseqDataclass from fairseq.tasks import register_task from fairseq.logging import metrics try: from ..data import ImageDataset except: import sys sys.path.append("..") from data import ImageDataset from .image_pretraining import ( ImagePretrainingConfig, ImagePretrainingTask, IMG_EXTENSIONS, ) logger = logging.getLogger(__name__) @dataclass class ImageClassificationConfig(ImagePretrainingConfig): pass @register_task("image_classification", dataclass=ImageClassificationConfig) class ImageClassificationTask(ImagePretrainingTask): cfg: ImageClassificationConfig @classmethod def setup_task(cls, cfg: ImageClassificationConfig, **kwargs): return cls(cfg) def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): data_path = self.cfg.data cfg = task_cfg or self.cfg path_with_split = osp.join(data_path, split) if osp.exists(path_with_split): data_path = path_with_split from timm.data import create_transform if split == "train": # this should always dispatch to transforms_imagenet_train transform = create_transform( input_size=cfg.input_size, is_training=True, auto_augment="rand-m9-mstd0.5-inc1", interpolation="bicubic", re_prob=0.25, re_mode="pixel", re_count=1, mean=cfg.normalization_mean, std=cfg.normalization_std, ) if not cfg.input_size > 32: transform.transforms[0] = transforms.RandomCrop( cfg.input_size, padding=4 ) else: t = [] if cfg.input_size > 32: crop_pct = 1 if cfg.input_size < 384: crop_pct = 224 / 256 size = int(cfg.input_size / crop_pct) t.append( transforms.Resize( size, interpolation=3 ), # to maintain same ratio w.r.t. 224 images ) t.append(transforms.CenterCrop(cfg.input_size)) t.append(transforms.ToTensor()) t.append( transforms.Normalize(cfg.normalization_mean, cfg.normalization_std) ) transform = transforms.Compose(t) logger.info(transform) self.datasets[split] = ImageDataset( root=data_path, extensions=IMG_EXTENSIONS, load_classes=True, transform=transform, ) for k in self.datasets.keys(): if k != split: assert self.datasets[k].classes == self.datasets[split].classes def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): model = super().build_model(model_cfg, from_checkpoint) actualized_cfg = getattr(model, "cfg", None) if actualized_cfg is not None: if hasattr(actualized_cfg, "pretrained_model_args"): model_cfg.pretrained_model_args = actualized_cfg.pretrained_model_args return model def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if "correct" in logging_outputs[0]: zero = torch.scalar_tensor(0.0) correct = sum(log.get("correct", zero) for log in logging_outputs) metrics.log_scalar_sum("_correct", correct) metrics.log_derived( "accuracy", lambda meters: 100 * meters["_correct"].sum / meters["sample_size"].sum, ) ================================================ FILE: examples/data2vec/tasks/image_pretraining.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import sys import os.path as osp from dataclasses import dataclass, field from typing import List from omegaconf import MISSING import torch from torchvision import transforms from fairseq.dataclass import FairseqDataclass from fairseq.tasks import FairseqTask, register_task try: from ..data import ImageDataset except: sys.path.append("..") from data import ImageDataset logger = logging.getLogger(__name__) IMG_EXTENSIONS = { ".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp", } @dataclass class ImagePretrainingConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) input_size: int = 224 normalization_mean: List[float] = (0.485, 0.456, 0.406) normalization_std: List[float] = (0.229, 0.224, 0.225) @register_task("image_pretraining", dataclass=ImagePretrainingConfig) class ImagePretrainingTask(FairseqTask): """ """ cfg: ImagePretrainingConfig @classmethod def setup_task(cls, cfg: ImagePretrainingConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: cfg (AudioPretrainingConfig): configuration of this task """ return cls(cfg) def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): data_path = self.cfg.data cfg = task_cfg or self.cfg path_with_split = osp.join(data_path, split) if osp.exists(path_with_split): data_path = path_with_split transform = transforms.Compose( [ transforms.ColorJitter(0.4, 0.4, 0.4), transforms.RandomHorizontalFlip(p=0.5), transforms.RandomResizedCrop( size=cfg.input_size, interpolation=transforms.InterpolationMode.BICUBIC, ), transforms.ToTensor(), transforms.Normalize( mean=torch.tensor(cfg.normalization_mean), std=torch.tensor(cfg.normalization_std), ), ] ) logger.info(transform) self.datasets[split] = ImageDataset( root=data_path, extensions=IMG_EXTENSIONS, load_classes=False, transform=transform, ) @property def source_dictionary(self): return None @property def target_dictionary(self): return None def max_positions(self): """Maximum input length supported by the encoder.""" return sys.maxsize, sys.maxsize ================================================ FILE: examples/data2vec/tasks/mae_image_classification.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import sys import torch from typing import Optional from dataclasses import dataclass, field from omegaconf import MISSING from fairseq.dataclass import FairseqDataclass from fairseq.tasks import FairseqTask, register_task from fairseq.logging import metrics try: from ..data import MaeFinetuningImageDataset except: sys.path.append("..") from data import MaeFinetuningImageDataset logger = logging.getLogger(__name__) @dataclass class MaeImageClassificationConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) input_size: int = 224 local_cache_path: Optional[str] = None rebuild_batches: bool = True @register_task("mae_image_classification", dataclass=MaeImageClassificationConfig) class MaeImageClassificationTask(FairseqTask): """ """ cfg: MaeImageClassificationConfig @classmethod def setup_task(cls, cfg: MaeImageClassificationConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: cfg (AudioPretrainingConfig): configuration of this task """ return cls(cfg) def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): data_path = self.cfg.data cfg = task_cfg or self.cfg self.datasets[split] = MaeFinetuningImageDataset( root=data_path, split=split, is_train=split == "train", input_size=cfg.input_size, local_cache_path=cfg.local_cache_path, shuffle=split == "train", ) def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): model = super().build_model(model_cfg, from_checkpoint) actualized_cfg = getattr(model, "cfg", None) if actualized_cfg is not None: if hasattr(actualized_cfg, "pretrained_model_args"): model_cfg.pretrained_model_args = actualized_cfg.pretrained_model_args return model def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if "correct" in logging_outputs[0]: zero = torch.scalar_tensor(0.0) correct = sum(log.get("correct", zero) for log in logging_outputs) metrics.log_scalar_sum("_correct", correct) metrics.log_derived( "accuracy", lambda meters: 100 * meters["_correct"].sum / meters["sample_size"].sum, ) @property def source_dictionary(self): return None @property def target_dictionary(self): return None def max_positions(self): """Maximum input length supported by the encoder.""" return sys.maxsize, sys.maxsize ================================================ FILE: examples/data2vec/tasks/mae_image_pretraining.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import sys from typing import Optional, List from dataclasses import dataclass, field from omegaconf import MISSING, II from fairseq.data import SubsampleDataset from fairseq.dataclass import FairseqDataclass from fairseq.tasks import FairseqTask, register_task try: from ..data import MaeImageDataset except: sys.path.append("..") from data import MaeImageDataset logger = logging.getLogger(__name__) @dataclass class ImageMaskingConfig: patch_size: int = II("model.modalities.image.patch_size") mask_prob: float = II("model.modalities.image.mask_prob") mask_prob_adjust: float = II("model.modalities.image.mask_prob_adjust") mask_length: int = II("model.modalities.image.mask_length") inverse_mask: bool = II("model.modalities.image.inverse_mask") mask_dropout: float = II("model.modalities.image.mask_dropout") clone_batch: int = II("model.clone_batch") expand_adjacent: bool = False non_overlapping: bool = False @dataclass class MaeImagePretrainingConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) multi_data: Optional[List[str]] = None input_size: int = 224 local_cache_path: Optional[str] = None key: str = "imgs" beit_transforms: bool = False target_transform: bool = False no_transform: bool = False rebuild_batches: bool = True precompute_mask_config: Optional[ImageMaskingConfig] = None subsample: float = 1 seed: int = II("common.seed") dataset_type: str = "imagefolder" @register_task("mae_image_pretraining", dataclass=MaeImagePretrainingConfig) class MaeImagePretrainingTask(FairseqTask): """ """ cfg: MaeImagePretrainingConfig @classmethod def setup_task(cls, cfg: MaeImagePretrainingConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: cfg (AudioPretrainingConfig): configuration of this task """ return cls(cfg) def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): data_path = self.cfg.data cfg = task_cfg or self.cfg compute_mask = cfg.precompute_mask_config is not None mask_args = {} if compute_mask: mask_args = cfg.precompute_mask_config self.datasets[split] = MaeImageDataset( root=data_path if cfg.multi_data is None else cfg.multi_data, split=split, input_size=cfg.input_size, local_cache_path=cfg.local_cache_path, key=cfg.key, beit_transforms=cfg.beit_transforms, target_transform=cfg.target_transform, no_transform=cfg.no_transform, compute_mask=compute_mask, dataset_type=cfg.dataset_type, **mask_args, ) if cfg.subsample < 1: self.datasets[split] = SubsampleDataset( self.datasets[split], cfg.subsample, shuffle=True, seed=cfg.seed, ) @property def source_dictionary(self): return None @property def target_dictionary(self): return None def max_positions(self): """Maximum input length supported by the encoder.""" return sys.maxsize, sys.maxsize ================================================ FILE: examples/data2vec/tasks/multimodal.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import sys from dataclasses import dataclass from typing import Optional, List from omegaconf import II from fairseq.data.iterators import GroupedEpochBatchIterator from fairseq.dataclass import FairseqDataclass from fairseq.tasks import FairseqTask, register_task from fairseq.tasks.audio_pretraining import AudioPretrainingConfig, AudioPretrainingTask from fairseq.tasks.masked_lm import MaskedLMConfig, MaskedLMTask from .mae_image_pretraining import MaeImagePretrainingConfig, MaeImagePretrainingTask from examples.data2vec.data.modality import Modality from fairseq.data.audio.multi_modality_dataset import ( MultiModalityDataset, ModalityDatasetItem, ) @dataclass class MultimodalPretrainingConfig(FairseqDataclass): audio: Optional[AudioPretrainingConfig] = None image: Optional[MaeImagePretrainingConfig] = None text: Optional[MaskedLMConfig] = None audio_ratio: float = 1 image_ratio: float = 1 text_ratio: float = 1 max_tokens: Optional[int] = II("dataset.max_tokens") batch_size: Optional[int] = II("dataset.batch_size") update_freq: List[int] = II("optimization.update_freq") rebuild_batches: bool = True @register_task("multimodal_pretraining", dataclass=MultimodalPretrainingConfig) class MultimodalPretrainingTask(FairseqTask): """ """ cfg: MultimodalPretrainingConfig def __init__(self, cfg: MultimodalPretrainingConfig): super().__init__(cfg) self.audio_task = ( AudioPretrainingTask(cfg.audio) if cfg.audio is not None else None ) self.image_task = ( MaeImagePretrainingTask(cfg.image) if cfg.image is not None else None ) self.text_task = MaskedLMTask(cfg.text) if cfg.text is not None else None self.mult_ratios = [] @classmethod def setup_task(cls, cfg: MultimodalPretrainingConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: cfg (AudioPretrainingConfig): configuration of this task """ return cls(cfg) def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): datasets = [] self.mult_ratios = [] def load_ds(task, name, ratio): if task is not None: task.load_dataset(split) ds = ModalityDatasetItem( datasetname=name, dataset=task.dataset(split), max_positions=task.max_positions(), max_tokens=self.cfg.max_tokens, max_sentences=self.cfg.batch_size, ) datasets.append(ds) self.mult_ratios.append(ratio) load_ds(self.audio_task, Modality.AUDIO, self.cfg.audio_ratio) load_ds(self.image_task, Modality.IMAGE, self.cfg.image_ratio) load_ds(self.text_task, Modality.TEXT, self.cfg.text_ratio) assert len(datasets) > 0 self.datasets[split] = MultiModalityDataset(datasets) @property def supported_modalities(self): modalities = [] if self.cfg.text is not None: modalities.append(Modality.TEXT) if self.cfg.audio is not None: modalities.append(Modality.AUDIO) if self.cfg.image is not None: modalities.append(Modality.IMAGE) return modalities def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, data_buffer_size=0, disable_iterator_cache=False, skip_remainder_batch=False, grouped_shuffling=False, update_epoch_batch_itr=False, ): # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) batch_samplers = dataset.get_batch_samplers( self.mult_ratios, required_batch_size_multiple, seed ) # return a reusable, sharded iterator epoch_iter = GroupedEpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_samplers=batch_samplers, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, mult_rate=max(self.cfg.update_freq), buffer_size=data_buffer_size, skip_remainder_batch=skip_remainder_batch, ) self.dataset_to_epoch_iter[dataset] = {} # refresh it every epoch return epoch_iter @property def source_dictionary(self): return None @property def target_dictionary(self): return None def max_positions(self): """Maximum input length supported by the encoder.""" return sys.maxsize, sys.maxsize ================================================ FILE: examples/discriminative_reranking_nmt/README.md ================================================ # Discriminative Reranking for Neural Machine Translation https://aclanthology.org/2021.acl-long.563/ This folder contains source code for training DrNMT, a discriminatively trained reranker for neural machine translation. ## Data preparation 1. Follow the instructions under `examples/translation` to build a base MT model. Prepare three files, one with source sentences, one with ground truth target sentences, and one with hypotheses generated from the base MT model. Each line in the file contains one sentence in raw text (i.e. no sentencepiece, etc.). Below is an example of the files with _N_ hypotheses for each source sentence. ``` # Example of the source sentence file: (The file should contain L lines.) source_sentence_1 source_sentence_2 source_sentence_3 ... source_sentence_L # Example of the target sentence file: (The file should contain L lines.) target_sentence_1 target_sentence_2 target_sentence_3 ... target_sentence_L # Example of the hypotheses file: (The file should contain L*N lines.) source_sentence_1_hypo_1 source_sentence_1_hypo_2 ... source_sentence_1_hypo_N source_sentence_2_hypo_1 ... source_sentence_2_hypo_N ... source_sentence_L_hypo_1 ... source_sentence_L_hypo_N ``` 2. Download the [XLMR model](https://github.com/fairinternal/fairseq-py/tree/main/examples/xlmr#pre-trained-models). ``` wget https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz tar zxvf xlmr.base.tar.gz # The folder should contain dict.txt, model.pt and sentencepiece.bpe.model. ``` 3. Prepare scores and BPE data. * `N`: Number of hypotheses per each source sentence. We use 50 in the paper. * `SPLIT`: Name of the data split, i.e. train, valid, test. Use split_name, split_name1, split_name2, ..., if there are multiple datasets for a split, e.g. train, train1, valid, valid1. * `NUM_SHARDS`: Number of shards. Set this to 1 for non-train splits. * `METRIC`: The metric for DrNMT to optimize for. We support either `bleu` or `ter`. ``` # For each data split, e.g. train, valid, test, etc., run the following: SOURCE_FILE=/path/to/source_sentence_file TARGET_FILE=/path/to/target_sentence_file HYPO_FILE=/path/to/hypo_file XLMR_DIR=/path/to/xlmr OUTPUT_DIR=/path/to/output python scripts/prep_data.py \ --input-source ${SOURCE_FILE} \ --input-target ${TARGET_FILE} \ --input-hypo ${HYPO_FILE} \ --output-dir ${OUTPUT_DIR} \ --split $SPLIT --beam $N \ --sentencepiece-model ${XLMR_DIR}/sentencepiece.bpe.model \ --metric $METRIC \ --num-shards ${NUM_SHARDS} # The script will create ${OUTPUT_DIR}/$METRIC with ${NUM_SHARDS} splits. # Under split*/input_src, split*/input_tgt and split*/$METRIC, there will be $SPLIT.bpe and $SPLIT.$METRIC files, respectively. ``` 4. Pre-process the data into fairseq format. ``` # use comma to separate if there are more than one train or valid set for suffix in src tgt ; do fairseq-preprocess --only-source \ --trainpref ${OUTPUT_DIR}/$METRIC/split1/input_${suffix}/train.bpe \ --validpref ${OUTPUT_DIR}/$METRIC/split1/input_${suffix}/valid.bpe \ --destdir ${OUTPUT_DIR}/$METRIC/split1/input_${suffix} \ --workers 60 \ --srcdict ${XLMR_DIR}/dict.txt done for i in `seq 2 ${NUM_SHARDS}`; do for suffix in src tgt ; do fairseq-preprocess --only-source \ --trainpref ${OUTPUT_DIR}/$METRIC/split${i}/input_${suffix}/train.bpe \ --destdir ${OUTPUT_DIR}/$METRIC/split${i}/input_${suffix} \ --workers 60 \ --srcdict ${XLMR_DIR}/dict.txt ln -s ${OUTPUT_DIR}/$METRIC/split1/input_${suffix}/valid* ${OUTPUT_DIR}/$METRIC/split${i}/input_${suffix}/. done ln -s ${OUTPUT_DIR}/$METRIC/split1/$METRIC/valid* ${OUTPUT_DIR}/$METRIC/split${i}/$METRIC/. done ``` ## Training ``` EXP_DIR=/path/to/exp # An example of training the model with the config for De-En experiment in the paper. # The config uses 16 GPUs and 50 hypotheses. # For training with fewer number of GPUs, set # distributed_training.distributed_world_size=k +optimization.update_freq='[x]' where x = 16/k # For training with fewer number of hypotheses, set # task.mt_beam=N dataset.batch_size=N dataset.required_batch_size_multiple=N fairseq-hydra-train -m \ --config-dir config/ --config-name deen \ task.data=${OUTPUT_DIR}/$METRIC/split1/ \ task.num_data_splits=${NUM_SHARDS} \ model.pretrained_model=${XLMR_DIR}/model.pt \ common.user_dir=${FAIRSEQ_ROOT}/examples/discriminative_reranking_nmt \ checkpoint.save_dir=${EXP_DIR} ``` ## Inference & scoring Perform DrNMT reranking (fw + reranker score) 1. Tune weights on valid sets. ``` # genrate N hypotheses with the base MT model (fw score) VALID_SOURCE_FILE=/path/to/source_sentences # one sentence per line, converted to the sentencepiece used by the base MT model VALID_TARGET_FILE=/path/to/target_sentences # one sentence per line in raw text, i.e. no sentencepiece and tokenization MT_MODEL=/path/to/mt_model MT_DATA_PATH=/path/to/mt_data cat ${VALID_SOURCE_FILE} | \ fairseq-interactive ${MT_DATA_PATH} \ --max-tokens 4000 --buffer-size 16 \ --num-workers 32 --path ${MT_MODEL} \ --beam $N --nbest $N \ --post-process sentencepiece &> valid-hypo.out # replace "bleu" with "ter" to optimize for TER python drnmt_rerank.py \ ${OUTPUT_DIR}/$METRIC/split1/ \ --path ${EXP_DIR}/checkpoint_best.pt \ --in-text valid-hypo.out \ --results-path ${EXP_DIR} \ --gen-subset valid \ --target-text ${VALID_TARGET_FILE} \ --user-dir ${FAIRSEQ_ROOT}/examples/discriminative_reranking_nmt \ --bpe sentencepiece \ --sentencepiece-model ${XLMR_DIR}/sentencepiece.bpe.model \ --beam $N \ --batch-size $N \ --metric bleu \ --tune ``` 2. Apply best weights on test sets ``` # genrate N hypotheses with the base MT model (fw score) TEST_SOURCE_FILE=/path/to/source_sentences # one sentence per line, converted to the sentencepiece used by the base MT model cat ${TEST_SOURCE_FILE} | \ fairseq-interactive ${MT_DATA_PATH} \ --max-tokens 4000 --buffer-size 16 \ --num-workers 32 --path ${MT_MODEL} \ --beam $N --nbest $N \ --post-process sentencepiece &> test-hypo.out # replace "bleu" with "ter" to evaluate TER # Add --target-text for evaluating BLEU/TER, # otherwise the script will only generate the hypotheses with the highest scores only. python drnmt_rerank.py \ ${OUTPUT_DIR}/$METRIC/split1/ \ --path ${EXP_DIR}/checkpoint_best.pt \ --in-text test-hypo.out \ --results-path ${EXP_DIR} \ --gen-subset test \ --user-dir ${FAIRSEQ_ROOT}/examples/discriminative_reranking_nmt \ --bpe sentencepiece \ --sentencepiece-model ${XLMR_DIR}/sentencepiece.bpe.model \ --beam $N \ --batch-size $N \ --metric bleu \ --fw-weight ${BEST_FW_WEIGHT} \ --lenpen ${BEST_LENPEN} ``` ## Citation ```bibtex @inproceedings{lee2021discriminative, title={Discriminative Reranking for Neural Machine Translation}, author={Lee, Ann and Auli, Michael and Ranzato, Marc'Aurelio}, booktitle={ACL}, year={2021} } ``` ================================================ FILE: examples/discriminative_reranking_nmt/__init__.py ================================================ from . import criterions, models, tasks # noqa ================================================ FILE: examples/discriminative_reranking_nmt/config/deen.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 50 seed: 2 checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: bleu maximize_best_checkpoint_metric: true task: _name: discriminative_reranking_nmt data: ??? num_data_splits: ??? include_src: true mt_beam: 50 eval_target_metric: true target_metric: bleu dataset: batch_size: 50 num_workers: 6 required_batch_size_multiple: 50 valid_subset: ??? criterion: _name: kl_divergence_rereanking target_dist_norm: minmax temperature: 0.5 optimization: max_epoch: 200 lr: [0.00005] update_freq: [32] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 8000 total_num_update: 320000 model: _name: discriminative_nmt_reranker pretrained_model: ??? classifier_dropout: 0.2 distributed_training: ddp_backend: no_c10d distributed_world_size: 16 ================================================ FILE: examples/discriminative_reranking_nmt/criterions/__init__.py ================================================ from .discriminative_reranking_criterion import KLDivergenceRerankingCriterion __all__ = [ "KLDivergenceRerankingCriterion", ] ================================================ FILE: examples/discriminative_reranking_nmt/criterions/discriminative_reranking_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import ChoiceEnum, FairseqDataclass _EPSILON = torch.finfo(torch.float32).eps TARGET_DIST_NORM_CHOICES = ChoiceEnum(["none", "minmax"]) @dataclass class KLDivergenceRerankingCriterionConfig(FairseqDataclass): target_dist_norm: TARGET_DIST_NORM_CHOICES = field( default="none", metadata={"help": "method to normalize the range of target scores"}, ) temperature: float = field( default=1.0, metadata={"help": "temperature in softmax for target distributions"}, ) forward_batch_size: int = field( default=32, metadata={ "help": "number of hypotheses per batch for model forward (set a value smaller than --mt-beam to avoid OOM when training with a large beam size)" }, ) @register_criterion( "kl_divergence_rereanking", dataclass=KLDivergenceRerankingCriterionConfig ) class KLDivergenceRerankingCriterion(FairseqCriterion): def __init__( self, task, target_dist_norm, temperature, forward_batch_size, ): super().__init__(task) self.target_dist_norm = target_dist_norm self.temperature = temperature self.forward_batch_size = forward_batch_size def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ sample_size = sample["id"].numel() assert sample_size % self.task.cfg.mt_beam == 0, ( f"sample_size ({sample_size}) cannot be divided by beam size ({self.task.cfg.mt_beam})." f"Please set --required-batch-size-multiple={self.task.cfg.mt_beam}." ) # split into smaller batches for model forward batch_out = [] for i in range(0, sample_size, self.forward_batch_size): j = min(i + self.forward_batch_size, sample_size) out = model( src_tokens=sample["net_input"]["src_tokens"][i:j, :], src_lengths=sample["net_input"]["src_lengths"][i:j], ) batch_out.append( model.sentence_forward(out, sample["net_input"]["src_tokens"][i:j, :]) ) batch_out = torch.cat(batch_out, dim=0).view( self.task.cfg.mt_beam, sample_size // self.task.cfg.mt_beam, -1 ) # T x B x C if model.joint_classification == "sent": batch_out = model.joint_forward(batch_out) scores = model.classification_forward(batch_out.view(sample_size, 1, -1)).view( -1, self.task.cfg.mt_beam ) # input: B x T x C loss = self.compute_kl_loss( scores, sample["target"][:, 0].view(-1, self.task.cfg.mt_beam) ) sample_size = sample_size // self.task.cfg.mt_beam logging_output = { "loss": loss.detach(), "ntokens": sample["ntokens"], "nsentences": sample_size * self.task.cfg.mt_beam, "sample_size": sample_size, "scores": scores.detach(), } return loss, sample_size, logging_output def compute_kl_loss(self, logits, target): norm_target = target if self.target_dist_norm == "minmax": min_v = torch.min(target, 1, keepdim=True).values max_v = torch.max(target, 1, keepdim=True).values norm_target = (target - min_v) / (max_v - min_v + _EPSILON) target_dist = F.softmax( norm_target / self.temperature, dim=-1, dtype=torch.float32 ) model_dist = F.log_softmax(logits, dim=-1, dtype=torch.float32) loss = -(target_dist * model_dist - target_dist * target_dist.log()).sum() return loss @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) loss = loss_sum / sample_size / math.log(2) metrics.log_scalar("loss", loss, sample_size, round=3) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: examples/discriminative_reranking_nmt/drnmt_rerank.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Score raw text with a trained model. """ from collections import namedtuple import logging from multiprocessing import Pool import sys import os import random import numpy as np import sacrebleu import torch from fairseq import checkpoint_utils, options, utils logger = logging.getLogger("fairseq_cli.drnmt_rerank") logger.setLevel(logging.INFO) Batch = namedtuple("Batch", "ids src_tokens src_lengths") pool_init_variables = {} def init_loaded_scores(mt_scores, model_scores, hyp, ref): global pool_init_variables pool_init_variables["mt_scores"] = mt_scores pool_init_variables["model_scores"] = model_scores pool_init_variables["hyp"] = hyp pool_init_variables["ref"] = ref def parse_fairseq_gen(filename, task): source = {} hypos = {} scores = {} with open(filename, "r", encoding="utf-8") as f: for line in f: line = line.strip() if line.startswith("S-"): # source uid, text = line.split("\t", 1) uid = int(uid[2:]) source[uid] = text elif line.startswith("D-"): # hypo uid, score, text = line.split("\t", 2) uid = int(uid[2:]) if uid not in hypos: hypos[uid] = [] scores[uid] = [] hypos[uid].append(text) scores[uid].append(float(score)) else: continue source_out = [source[i] for i in range(len(hypos))] hypos_out = [h for i in range(len(hypos)) for h in hypos[i]] scores_out = [s for i in range(len(scores)) for s in scores[i]] return source_out, hypos_out, scores_out def read_target(filename): with open(filename, "r", encoding="utf-8") as f: output = [line.strip() for line in f] return output def make_batches(args, src, hyp, task, max_positions, encode_fn): assert len(src) * args.beam == len( hyp ), f"Expect {len(src) * args.beam} hypotheses for {len(src)} source sentences with beam size {args.beam}. Got {len(hyp)} hypotheses intead." hyp_encode = [ task.source_dictionary.encode_line(encode_fn(h), add_if_not_exist=False).long() for h in hyp ] if task.cfg.include_src: src_encode = [ task.source_dictionary.encode_line( encode_fn(s), add_if_not_exist=False ).long() for s in src ] tokens = [(src_encode[i // args.beam], h) for i, h in enumerate(hyp_encode)] lengths = [(t1.numel(), t2.numel()) for t1, t2 in tokens] else: tokens = [(h,) for h in hyp_encode] lengths = [(h.numel(),) for h in hyp_encode] itr = task.get_batch_iterator( dataset=task.build_dataset_for_inference(tokens, lengths), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=max_positions, ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, ).next_epoch_itr(shuffle=False) for batch in itr: yield Batch( ids=batch["id"], src_tokens=batch["net_input"]["src_tokens"], src_lengths=batch["net_input"]["src_lengths"], ) def decode_rerank_scores(args): if args.max_tokens is None and args.batch_size is None: args.batch_size = 1 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu # Load ensemble logger.info("loading model(s) from {}".format(args.path)) models, _model_args, task = checkpoint_utils.load_model_ensemble_and_task( [args.path], arg_overrides=eval(args.model_overrides), ) for model in models: if args.fp16: model.half() if use_cuda: model.cuda() # Initialize generator generator = task.build_generator(args) # Handle tokenization and BPE tokenizer = task.build_tokenizer(args) bpe = task.build_bpe(args) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) src, hyp, mt_scores = parse_fairseq_gen(args.in_text, task) model_scores = {} logger.info("decode reranker score") for batch in make_batches(args, src, hyp, task, max_positions, encode_fn): src_tokens = batch.src_tokens src_lengths = batch.src_lengths if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths}, } scores = task.inference_step(generator, models, sample) for id, sc in zip(batch.ids.tolist(), scores.tolist()): model_scores[id] = sc[0] model_scores = [model_scores[i] for i in range(len(model_scores))] return src, hyp, mt_scores, model_scores def get_score(mt_s, md_s, w1, lp, tgt_len): return mt_s / (tgt_len ** lp) * w1 + md_s def get_best_hyps(mt_scores, md_scores, hypos, fw_weight, lenpen, beam): assert len(mt_scores) == len(md_scores) and len(mt_scores) == len(hypos) hypo_scores = [] best_hypos = [] best_scores = [] offset = 0 for i in range(len(hypos)): tgt_len = len(hypos[i].split()) hypo_scores.append( get_score(mt_scores[i], md_scores[i], fw_weight, lenpen, tgt_len) ) if (i + 1) % beam == 0: max_i = np.argmax(hypo_scores) best_hypos.append(hypos[offset + max_i]) best_scores.append(hypo_scores[max_i]) hypo_scores = [] offset += beam return best_hypos, best_scores def eval_metric(args, hypos, ref): if args.metric == "bleu": score = sacrebleu.corpus_bleu(hypos, [ref]).score else: score = sacrebleu.corpus_ter(hypos, [ref]).score return score def score_target_hypo(args, fw_weight, lp): mt_scores = pool_init_variables["mt_scores"] model_scores = pool_init_variables["model_scores"] hyp = pool_init_variables["hyp"] ref = pool_init_variables["ref"] best_hypos, _ = get_best_hyps( mt_scores, model_scores, hyp, fw_weight, lp, args.beam ) rerank_eval = None if ref: rerank_eval = eval_metric(args, best_hypos, ref) print(f"fw_weight {fw_weight}, lenpen {lp}, eval {rerank_eval}") return rerank_eval def print_result(best_scores, best_hypos, output_file): for i, (s, h) in enumerate(zip(best_scores, best_hypos)): print(f"{i}\t{s}\t{h}", file=output_file) def main(args): utils.import_user_module(args) src, hyp, mt_scores, model_scores = decode_rerank_scores(args) assert ( not args.tune or args.target_text is not None ), "--target-text has to be set when tuning weights" if args.target_text: ref = read_target(args.target_text) assert len(src) == len( ref ), f"different numbers of source and target sentences ({len(src)} vs. {len(ref)})" orig_best_hypos = [hyp[i] for i in range(0, len(hyp), args.beam)] orig_eval = eval_metric(args, orig_best_hypos, ref) if args.tune: logger.info("tune weights for reranking") random_params = np.array( [ [ random.uniform( args.lower_bound_fw_weight, args.upper_bound_fw_weight ), random.uniform(args.lower_bound_lenpen, args.upper_bound_lenpen), ] for k in range(args.num_trials) ] ) logger.info("launching pool") with Pool( 32, initializer=init_loaded_scores, initargs=(mt_scores, model_scores, hyp, ref), ) as p: rerank_scores = p.starmap( score_target_hypo, [ (args, random_params[i][0], random_params[i][1],) for i in range(args.num_trials) ], ) if args.metric == "bleu": best_index = np.argmax(rerank_scores) else: best_index = np.argmin(rerank_scores) best_fw_weight = random_params[best_index][0] best_lenpen = random_params[best_index][1] else: assert ( args.lenpen is not None and args.fw_weight is not None ), "--lenpen and --fw-weight should be set" best_fw_weight, best_lenpen = args.fw_weight, args.lenpen best_hypos, best_scores = get_best_hyps( mt_scores, model_scores, hyp, best_fw_weight, best_lenpen, args.beam ) if args.results_path is not None: os.makedirs(args.results_path, exist_ok=True) output_path = os.path.join( args.results_path, "generate-{}.txt".format(args.gen_subset), ) with open(output_path, "w", buffering=1, encoding="utf-8") as o: print_result(best_scores, best_hypos, o) else: print_result(best_scores, best_hypos, sys.stdout) if args.target_text: rerank_eval = eval_metric(args, best_hypos, ref) print(f"before reranking, {args.metric.upper()}:", orig_eval) print( f"after reranking with fw_weight={best_fw_weight}, lenpen={best_lenpen}, {args.metric.upper()}:", rerank_eval, ) def cli_main(): parser = options.get_generation_parser(interactive=True) parser.add_argument( "--in-text", default=None, required=True, help="text from fairseq-interactive output, containing source sentences and hypotheses", ) parser.add_argument("--target-text", default=None, help="reference text") parser.add_argument("--metric", type=str, choices=["bleu", "ter"], default="bleu") parser.add_argument( "--tune", action="store_true", help="if set, tune weights on fw scores and lenpen instead of applying fixed weights for reranking", ) parser.add_argument( "--lower-bound-fw-weight", default=0.0, type=float, help="lower bound of search space", ) parser.add_argument( "--upper-bound-fw-weight", default=3, type=float, help="upper bound of search space", ) parser.add_argument( "--lower-bound-lenpen", default=0.0, type=float, help="lower bound of search space", ) parser.add_argument( "--upper-bound-lenpen", default=3, type=float, help="upper bound of search space", ) parser.add_argument( "--fw-weight", type=float, default=None, help="weight on the fw model score" ) parser.add_argument( "--num-trials", default=1000, type=int, help="number of trials to do for random search", ) args = options.parse_args_and_arch(parser) main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/discriminative_reranking_nmt/models/__init__.py ================================================ from .discriminative_reranking_model import DiscriminativeNMTReranker __all__ = [ "DiscriminativeNMTReranker", ] ================================================ FILE: examples/discriminative_reranking_nmt/models/discriminative_reranking_model.py ================================================ from dataclasses import dataclass, field import os import torch import torch.nn as nn from fairseq import utils from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import ( BaseFairseqModel, register_model, ) from fairseq.models.roberta.model import RobertaClassificationHead from fairseq.modules import ( LayerNorm, TransformerSentenceEncoder, TransformerSentenceEncoderLayer, ) ACTIVATION_FN_CHOICES = ChoiceEnum(utils.get_available_activation_fns()) JOINT_CLASSIFICATION_CHOICES = ChoiceEnum(["none", "sent"]) SENTENCE_REP_CHOICES = ChoiceEnum(["head", "meanpool", "maxpool"]) def update_init_roberta_model_state(state): """ update the state_dict of a Roberta model for initializing weights of the BertRanker """ for k in list(state.keys()): if ".lm_head." in k or "version" in k: del state[k] continue # remove 'encoder/decoder.sentence_encoder.' from the key assert k.startswith("encoder.sentence_encoder.") or k.startswith( "decoder.sentence_encoder." ), f"Cannot recognize parameter name {k}" if "layernorm_embedding" in k: new_k = k.replace(".layernorm_embedding.", ".emb_layer_norm.") state[new_k[25:]] = state[k] else: state[k[25:]] = state[k] del state[k] class BaseRanker(nn.Module): def __init__(self, args, task): super().__init__() self.separator_token = task.dictionary.eos() self.padding_idx = task.dictionary.pad() def forward(self, src_tokens): raise NotImplementedError def get_segment_labels(self, src_tokens): segment_boundary = (src_tokens == self.separator_token).long() segment_labels = ( segment_boundary.cumsum(dim=1) - segment_boundary - (src_tokens == self.padding_idx).long() ) return segment_labels def get_positions(self, src_tokens, segment_labels): segment_positions = ( torch.arange(src_tokens.shape[1]) .to(src_tokens.device) .repeat(src_tokens.shape[0], 1) ) segment_boundary = (src_tokens == self.separator_token).long() _, col_idx = (segment_positions * segment_boundary).nonzero(as_tuple=True) col_idx = torch.cat([torch.zeros(1).type_as(col_idx), col_idx]) offset = torch.cat( [ torch.zeros(1).type_as(segment_boundary), segment_boundary.sum(dim=1).cumsum(dim=0)[:-1], ] ) segment_positions -= col_idx[segment_labels + offset.unsqueeze(1)] * ( segment_labels != 0 ) padding_mask = src_tokens.ne(self.padding_idx) segment_positions = (segment_positions + 1) * padding_mask.type_as( segment_positions ) + self.padding_idx return segment_positions class BertRanker(BaseRanker): def __init__(self, args, task): super(BertRanker, self).__init__(args, task) init_model = getattr(args, "pretrained_model", "") self.joint_layers = nn.ModuleList() if os.path.isfile(init_model): print(f"initialize weight from {init_model}") from fairseq import hub_utils x = hub_utils.from_pretrained( os.path.dirname(init_model), checkpoint_file=os.path.basename(init_model), ) in_state_dict = x["models"][0].state_dict() init_args = x["args"].model num_positional_emb = init_args.max_positions + task.dictionary.pad() + 1 # follow the setup in roberta self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=getattr( args, "encoder_layers", init_args.encoder_layers ), embedding_dim=init_args.encoder_embed_dim, ffn_embedding_dim=init_args.encoder_ffn_embed_dim, num_attention_heads=init_args.encoder_attention_heads, dropout=init_args.dropout, attention_dropout=init_args.attention_dropout, activation_dropout=init_args.activation_dropout, num_segments=2, # add language embeddings max_seq_len=num_positional_emb, offset_positions_by_padding=False, encoder_normalize_before=True, apply_bert_init=True, activation_fn=init_args.activation_fn, freeze_embeddings=args.freeze_embeddings, n_trans_layers_to_freeze=args.n_trans_layers_to_freeze, ) # still need to learn segment embeddings as we added a second language embedding if args.freeze_embeddings: for p in self.model.segment_embeddings.parameters(): p.requires_grad = False update_init_roberta_model_state(in_state_dict) print("loading weights from the pretrained model") self.model.load_state_dict( in_state_dict, strict=False ) # ignore mismatch in language embeddings ffn_embedding_dim = init_args.encoder_ffn_embed_dim num_attention_heads = init_args.encoder_attention_heads dropout = init_args.dropout attention_dropout = init_args.attention_dropout activation_dropout = init_args.activation_dropout activation_fn = init_args.activation_fn classifier_embed_dim = getattr( args, "embed_dim", init_args.encoder_embed_dim ) if classifier_embed_dim != init_args.encoder_embed_dim: self.transform_layer = nn.Linear( init_args.encoder_embed_dim, classifier_embed_dim ) else: self.model = TransformerSentenceEncoder( padding_idx=task.dictionary.pad(), vocab_size=len(task.dictionary), num_encoder_layers=args.encoder_layers, embedding_dim=args.embed_dim, ffn_embedding_dim=args.ffn_embed_dim, num_attention_heads=args.attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, max_seq_len=task.max_positions() if task.max_positions() else args.tokens_per_sample, num_segments=2, offset_positions_by_padding=False, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, ) classifier_embed_dim = args.embed_dim ffn_embedding_dim = args.ffn_embed_dim num_attention_heads = args.attention_heads dropout = args.dropout attention_dropout = args.attention_dropout activation_dropout = args.activation_dropout activation_fn = args.activation_fn self.joint_classification = args.joint_classification if args.joint_classification == "sent": if args.joint_normalize_before: self.joint_layer_norm = LayerNorm(classifier_embed_dim) else: self.joint_layer_norm = None self.joint_layers = nn.ModuleList( [ TransformerSentenceEncoderLayer( embedding_dim=classifier_embed_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, ) for _ in range(args.num_joint_layers) ] ) self.classifier = RobertaClassificationHead( classifier_embed_dim, classifier_embed_dim, 1, # num_classes "tanh", args.classifier_dropout, ) def forward(self, src_tokens, src_lengths): segment_labels = self.get_segment_labels(src_tokens) positions = self.get_positions(src_tokens, segment_labels) inner_states, _ = self.model( tokens=src_tokens, segment_labels=segment_labels, last_state_only=True, positions=positions, ) return inner_states[-1].transpose(0, 1) # T x B x C -> B x T x C def sentence_forward(self, encoder_out, src_tokens=None, sentence_rep="head"): # encoder_out: B x T x C if sentence_rep == "head": x = encoder_out[:, :1, :] else: # 'meanpool', 'maxpool' assert src_tokens is not None, "meanpool requires src_tokens input" segment_labels = self.get_segment_labels(src_tokens) padding_mask = src_tokens.ne(self.padding_idx) encoder_mask = segment_labels * padding_mask.type_as(segment_labels) if sentence_rep == "meanpool": ntokens = torch.sum(encoder_mask, dim=1, keepdim=True) x = torch.sum( encoder_out * encoder_mask.unsqueeze(2), dim=1, keepdim=True ) / ntokens.unsqueeze(2).type_as(encoder_out) else: # 'maxpool' encoder_out[ (encoder_mask == 0).unsqueeze(2).repeat(1, 1, encoder_out.shape[-1]) ] = -float("inf") x, _ = torch.max(encoder_out, dim=1, keepdim=True) if hasattr(self, "transform_layer"): x = self.transform_layer(x) return x # B x 1 x C def joint_forward(self, x): # x: T x B x C if self.joint_layer_norm: x = self.joint_layer_norm(x.transpose(0, 1)) x = x.transpose(0, 1) for layer in self.joint_layers: x, _ = layer(x, self_attn_padding_mask=None) return x def classification_forward(self, x): # x: B x T x C return self.classifier(x) @dataclass class DiscriminativeNMTRerankerConfig(FairseqDataclass): pretrained_model: str = field( default="", metadata={"help": "pretrained model to load"} ) sentence_rep: SENTENCE_REP_CHOICES = field( default="head", metadata={ "help": "method to transform the output of the transformer stack to a sentence-level representation" }, ) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"} ) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN"} ) classifier_dropout: float = field( default=0.0, metadata={"help": "classifier dropout probability"} ) embed_dim: int = field(default=768, metadata={"help": "embedding dimension"}) ffn_embed_dim: int = field( default=2048, metadata={"help": "embedding dimension for FFN"} ) encoder_layers: int = field(default=12, metadata={"help": "num encoder layers"}) attention_heads: int = field(default=8, metadata={"help": "num attention heads"}) encoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each encoder block"} ) apply_bert_init: bool = field( default=False, metadata={"help": "use custom param initialization for BERT"} ) activation_fn: ACTIVATION_FN_CHOICES = field( default="relu", metadata={"help": "activation function to use"} ) freeze_embeddings: bool = field( default=False, metadata={"help": "freeze embeddings in the pretrained model"} ) n_trans_layers_to_freeze: int = field( default=0, metadata={ "help": "number of layers to freeze in the pretrained transformer model" }, ) # joint classfication joint_classification: JOINT_CLASSIFICATION_CHOICES = field( default="none", metadata={"help": "method to compute joint features for classification"}, ) num_joint_layers: int = field( default=1, metadata={"help": "number of joint layers"} ) joint_normalize_before: bool = field( default=False, metadata={"help": "apply layer norm on the input to the joint layer"}, ) @register_model( "discriminative_nmt_reranker", dataclass=DiscriminativeNMTRerankerConfig ) class DiscriminativeNMTReranker(BaseFairseqModel): @classmethod def build_model(cls, args, task): model = BertRanker(args, task) return DiscriminativeNMTReranker(args, model) def __init__(self, args, model): super().__init__() self.model = model self.sentence_rep = args.sentence_rep self.joint_classification = args.joint_classification def forward(self, src_tokens, src_lengths, **kwargs): return self.model(src_tokens, src_lengths) def sentence_forward(self, encoder_out, src_tokens): return self.model.sentence_forward(encoder_out, src_tokens, self.sentence_rep) def joint_forward(self, x): return self.model.joint_forward(x) def classification_forward(self, x): return self.model.classification_forward(x) ================================================ FILE: examples/discriminative_reranking_nmt/scripts/prep_data.py ================================================ #!/usr/bin/env python import argparse from multiprocessing import Pool from pathlib import Path import sacrebleu import sentencepiece as spm def read_text_file(filename): with open(filename, "r") as f: output = [line.strip() for line in f] return output def get_bleu(in_sent, target_sent): bleu = sacrebleu.corpus_bleu([in_sent], [[target_sent]]) out = " ".join( map(str, [bleu.score, bleu.sys_len, bleu.ref_len] + bleu.counts + bleu.totals) ) return out def get_ter(in_sent, target_sent): ter = sacrebleu.corpus_ter([in_sent], [[target_sent]]) out = " ".join(map(str, [ter.score, ter.num_edits, ter.ref_length])) return out def init(sp_model): global sp sp = spm.SentencePieceProcessor() sp.Load(sp_model) def process(source_sent, target_sent, hypo_sent, metric): source_bpe = " ".join(sp.EncodeAsPieces(source_sent)) hypo_bpe = [" ".join(sp.EncodeAsPieces(h)) for h in hypo_sent] if metric == "bleu": score_str = [get_bleu(h, target_sent) for h in hypo_sent] else: # ter score_str = [get_ter(h, target_sent) for h in hypo_sent] return source_bpe, hypo_bpe, score_str def main(args): assert ( args.split.startswith("train") or args.num_shards == 1 ), "--num-shards should be set to 1 for valid and test sets" assert ( args.split.startswith("train") or args.split.startswith("valid") or args.split.startswith("test") ), "--split should be set to train[n]/valid[n]/test[n]" source_sents = read_text_file(args.input_source) target_sents = read_text_file(args.input_target) num_sents = len(source_sents) assert num_sents == len( target_sents ), f"{args.input_source} and {args.input_target} should have the same number of sentences." hypo_sents = read_text_file(args.input_hypo) assert ( len(hypo_sents) % args.beam == 0 ), f"Number of hypotheses ({len(hypo_sents)}) cannot be divided by beam size ({args.beam})." hypo_sents = [ hypo_sents[i : i + args.beam] for i in range(0, len(hypo_sents), args.beam) ] assert num_sents == len( hypo_sents ), f"{args.input_hypo} should contain {num_sents * args.beam} hypotheses but only has {len(hypo_sents) * args.beam}. (--beam={args.beam})" output_dir = args.output_dir / args.metric for ns in range(args.num_shards): print(f"processing shard {ns+1}/{args.num_shards}") shard_output_dir = output_dir / f"split{ns+1}" source_output_dir = shard_output_dir / "input_src" hypo_output_dir = shard_output_dir / "input_tgt" metric_output_dir = shard_output_dir / args.metric source_output_dir.mkdir(parents=True, exist_ok=True) hypo_output_dir.mkdir(parents=True, exist_ok=True) metric_output_dir.mkdir(parents=True, exist_ok=True) if args.n_proc > 1: with Pool( args.n_proc, initializer=init, initargs=(args.sentencepiece_model,) ) as p: output = p.starmap( process, [ (source_sents[i], target_sents[i], hypo_sents[i], args.metric) for i in range(ns, num_sents, args.num_shards) ], ) else: init(args.sentencepiece_model) output = [ process(source_sents[i], target_sents[i], hypo_sents[i], args.metric) for i in range(ns, num_sents, args.num_shards) ] with open(source_output_dir / f"{args.split}.bpe", "w") as s_o, open( hypo_output_dir / f"{args.split}.bpe", "w" ) as h_o, open(metric_output_dir / f"{args.split}.{args.metric}", "w") as m_o: for source_bpe, hypo_bpe, score_str in output: assert len(hypo_bpe) == len(score_str) for h, m in zip(hypo_bpe, score_str): s_o.write(f"{source_bpe}\n") h_o.write(f"{h}\n") m_o.write(f"{m}\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input-source", type=Path, required=True) parser.add_argument("--input-target", type=Path, required=True) parser.add_argument("--input-hypo", type=Path, required=True) parser.add_argument("--output-dir", type=Path, required=True) parser.add_argument("--split", type=str, required=True) parser.add_argument("--beam", type=int, required=True) parser.add_argument("--sentencepiece-model", type=str, required=True) parser.add_argument("--metric", type=str, choices=["bleu", "ter"], default="bleu") parser.add_argument("--num-shards", type=int, default=1) parser.add_argument("--n-proc", type=int, default=8) args = parser.parse_args() main(args) ================================================ FILE: examples/discriminative_reranking_nmt/tasks/__init__.py ================================================ from .discriminative_reranking_task import DiscriminativeRerankingNMTTask __all__ = [ "DiscriminativeRerankingNMTTask", ] ================================================ FILE: examples/discriminative_reranking_nmt/tasks/discriminative_reranking_task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import itertools import logging import os import numpy as np import torch from fairseq.logging import metrics from fairseq.data import ( ConcatDataset, ConcatSentencesDataset, data_utils, Dictionary, IdDataset, indexed_dataset, NestedDictionaryDataset, NumSamplesDataset, NumelDataset, PrependTokenDataset, RawLabelDataset, RightPadDataset, SortDataset, TruncateDataset, TokenBlockDataset, ) from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.tasks import FairseqTask, register_task from omegaconf import II, MISSING EVAL_BLEU_ORDER = 4 TARGET_METRIC_CHOICES = ChoiceEnum(["bleu", "ter"]) logger = logging.getLogger(__name__) @dataclass class DiscriminativeRerankingNMTConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) num_data_splits: int = field( default=1, metadata={"help": "total number of data splits"} ) no_shuffle: bool = field( default=False, metadata={"help": "do not shuffle training data"} ) max_positions: int = field( default=512, metadata={"help": "number of positional embeddings to learn"} ) include_src: bool = field( default=False, metadata={"help": "include source sentence"} ) mt_beam: int = field(default=50, metadata={"help": "beam size of input hypotheses"}) eval_target_metric: bool = field( default=False, metadata={"help": "evaluation with the target metric during validation"}, ) target_metric: TARGET_METRIC_CHOICES = field( default="bleu", metadata={"help": "name of the target metric to optimize for"} ) train_subset: str = field( default=II("dataset.train_subset"), metadata={"help": "data subset to use for training (e.g. train, valid, test)"}, ) seed: int = field( default=II("common.seed"), metadata={"help": "pseudo random number generator seed"}, ) class RerankerScorer(object): """Scores the target for a given (source (optional), target) input.""" def __init__(self, args, mt_beam): self.mt_beam = mt_beam @torch.no_grad() def generate(self, models, sample, **kwargs): """Score a batch of translations.""" net_input = sample["net_input"] assert len(models) == 1, "does not support model ensemble" model = models[0] bs = net_input["src_tokens"].shape[0] assert ( model.joint_classification == "none" or bs % self.mt_beam == 0 ), f"invalid batch size ({bs}) for joint classification with beam size ({self.mt_beam})" model.eval() logits = model(**net_input) batch_out = model.sentence_forward(logits, net_input["src_tokens"]) if model.joint_classification == "sent": batch_out = model.joint_forward( batch_out.view(self.mt_beam, bs // self.mt_beam, -1) ) scores = model.classification_forward( batch_out.view(bs, 1, -1) ) # input: B x T x C return scores @register_task( "discriminative_reranking_nmt", dataclass=DiscriminativeRerankingNMTConfig ) class DiscriminativeRerankingNMTTask(FairseqTask): """ Translation rerank task. The input can be either (src, tgt) sentence pairs or tgt sentence only. """ cfg: DiscriminativeRerankingNMTConfig def __init__(self, cfg: DiscriminativeRerankingNMTConfig, data_dictionary=None): super().__init__(cfg) self.dictionary = data_dictionary self._max_positions = cfg.max_positions # args.tokens_per_sample = self._max_positions # self.num_classes = 1 # for model @classmethod def load_dictionary(cls, cfg, filename): """Load the dictionary from the filename""" dictionary = Dictionary.load(filename) dictionary.add_symbol("") # for loading pretrained XLMR model return dictionary @classmethod def setup_task(cls, cfg: DiscriminativeRerankingNMTConfig, **kwargs): # load data dictionary (assume joint dictionary) data_path = cfg.data data_dict = cls.load_dictionary( cfg, os.path.join(data_path, "input_src/dict.txt") ) logger.info("[input] src dictionary: {} types".format(len(data_dict))) return DiscriminativeRerankingNMTTask(cfg, data_dict) def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" if self.cfg.data.endswith("1"): data_shard = (epoch - 1) % self.cfg.num_data_splits + 1 data_path = self.cfg.data[:-1] + str(data_shard) else: data_path = self.cfg.data def get_path(type, data_split): return os.path.join(data_path, str(type), data_split) def make_dataset(type, dictionary, data_split, combine): split_path = get_path(type, data_split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, combine=combine, ) return dataset def load_split(data_split, metric): input_src = None if self.cfg.include_src: input_src = make_dataset( "input_src", self.dictionary, data_split, combine=False ) assert input_src is not None, "could not find dataset: {}".format( get_path("input_src", data_split) ) input_tgt = make_dataset( "input_tgt", self.dictionary, data_split, combine=False ) assert input_tgt is not None, "could not find dataset: {}".format( get_path("input_tgt", data_split) ) label_path = f"{get_path(metric, data_split)}.{metric}" assert os.path.exists(label_path), f"could not find dataset: {label_path}" np_labels = np.loadtxt(label_path) if self.cfg.target_metric == "ter": np_labels = -np_labels label = RawLabelDataset(np_labels) return input_src, input_tgt, label src_datasets = [] tgt_datasets = [] label_datasets = [] if split == self.cfg.train_subset: for k in itertools.count(): split_k = "train" + (str(k) if k > 0 else "") prefix = os.path.join(data_path, "input_tgt", split_k) if not indexed_dataset.dataset_exists(prefix, impl=None): if k > 0: break else: raise FileNotFoundError(f"Dataset not found: {prefix}") input_src, input_tgt, label = load_split( split_k, self.cfg.target_metric ) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) else: input_src, input_tgt, label = load_split(split, self.cfg.target_metric) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) if len(tgt_datasets) == 1: input_tgt, label = tgt_datasets[0], label_datasets[0] if self.cfg.include_src: input_src = src_datasets[0] else: input_tgt = ConcatDataset(tgt_datasets) label = ConcatDataset(label_datasets) if self.cfg.include_src: input_src = ConcatDataset(src_datasets) input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) if self.cfg.include_src: input_src = PrependTokenDataset(input_src, self.dictionary.bos()) input_src = TruncateDataset(input_src, self.cfg.max_positions) src_lengths = NumelDataset(input_src, reduce=False) src_tokens = ConcatSentencesDataset(input_src, input_tgt) else: src_tokens = PrependTokenDataset(input_tgt, self.dictionary.bos()) src_lengths = NumelDataset(src_tokens, reduce=False) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths, }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), "target": label, } dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) assert ( len(dataset) % self.cfg.mt_beam == 0 ), "dataset size (%d) is not a multiple of beam size (%d)" % ( len(dataset), self.cfg.mt_beam, ) # no need to shuffle valid/test sets if not self.cfg.no_shuffle and split == self.cfg.train_subset: # need to keep all hypothese together start_idx = np.arange(0, len(dataset), self.cfg.mt_beam) with data_utils.numpy_seed(self.cfg.seed + epoch): np.random.shuffle(start_idx) idx = np.arange(0, self.cfg.mt_beam) shuffle = np.tile(idx, (len(start_idx), 1)).reshape(-1) + np.tile( start_idx, (self.cfg.mt_beam, 1) ).transpose().reshape(-1) dataset = SortDataset( dataset, sort_order=[shuffle], ) logger.info(f"Loaded {split} with #samples: {len(dataset)}") self.datasets[split] = dataset return self.datasets[split] def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): assert not self.cfg.include_src or len(src_tokens[0]) == 2 input_src = None if self.cfg.include_src: input_src = TokenBlockDataset( [t[0] for t in src_tokens], [l[0] for l in src_lengths], block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ) input_src = PrependTokenDataset(input_src, self.dictionary.bos()) input_src = TruncateDataset(input_src, self.cfg.max_positions) input_tgt = TokenBlockDataset( [t[-1] for t in src_tokens], [l[-1] for l in src_lengths], block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ) input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) if self.cfg.include_src: src_tokens = ConcatSentencesDataset(input_src, input_tgt) src_lengths = NumelDataset(input_src, reduce=False) else: input_tgt = PrependTokenDataset(input_tgt, self.dictionary.bos()) src_tokens = input_tgt src_lengths = NumelDataset(src_tokens, reduce=False) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths, }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } return NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) def build_model(self, cfg: FairseqDataclass, from_checkpoint: bool = False): return super().build_model(cfg) def build_generator(self, args): return RerankerScorer(args, mt_beam=self.cfg.mt_beam) def max_positions(self): return self._max_positions @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary def create_dummy_batch(self, device): dummy_target = ( torch.zeros(self.cfg.mt_beam, EVAL_BLEU_ORDER * 2 + 3).long().to(device) if not self.cfg.eval_ter else torch.zeros(self.cfg.mt_beam, 3).long().to(device) ) return { "id": torch.zeros(self.cfg.mt_beam, 1).long().to(device), "net_input": { "src_tokens": torch.zeros(self.cfg.mt_beam, 4).long().to(device), "src_lengths": torch.ones(self.cfg.mt_beam, 1).long().to(device), }, "nsentences": 0, "ntokens": 0, "target": dummy_target, } def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): if ignore_grad and sample is None: sample = self.create_dummy_batch(model.device) return super().train_step( sample, model, criterion, optimizer, update_num, ignore_grad ) def valid_step(self, sample, model, criterion): if sample is None: sample = self.create_dummy_batch(model.device) loss, sample_size, logging_output = super().valid_step(sample, model, criterion) if not self.cfg.eval_target_metric: return loss, sample_size, logging_output scores = logging_output["scores"] if self.cfg.target_metric == "bleu": assert sample["target"].shape[1] == EVAL_BLEU_ORDER * 2 + 3, ( "target does not contain enough information (" + str(sample["target"].shape[1]) + "for evaluating BLEU" ) max_id = torch.argmax(scores, dim=1) select_id = max_id + torch.arange( 0, sample_size * self.cfg.mt_beam, self.cfg.mt_beam ).to(max_id.device) bleu_data = sample["target"][select_id, 1:].sum(0).data logging_output["_bleu_sys_len"] = bleu_data[0] logging_output["_bleu_ref_len"] = bleu_data[1] for i in range(EVAL_BLEU_ORDER): logging_output["_bleu_counts_" + str(i)] = bleu_data[2 + i] logging_output["_bleu_totals_" + str(i)] = bleu_data[ 2 + EVAL_BLEU_ORDER + i ] elif self.cfg.target_metric == "ter": assert sample["target"].shape[1] == 3, ( "target does not contain enough information (" + str(sample["target"].shape[1]) + "for evaluating TER" ) max_id = torch.argmax(scores, dim=1) select_id = max_id + torch.arange( 0, sample_size * self.cfg.mt_beam, self.cfg.mt_beam ).to(max_id.device) ter_data = sample["target"][select_id, 1:].sum(0).data logging_output["_ter_num_edits"] = -ter_data[0] logging_output["_ter_ref_len"] = -ter_data[1] return loss, sample_size, logging_output def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if not self.cfg.eval_target_metric: return def sum_logs(key): return sum(log.get(key, 0) for log in logging_outputs) if self.cfg.target_metric == "bleu": counts, totals = [], [] for i in range(EVAL_BLEU_ORDER): counts.append(sum_logs("_bleu_counts_" + str(i))) totals.append(sum_logs("_bleu_totals_" + str(i))) if max(totals) > 0: # log counts as numpy arrays -- log_scalar will sum them correctly metrics.log_scalar("_bleu_counts", np.array(counts)) metrics.log_scalar("_bleu_totals", np.array(totals)) metrics.log_scalar("_bleu_sys_len", sum_logs("_bleu_sys_len")) metrics.log_scalar("_bleu_ref_len", sum_logs("_bleu_ref_len")) def compute_bleu(meters): import inspect import sacrebleu fn_sig = inspect.getfullargspec(sacrebleu.compute_bleu)[0] if "smooth_method" in fn_sig: smooth = {"smooth_method": "exp"} else: smooth = {"smooth": "exp"} bleu = sacrebleu.compute_bleu( correct=meters["_bleu_counts"].sum, total=meters["_bleu_totals"].sum, sys_len=meters["_bleu_sys_len"].sum, ref_len=meters["_bleu_ref_len"].sum, **smooth, ) return round(bleu.score, 2) metrics.log_derived("bleu", compute_bleu) elif self.cfg.target_metric == "ter": num_edits = sum_logs("_ter_num_edits") ref_len = sum_logs("_ter_ref_len") if ref_len > 0: metrics.log_scalar("_ter_num_edits", num_edits) metrics.log_scalar("_ter_ref_len", ref_len) def compute_ter(meters): score = meters["_ter_num_edits"].sum / meters["_ter_ref_len"].sum return round(score.item(), 2) metrics.log_derived("ter", compute_ter) ================================================ FILE: examples/emotion_conversion/README.md ================================================ # Textless speech emotion conversion using decomposed and discrete representations [Felix Kreuk](https://felixkreuk.github.io), Adam Polyak, Jade Copet, Eugene Kharitonov, Tu-Anh Nguyen, Morgane Rivière, Wei-Ning Hsu, Abdelrahman Mohamed, Emmanuel Dupoux, [Yossi Adi](https://adiyoss.github.io) _abstract_: Speech emotion conversion is the task of modifying the perceived emotion of a speech utterance while preserving the lexical content and speaker identity. In this study, we cast the problem of emotion conversion as a spoken language translation task. We decompose speech into discrete and disentangled learned representations, consisting of content units, F0, speaker, and emotion. First, we modify the speech content by translating the content units to a target emotion, and then predict the prosodic features based on these units. Finally, the speech waveform is generated by feeding the predicted representations into a neural vocoder. Such a paradigm allows us to go beyond spectral and parametric changes of the signal, and model non-verbal vocalizations, such as laughter insertion, yawning removal, etc. We demonstrate objectively and subjectively that the proposed method is superior to the baselines in terms of perceived emotion and audio quality. We rigorously evaluate all components of such a complex system and conclude with an extensive model analysis and ablation study to better emphasize the architectural choices, strengths and weaknesses of the proposed method. Samples and code will be publicly available under the following link: https://speechbot.github.io/emotion. ## Installation First, create a conda virtual environment and activate it: ``` conda create -n emotion python=3.8 -y conda activate emotion ``` Then, clone this repository: ``` git clone https://github.com/facebookresearch/fairseq.git cd fairseq/examples/emotion_conversion git clone https://github.com/felixkreuk/speech-resynthesis ``` Next, download the EmoV discrete tokens: ``` wget https://dl.fbaipublicfiles.com/textless_nlp/emotion_conversion/data.tar.gz # (still in fairseq/examples/emotion_conversion) tar -xzvf data.tar.gz ``` Your `fairseq/examples/emotion_conversion` directory should like this: ``` drwxrwxr-x 3 felixkreuk felixkreuk 0 Feb 6 2022 data drwxrwxr-x 3 felixkreuk felixkreuk 0 Sep 28 10:41 emotion_models drwxr-xr-x 3 felixkreuk felixkreuk 0 Jun 29 05:43 fairseq_models drwxr-xr-x 3 felixkreuk felixkreuk 0 Sep 28 10:41 preprocess -rw-rw-r-- 1 felixkreuk felixkreuk 11K Dec 5 09:00 README.md -rw-rw-r-- 1 felixkreuk felixkreuk 88 Mar 6 2022 requirements.txt -rw-rw-r-- 1 felixkreuk felixkreuk 13K Jun 29 06:26 synthesize.py ``` Lastly, install fairseq and the other packages: ``` pip install --editable ./ pip install -r examples/emotion_conversion/requirements.txt ``` ## Data preprocessing ### Convert your audio to discrete representations Please follow the steps described [here](https://github.com/pytorch/fairseq/tree/main/examples/hubert/simple_kmeans). To generate the same discrete representations please use the following: 1. [HuBERT checkpoint](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) 2. k-means model at `data/hubert_base_ls960_layer9_clusters200/data_hubert_base_ls960_layer9_clusters200.bin` ### Construct data splits This step will use the discrete representations from the previous step and split them to train/valid/test sets for 3 tasks: 1. Translation model pre-training (BART language denoising) 2. Translation model training (content units emotion translation mechanism) 3. HiFiGAN model training (for synthesizing audio from discrete representations) Your processed data should be at `data/`: 1. `hubert_base_ls960_layer9_clusters200` - discrete representations extracted using HuBERT layer 9, clustered into 200 clusters. 2. `data.tsv` - a tsv file pointing to the EmoV dataset in your environment (Please edit the first line of this file according to your path). The following command will create the above splits: ``` python examples/emotion_conversion/preprocess/create_core_manifest.py \ --tsv data/data.tsv \ --emov-km data/hubert_base_ls960_layer9_clusters200/data.km \ --km data/hubert_base_ls960_layer9_clusters200/vctk.km \ --dict data/hubert_base_ls960_layer9_clusters200/dict.txt \ --manifests-dir $DATA ``` * Set `$DATA` as the directory that will contain the processed data. ### Extract F0 To train the HiFiGAN vocoder we need to first extract the F0 curves: ``` python examples/emotion_conversion/preprocess/extract_f0.py \ --tsv data/data.tsv \ --extractor pyaapt \ ``` ## HiFiGAN training Now we are all set to train the HiFiGAN vocoder: ``` python examples/emotion_conversion/speech-resynthesis/train.py --checkpoint_path \ --config examples/emotion_conversion/speech-resynthesis/configs/EmoV/emov_hubert-layer9-cluster200_fixed-spkr-embedder_f0-raw_gst.json ``` ## Translation Pre-training Before translating emotions, we first need to pre-train the translation model as a denoising autoencoder (similarly to BART). ``` python train.py \ $DATA/fairseq-data/emov_multilingual_denoising_cross-speaker_dedup_nonzeroshot/tokenized \ --save-dir \ --tensorboard-logdir \ --langs neutral,amused,angry,sleepy,disgusted,vctk.km \ --dataset-impl mmap \ --task multilingual_denoising \ --arch transformer_small --criterion cross_entropy \ --multilang-sampling-alpha 1.0 --sample-break-mode eos --max-tokens 16384 \ --update-freq 1 --max-update 3000000 \ --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.0 \ --optimizer adam --weight-decay 0.01 --adam-eps 1e-06 \ --clip-norm 0.1 --lr-scheduler polynomial_decay --lr 0.0003 \ --total-num-update 3000000 --warmup-updates 10000 --fp16 \ --poisson-lambda 3.5 --mask 0.3 --mask-length span-poisson --replace-length 1 --rotate 0 --mask-random 0.1 --insert 0 --permute-sentences 1.0 \ --skip-invalid-size-inputs-valid-test \ --user-dir examples/emotion_conversion/fairseq_models ``` ## Translation Training Now we are ready to train our emotion translation model: ``` python train.py \ --distributed-world-size 1 \ $DATA/fairseq-data/emov_multilingual_translation_cross-speaker_dedup/tokenized/ \ --save-dir \ --tensorboard-logdir \ --arch multilingual_small --task multilingual_translation \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --lang-pairs neutral-amused,neutral-sleepy,neutral-disgusted,neutral-angry,amused-sleepy,amused-disgusted,amused-neutral,amused-angry,angry-amused,angry-sleepy,angry-disgusted,angry-neutral,disgusted-amused,disgusted-sleepy,disgusted-neutral,disgusted-angry,sleepy-amused,sleepy-neutral,sleepy-disgusted,sleepy-angry \ --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --lr 1e-05 --clip-norm 0 --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.01 --warmup-updates 2000 --lr-scheduler inverse_sqrt \ --max-tokens 4096 --update-freq 1 --max-update 100000 \ --required-batch-size-multiple 8 --fp16 --num-workers 4 \ --seed 2 --log-format json --log-interval 25 --save-interval-updates 1000 \ --no-epoch-checkpoints --keep-best-checkpoints 1 --keep-interval-updates 1 \ --finetune-from-model \ --user-dir examples/emotion_conversion/fairseq_models ``` * To share encoders/decoders use the `--share-encoders` and `--share-decoders` flags. * To add source/target emotion tokens use the `--encoder-langtok {'src'|'tgt'}` and `--decoder-langtok` flags. ## F0-predictor Training The following command trains the F0 prediction module: ``` cd examples/emotion_conversion python -m emotion_models.pitch_predictor n_tokens=200 \ train_tsv="$DATA/denoising/emov/train.tsv" \ train_km="$DATA/denoising/emov/train.km" \ valid_tsv="$DATA/denoising/emov/valid.tsv" \ valid_km="$DATA/denoising/emov/valid.km" ``` * See `hyra.run.dir` to configure directory for saving models. ## Duration-predictor Training The following command trains the duration prediction modules: ``` cd examples/emotion_conversion for emotion in "neutral" "amused" "angry" "disgusted" "sleepy"; do python -m emotion_models.duration_predictor n_tokens=200 substring=$emotion \ train_tsv="$DATA/denoising/emov/train.tsv" \ train_km="$DATA/denoising/emov/train.km" \ valid_tsv="$DATA/denoising/emov/valid.tsv" \ valid_km="$DATA/denoising/emov/valid.km" done ``` * See `hyra.run.dir` to configure directory for saving models. * After the above command you should have 5 duration models in your checkpoint directory: ``` ❯ ll duration_predictor/ total 21M -rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 amused.ckpt -rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 angry.ckpt -rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 disgusted.ckpt -rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 neutral.ckpt -rw-rw-r-- 1 felixkreuk felixkreuk 4.1M Nov 15 2021 sleepy.ckpt ``` ## Token Generation The following command uses `fairseq-generate` to generate the token sequences based on the source and target emotions. ``` fairseq-generate \ $DATA/fairseq-data/emov_multilingual_translation_cross-speaker_dedup/tokenized/ \ --task multilingual_translation \ --gen-subset test \ --path \ --beam 5 \ --batch-size 4 --max-len-a 1.8 --max-len-b 10 --lenpen 1 --min-len 1 \ --skip-invalid-size-inputs-valid-test --distributed-world-size 1 \ --source-lang neutral --target-lang amused \ --lang-pairs neutral-amused,neutral-sleepy,neutral-disgusted,neutral-angry,amused-sleepy,amused-disgusted,amused-neutral,amused-angry,angry-amused,angry-sleepy,angry-disgusted,angry-neutral,disgusted-amused,disgusted-sleepy,disgusted-neutral,disgusted-angry,sleepy-amused,sleepy-neutral,sleepy-disgusted,sleepy-angry \ --results-path \ --user-dir examples/emotion_conversion/fairseq_models ``` * Modify `--source-lang` and `--target-lang` to control for the source and target emotions. * See [fairseq documentation](https://fairseq.readthedocs.io/en/latest/command_line_tools.html#fairseq-generate) for a full overview of generation parameters (e.g., top-k/top-p sampling). ## Waveform Synthesis Using the output of the above command, the HiFiGAN vocoder, and the prosody prediction modules (F0 and duration) we can now generate the output waveforms: ``` python examples/emotion_conversion/synthesize.py \ --result-path /generate-test.txt \ --data $DATA/fairseq-data/emov_multilingual_translation_cross-speaker_dedup/neutral-amused \ --orig-tsv examples/emotion_conversion/data/data.tsv \ --orig-km examples/emotion_conversion/data/hubert_base_ls960_layer9_clusters200/data.km \ --checkpoint-file /g_00400000 \ --dur-model duration_predictor/ \ --f0-model pitch_predictor/pitch_predictor.ckpt \ -s neutral -t amused \ --outdir ~/tmp/emotion_results/wavs/neutral-amused ``` * Please make sure the source and target emotions here match those of the previous command. # Citation If you find this useful in your research, please use the following BibTeX entry for citation. ``` @article{kreuk2021textless, title={Textless speech emotion conversion using decomposed and discrete representations}, author={Kreuk, Felix and Polyak, Adam and Copet, Jade and Kharitonov, Eugene and Nguyen, Tu-Anh and Rivi{\`e}re, Morgane and Hsu, Wei-Ning and Mohamed, Abdelrahman and Dupoux, Emmanuel and Adi, Yossi}, journal={Conference on Empirical Methods in Natural Language Processing (EMNLP)}, year={2022} } ``` ================================================ FILE: examples/emotion_conversion/emotion_models/__init__.py ================================================ ================================================ FILE: examples/emotion_conversion/emotion_models/duration_predictor.py ================================================ import logging import os import hydra import torch import torch.nn as nn import torch.nn.functional as F from einops.layers.torch import Rearrange from torch.utils.data import DataLoader, Dataset from .utils import Accuracy logger = logging.getLogger(__name__) def save_ckpt(model, path, model_class): ckpt = { "state_dict": model.state_dict(), "padding_token": model.padding_token, "model_class": model_class, } torch.save(ckpt, path) def load_ckpt(path): ckpt = torch.load(path) ckpt["model_class"]["_target_"] = "emotion_models.duration_predictor.CnnPredictor" model = hydra.utils.instantiate(ckpt["model_class"]) model.load_state_dict(ckpt["state_dict"]) model.padding_token = ckpt["padding_token"] model = model.cpu() model.eval() return model class Collator: def __init__(self, padding_idx): self.padding_idx = padding_idx def __call__(self, batch): x = [item[0] for item in batch] lengths = [len(item) for item in x] x = torch.nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=self.padding_idx) y = [item[1] for item in batch] y = torch.nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=self.padding_idx) mask = (x != self.padding_idx) return x, y, mask, lengths class Predictor(nn.Module): def __init__(self, n_tokens, emb_dim): super(Predictor, self).__init__() self.n_tokens = n_tokens self.emb_dim = emb_dim self.padding_token = n_tokens # add 1 extra embedding for padding token, set the padding index to be the last token # (tokens from the clustering start at index 0) self.emb = nn.Embedding(n_tokens + 1, emb_dim, padding_idx=self.padding_token) def inflate_input(self, batch): """ get a sequence of tokens, predict their durations and inflate them accordingly """ batch_durs = self.forward(batch) batch_durs = torch.exp(batch_durs) - 1 batch_durs = batch_durs.round() output = [] for seq, durs in zip(batch, batch_durs): inflated_seq = [] for token, n in zip(seq, durs): if token == self.padding_token: break n = int(n.item()) token = int(token.item()) inflated_seq.extend([token for _ in range(n)]) output.append(inflated_seq) output = torch.LongTensor(output) return output class CnnPredictor(Predictor): def __init__(self, n_tokens, emb_dim, channels, kernel, output_dim, dropout, n_layers): super(CnnPredictor, self).__init__(n_tokens=n_tokens, emb_dim=emb_dim) layers = [ Rearrange("b t c -> b c t"), nn.Conv1d(emb_dim, channels, kernel_size=kernel, padding=(kernel - 1) // 2), Rearrange("b c t -> b t c"), nn.ReLU(), nn.LayerNorm(channels), nn.Dropout(dropout), ] for _ in range(n_layers-1): layers += [ Rearrange("b t c -> b c t"), nn.Conv1d(channels, channels, kernel_size=kernel, padding=(kernel - 1) // 2), Rearrange("b c t -> b t c"), nn.ReLU(), nn.LayerNorm(channels), nn.Dropout(dropout), ] self.conv_layer = nn.Sequential(*layers) self.proj = nn.Linear(channels, output_dim) def forward(self, x): x = self.emb(x) x = self.conv_layer(x) x = self.proj(x) x = x.squeeze(-1) return x def l2_log_loss(input, target): return F.mse_loss( input=input.float(), target=torch.log(target.float() + 1), reduce=False ) class DurationDataset(Dataset): def __init__(self, tsv_path, km_path, substring=""): lines = open(tsv_path, "r").readlines() self.root, self.tsv = lines[0], lines[1:] self.km = open(km_path, "r").readlines() logger.info(f"loaded {len(self.km)} files") if substring != "": tsv, km = [], [] for tsv_line, km_line in zip(self.tsv, self.km): if substring.lower() in tsv_line.lower(): tsv.append(tsv_line) km.append(km_line) self.tsv, self.km = tsv, km logger.info(f"after filtering: {len(self.km)} files") def __len__(self): return len(self.km) def __getitem__(self, i): x = self.km[i] x = x.split(" ") x = list(map(int, x)) y = [] xd = [] count = 1 for x1, x2 in zip(x[:-1], x[1:]): if x1 == x2: count += 1 continue else: y.append(count) xd.append(x1) count = 1 xd = torch.LongTensor(xd) y = torch.LongTensor(y) return xd, y def train(cfg): device = "cuda:0" model = hydra.utils.instantiate(cfg[cfg.model]).to(device) optimizer = hydra.utils.instantiate(cfg.optimizer, model.parameters()) # add 1 extra embedding for padding token, set the padding index to be the last token # (tokens from the clustering start at index 0) collate_fn = Collator(padding_idx=model.padding_token) logger.info(f"data: {cfg.train_tsv}") train_ds = DurationDataset(cfg.train_tsv, cfg.train_km, substring=cfg.substring) valid_ds = DurationDataset(cfg.valid_tsv, cfg.valid_km, substring=cfg.substring) train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, collate_fn=collate_fn) valid_dl = DataLoader(valid_ds, batch_size=32, shuffle=False, collate_fn=collate_fn) best_loss = float("inf") for epoch in range(cfg.epochs): train_loss, train_loss_scaled = train_epoch(model, train_dl, l2_log_loss, optimizer, device) valid_loss, valid_loss_scaled, *acc = valid_epoch(model, valid_dl, l2_log_loss, device) acc0, acc1, acc2, acc3 = acc if valid_loss_scaled < best_loss: path = f"{os.getcwd()}/{cfg.substring}.ckpt" save_ckpt(model, path, cfg[cfg.model]) best_loss = valid_loss_scaled logger.info(f"saved checkpoint: {path}") logger.info(f"[epoch {epoch}] train loss: {train_loss:.3f}, train scaled: {train_loss_scaled:.3f}") logger.info(f"[epoch {epoch}] valid loss: {valid_loss:.3f}, valid scaled: {valid_loss_scaled:.3f}") logger.info(f"acc: {acc0,acc1,acc2,acc3}") def train_epoch(model, loader, criterion, optimizer, device): model.train() epoch_loss = 0 epoch_loss_scaled = 0 for x, y, mask, _ in loader: x, y, mask = x.to(device), y.to(device), mask.to(device) yhat = model(x) loss = criterion(yhat, y) * mask loss = torch.mean(loss) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() epoch_loss += loss.item() # get normal scale loss yhat_scaled = torch.exp(yhat) - 1 yhat_scaled = torch.round(yhat_scaled) scaled_loss = torch.mean(torch.abs(yhat_scaled - y) * mask) epoch_loss_scaled += scaled_loss.item() return epoch_loss / len(loader), epoch_loss_scaled / len(loader) def valid_epoch(model, loader, criterion, device): model.eval() epoch_loss = 0 epoch_loss_scaled = 0 acc = Accuracy() for x, y, mask, _ in loader: x, y, mask = x.to(device), y.to(device), mask.to(device) yhat = model(x) loss = criterion(yhat, y) * mask loss = torch.mean(loss) epoch_loss += loss.item() # get normal scale loss yhat_scaled = torch.exp(yhat) - 1 yhat_scaled = torch.round(yhat_scaled) scaled_loss = torch.sum(torch.abs(yhat_scaled - y) * mask) / mask.sum() acc.update(yhat_scaled[mask].view(-1).float(), y[mask].view(-1).float()) epoch_loss_scaled += scaled_loss.item() logger.info(f"example y: {y[0, :10].tolist()}") logger.info(f"example yhat: {yhat_scaled[0, :10].tolist()}") acc0 = acc.acc(tol=0) acc1 = acc.acc(tol=1) acc2 = acc.acc(tol=2) acc3 = acc.acc(tol=3) logger.info(f"accs: {acc0,acc1,acc2,acc3}") return epoch_loss / len(loader), epoch_loss_scaled / len(loader), acc0, acc1, acc2, acc3 @hydra.main(config_path=".", config_name="duration_predictor.yaml") def main(cfg): logger.info(f"{cfg}") train(cfg) if __name__ == "__main__": main() ================================================ FILE: examples/emotion_conversion/emotion_models/duration_predictor.yaml ================================================ train_tsv: "/denoising/emov/train.tsv" train_km: "/denoising/emov/train.km" valid_tsv: "/denoising/emov/valid.tsv" valid_km: "/denoising/emov/valid.km" n_tokens: 200 batch_size: 32 lr: 0.0001 epochs: 300 model: "cnn" substring: "" rnn: _target_: emotion_models.duration_predictor.RnnPredictor n_tokens: ${n_tokens} emb_dim: 128 rnn_hidden: 128 output_dim: 1 dropout: 0 n_layers: 1 optimizer: _target_: torch.optim.Adam lr: ${lr} betas: [0.9, 0.98] eps: 0.000000001 weight_decay: 0 cnn: _target_: emotion_models.duration_predictor.CnnPredictor n_tokens: ${n_tokens} emb_dim: 128 channels: 256 kernel: 3 output_dim: 1 dropout: 0.5 n_layers: 1 hydra: run: dir: /checkpoint/felixkreuk/experiments/duration_predictor/${hydra.job.override_dirname} job: config: # configuration for the ${hydra.job.override_dirname} runtime variable override_dirname: kv_sep: '=' item_sep: ',' exclude_keys: ['train_tsv', 'train_km', 'valid_tsv', 'valid_km'] ================================================ FILE: examples/emotion_conversion/emotion_models/pitch_predictor.py ================================================ import logging import os import random import sys from collections import defaultdict import hydra import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange from einops.layers.torch import Rearrange from scipy.io.wavfile import read from scipy.ndimage import gaussian_filter1d from torch.utils.data import DataLoader, Dataset from tqdm import tqdm dir_path = os.path.dirname(__file__) resynth_path = os.path.dirname(dir_path) + "/speech-resynthesis" sys.path.append(resynth_path) from dataset import parse_speaker, parse_style from .utils import F0Stat MAX_WAV_VALUE = 32768.0 logger = logging.getLogger(__name__) def quantize_f0(speaker_to_f0, nbins, normalize, log): f0_all = [] for speaker, f0 in speaker_to_f0.items(): f0 = f0.raw_data if log: f0 = f0.log() mean = speaker_to_f0[speaker].mean_log if log else speaker_to_f0[speaker].mean std = speaker_to_f0[speaker].std_log if log else speaker_to_f0[speaker].std if normalize == "mean": f0 = f0 - mean elif normalize == "meanstd": f0 = (f0 - mean) / std f0_all.extend(f0.tolist()) hist, bin_x = np.histogram(f0_all, 100000) cum_hist = np.cumsum(hist) / len(f0_all) * 100 bin_offset = [] bin_size = 100 / nbins threshold = bin_size for i in range(nbins - 1): index = (np.abs(cum_hist - threshold)).argmin() bin_offset.append(bin_x[index]) threshold += bin_size bins = np.array(bin_offset) bins = torch.FloatTensor(bins) return bins def save_ckpt(model, path, model_class, f0_min, f0_max, f0_bins, speaker_stats): ckpt = { "state_dict": model.state_dict(), "padding_token": model.padding_token, "model_class": model_class, "speaker_stats": speaker_stats, "f0_min": f0_min, "f0_max": f0_max, "f0_bins": f0_bins, } torch.save(ckpt, path) def load_ckpt(path): ckpt = torch.load(path) ckpt["model_class"]["_target_"] = "emotion_models.pitch_predictor.CnnPredictor" model = hydra.utils.instantiate(ckpt["model_class"]) model.load_state_dict(ckpt["state_dict"]) model.setup_f0_stats( ckpt["f0_min"], ckpt["f0_max"], ckpt["f0_bins"], ckpt["speaker_stats"], ) return model def freq2bin(f0, f0_min, f0_max, bins): f0 = f0.clone() f0[f0 < f0_min] = f0_min f0[f0 > f0_max] = f0_max f0 = torch.bucketize(f0, bins) return f0 def bin2freq(x, f0_min, f0_max, bins, mode): n_bins = len(bins) + 1 assert x.shape[-1] == n_bins bins = torch.cat([torch.tensor([f0_min]), bins]).to(x.device) if mode == "mean": f0 = (x * bins).sum(-1, keepdims=True) / x.sum(-1, keepdims=True) elif mode == "argmax": idx = F.one_hot(x.argmax(-1), num_classes=n_bins) f0 = (idx * bins).sum(-1, keepdims=True) else: raise NotImplementedError() return f0[..., 0] def load_wav(full_path): sampling_rate, data = read(full_path) return data, sampling_rate def l1_loss(input, target): return F.l1_loss(input=input.float(), target=target.float(), reduce=False) def l2_loss(input, target): return F.mse_loss(input=input.float(), target=target.float(), reduce=False) class Collator: def __init__(self, padding_idx): self.padding_idx = padding_idx def __call__(self, batch): tokens = [item[0] for item in batch] lengths = [len(item) for item in tokens] tokens = torch.nn.utils.rnn.pad_sequence( tokens, batch_first=True, padding_value=self.padding_idx ) f0 = [item[1] for item in batch] f0 = torch.nn.utils.rnn.pad_sequence( f0, batch_first=True, padding_value=self.padding_idx ) f0_raw = [item[2] for item in batch] f0_raw = torch.nn.utils.rnn.pad_sequence( f0_raw, batch_first=True, padding_value=self.padding_idx ) spk = [item[3] for item in batch] spk = torch.LongTensor(spk) gst = [item[4] for item in batch] gst = torch.LongTensor(gst) mask = tokens != self.padding_idx return tokens, f0, f0_raw, spk, gst, mask, lengths class CnnPredictor(nn.Module): def __init__( self, n_tokens, emb_dim, channels, kernel, dropout, n_layers, spk_emb, gst_emb, n_bins, f0_pred, f0_log, f0_norm, ): super(CnnPredictor, self).__init__() self.n_tokens = n_tokens self.emb_dim = emb_dim self.f0_log = f0_log self.f0_pred = f0_pred self.padding_token = n_tokens self.f0_norm = f0_norm # add 1 extra embedding for padding token, set the padding index to be the last token # (tokens from the clustering start at index 0) self.token_emb = nn.Embedding( n_tokens + 1, emb_dim, padding_idx=self.padding_token ) self.spk_emb = spk_emb self.gst_emb = nn.Embedding(20, gst_emb) self.setup = False feats = emb_dim + gst_emb # feats = emb_dim + gst_emb + (256 if spk_emb else 0) layers = [ nn.Sequential( Rearrange("b t c -> b c t"), nn.Conv1d( feats, channels, kernel_size=kernel, padding=(kernel - 1) // 2 ), Rearrange("b c t -> b t c"), nn.ReLU(), nn.LayerNorm(channels), nn.Dropout(dropout), ) ] for _ in range(n_layers - 1): layers += [ nn.Sequential( Rearrange("b t c -> b c t"), nn.Conv1d( channels, channels, kernel_size=kernel, padding=(kernel - 1) // 2, ), Rearrange("b c t -> b t c"), nn.ReLU(), nn.LayerNorm(channels), nn.Dropout(dropout), ) ] self.conv_layer = nn.ModuleList(layers) self.proj = nn.Linear(channels, n_bins) def forward(self, x, gst=None): x = self.token_emb(x) feats = [x] if gst is not None: gst = self.gst_emb(gst) gst = rearrange(gst, "b c -> b c 1") gst = F.interpolate(gst, x.shape[1]) gst = rearrange(gst, "b c t -> b t c") feats.append(gst) x = torch.cat(feats, dim=-1) for i, conv in enumerate(self.conv_layer): if i != 0: x = conv(x) + x else: x = conv(x) x = self.proj(x) x = x.squeeze(-1) if self.f0_pred == "mean": x = torch.sigmoid(x) elif self.f0_pred == "argmax": x = torch.softmax(x, dim=-1) else: raise NotImplementedError return x def setup_f0_stats(self, f0_min, f0_max, f0_bins, speaker_stats): self.f0_min = f0_min self.f0_max = f0_max self.f0_bins = f0_bins self.speaker_stats = speaker_stats self.setup = True def inference(self, x, spk_id=None, gst=None): assert ( self.setup == True ), "make sure that `setup_f0_stats` was called before inference!" probs = self(x, gst) f0 = bin2freq(probs, self.f0_min, self.f0_max, self.f0_bins, self.f0_pred) for i in range(f0.shape[0]): mean = ( self.speaker_stats[spk_id[i].item()].mean_log if self.f0_log else self.speaker_stats[spk_id[i].item()].mean ) std = ( self.speaker_stats[spk_id[i].item()].std_log if self.f0_log else self.speaker_stats[spk_id[i].item()].std ) if self.f0_norm == "mean": f0[i] = f0[i] + mean if self.f0_norm == "meanstd": f0[i] = (f0[i] * std) + mean if self.f0_log: f0 = f0.exp() return f0 class PitchDataset(Dataset): def __init__( self, tsv_path, km_path, substring, spk, spk2id, gst, gst2id, f0_bins, f0_bin_type, f0_smoothing, f0_norm, f0_log, ): lines = open(tsv_path, "r").readlines() self.root, self.tsv = lines[0], lines[1:] self.root = self.root.strip() self.km = open(km_path, "r").readlines() print(f"loaded {len(self.km)} files") self.spk = spk self.spk2id = spk2id self.gst = gst self.gst2id = gst2id self.f0_bins = f0_bins self.f0_smoothing = f0_smoothing self.f0_norm = f0_norm self.f0_log = f0_log if substring != "": tsv, km = [], [] for tsv_line, km_line in zip(self.tsv, self.km): if substring.lower() in tsv_line.lower(): tsv.append(tsv_line) km.append(km_line) self.tsv, self.km = tsv, km print(f"after filtering: {len(self.km)} files") self.speaker_stats = self._compute_f0_stats() self.f0_min, self.f0_max = self._compute_f0_minmax() if f0_bin_type == "adaptive": self.f0_bins = quantize_f0( self.speaker_stats, self.f0_bins, self.f0_norm, self.f0_log ) elif f0_bin_type == "uniform": self.f0_bins = torch.linspace(self.f0_min, self.f0_max, self.f0_bins + 1)[ 1:-1 ] else: raise NotImplementedError print(f"f0 min: {self.f0_min}, f0 max: {self.f0_max}") print(f"bins: {self.f0_bins} (shape: {self.f0_bins.shape})") def __len__(self): return len(self.km) def _load_f0(self, tsv_line): tsv_line = tsv_line.split("\t")[0] f0 = self.root + "/" + tsv_line.replace(".wav", ".yaapt.f0.npy") f0 = np.load(f0) f0 = torch.FloatTensor(f0) return f0 def _preprocess_f0(self, f0, spk): mask = f0 != -999999 # process all frames # mask = (f0 != 0) # only process voiced frames mean = ( self.speaker_stats[spk].mean_log if self.f0_log else self.speaker_stats[spk].mean ) std = ( self.speaker_stats[spk].std_log if self.f0_log else self.speaker_stats[spk].std ) if self.f0_log: f0[f0 == 0] = 1e-5 f0[mask] = f0[mask].log() if self.f0_norm == "mean": f0[mask] = f0[mask] - mean if self.f0_norm == "meanstd": f0[mask] = (f0[mask] - mean) / std return f0 def _compute_f0_minmax(self): f0_min, f0_max = float("inf"), -float("inf") for tsv_line in tqdm(self.tsv, desc="computing f0 minmax"): spk = self.spk2id[parse_speaker(tsv_line, self.spk)] f0 = self._load_f0(tsv_line) f0 = self._preprocess_f0(f0, spk) f0_min = min(f0_min, f0.min().item()) f0_max = max(f0_max, f0.max().item()) return f0_min, f0_max def _compute_f0_stats(self): from functools import partial speaker_stats = defaultdict(partial(F0Stat, True)) for tsv_line in tqdm(self.tsv, desc="computing speaker stats"): spk = self.spk2id[parse_speaker(tsv_line, self.spk)] f0 = self._load_f0(tsv_line) mask = f0 != 0 f0 = f0[mask] # compute stats only on voiced parts speaker_stats[spk].update(f0) return speaker_stats def __getitem__(self, i): x = self.km[i] x = x.split(" ") x = list(map(int, x)) x = torch.LongTensor(x) gst = parse_style(self.tsv[i], self.gst) gst = self.gst2id[gst] spk = parse_speaker(self.tsv[i], self.spk) spk = self.spk2id[spk] f0_raw = self._load_f0(self.tsv[i]) f0 = self._preprocess_f0(f0_raw.clone(), spk) f0 = F.interpolate(f0.unsqueeze(0).unsqueeze(0), x.shape[0])[0, 0] f0_raw = F.interpolate(f0_raw.unsqueeze(0).unsqueeze(0), x.shape[0])[0, 0] f0 = freq2bin(f0, f0_min=self.f0_min, f0_max=self.f0_max, bins=self.f0_bins) f0 = F.one_hot(f0.long(), num_classes=len(self.f0_bins) + 1).float() if self.f0_smoothing > 0: f0 = torch.tensor( gaussian_filter1d(f0.float().numpy(), sigma=self.f0_smoothing) ) return x, f0, f0_raw, spk, gst def train(cfg): device = "cuda:0" # add 1 extra embedding for padding token, set the padding index to be the last token # (tokens from the clustering start at index 0) padding_token = cfg.n_tokens collate_fn = Collator(padding_idx=padding_token) train_ds = PitchDataset( cfg.train_tsv, cfg.train_km, substring=cfg.substring, spk=cfg.spk, spk2id=cfg.spk2id, gst=cfg.gst, gst2id=cfg.gst2id, f0_bins=cfg.f0_bins, f0_bin_type=cfg.f0_bin_type, f0_smoothing=cfg.f0_smoothing, f0_norm=cfg.f0_norm, f0_log=cfg.f0_log, ) valid_ds = PitchDataset( cfg.valid_tsv, cfg.valid_km, substring=cfg.substring, spk=cfg.spk, spk2id=cfg.spk2id, gst=cfg.gst, gst2id=cfg.gst2id, f0_bins=cfg.f0_bins, f0_bin_type=cfg.f0_bin_type, f0_smoothing=cfg.f0_smoothing, f0_norm=cfg.f0_norm, f0_log=cfg.f0_log, ) train_dl = DataLoader( train_ds, num_workers=0, batch_size=cfg.batch_size, shuffle=True, collate_fn=collate_fn, ) valid_dl = DataLoader( valid_ds, num_workers=0, batch_size=16, shuffle=False, collate_fn=collate_fn ) f0_min = train_ds.f0_min f0_max = train_ds.f0_max f0_bins = train_ds.f0_bins speaker_stats = train_ds.speaker_stats model = hydra.utils.instantiate(cfg["model"]).to(device) model.setup_f0_stats(f0_min, f0_max, f0_bins, speaker_stats) optimizer = hydra.utils.instantiate(cfg.optimizer, model.parameters()) best_loss = float("inf") for epoch in range(cfg.epochs): train_loss, train_l2_loss, train_l2_voiced_loss = run_epoch( model, train_dl, optimizer, device, cfg, mode="train" ) valid_loss, valid_l2_loss, valid_l2_voiced_loss = run_epoch( model, valid_dl, None, device, cfg, mode="valid" ) print( f"[epoch {epoch}] train loss: {train_loss:.3f}, l2 loss: {train_l2_loss:.3f}, l2 voiced loss: {train_l2_voiced_loss:.3f}" ) print( f"[epoch {epoch}] valid loss: {valid_loss:.3f}, l2 loss: {valid_l2_loss:.3f}, l2 voiced loss: {valid_l2_voiced_loss:.3f}" ) if valid_l2_voiced_loss < best_loss: path = f"{os.getcwd()}/pitch_predictor.ckpt" save_ckpt(model, path, cfg["model"], f0_min, f0_max, f0_bins, speaker_stats) best_loss = valid_l2_voiced_loss print(f"saved checkpoint: {path}") print(f"[epoch {epoch}] best loss: {best_loss:.3f}") def run_epoch(model, loader, optimizer, device, cfg, mode): if mode == "train": model.train() else: model.eval() epoch_loss = 0 l1 = 0 l1_voiced = 0 for x, f0_bin, f0_raw, spk_id, gst, mask, _ in tqdm(loader): x, f0_bin, f0_raw, spk_id, gst, mask = ( x.to(device), f0_bin.to(device), f0_raw.to(device), spk_id.to(device), gst.to(device), mask.to(device), ) b, t, n_bins = f0_bin.shape yhat = model(x, gst) nonzero_mask = (f0_raw != 0).logical_and(mask) yhat_raw = model.inference(x, spk_id, gst) expanded_mask = mask.unsqueeze(-1).expand(-1, -1, n_bins) if cfg.f0_pred == "mean": loss = F.binary_cross_entropy( yhat[expanded_mask], f0_bin[expanded_mask] ).mean() elif cfg.f0_pred == "argmax": loss = F.cross_entropy( rearrange(yhat, "b t d -> (b t) d"), rearrange(f0_bin.argmax(-1), "b t -> (b t)"), reduce=False, ) loss = rearrange(loss, "(b t) -> b t", b=b, t=t) loss = (loss * mask).sum() / mask.float().sum() else: raise NotImplementedError l1 += F.l1_loss(yhat_raw[mask], f0_raw[mask]).item() l1_voiced += F.l1_loss(yhat_raw[nonzero_mask], f0_raw[nonzero_mask]).item() epoch_loss += loss.item() if mode == "train": loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() print(f"{mode} example y: {f0_bin.argmax(-1)[0, 50:60].tolist()}") print(f"{mode} example yhat: {yhat.argmax(-1)[0, 50:60].tolist()}") print(f"{mode} example y: {f0_raw[0, 50:60].round().tolist()}") print(f"{mode} example yhat: {yhat_raw[0, 50:60].round().tolist()}") return epoch_loss / len(loader), l1 / len(loader), l1_voiced / len(loader) @hydra.main(config_path=dir_path, config_name="pitch_predictor.yaml") def main(cfg): np.random.seed(1) random.seed(1) torch.manual_seed(1) from hydra.core.hydra_config import HydraConfig overrides = { x.split("=")[0]: x.split("=")[1] for x in HydraConfig.get().overrides.task if "/" not in x } print(f"{cfg}") train(cfg) if __name__ == "__main__": main() ================================================ FILE: examples/emotion_conversion/emotion_models/pitch_predictor.yaml ================================================ train_tsv: "/denoising/emov/train.tsv" train_km: "/denoising/emov/train.km" valid_tsv: "/denoising/emov/valid.tsv" valid_km: "/denoising/emov/valid.km" n_tokens: 200 batch_size: 64 lr: 0.0001 epochs: 1000 substring: "" loss: "l2" spk: "parent_parent_name" gst: "emotion" f0_bins: 50 f0_pred: "mean" # [argmax, mean] f0_smoothing: 0.1 f0_norm: "mean" f0_log: false f0_bin_type: "adaptive" # [uniform, adaptive] spk2id: bea: 0 jenie: 1 josh: 2 sam: 3 gst2id: amused: 0 angry: 1 disgusted: 2 neutral: 3 sleepy: 4 optimizer: _target_: torch.optim.Adam lr: ${lr} model: _target_: emotion_models.pitch_predictor.CnnPredictor n_tokens: ${n_tokens} emb_dim: 256 channels: 256 kernel: 5 dropout: 0.1 n_layers: 6 spk_emb: true gst_emb: 8 n_bins: ${f0_bins} f0_pred: ${f0_pred} f0_log: ${f0_log} f0_norm: ${f0_norm} hydra: run: dir: /checkpoint/felixkreuk/experiments/pitch_predictor/${hydra.job.override_dirname} job: config: # configuration for the ${hydra.job.override_dirname} runtime variable override_dirname: kv_sep: '=' item_sep: ',' exclude_keys: ['train_tsv', 'train_km', 'valid_tsv', 'valid_km'] ================================================ FILE: examples/emotion_conversion/emotion_models/utils.py ================================================ import torch class Stat: def __init__(self, keep_raw=False): self.x = 0.0 self.x2 = 0.0 self.z = 0.0 # z = logx self.z2 = 0.0 self.n = 0.0 self.u = 0.0 self.keep_raw = keep_raw self.raw = [] def update(self, new_x): new_z = new_x.log() self.x += new_x.sum() self.x2 += (new_x**2).sum() self.z += new_z.sum() self.z2 += (new_z**2).sum() self.n += len(new_x) self.u += 1 if self.keep_raw: self.raw.append(new_x) @property def mean(self): return self.x / self.n @property def std(self): return (self.x2 / self.n - self.mean**2) ** 0.5 @property def mean_log(self): return self.z / self.n @property def std_log(self): return (self.z2 / self.n - self.mean_log**2) ** 0.5 @property def n_frms(self): return self.n @property def n_utts(self): return self.u @property def raw_data(self): assert self.keep_raw, "does not support storing raw data!" return torch.cat(self.raw) class F0Stat(Stat): def update(self, new_x): # assume unvoiced frames are 0 and consider only voiced frames if new_x is not None: super().update(new_x[new_x != 0]) class Accuracy: def __init__(self): self.y, self.yhat = [], [] def update(self, yhat, y): self.yhat.append(yhat) self.y.append(y) def acc(self, tol): yhat = torch.cat(self.yhat) y = torch.cat(self.y) acc = torch.abs(yhat - y) <= tol acc = acc.float().mean().item() return acc ================================================ FILE: examples/emotion_conversion/fairseq_models/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq import utils from fairseq.models import ( FairseqMultiModel, register_model, register_model_architecture, ) from fairseq.models.transformer import ( Embedding, base_architecture, ) from fairseq.models.multilingual_transformer import ( MultilingualTransformerModel, base_multilingual_architecture, ) from fairseq.utils import safe_hasattr from collections import OrderedDict @register_model("multilingual_transformer_from_mbart") class MultilingualTransformerModelFromMbart(MultilingualTransformerModel): @classmethod def build_model(cls, args, task): """Build a new model instance.""" from fairseq.tasks.multilingual_translation import MultilingualTranslationTask assert isinstance(task, MultilingualTranslationTask) # make sure all arguments are present in older models base_multilingual_architecture(args) if not safe_hasattr(args, "max_source_positions"): args.max_source_positions = 1024 if not safe_hasattr(args, "max_target_positions"): args.max_target_positions = 1024 src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs] tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs] if args.share_encoders: args.share_encoder_embeddings = True if args.share_decoders: args.share_decoder_embeddings = True def build_embedding(dictionary, embed_dim, path=None): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb # build shared embeddings (if applicable) shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None if args.share_all_embeddings: if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embeddings not compatible with --decoder-embed-path" ) shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( dicts=task.dicts, langs=task.langs, embed_dim=args.encoder_embed_dim, build_embedding=build_embedding, pretrained_embed_path=args.encoder_embed_path, ) shared_decoder_embed_tokens = shared_encoder_embed_tokens args.share_decoder_input_output_embed = True else: if args.share_encoder_embeddings: shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( dicts=task.dicts, langs=src_langs, embed_dim=args.encoder_embed_dim, build_embedding=build_embedding, pretrained_embed_path=args.encoder_embed_path, ) if args.share_decoder_embeddings: shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( dicts=task.dicts, langs=tgt_langs, embed_dim=args.decoder_embed_dim, build_embedding=build_embedding, pretrained_embed_path=args.decoder_embed_path, ) # encoders/decoders for each language lang_encoders, lang_decoders = {}, {} def get_encoder(lang): if lang not in lang_encoders: if shared_encoder_embed_tokens is not None: encoder_embed_tokens = shared_encoder_embed_tokens else: encoder_embed_tokens = build_embedding( task.dicts[lang], args.encoder_embed_dim, args.encoder_embed_path, ) lang_encoders[lang] = MultilingualTransformerModel._get_module_class( True, args, task.dicts[lang], encoder_embed_tokens, src_langs ) return lang_encoders[lang] def get_decoder(lang): if lang not in lang_decoders: if shared_decoder_embed_tokens is not None: decoder_embed_tokens = shared_decoder_embed_tokens else: decoder_embed_tokens = build_embedding( task.dicts[lang], args.decoder_embed_dim, args.decoder_embed_path, ) lang_decoders[lang] = MultilingualTransformerModel._get_module_class( False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs ) return lang_decoders[lang] # shared encoders/decoders (if applicable) shared_encoder, shared_decoder = None, None if args.share_encoders: shared_encoder = get_encoder(src_langs[0]) if args.share_decoders: shared_decoder = get_decoder(tgt_langs[0]) encoders, decoders = OrderedDict(), OrderedDict() for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs): encoders[lang_pair] = ( shared_encoder if shared_encoder is not None else get_encoder(src) ) decoders[lang_pair] = ( shared_decoder if shared_decoder is not None else get_decoder(tgt) ) return MultilingualTransformerModelFromMbart(encoders, decoders) def load_state_dict(self, state_dict, strict=True, model_cfg=None): state_dict_subset = state_dict.copy() lang_pairs = set([x.split(".")[1] for x in state_dict.keys()]) finetune_mode = not any("neutral" in lp for lp in lang_pairs) if finetune_mode: # load a pre-trained mBART/BART model # we need this code because mBART/BART are not of type FairseqMultiModel but FairseqModel # so we hackishly load the weights by replicating them for all lang pairs print("loading pre-trained BART") self_state_dict = self.state_dict() for k, v in state_dict.items(): for lang_pair in self.models: new_key = k if "models." in k else f"models.{lang_pair}.{k}" # print(new_key) if self_state_dict[new_key].shape == v.shape: state_dict_subset[new_key] = v elif any( w in k for w in [ "encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "decoder.output_projection.weight", ] ): # why vocab_size - 5? because there are `vocab_size` tokens from the language # and 5 additional tokens in the denoising task: eos,bos,pad,unk,mask. # but in the translation task there are only `vocab_size` + 4 (no mask). print( f"{k}: {self_state_dict[new_key].shape} != {v.shape}", end="", flush=True, ) vocab_size = v.shape[0] - 5 state_dict_subset[new_key] = self_state_dict[new_key] state_dict_subset[new_key] = v[: vocab_size + 4] print(f" => fixed by using first {vocab_size + 4} dims") else: raise ValueError("unable to load model due to mimatched dims!") del state_dict_subset[k] else: print("loading pre-trained emotion translation model") for k, _ in state_dict.items(): assert k.startswith("models.") lang_pair = k.split(".")[1] if lang_pair not in self.models: del state_dict_subset[k] super().load_state_dict(state_dict_subset, strict=strict, model_cfg=model_cfg) @register_model_architecture("transformer", "transformer_small") def transformer_small(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.encoder_layers = getattr(args, "encoder_layers", 3) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 512) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.decoder_layers = getattr(args, "decoder_layers", 3) base_architecture(args) @register_model_architecture( "multilingual_transformer_from_mbart", "multilingual_small" ) def multilingual_small(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.encoder_layers = getattr(args, "encoder_layers", 3) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 512) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.decoder_layers = getattr(args, "decoder_layers", 3) base_multilingual_architecture(args) ================================================ FILE: examples/emotion_conversion/preprocess/__init__.py ================================================ ================================================ FILE: examples/emotion_conversion/preprocess/build_hifigan_manifest.py ================================================ import torchaudio import argparse import json def main(): parser = argparse.ArgumentParser(description="example: python create_hifigan_manifest.py --tsv /checkpoint/felixkreuk/datasets/vctk/splits/vctk_16khz/train.tsv --km /checkpoint/felixkreuk/experiments/hubert/hubert_feats/vctk_16khz_km_100/train.km --km_type hubert_100km > ~/tmp/tmp_mani.txt") parser.add_argument("--tsv", required=True, help="path to fairseq tsv file") parser.add_argument("--km", required=True, help="path to a km file generated by HuBERT clustering") parser.add_argument("--km_type", required=True, help="name of the codes in the output json (for example: 'cpc_100km')") args = parser.parse_args() km_lines = open(args.km, "r").readlines() tsv_lines = open(args.tsv, "r").readlines() assert len(km_lines) == len(tsv_lines) - 1, "tsv and km files are not of the same length!" wav_root = tsv_lines[0].strip() tsv_lines = tsv_lines[1:] for tsv_line, km_line in zip(tsv_lines, km_lines): tsv_line, km_line = tsv_line.strip(), km_line.strip() wav_basename, wav_num_frames = tsv_line.split("\t") wav_path = wav_root + "/" + wav_basename wav_info = torchaudio.info(wav_path) assert int(wav_num_frames) == wav_info.num_frames, "tsv duration and actual duration don't match!" wav_duration = wav_info.num_frames / wav_info.sample_rate manifest_line = {"audio": wav_path, "duration": wav_duration, args.km_type: km_line} print(json.dumps(manifest_line)) if __name__ == "__main__": """ usage: python create_hifigan_manifest.py \ --tsv /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/valid.tsv \ --km /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/valid.km \ --km_type hubert \ > /checkpoint/felixkreuk/datasets/vctk/manifests/vctk_16khz/hubert_km_100/hifigan_valid_manifest.txt """ main() ================================================ FILE: examples/emotion_conversion/preprocess/build_translation_manifests.py ================================================ from glob import glob import argparse from collections import defaultdict, Counter from itertools import combinations, product, groupby from pathlib import Path import os from sklearn.utils import shuffle import numpy as np import random from shutil import copy from subprocess import check_call np.random.seed(42) random.seed(42) def get_fname(s): return s.split("\t")[0] def get_emotion(s): return get_fname(s).split("_")[0].split("/")[1].lower() def get_utt_id(s): return get_fname(s).split(".")[0].split("_")[-1] def dedup(seq): """ >> remove_repetitions("1 2 2 3 100 2 2 1") '1 2 3 100 2 1' """ seq = seq.strip().split(" ") result = seq[:1] reps = [] rep_counter = 1 for k in seq[1:]: if k != result[-1]: result += [k] reps += [rep_counter] rep_counter = 1 else: rep_counter += 1 reps += [rep_counter] assert len(reps) == len(result) and sum(reps) == len(seq) return " ".join(result) + "\n" #, reps def remove_under_k(seq, k): """ remove tokens that repeat less then k times in a row >> remove_under_k("a a a a b c c c", 1) ==> a a a a c c c """ seq = seq.strip().split(" ") result = [] freqs = [(k,len(list(g))) for k, g in groupby(seq)] for c, f in freqs: if f > k: result += [c for _ in range(f)] return " ".join(result) + "\n" #, reps def call(cmd): print(cmd) check_call(cmd, shell=True) def denoising_preprocess(path, lang, dict): bin = 'fairseq-preprocess' cmd = [ bin, f'--trainpref {path}/train.{lang} --validpref {path}/valid.{lang} --testpref {path}/test.{lang}', f'--destdir {path}/tokenized/{lang}', '--only-source', '--task multilingual_denoising', '--workers 40', ] if dict != "": cmd += [f'--srcdict {dict}'] cmd = " ".join(cmd) call(cmd) def translation_preprocess(path, src_lang, trg_lang, dict, only_train=False): bin = 'fairseq-preprocess' cmd = [ bin, f'--source-lang {src_lang} --target-lang {trg_lang}', f'--trainpref {path}/train', f'--destdir {path}/tokenized', '--workers 40', ] if not only_train: cmd += [f'--validpref {path}/valid --testpref {path}/test'] if dict != "": cmd += [ f'--srcdict {dict}', f'--tgtdict {dict}', ] cmd = " ".join(cmd) call(cmd) def load_tsv_km(tsv_path, km_path): assert tsv_path.exists() and km_path.exists() tsv_lines = open(tsv_path, "r").readlines() root, tsv_lines = tsv_lines[0], tsv_lines[1:] km_lines = open(km_path, "r").readlines() assert len(tsv_lines) == len(km_lines), ".tsv and .km should be the same length!" return root, tsv_lines, km_lines def main(): desc = """ this script takes as input .tsv and .km files for EMOV dataset, and a pairs of emotions. it generates parallel .tsv and .km files for these emotions. for exmaple: ❯ python build_emov_translation_manifests.py \ /checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/train.tsv \ /checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/emov_16khz_km_100/train.km \ ~/tmp/emov_pairs \ --src-emotion amused --trg-emotion neutral \ --dedup --shuffle --cross-speaker --dry-run """ parser = argparse.ArgumentParser(description=desc) parser.add_argument("data", type=Path, help="path to a dir containing .tsv and .km files containing emov dataset") parser.add_argument("output_path", type=Path, help="output directory with the manifests will be created") parser.add_argument("-cs", "--cross-speaker", action='store_true', help="if set then translation will occur also between speakers, meaning the same sentence can be translated between different speakers (default: false)") parser.add_argument("-dd", "--dedup", action='store_true', help="remove repeated tokens (example: 'aaabc=>abc')") parser.add_argument("-sh", "--shuffle", action='store_true', help="shuffle the data") parser.add_argument("-ae", "--autoencode", action='store_true', help="include training pairs from the same emotion (this includes examples of the same sentence uttered by different people and examples where the src and trg are the exact same seq)") parser.add_argument("-dr", "--dry-run", action='store_true', help="don't write anything to disk") parser.add_argument("-zs", "--zero-shot", action='store_true', help="if true, the denoising task will train on the same splits as the translation task (split by utterance id). if false, the denoising task will train on randomly sampled splits (not split by utterance id)") parser.add_argument("--km-ext", default="km", help="") parser.add_argument("--dict", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/fairseq.dict.txt", help="") args = parser.parse_args() SPEAKERS = ["bea", "jenie", "josh", "sam", "SAME"] EMOTIONS = ['neutral', 'amused', 'angry', 'disgusted', 'sleepy'] suffix = "" if args.cross_speaker: suffix += "_cross-speaker" if args.dedup: suffix += "_dedup" translation_suffix = "" if args.autoencode: translation_suffix += "_autoencode" denoising_suffix = "" denoising_suffix += "_zeroshot" if args.zero_shot else "_nonzeroshot" translation_dir = Path(args.output_path) / ("emov_multilingual_translation" + suffix + translation_suffix) os.makedirs(translation_dir, exist_ok=True) denoising_dir = Path(args.output_path) / ("emov_multilingual_denoising" + suffix + denoising_suffix) os.makedirs(denoising_dir, exist_ok=True) denoising_data = [p.name for p in (args.data / "denoising").glob("*") if "emov" not in p.name] for split in ["train", "valid", "test"]: root, tsv_lines, km_lines = load_tsv_km( tsv_path = args.data / "denoising" / "emov" / f"{split}.tsv", km_path = args.data / "denoising" / "emov" / f"{split}.{args.km_ext}" ) # generate data for the multilingual denoising task for EMOTION in EMOTIONS: print("---") print(split) print(f"denoising: {EMOTION}") emotion_tsv, emotion_km = [], [] for tsv_line, km_line in zip(tsv_lines, km_lines): if EMOTION.lower() in tsv_line.lower(): km_line = km_line if not args.dedup else dedup(km_line) emotion_tsv.append(tsv_line) emotion_km.append(km_line) print(f"{len(emotion_km)} samples") open(denoising_dir / f"files.{split}.{EMOTION}", "w").writelines([root] + emotion_tsv) open(denoising_dir / f"{split}.{EMOTION}", "w").writelines(emotion_km) for data in denoising_data: with open(args.data / "denoising" / data / f"{split}.{args.km_ext}", "r") as f1: with open(denoising_dir / f"{split}.{data}", "w") as f2: f2.writelines([l if not args.dedup else dedup(l) for l in f1.readlines()]) # start of translation preprocessing root, tsv_lines, km_lines = load_tsv_km( tsv_path = args.data / "translation" / f"{split}.tsv", km_path = args.data / "translation" / f"{split}.{args.km_ext}" ) # generate data for the multilingual translation task for SRC_EMOTION in EMOTIONS: TRG_EMOTIONS = EMOTIONS if args.autoencode else set(EMOTIONS) - set([SRC_EMOTION]) for TRG_EMOTION in TRG_EMOTIONS: # when translating back to the same emotion - we dont want these emotion # pairs to be part of the validation/test sets (because its not really emotion conversino) # if SRC_EMOTION == TRG_EMOTION and split in ["valid", "test"]: continue print("---") print(split) print(f"src emotions: {SRC_EMOTION}\ntrg emotions: {TRG_EMOTION}") # create a dictionary with the following structure: # output[SPEAKER][UTT_ID] = list with indexes of line from the tsv file # that match the speaker and utterance id. for exmaple: # output = {'sam': {'0493': [875, 1608, 1822], ...}, ...} # meaning, for speaker 'sam', utterance id '0493', the indexes in tsv_lines # are 875, 1608, 1822 spkr2utts = defaultdict(lambda: defaultdict(list)) for i, tsv_line in enumerate(tsv_lines): speaker = tsv_line.split("/")[0] if args.cross_speaker: speaker = "SAME" assert speaker in SPEAKERS, "unknown speaker! make sure the .tsv contains EMOV data" utt_id = get_utt_id(tsv_line) spkr2utts[speaker][utt_id].append(i) # create a tsv and km files with all the combinations for translation src_tsv, trg_tsv, src_km, trg_km = [], [], [], [] for speaker, utt_ids in spkr2utts.items(): for utt_id, indices in utt_ids.items(): # generate all pairs pairs = [(x,y) for x in indices for y in indices] # self-translation if SRC_EMOTION == TRG_EMOTION: pairs = [(x,y) for (x,y) in pairs if x == y] # filter according to src and trg emotions pairs = [(x,y) for (x,y) in pairs if get_emotion(tsv_lines[x]) == SRC_EMOTION and get_emotion(tsv_lines[y]) == TRG_EMOTION] for idx1, idx2 in pairs: assert get_utt_id(tsv_lines[idx1]) == get_utt_id(tsv_lines[idx2]) src_tsv.append(tsv_lines[idx1]) trg_tsv.append(tsv_lines[idx2]) km_line_idx1 = km_lines[idx1] km_line_idx2 = km_lines[idx2] km_line_idx1 = km_line_idx1 if not args.dedup else dedup(km_line_idx1) km_line_idx2 = km_line_idx2 if not args.dedup else dedup(km_line_idx2) src_km.append(km_line_idx1) trg_km.append(km_line_idx2) assert len(src_tsv) == len(trg_tsv) == len(src_km) == len(trg_km) print(f"{len(src_tsv)} pairs") if len(src_tsv) == 0: raise Exception("ERROR: generated 0 pairs!") if args.dry_run: continue # create files os.makedirs(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}", exist_ok=True) open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"files.{split}.{SRC_EMOTION}", "w").writelines([root] + src_tsv) open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"files.{split}.{TRG_EMOTION}", "w").writelines([root] + trg_tsv) open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"{split}.{SRC_EMOTION}", "w").writelines(src_km) open(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}" / f"{split}.{TRG_EMOTION}", "w").writelines(trg_km) # fairseq-preprocess the denoising data for EMOTION in EMOTIONS + denoising_data: denoising_preprocess(denoising_dir, EMOTION, args.dict) os.system(f"cp {args.dict} {denoising_dir}/tokenized/dict.txt") # fairseq-preprocess the translation data os.makedirs(translation_dir / "tokenized", exist_ok=True) for SRC_EMOTION in EMOTIONS: TRG_EMOTIONS = EMOTIONS if args.autoencode else set(EMOTIONS) - set([SRC_EMOTION]) for TRG_EMOTION in TRG_EMOTIONS: translation_preprocess(translation_dir / f"{SRC_EMOTION}-{TRG_EMOTION}", SRC_EMOTION, TRG_EMOTION, args.dict)#, only_train=SRC_EMOTION==TRG_EMOTION) os.system(f"cp -rf {translation_dir}/**/tokenized/* {translation_dir}/tokenized") if __name__ == "__main__": main() ================================================ FILE: examples/emotion_conversion/preprocess/create_core_manifest.py ================================================ from pathlib import Path import os import sys import subprocess import argparse from datetime import datetime import logging logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.FileHandler('debug.log'), logging.StreamHandler()] ) logger = logging.getLogger(__name__) def verify_dict_size(km, dict): logger.info(f"verifying: {km}") dict_size = len(open(dict, "r").readlines()) km_vocab = set(open(km, "r").read().replace("\n", " ").split(" ")) if "" in km_vocab: km_vocab.remove("") km_vocab_size = len(km_vocab) return dict_size == km_vocab_size def verify_files_exist(l): for f in l: if not f.exists(): logging.error(f"{f} doesn't exist!") return False return True def run_cmd(cmd, print_output=True): try: out = subprocess.check_output(cmd, stderr=subprocess.STDOUT, universal_newlines=True, shell=True) if print_output: logger.info(f"command output:\n{out}") return out except subprocess.CalledProcessError as grepexc: logger.info(f"error executing command!:\n{cmd}") logger.info(grepexc.output) def main(): parser = argparse.ArgumentParser() parser.add_argument("--tsv", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/data.tsv", type=Path) parser.add_argument("--emov-km", required=True, type=Path) parser.add_argument("--km", nargs='+', required=True, type=Path) parser.add_argument("--seed", type=int, default=1) parser.add_argument("--dict", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/fairseq.dict.txt") parser.add_argument("--manifests-dir", type=Path, default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz") args = parser.parse_args() manifests_dir = args.manifests_dir date = datetime.now().strftime('%d%m%y') outdir = manifests_dir / f"{date}" # verify input and create folders all_kms = args.km + [args.emov_km] assert verify_files_exist(all_kms), "make sure the km dir contains: train-clean-all.km, blizzard2013.km, data.km" for codes in all_kms: assert verify_dict_size(codes, args.dict), "dict argument doesn't match the vocabulary of the km file!" assert not outdir.exists(), "data dir already exists!" outdir.mkdir(parents=True, exist_ok=True) logger.info("generating denoising split (emov)") run_cmd(f"python preprocess/split_km_tsv.py {args.tsv} {args.emov_km} --destdir {outdir}/denoising/emov -sh --seed {args.seed}") for codes in args.km: codes_name = os.path.basename(codes) run_cmd(f"python preprocess/split_km.py {codes} --destdir {outdir}/denoising/{codes_name} -sh --seed {args.seed}") logger.info("generating translation split") run_cmd(f"python preprocess/split_emov_km_tsv_by_uttid.py {args.tsv} {args.emov_km} --destdir {outdir}/translation --seed {args.seed}") emov_code_name = os.path.basename(args.emov_km) logger.info("generating hifigan split") run_cmd( f"mkdir -p {outdir}/hifigan &&" f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/train.tsv --km {outdir}/denoising/emov/train.km > {outdir}/hifigan/train.txt &&" f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/valid.tsv --km {outdir}/denoising/emov/valid.km > {outdir}/hifigan/valid.txt &&" f"python preprocess/build_hifigan_manifest.py --km_type hubert --tsv {outdir}/denoising/emov/test.tsv --km {outdir}/denoising/emov/test.km > {outdir}/hifigan/test.txt" ) logger.info("generating fairseq manifests") run_cmd(f"python preprocess/build_translation_manifests.py {outdir} {outdir}/fairseq-data -dd -cs --dict {args.dict}") logger.info(f"finished processing data at:\n{outdir}") if __name__ == "__main__": main() ================================================ FILE: examples/emotion_conversion/preprocess/extract_f0.py ================================================ import argparse from tqdm import tqdm from multiprocessing import Manager, Pool from scipy.io.wavfile import read from librosa.util import normalize import numpy as np import amfm_decompy.pYAAPT as pYAAPT import amfm_decompy.basic_tools as basic MAX_WAV_VALUE = 32768.0 parser = argparse.ArgumentParser(description="") parser.add_argument("tsv", help="") parser.add_argument("--extractor", choices=["crepe", "pyaapt"], default="pyaapt", help="") parser.add_argument("--interp", action="store_true", help="") parser.add_argument("--n_workers", type=int, default=40, help="") args = parser.parse_args() tsv_lines = open(args.tsv, "r").readlines() root, tsv_lines = tsv_lines[0].strip(), tsv_lines[1:] def extract_f0(tsv_line): wav_path, _ = tsv_line.split("\t") wav_path = root.strip() + "/" + wav_path sr, wav = read(wav_path) wav = wav / MAX_WAV_VALUE wav = normalize(wav) * 0.95 if args.extractor == "pyaapt": frame_length = 20.0 pad = int(frame_length / 1000 * sr) // 2 wav = np.pad(wav.squeeze(), (pad, pad), "constant", constant_values=0) signal = basic.SignalObj(wav, sr) pitch = pYAAPT.yaapt( signal, **{ 'frame_length': frame_length, 'frame_space': 5.0, 'nccf_thresh1': 0.25, 'tda_frame_length': 25.0 }) pitch = pitch.samp_interp[None, None, :] if args.interp else pitch.samp_values[None, None, :] pitch = pitch[0, 0] f0_path = wav_path.replace(".wav", ".yaapt") f0_path += ".interp.f0" if args.interp else ".f0" np.save(f0_path, pitch) def main(): with Pool(args.n_workers) as p: r = list(tqdm(p.imap(extract_f0, tsv_lines), total=len(tsv_lines))) if __name__ == "__main__": main() ================================================ FILE: examples/emotion_conversion/preprocess/process_km.py ================================================ import sys import argparse from tqdm import tqdm from build_emov_translation_manifests import dedup, remove_under_k if __name__ == "__main__": """ this is a standalone script to process a km file specifically, to dedup or remove tokens that repeat less than k times in a row """ parser = argparse.ArgumentParser(description="") parser.add_argument("km", type=str, help="path to km file") parser.add_argument("--dedup", action='store_true') parser.add_argument("--remove-under-k", type=int, default=0) parser.add_argument("--output", default=None) args = parser.parse_args() if not args.dedup and args.remove_under_k == 0: print("nothing to do! quitting...") sys.exit(0) km = open(args.km, "r").readlines() out = [] for line in tqdm(km): if args.remove_under_k > 0: line = remove_under_k(line, args.remove_under_k) if args.dedup: line = dedup(line) out.append(line) path = args.km if args.output is None else args.output if args.remove_under_k > 0: path = path.replace(".km", f"-k{args.remove_under_k}.km") if args.dedup: path = path.replace(".km", f"-deduped.km") open(path, "w").writelines(out) print(f"written to {path}") ================================================ FILE: examples/emotion_conversion/preprocess/split_emov_km_tsv_by_uttid.py ================================================ from pathlib import Path import os import sys import argparse import random import numpy as np from tqdm import tqdm from sklearn.model_selection import train_test_split from build_translation_manifests import get_utt_id def train_val_test_split(tsv_lines, km_lines, valid_percent, test_percent, seed=42): utt_ids = list(sorted(set([get_utt_id(x) for x in tsv_lines]))) utt_ids, valid_utt_ids, _, _ = train_test_split(utt_ids, utt_ids, test_size=valid_percent, shuffle=True, random_state=seed) train_utt_ids, test_utt_ids, _, _ = train_test_split(utt_ids, utt_ids, test_size=test_percent, shuffle=True, random_state=seed) train_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in train_utt_ids] valid_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in valid_utt_ids] test_idx = [i for i, line in enumerate(tsv_lines) if get_utt_id(line) in test_utt_ids] train_tsv, train_km = [tsv_lines[i] for i in train_idx], [km_lines[i] for i in train_idx] valid_tsv, valid_km = [tsv_lines[i] for i in valid_idx], [km_lines[i] for i in valid_idx] test_tsv, test_km = [tsv_lines[i] for i in test_idx], [km_lines[i] for i in test_idx] print(f"train {len(train_km)}") print(f"valid {len(valid_km)}") print(f"test {len(test_km)}") return train_tsv, train_km, valid_tsv, valid_km, test_tsv, test_km if __name__ == "__main__": """ this is a standalone script to process a km file specifically, to dedup or remove tokens that repeat less than k times in a row """ parser = argparse.ArgumentParser(description="") parser.add_argument("tsv", type=str, help="path to tsv file") parser.add_argument("km", type=str, help="path to km file") parser.add_argument("--destdir", required=True, type=str) parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set") parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set") parser.add_argument("--seed", type=int, default=42, help="") args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) os.makedirs(args.destdir, exist_ok=True) km = open(args.km, "r").readlines() tsv = open(args.tsv, "r").readlines() root, tsv = tsv[0], tsv[1:] assert args.tsv.endswith(".tsv") and args.km.endswith(".km") assert len(tsv) == len(km) train_tsv, train_km, valid_tsv, valid_km, test_tsv, test_km = train_val_test_split(tsv, km, args.valid_percent, args.test_percent, args.seed) assert len(train_tsv) + len(valid_tsv) + len(test_tsv) == len(tsv) assert len(train_tsv) == len(train_km) and len(valid_tsv) == len(valid_km) and len(test_tsv) == len(test_km) dir = Path(args.destdir) open(dir / f"train.tsv", "w").writelines([root] + train_tsv) open(dir / f"valid.tsv", "w").writelines([root] + valid_tsv) open(dir / f"test.tsv", "w").writelines([root] + test_tsv) open(dir / f"train.km", "w").writelines(train_km) open(dir / f"valid.km", "w").writelines(valid_km) open(dir / f"test.km", "w").writelines(test_km) print("done") ================================================ FILE: examples/emotion_conversion/preprocess/split_km.py ================================================ from pathlib import Path import os import argparse import random import numpy as np from sklearn.utils import shuffle if __name__ == "__main__": """ this is a standalone script to process a km file specifically, to dedup or remove tokens that repeat less than k times in a row """ parser = argparse.ArgumentParser(description="") parser.add_argument("km", type=str, help="path to km file") parser.add_argument("--destdir", required=True, type=str) parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set") parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set") parser.add_argument("-sh", "--shuffle", action="store_true", help="path to km file") parser.add_argument("--seed", type=int, default=42, help="") args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) os.makedirs(args.destdir, exist_ok=True) km = open(args.km, "r").readlines() if args.shuffle: km = shuffle(km) print(f"shuffled") N = len(km) N_tt = int(N * args.test_percent) N_cv = int(N * args.valid_percent) N_tr = N - N_tt - N_cv train_km = km[:N_tr] valid_km = km[N_tr:N_tr + N_cv] test_km = km[N_tr + N_cv:] dir = Path(args.destdir) open(dir / f"train.km", "w").writelines(train_km) open(dir / f"valid.km", "w").writelines(valid_km) open(dir / f"test.km", "w").writelines(test_km) print(f"train: {len(train_km)}") print(f"valid: {len(valid_km)}") print(f"test: {len(test_km)}") print("done") ================================================ FILE: examples/emotion_conversion/preprocess/split_km_tsv.py ================================================ from pathlib import Path import os import argparse import random import numpy as np from sklearn.utils import shuffle if __name__ == "__main__": """ this is a standalone script to process a km file specifically, to dedup or remove tokens that repeat less than k times in a row """ parser = argparse.ArgumentParser(description="") parser.add_argument("tsv", type=str, help="path to tsv file") parser.add_argument("km", type=str, help="path to km file") parser.add_argument("--destdir", required=True, type=str) parser.add_argument("--valid-percent", type=float, default=0.05, help="percent to allocate to validation set") parser.add_argument("--test-percent", type=float, default=0.05, help="percent to allocate to test set") parser.add_argument("-sh", "--shuffle", action="store_true", help="path to km file") parser.add_argument("--seed", type=int, default=42, help="") args = parser.parse_args() np.random.seed(args.seed) random.seed(args.seed) os.makedirs(args.destdir, exist_ok=True) km = open(args.km, "r").readlines() tsv = open(args.tsv, "r").readlines() root, tsv = tsv[0], tsv[1:] assert args.tsv.endswith(".tsv") and args.km.endswith(".km") assert len(tsv) == len(km) if args.shuffle: tsv, km = shuffle(tsv, km) print(f"shuffled") N = len(tsv) N_tt = int(N * args.test_percent) N_cv = int(N * args.valid_percent) N_tr = N - N_tt - N_cv train_tsv = tsv[:N_tr] valid_tsv = tsv[N_tr:N_tr + N_cv] test_tsv = tsv[N_tr + N_cv:] train_km = km[:N_tr] valid_km = km[N_tr:N_tr + N_cv] test_km = km[N_tr + N_cv:] assert len(train_tsv) + len(valid_tsv) + len(test_tsv) == len(tsv) assert len(train_tsv) == len(train_km) and len(valid_tsv) == len(valid_km) and len(test_tsv) == len(test_km) dir = Path(args.destdir) open(dir / f"train.tsv", "w").writelines([root] + train_tsv) open(dir / f"valid.tsv", "w").writelines([root] + valid_tsv) open(dir / f"test.tsv", "w").writelines([root] + test_tsv) open(dir / f"train.km", "w").writelines(train_km) open(dir / f"valid.km", "w").writelines(valid_km) open(dir / f"test.km", "w").writelines(test_km) print(f"train: {len(train_km)}") print(f"valid: {len(valid_km)}") print(f"test: {len(test_km)}") print("done") ================================================ FILE: examples/emotion_conversion/requirements.txt ================================================ scipy einops amfm_decompy joblib numba decorator requests appdirs packaging six sklearn ================================================ FILE: examples/emotion_conversion/synthesize.py ================================================ import logging import argparse import random import sys import os import numpy as np import torch import soundfile as sf import shutil import librosa import json from pathlib import Path from tqdm import tqdm import amfm_decompy.basic_tools as basic import amfm_decompy.pYAAPT as pYAAPT dir_path = os.path.dirname(__file__) resynth_path = os.path.dirname(os.path.abspath(__file__)) + "/speech-resynthesis" sys.path.append(resynth_path) from models import CodeGenerator from inference import scan_checkpoint, load_checkpoint, generate from emotion_models.pitch_predictor import load_ckpt as load_pitch_predictor from emotion_models.duration_predictor import load_ckpt as load_duration_predictor from dataset import load_audio, MAX_WAV_VALUE, parse_style, parse_speaker, EMOV_SPK2ID, EMOV_STYLE2ID logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[logging.FileHandler('debug.log'), logging.StreamHandler()] ) logger = logging.getLogger(__name__) class AttrDict(dict): def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) self.__dict__ = self def parse_generation_file(fname): lines = open(fname).read() lines = lines.split('\n') results = {} for l in lines: if len(l) == 0: continue if l[0] == 'H': parts = l[2:].split('\t') if len(parts) == 2: sid, utt = parts else: sid, _, utt = parts sid = int(sid) utt = [int(x) for x in utt.split()] if sid in results: results[sid]['H'] = utt else: results[sid] = {'H': utt} elif l[0] == 'S': sid, utt = l[2:].split('\t') sid = int(sid) utt = [x for x in utt.split()] if sid in results: results[sid]['S'] = utt else: results[sid] = {'S': utt} elif l[0] == 'T': sid, utt = l[2:].split('\t') sid = int(sid) utt = [int(x) for x in utt.split()] if sid in results: results[sid]['T'] = utt else: results[sid] = {'T': utt} for d, result in results.items(): if 'H' not in result: result['H'] = result['S'] return results def get_code_to_fname(manifest, tokens): if tokens is None: code_to_fname = {} with open(manifest) as f: for line in f: line = line.strip() fname, code = line.split() code = code.replace(',', ' ') code_to_fname[code] = fname return code_to_fname with open(manifest) as f: fnames = [l.strip() for l in f.readlines()] root = Path(fnames[0]) fnames = fnames[1:] if '\t' in fnames[0]: fnames = [x.split()[0] for x in fnames] with open(tokens) as f: codes = [l.strip() for l in f.readlines()] code_to_fname = {} for fname, code in zip(fnames, codes): code = code.replace(',', ' ') code_to_fname[code] = str(root / fname) return root, code_to_fname def code_to_str(s): k = ' '.join([str(x) for x in s]) return k def get_praat_f0(audio, rate=16000, interp=False): frame_length = 20.0 to_pad = int(frame_length / 1000 * rate) // 2 f0s = [] for y in audio.astype(np.float64): y_pad = np.pad(y.squeeze(), (to_pad, to_pad), "constant", constant_values=0) signal = basic.SignalObj(y_pad, rate) pitch = pYAAPT.yaapt(signal, **{'frame_length': frame_length, 'frame_space': 5.0, 'nccf_thresh1': 0.25, 'tda_frame_length': 25.0}) if interp: f0s += [pitch.samp_interp[None, None, :]] else: f0s += [pitch.samp_values[None, None, :]] f0 = np.vstack(f0s) return f0 def generate_from_code(generator, h, code, spkr=None, f0=None, gst=None, device="cpu"): batch = { 'code': torch.LongTensor(code).to(device).view(1, -1), } if spkr is not None: batch['spkr'] = spkr.to(device).unsqueeze(0) if f0 is not None: batch['f0'] = f0.to(device) if gst is not None: batch['style'] = gst.to(device) with torch.no_grad(): audio, rtf = generate(h, generator, batch) audio = librosa.util.normalize(audio / 2 ** 15) return audio @torch.no_grad() def synth(argv, interactive=False): parser = argparse.ArgumentParser() parser.add_argument('--result-path', type=Path, help='Translation Model Output', required=True) parser.add_argument('--data', type=Path, help='a directory with the files: src.tsv, src.km, trg.tsv, trg.km, orig.tsv, orig.km') parser.add_argument("--orig-tsv", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/data.tsv") parser.add_argument("--orig-km", default="/checkpoint/felixkreuk/datasets/emov/manifests/emov_16khz/core_manifests/emov_16khz_km_100/data.km") parser.add_argument('--checkpoint-file', type=Path, help='Generator Checkpoint', required=True) parser.add_argument('--dur-model', type=Path, help='a token duration prediction model (if tokens were deduped)') parser.add_argument('--f0-model', type=Path, help='a f0 prediction model') parser.add_argument('-s', '--src-emotion', default=None) parser.add_argument('-t', '--trg-emotion', default=None) parser.add_argument('-N', type=int, default=10) parser.add_argument('--split', default="test") parser.add_argument('--outdir', type=Path, default=Path('results')) parser.add_argument('--orig-filename', action='store_true') parser.add_argument('--device', type=int, default=0) a = parser.parse_args(argv) seed = 52 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if os.path.isdir(a.checkpoint_file): config_file = os.path.join(a.checkpoint_file, 'config.json') else: config_file = os.path.join(os.path.split(a.checkpoint_file)[0], 'config.json') with open(config_file) as f: data = f.read() json_config = json.loads(data) h = AttrDict(json_config) generator = CodeGenerator(h).to(a.device) if os.path.isdir(a.checkpoint_file): cp_g = scan_checkpoint(a.checkpoint_file, 'g_') else: cp_g = a.checkpoint_file state_dict_g = load_checkpoint(cp_g) generator.load_state_dict(state_dict_g['generator']) generator.eval() generator.remove_weight_norm() dur_models = { "neutral": load_duration_predictor(f"{a.dur_model}/neutral.ckpt"), "amused": load_duration_predictor(f"{a.dur_model}/amused.ckpt"), "disgusted": load_duration_predictor(f"{a.dur_model}/disgusted.ckpt"), "angry": load_duration_predictor(f"{a.dur_model}/angry.ckpt"), "sleepy": load_duration_predictor(f"{a.dur_model}/sleepy.ckpt"), } logger.info(f"loaded duration prediction model from {a.dur_model}") f0_model = load_pitch_predictor(a.f0_model).to(a.device) logger.info(f"loaded f0 prediction model from {a.f0_model}") # we need to know how to map code back to the filename # (if we want the original files names as output) results = parse_generation_file(a.result_path) _, src_code_to_fname = get_code_to_fname(f'{a.data}/files.{a.split}.{a.src_emotion}', f'{a.data}/{a.split}.{a.src_emotion}') _, tgt_code_to_fname = get_code_to_fname(f'{a.data}/files.{a.split}.{a.trg_emotion}', f'{a.data}/{a.split}.{a.trg_emotion}') # we need the originals (before dedup) to get the ground-truth durations orig_tsv = open(a.orig_tsv, 'r').readlines() orig_tsv_root, orig_tsv = orig_tsv[0].strip(), orig_tsv[1:] orig_km = open(a.orig_km, 'r').readlines() fname_to_idx = {orig_tsv_root + "/" + line.split("\t")[0]: i for i, line in enumerate(orig_tsv)} outdir = a.outdir outdir.mkdir(parents=True, exist_ok=True) (outdir / '0-source').mkdir(exist_ok=True) (outdir / '1-src-tokens-src-style-src-f0').mkdir(exist_ok=True) (outdir / '2-src-tokens-trg-style-src-f0').mkdir(exist_ok=True) (outdir / '2.5-src-tokens-trg-style-src-f0').mkdir(exist_ok=True) (outdir / '3-src-tokens-trg-style-pred-f0').mkdir(exist_ok=True) (outdir / '4-gen-tokens-trg-style-pred-f0').mkdir(exist_ok=True) (outdir / '5-target').mkdir(exist_ok=True) N = 0 results = list(results.items()) random.shuffle(results) for i, (sid, result) in tqdm(enumerate(results)): N += 1 if N > a.N and a.N != -1: break if '[' in result['S'][0]: result['S'] = result['S'][1:] if '_' in result['S'][-1]: result['S'] = result['S'][:-1] src_ref = src_code_to_fname[code_to_str(result['S'])] trg_ref = tgt_code_to_fname[code_to_str(result['T'])] src_style, trg_style = None, None src_spkr, trg_spkr = None, None src_f0 = None src_audio = (load_audio(src_ref)[0] / MAX_WAV_VALUE) * 0.95 trg_audio = (load_audio(trg_ref)[0] / MAX_WAV_VALUE) * 0.95 src_audio = torch.FloatTensor(src_audio).unsqueeze(0).cuda() trg_audio = torch.FloatTensor(trg_audio).unsqueeze(0).cuda() src_spkr = parse_speaker(src_ref, h.multispkr) src_spkr = src_spkr if src_spkr in EMOV_SPK2ID else random.choice(list(EMOV_SPK2ID.keys())) src_spkr = EMOV_SPK2ID[src_spkr] src_spkr = torch.LongTensor([src_spkr]) trg_spkr = parse_speaker(trg_ref, h.multispkr) trg_spkr = trg_spkr if trg_spkr in EMOV_SPK2ID else random.choice(list(EMOV_SPK2ID.keys())) trg_spkr = EMOV_SPK2ID[trg_spkr] trg_spkr = torch.LongTensor([trg_spkr]) src_style = EMOV_STYLE2ID[a.src_emotion] src_style = torch.LongTensor([src_style]).cuda() trg_style_str = a.trg_emotion trg_style = EMOV_STYLE2ID[a.trg_emotion] trg_style = torch.LongTensor([trg_style]).cuda() src_tokens = list(map(int, orig_km[fname_to_idx[src_ref]].strip().split(" "))) src_tokens = torch.LongTensor(src_tokens).unsqueeze(0) src_tokens_dur_pred = torch.LongTensor(list(map(int, result['S']))).unsqueeze(0) src_tokens_dur_pred = dur_models[trg_style_str].inflate_input(src_tokens_dur_pred) gen_tokens = torch.LongTensor(result['H']).unsqueeze(0) gen_tokens = dur_models[trg_style_str].inflate_input(gen_tokens) trg_tokens = torch.LongTensor(result['T']).unsqueeze(0) trg_tokens = dur_models[trg_style_str].inflate_input(trg_tokens) src_f0 = get_praat_f0(src_audio.unsqueeze(0).cpu().numpy()) src_f0 = torch.FloatTensor(src_f0).cuda() pred_src_f0 = f0_model.inference(torch.LongTensor(src_tokens).to(a.device), src_spkr, trg_style).unsqueeze(0) pred_src_dur_pred_f0 = f0_model.inference(torch.LongTensor(src_tokens_dur_pred).to(a.device), src_spkr, trg_style).unsqueeze(0) pred_gen_f0 = f0_model.inference(torch.LongTensor(gen_tokens).to(a.device), src_spkr, trg_style).unsqueeze(0) pred_trg_f0 = f0_model.inference(torch.LongTensor(trg_tokens).to(a.device), src_spkr, trg_style).unsqueeze(0) if a.orig_filename: path = src_code_to_fname[code_to_str(result['S'])] sid = str(sid) + "__" + Path(path).stem shutil.copy(src_code_to_fname[code_to_str(result['S'])], outdir / '0-source' / f'{sid}.wav') audio = generate_from_code(generator, h, src_tokens, spkr=src_spkr, f0=src_f0, gst=src_style, device=a.device) sf.write(outdir / '1-src-tokens-src-style-src-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) audio = generate_from_code(generator, h, src_tokens, spkr=src_spkr, f0=src_f0, gst=trg_style, device=a.device) sf.write(outdir / '2-src-tokens-trg-style-src-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) audio = generate_from_code(generator, h, src_tokens_dur_pred, spkr=src_spkr, f0=src_f0, gst=trg_style, device=a.device) sf.write(outdir / '2.5-src-tokens-trg-style-src-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) audio = generate_from_code(generator, h, src_tokens_dur_pred, spkr=src_spkr, f0=pred_src_dur_pred_f0, gst=trg_style, device=a.device) sf.write(outdir / '3-src-tokens-trg-style-pred-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) audio = generate_from_code(generator, h, gen_tokens, spkr=src_spkr, f0=pred_gen_f0, gst=trg_style, device=a.device) sf.write(outdir / '4-gen-tokens-trg-style-pred-f0' / f'{sid}.wav', audio, samplerate=h.sampling_rate) shutil.copy(tgt_code_to_fname[code_to_str(result['T'])], outdir / '5-target' / f'{sid}.wav') logger.info("Done.") if __name__ == '__main__': synth(sys.argv[1:]) ================================================ FILE: examples/fast_noisy_channel/README.md ================================================ # Language Models not just for Pre-training: Fast Online Neural Noisy Channel Modeling ## Introduction - [Yee et al. (2019)](https://www.aclweb.org/anthology/D19-1571.pdf) introduce a simple and effective noisy channel modeling approach for neural machine translation. However, the noisy channel online decoding approach introduced in this paper is too slow to be practical. - To address this, [Bhosale et al. (2020)](http://www.statmt.org/wmt20/pdf/2020.wmt-1.68.pdf) introduces 3 simple approximations to make this approach very fast and practical without much loss in accuracy. - This README provides intructions on how to run online decoding or generation with the noisy channel modeling approach, including ways to make it very fast without much loss in accuracy. ## Noisy Channel Modeling [Yee et al. (2019)](https://www.aclweb.org/anthology/D19-1571.pdf) applies the Bayes Rule to predict `P(y|x)`, the probability of the target `y` given the source `x`. ```P(y|x) = P(x|y) * P(y) / P(x)``` - `P(x|y)` predicts the source `x` given the target `y` and is referred to as the **channel model** - `P(y)` is a **language model** over the target `y` - `P(x)` is generally not modeled since it is constant for all `y`. We use Transformer models to parameterize the direct model `P(y|x)`, the channel model `P(x|y)` and the language model `P(y)`. During online decoding with beam search, we generate the top `K2` candidates per beam and score them with the following linear combination of the channel model, the language model as well as the direct model scores. ```(1 / t) * log(P(y|x) + (1 / s) * ( λ1 * log(P(x|y)) + λ2 * log(P(y) ) )``` - `t` - Target Prefix Length - `s` - Source Length - `λ1` - Channel Model Weight - `λ2` - Language Model Weight The top `beam_size` candidates based on the above combined scores are chosen to continue the beams in beam search. In beam search with a direct model alone, the scores from the direct model `P(y|x)` are used to choose the top candidates in beam search. This framework provides a great way to utlize strong target language models trained on large amounts of unlabeled data. Language models can prefer targets unrelated to the source, so we also need a channel model whose role is to ensure that the target preferred by the language model also translates back to the source. ### Training Translation Models and Language Models For training Transformer models in fairseq for machine translation, refer to instructions [here](https://github.com/pytorch/fairseq/tree/main/examples/translation) For training Transformer models in fairseq for language modeling, refer to instructions [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model) ### Generation with Language Model for German-English translation with fairseq Here are instructions to generate using a direct model and a target-side language model. Note: - Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq) - Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing) ```sh binarized_data=data_dir/binarized direct_model=de_en_seed4.pt lm_model=en_lm.pt lm_data=lm_data wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model} wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model} mkdir -p ${lm_data} wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt k2=10 lenpen=0.16 lm_wt=0.14 fairseq-generate ${binarized_data} \ --user-dir examples/fast_noisy_channel \ --beam 5 \ --path ${direct_model} \ --lm-model ${lm_model} \ --lm-data ${lm_data} \ --k2 ${k2} \ --combine-method lm_only \ --task noisy_channel_translation \ --lenpen ${lenpen} \ --lm-wt ${lm_wt} \ --gen-subset valid \ --remove-bpe \ --fp16 \ --batch-size 10 ``` ### Noisy Channel Generation for German-English translation with fairseq Here are instructions for noisy channel generation with a direct model, channel model and language model as explained in section [Noisy Channel Modeling](#noisy-channel-modeling). Note: - Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq) - Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing) ```sh binarized_data=data_dir/binarized direct_model=de_en_seed4.pt lm_model=en_lm.pt lm_data=lm_data ch_model=en_de.big.seed4.pt wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model} wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model} mkdir -p ${lm_data} wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed4.pt -O ${ch_model} k2=10 lenpen=0.21 lm_wt=0.50 bw_wt=0.30 fairseq-generate ${binarized_data} \ --user-dir examples/fast_noisy_channel \ --beam 5 \ --path ${direct_model} \ --lm-model ${lm_model} \ --lm-data ${lm_data} \ --channel-model ${ch_model} \ --k2 ${k2} \ --combine-method noisy_channel \ --task noisy_channel_translation \ --lenpen ${lenpen} \ --lm-wt ${lm_wt} \ --ch-wt ${bw_wt} \ --gen-subset test \ --remove-bpe \ --fp16 \ --batch-size 1 ``` ## Fast Noisy Channel Modeling [Bhosale et al. (2020)](http://www.statmt.org/wmt20/pdf/2020.wmt-1.68.pdf) introduces 3 approximations that speed up online noisy channel decoding - - Smaller channel models (`Tranformer Base` with 1 encoder and decoder layer each vs. `Transformer Big`) - This involves training a channel model that is possibly smaller and less accurate in terms of BLEU than a channel model of the same size as the direct model. - Since the role of the channel model is mainly to assign low scores to generations from the language model if they don't translate back to the source, we may not need the most accurate channel model for this purpose. - Smaller output vocabulary size for the channel model (~30,000 -> ~1000) - The channel model doesn't need to score the full output vocabulary, it just needs to score the source tokens, which are completely known. - This is specified using the arguments `--channel-scoring-type src_vocab --top-k-vocab 500` - This means that the output vocabulary for the channel model will be the source tokens for all examples in the batch and the top-K most frequent tokens in the vocabulary - This reduces the memory consumption needed to store channel model scores significantly - Smaller number of candidates (`k2`) scored per beam - This is specified by reducing the argument `--k2` ### Fast Noisy Channel Generation for German-English translation with fairseq Here are instructions for **fast** noisy channel generation with a direct model, channel model and language model as explained in section [Fast Noisy Channel Modeling](#fast-noisy-channel-modeling). The main differences are that we use a smaller channel model, reduce `--k2`, set `--channel-scoring-type src_vocab --top-k-vocab 500` and increase the `--batch-size`. Note: - Download and install fairseq as per instructions [here](https://github.com/pytorch/fairseq) - Preprocess and binarize the dataset as per instructions in section [Test Data Preprocessing](#test-data-preprocessing) ```sh binarized_data=data_dir/binarized direct_model=de_en_seed4.pt lm_model=en_lm.pt lm_data=lm_data small_ch_model=en_de.base_1_1.seed4.pt wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt -O ${direct_model} wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt -O ${lm_model} mkdir -p ${lm_data} wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/dict.txt -O ${lm_data}/dict.txt wget https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed4.pt -O ${small_ch_model} k2=3 lenpen=0.23 lm_wt=0.58 bw_wt=0.26 fairseq-generate ${binarized_data} \ --user-dir examples/fast_noisy_channel \ --beam 5 \ --path ${direct_model} \ --lm-model ${lm_model} \ --lm-data ${lm_data} \ --channel-model ${small_ch_model} \ --k2 ${k2} \ --combine-method noisy_channel \ --task noisy_channel_translation \ --lenpen ${lenpen} \ --lm-wt ${lm_wt} \ --ch-wt ${bw_wt} \ --gen-subset test \ --remove-bpe \ --fp16 \ --batch-size 50 \ --channel-scoring-type src_vocab --top-k-vocab 500 ``` ## Test Data Preprocessing For preprocessing and binarizing the test sets for Romanian-English and German-English translation, we use the following script - ```sh FAIRSEQ=/path/to/fairseq cd $FAIRSEQ SCRIPTS=$FAIRSEQ/mosesdecoder/scripts if [ ! -d "${SCRIPTS}" ]; then echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git fi TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl NORMALIZE=$SCRIPTS/tokenizer/normalize-punctuation.perl s=de t=en test=wmt18 mkdir -p data_dir # Tokenization if [ $s == "ro" ] ; then # Note: Get normalise-romanian.py and remove-diacritics.py from # https://github.com/rsennrich/wmt16-scripts/tree/master/preprocess sacrebleu -t $test -l $s-$t --echo src | \ $NORMALIZE -l $s | \ python normalise-romanian.py | \ python remove-diacritics.py | \ $TOKENIZER -l $s -a -q > data_dir/$test.$s-$t.$s else sacrebleu -t $test -l $s-$t --echo src | perl $NORMALIZE -l $s | perl $TOKENIZER -threads 8 -a -l $s > data_dir/$test.$s-$t.$s fi sacrebleu -t $test -l $s-$t --echo ref | perl $NORMALIZE -l $t | perl $TOKENIZER -threads 8 -a -l $t > data_dir/$test.$s-$t.$t # Applying BPE src_bpe_code=/path/to/source/language/bpe/code tgt_bpe_code=/path/to/target/language/bpe/code src_dict=/path/to/source/language/dict tgt_dict=/path/to/target/language/dict FASTBPE=$FAIRSEQ/fastBPE if [ ! -d "${FASTBPE}" ] ; then git clone https://github.com/glample/fastBPE.git # Follow compilation instructions at https://github.com/glample/fastBPE g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast fi ${FASTBPE}/fast applybpe data_dir/bpe.$test.$s-$t.$s data_dir/$test.$s-$t.$s ${src_bpe_code} ${FASTBPE}/fast applybpe data_dir/bpe.$test.$s-$t.$s data_dir/$test.$s-$t.$s ${tgt_bpe_code} fairseq-preprocess -s $s -t $t \ --testpref data_dir/bpe.$test.$s-$t \ --destdir data_dir/binarized \ --srcdict ${src_dict} \ --tgtdict ${tgt_dict} ``` ## Calculating BLEU ```sh DETOKENIZER=$SCRIPTS/tokenizer/detokenizer.perl cat ${generation_output} | grep -P "^H" | sort -V | cut -f 3- | $DETOKENIZER -l $t -q -a | sacrebleu -t $test -l $s-$t ``` ## Romanian-English Translation The direct and channel models are trained using bitext data (WMT16) combined with backtranslated data (The monolingual data used for backtranslation comes from http://data.statmt.org/rsennrich/wmt16_backtranslations/ (Sennrich et al., 2016c)) The backtranslated data is generated using an ensemble of 3 English-Romanian models trained on bitext training data (WMT16) with unrestricted sampling. ### BPE Codes and Dictionary We learn a joint BPE vocabulary of 18K types on the bitext training data which is used for both the source and target. ||Path| |----------|------| | BPE Code | [joint_bpe_18k](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/bpe_18k) | | Dictionary | [dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/dict) | ### Direct Models For Ro-En with backtranslation, the direct and channel models use a Transformer-Big architecture. | Seed | Model | |----|----| | 2 | [ro_en_seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed2.pt) | 4 | [ro_en_seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed4.pt) | 6 | [ro_en_seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/direct_models/seed6.pt) ### Channel Models For channel models, we follow the same steps as for the direct models. But backtranslated data is generated in the opposite direction using [this Romanian monolingual data](http://data.statmt.org/rsennrich/wmt16_backtranslations/). The best lenpen, LM weight and CH weight are obtained by sweeping over the validation set (wmt16/dev) using beam 5. | Model Size | Lenpen | LM Weight | CH Weight | Seed 2 | Seed 4 | Seed 6 | |----|----|----|----|----|----|----| | `big` | 0.84 | 0.64 | 0.56 | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | [big.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/big.seed2.pt) | | `base_1_1` | 0.63 | 0.40 | 0.37 | [base_1_1.seed2.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed2.pt) | [base_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed4.pt) | [base_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/channel_models/base_1_1.seed6.pt) | ### Language Model The model is trained on de-duplicated English Newscrawl data from 2007-2018 comprising 186 million sentences or 4.5B words after normalization and tokenization. | | Path | |----|----| | `--lm-model` | [transformer_en_lm](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/lm_model/transformer_lm.pt) | | `--lm-data` | [lm_data](https://dl.fbaipublicfiles.com/fast_noisy_channel/ro_en/lm_model/lm_dict) ## German-English Translation ### BPE Codes and Dictionaries | | Path| |----------|------| | Source BPE Code | [de_bpe_code_24K](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/de_bpe_code_24K) | | Target BPE Code | [en_bpe_code_24K](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/en_bpe_code_24K) | Source Dictionary | [de_dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/de_dict) | | Target Dictionary | [en_dict](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/en_dict) | ### Direct Models We train on WMT’19 training data. Following [Ng et al., 2019](http://statmt.org/wmt19/pdf/53/WMT33.pdf), we apply language identification filtering and remove sentences longer than 250 tokens as well as sentence pairs with a source/target length ratio exceeding 1.5. This results in 26.8M sentence pairs. We use the Transformer-Big architecture for the direct model. | Seed | Model | |:----:|----| | 4 | [de_en_seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed4.pt) | 5 | [de_en_seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed5.pt) | 6 | [de_en_seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/direct_models/seed6.pt) ### Channel Models We train on WMT’19 training data. Following [Ng et al., 2019](http://statmt.org/wmt19/pdf/53/WMT33.pdf), we apply language identification filtering and remove sentences longer than 250 tokens as well as sentence pairs with a source/target length ratio exceeding 1.5. This results in 26.8M sentence pairs. | Model Size | Seed 4 | Seed 5 | Seed 6 | |----|----|----|----| | `big` | [big.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed4.pt) | [big.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed5.pt) | [big.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big.seed6.pt) | | `big_1_1` | [big_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed4.pt) | [big_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed5.pt) | [big_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/big_1_1.seed6.pt) | | `base` | [base.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed4.pt) | [base.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed5.pt) | [base.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base.seed6.pt) | | `base_1_1` | [base_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed4.pt) | [base_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed5.pt) | [base_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/base_1_1.seed6.pt) | | `half` | [half.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed4.pt) | [half.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed5.pt) | [half.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half.seed6.pt) | | `half_1_1` | [half_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed4.pt) | [half_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed5.pt) | [half_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/half_1_1.seed6.pt) | | `quarter` | [quarter.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed4.pt) | [quarter.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed5.pt) | [quarter.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter.seed6.pt) | | `quarter_1_1` | [quarter_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed4.pt) | [quarter_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed5.pt) | [quarter_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/quarter_1_1.seed6.pt) | | `8th` | [8th.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed4.pt) | [8th.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed5.pt) | [8th.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th.seed6.pt) | | `8th_1_1` | [8th_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed4.pt) | [8th_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed5.pt) | [8th_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/8th_1_1.seed6.pt) | | `16th` | [16th.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed4.pt) | [16th.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed5.pt) | [16th.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th.seed6.pt) | | `16th_1_1` | [16th_1_1.seed4.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed4.pt) | [16th_1_1.seed5.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed5.pt) | [16th_1_1.seed6.pt](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/channel_models/16th_1_1.seed6.pt) | ### Language Model The model is trained on de-duplicated English Newscrawl data from 2007-2018 comprising 186 million sentences or 4.5B words after normalization and tokenization. | | Path | |----|----| | `--lm-model` | [transformer_en_lm](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/transformer_lm.pt) | | `--lm-data` | [lm_data](https://dl.fbaipublicfiles.com/fast_noisy_channel/de_en/lm_model/lm_dict/) ## Citation ```bibtex @inproceedings{bhosale2020language, title={Language Models not just for Pre-training: Fast Online Neural Noisy Channel Modeling}, author={Shruti Bhosale and Kyra Yee and Sergey Edunov and Michael Auli}, booktitle={Proceedings of the Fifth Conference on Machine Translation (WMT)}, year={2020}, } @inproceedings{yee2019simple, title={Simple and Effective Noisy Channel Modeling for Neural Machine Translation}, author={Yee, Kyra and Dauphin, Yann and Auli, Michael}, booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, pages={5700--5705}, year={2019} } ``` ================================================ FILE: examples/fast_noisy_channel/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import noisy_channel_translation # noqa from . import noisy_channel_sequence_generator # noqa from . import noisy_channel_beam_search # noqa ================================================ FILE: examples/fast_noisy_channel/noisy_channel_beam_search.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.search import Search class NoisyChannelBeamSearch(Search): def __init__(self, tgt_dict): super().__init__(tgt_dict) self.fw_scores_buf = None self.lm_scores_buf = None def _init_buffers(self, t): # super()._init_buffers(t) if self.fw_scores_buf is None: self.scores_buf = t.new() self.indices_buf = torch.LongTensor().to(device=t.device) self.beams_buf = torch.LongTensor().to(device=t.device) self.fw_scores_buf = t.new() self.lm_scores_buf = t.new() def combine_fw_bw(self, combine_method, fw_cum, bw, step): if combine_method == "noisy_channel": fw_norm = fw_cum.div(step + 1) lprobs = bw + fw_norm elif combine_method == "lm_only": lprobs = bw + fw_cum return lprobs def step(self, step, fw_lprobs, scores, bw_lprobs, lm_lprobs, combine_method): self._init_buffers(fw_lprobs) bsz, beam_size, vocab_size = fw_lprobs.size() if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam fw_lprobs = fw_lprobs[:, ::beam_size, :].contiguous() bw_lprobs = bw_lprobs[:, ::beam_size, :].contiguous() # nothing to add since we are at the first step fw_lprobs_cum = fw_lprobs else: # make probs contain cumulative scores for each hypothesis raw_scores = (scores[:, :, step - 1].unsqueeze(-1)) fw_lprobs_cum = (fw_lprobs.add(raw_scores)) combined_lprobs = self.combine_fw_bw(combine_method, fw_lprobs_cum, bw_lprobs, step) # choose the top k according to the combined noisy channel model score torch.topk( combined_lprobs.view(bsz, -1), k=min( # Take the best 2 x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. beam_size * 2, combined_lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad ), out=(self.scores_buf, self.indices_buf), ) # save corresponding fw and lm scores self.fw_scores_buf = torch.gather(fw_lprobs_cum.view(bsz, -1), 1, self.indices_buf) self.lm_scores_buf = torch.gather(lm_lprobs.view(bsz, -1), 1, self.indices_buf) # Project back into relative indices and beams self.beams_buf = self.indices_buf // vocab_size self.indices_buf.fmod_(vocab_size) return self.scores_buf, self.fw_scores_buf, self.lm_scores_buf, self.indices_buf, self.beams_buf ================================================ FILE: examples/fast_noisy_channel/noisy_channel_sequence_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Optional import math import numpy as np import torch import torch.nn.functional as F from torch import Tensor from .noisy_channel_beam_search import NoisyChannelBeamSearch from fairseq.sequence_generator import EnsembleModel class NoisyChannelSequenceGenerator(object): def __init__( self, combine_method, tgt_dict, src_dict=None, beam_size=1, max_len_a=0, max_len_b=200, min_len=1, len_penalty=1.0, unk_penalty=0.0, retain_dropout=False, temperature=1.0, match_source_len=False, no_repeat_ngram_size=0, normalize_scores=True, channel_models=None, k2=10, ch_weight=1.0, channel_scoring_type='log_norm', top_k_vocab=0, lm_models=None, lm_dict=None, lm_weight=1.0, normalize_lm_scores_by_tgt_len=False, ): """Generates translations of a given source sentence, using beam search with noisy channel decoding. Args: combine_method (string, optional): Method to combine direct, LM and channel model scores (default: None) tgt_dict (~fairseq.data.Dictionary): target dictionary src_dict (~fairseq.data.Dictionary): source dictionary beam_size (int, optional): beam width (default: 1) max_len_a/b (int, optional): generate sequences of maximum length ax + b, where x is the source length min_len (int, optional): the minimum length of the generated output (not including end-of-sentence) len_penalty (float, optional): length penalty, where <1.0 favors shorter, >1.0 favors longer sentences (default: 1.0) unk_penalty (float, optional): unknown word penalty, where <0 produces more unks, >0 produces fewer (default: 0.0) retain_dropout (bool, optional): use dropout when generating (default: False) temperature (float, optional): temperature, where values >1.0 produce more uniform samples and values <1.0 produce sharper samples (default: 1.0) match_source_len (bool, optional): outputs should match the source length (default: False) no_repeat_ngram_size (int, optional): Size of n-grams that we avoid repeating in the generation (default: 0) normalize_scores (bool, optional): normalize scores by the length of the output (default: True) channel_models (List[~fairseq.models.FairseqModel]): ensemble of models translating from the target to the source k2 (int, optional): Top K2 candidates to score per beam at each step (default:10) ch_weight (int, optional): Weight associated with the channel model score assuming that the direct model score has weight 1.0 (default: 1.0) channel_scoring_type (str, optional): String specifying how to score the channel model (default: 'log_norm') top_k_vocab (int, optional): If `channel_scoring_type` is `'src_vocab'` or `'src_vocab_batched'`, then this parameter specifies the number of most frequent tokens to include in the channel model output vocabulary, in addition to the source tokens in the input batch (default: 0) lm_models (List[~fairseq.models.FairseqModel]): ensemble of models generating text in the target language lm_dict (~fairseq.data.Dictionary): LM Model dictionary lm_weight (int, optional): Weight associated with the LM model score assuming that the direct model score has weight 1.0 (default: 1.0) normalize_lm_scores_by_tgt_len (bool, optional): Should we normalize LM scores by the target length? By default, we normalize the combination of LM and channel model scores by the source length """ self.pad = tgt_dict.pad() self.unk = tgt_dict.unk() self.eos = tgt_dict.eos() self.vocab_size = len(tgt_dict) self.beam_size = beam_size # the max beam size is the dictionary size - 1, since we never select pad self.beam_size = min(beam_size, self.vocab_size - 1) self.max_len_a = max_len_a self.max_len_b = max_len_b self.min_len = min_len self.normalize_scores = normalize_scores self.len_penalty = len_penalty self.unk_penalty = unk_penalty self.retain_dropout = retain_dropout self.temperature = temperature self.match_source_len = match_source_len self.no_repeat_ngram_size = no_repeat_ngram_size self.channel_models = channel_models self.src_dict = src_dict self.tgt_dict = tgt_dict self.combine_method = combine_method self.k2 = k2 self.ch_weight = ch_weight self.channel_scoring_type = channel_scoring_type self.top_k_vocab = top_k_vocab self.lm_models = lm_models self.lm_dict = lm_dict self.lm_weight = lm_weight self.log_softmax_fn = torch.nn.LogSoftmax(dim=1) self.normalize_lm_scores_by_tgt_len = normalize_lm_scores_by_tgt_len self.share_tgt_dict = (self.lm_dict == self.tgt_dict) self.tgt_to_lm = make_dict2dict(tgt_dict, lm_dict) self.ch_scoring_bsz = 3072 assert temperature > 0, '--temperature must be greater than 0' self.search = NoisyChannelBeamSearch(tgt_dict) @torch.no_grad() def generate( self, models, sample, prefix_tokens=None, bos_token=None, **kwargs ): """Generate a batch of translations. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models sample (dict): batch prefix_tokens (torch.LongTensor, optional): force decoder to begin with these tokens """ model = EnsembleModel(models) incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) for i in range(model.models_size) ], ) if not self.retain_dropout: model.eval() # model.forward normally channels prev_output_tokens into the decoder # separately, but SequenceGenerator directly calls model.encoder encoder_input = { k: v for k, v in sample['net_input'].items() if k != 'prev_output_tokens' } src_tokens = encoder_input['src_tokens'] src_lengths_no_eos = (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)).long().sum(dim=1) input_size = src_tokens.size() # batch dimension goes first followed by source lengths bsz = input_size[0] src_len = input_size[1] beam_size = self.beam_size if self.match_source_len: max_len = src_lengths_no_eos.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), # exclude the EOS marker model.max_decoder_positions() - 1, ) # compute the encoder output for each beam encoder_outs = model.forward_encoder(encoder_input) new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = model.reorder_encoder_out(encoder_outs, new_order) src_lengths = encoder_input['src_lengths'] # initialize buffers scores = src_tokens.new(bsz * beam_size, max_len + 1).float().fill_(0) lm_prefix_scores = src_tokens.new(bsz * beam_size).float().fill_(0) scores_buf = scores.clone() tokens = src_tokens.new(bsz * beam_size, max_len + 2).long().fill_(self.pad) tokens_buf = tokens.clone() tokens[:, 0] = self.eos if bos_token is None else bos_token # reorder source tokens so they may be used as a reference in generating P(S|T) src_tokens = reorder_all_tokens(src_tokens, src_lengths, self.src_dict.eos_index) src_tokens = src_tokens.repeat(1, beam_size).view(-1, src_len) src_lengths = src_lengths.view(bsz, -1).repeat(1, beam_size).view(bsz*beam_size, -1) attn, attn_buf = None, None nonpad_idxs = None # The cands_to_ignore indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then the cands_to_ignore would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. cands_to_ignore = src_tokens.new_zeros(bsz, beam_size).eq(-1) # forward and backward-compatible False mask # list of completed sentences finalized = [[] for i in range(bsz)] finished = [False for i in range(bsz)] num_remaining_sent = bsz # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = (torch.arange(0, bsz) * beam_size).unsqueeze(1).type_as(tokens) cand_offsets = torch.arange(0, cand_size).type_as(tokens) # helper function for allocating buffers on the fly buffers = {} def buffer(name, type_of=tokens): # noqa if name not in buffers: buffers[name] = type_of.new() return buffers[name] def is_finished(sent, step, unfin_idx): """ Check whether we've finished generation for a given sentence, by comparing the worst score among finalized hypotheses to the best possible score among unfinalized hypotheses. """ assert len(finalized[sent]) <= beam_size if len(finalized[sent]) == beam_size: return True return False def finalize_hypos(step, bbsz_idx, eos_scores, combined_noisy_channel_eos_scores): """ Finalize the given hypotheses at this step, while keeping the total number of finalized hypotheses per sentence <= beam_size. Note: the input must be in the desired finalization order, so that hypotheses that appear earlier in the input are preferred to those that appear later. Args: step: current time step bbsz_idx: A vector of indices in the range [0, bsz*beam_size), indicating which hypotheses to finalize eos_scores: A vector of the same size as bbsz_idx containing fw scores for each hypothesis combined_noisy_channel_eos_scores: A vector of the same size as bbsz_idx containing combined noisy channel scores for each hypothesis """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors tokens_clone = tokens.index_select(0, bbsz_idx) tokens_clone = tokens_clone[:, 1:step + 2] # skip the first index, which is EOS assert not tokens_clone.eq(self.eos).any() tokens_clone[:, step] = self.eos attn_clone = attn.index_select(0, bbsz_idx)[:, :, 1:step+2] if attn is not None else None # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, :step+1] pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: combined_noisy_channel_eos_scores /= (step + 1) ** self.len_penalty cum_unfin = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) sents_seen = set() for i, (idx, score) in enumerate(zip(bbsz_idx.tolist(), combined_noisy_channel_eos_scores.tolist())): unfin_idx = idx // beam_size sent = unfin_idx + cum_unfin[unfin_idx] sents_seen.add((sent, unfin_idx)) if self.match_source_len and step > src_lengths_no_eos[unfin_idx]: score = -math.inf def get_hypo(): if attn_clone is not None: # remove padding tokens from attn scores hypo_attn = attn_clone[i][nonpad_idxs[sent]] _, alignment = hypo_attn.max(dim=0) else: hypo_attn = None alignment = None return { 'tokens': tokens_clone[i], 'score': score, 'attention': hypo_attn, # src_len x tgt_len 'alignment': alignment, 'positional_scores': pos_scores[i], } if len(finalized[sent]) < beam_size: finalized[sent].append(get_hypo()) newly_finished = [] for sent, unfin_idx in sents_seen: # check termination conditions for this sentence if not finished[sent] and is_finished(sent, step, unfin_idx): finished[sent] = True newly_finished.append(unfin_idx) return newly_finished def noisy_channel_rescoring(lprobs, beam_size, bsz, src_tokens, tokens, k): """Rescore the top k hypothesis from each beam using noisy channel modeling Returns: new_fw_lprobs: the direct model probabilities after pruning the top k new_ch_lm_lprobs: the combined channel and language model probabilities new_lm_lprobs: the language model probabilities after pruning the top k """ with torch.no_grad(): lprobs_size = lprobs.size() if prefix_tokens is not None and step < prefix_tokens.size(1): probs_slice = lprobs.view(bsz, -1, lprobs.size(-1))[:, 0, :] cand_scores = torch.gather( probs_slice, dim=1, index=prefix_tokens[:, step].view(-1, 1).data ).expand(-1, beam_size).contiguous().view(bsz*beam_size, 1) cand_indices = prefix_tokens[:, step].view(-1, 1).expand(bsz, beam_size).data.contiguous().view(bsz*beam_size, 1) # need to calculate and save fw and lm probs for prefix tokens fw_top_k = cand_scores fw_top_k_idx = cand_indices k = 1 else: # take the top k best words for every sentence in batch*beam fw_top_k, fw_top_k_idx = torch.topk(lprobs.view(beam_size*bsz, -1), k=k) eos_idx = torch.nonzero(fw_top_k_idx.view(bsz*beam_size*k, -1) == self.eos)[:, 0] ch_scores = fw_top_k.new_full((beam_size*bsz*k, ), 0) src_size = torch.sum(src_tokens[:, :] != self.src_dict.pad_index, dim=1, keepdim=True, dtype=fw_top_k.dtype) if self.combine_method != "lm_only": temp_src_tokens_full = src_tokens[:, :].repeat(1, k).view(bsz*beam_size*k, -1) not_padding = temp_src_tokens_full[:, 1:] != self.src_dict.pad_index cur_tgt_size = step+2 # add eos to all candidate sentences except those that already end in eos eos_tokens = tokens[:, 0].repeat(1, k).view(-1, 1) eos_tokens[eos_idx] = self.tgt_dict.pad_index if step == 0: channel_input = torch.cat((fw_top_k_idx.view(-1, 1), eos_tokens), 1) else: # move eos from beginning to end of target sentence channel_input = torch.cat((tokens[:, 1:step + 1].repeat(1, k).view(-1, step), fw_top_k_idx.view(-1, 1), eos_tokens), 1) ch_input_lengths = torch.tensor(np.full(channel_input.size(0), cur_tgt_size)) ch_input_lengths[eos_idx] = cur_tgt_size-1 if self.channel_scoring_type == "unnormalized": ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths) ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True) del ch_encoder_output ch_intermed_scores = channel_model.decoder.unnormalized_scores_given_target(ch_decoder_output, target_ids=temp_src_tokens_full[:, 1:]) ch_intermed_scores = ch_intermed_scores.float() ch_intermed_scores *= not_padding.float() ch_scores = torch.sum(ch_intermed_scores, dim=1) elif self.channel_scoring_type == "k2_separate": for k_idx in range(k): k_eos_tokens = eos_tokens[k_idx::k, :] if step == 0: k_ch_input = torch.cat((fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1) else: # move eos from beginning to end of target sentence k_ch_input = torch.cat((tokens[:, 1:step + 1], fw_top_k_idx[:, k_idx:k_idx+1], k_eos_tokens), 1) k_ch_input_lengths = ch_input_lengths[k_idx::k] k_ch_output = channel_model(k_ch_input, k_ch_input_lengths, src_tokens) k_ch_lprobs = channel_model.get_normalized_probs(k_ch_output, log_probs=True) k_ch_intermed_scores = torch.gather(k_ch_lprobs[:, :-1, :], 2, src_tokens[:, 1:].unsqueeze(2)).squeeze(2) k_ch_intermed_scores *= not_padding.float() ch_scores[k_idx::k] = torch.sum(k_ch_intermed_scores, dim=1) elif self.channel_scoring_type == "src_vocab": ch_encoder_output = channel_model.encoder(channel_input, src_lengths=ch_input_lengths) ch_decoder_output, _ = channel_model.decoder(temp_src_tokens_full, encoder_out=ch_encoder_output, features_only=True) del ch_encoder_output ch_lprobs = normalized_scores_with_batch_vocab( channel_model.decoder, ch_decoder_output, src_tokens, k, bsz, beam_size, self.src_dict.pad_index, top_k=self.top_k_vocab) ch_scores = torch.sum(ch_lprobs, dim=1) elif self.channel_scoring_type == "src_vocab_batched": ch_bsz_size = temp_src_tokens_full.shape[0] ch_lprobs_list = [None] * len(range(0, ch_bsz_size, self.ch_scoring_bsz)) for i, start_idx in enumerate(range(0, ch_bsz_size, self.ch_scoring_bsz)): end_idx = min(start_idx + self.ch_scoring_bsz, ch_bsz_size) temp_src_tokens_full_batch = temp_src_tokens_full[start_idx:end_idx, :] channel_input_batch = channel_input[start_idx:end_idx, :] ch_input_lengths_batch = ch_input_lengths[start_idx:end_idx] ch_encoder_output_batch = channel_model.encoder(channel_input_batch, src_lengths=ch_input_lengths_batch) ch_decoder_output_batch, _ = channel_model.decoder(temp_src_tokens_full_batch, encoder_out=ch_encoder_output_batch, features_only=True) ch_lprobs_list[i] = normalized_scores_with_batch_vocab( channel_model.decoder, ch_decoder_output_batch, src_tokens, k, bsz, beam_size, self.src_dict.pad_index, top_k=self.top_k_vocab, start_idx=start_idx, end_idx=end_idx) ch_lprobs = torch.cat(ch_lprobs_list, dim=0) ch_scores = torch.sum(ch_lprobs, dim=1) else: ch_output = channel_model(channel_input, ch_input_lengths, temp_src_tokens_full) ch_lprobs = channel_model.get_normalized_probs(ch_output, log_probs=True) ch_intermed_scores = torch.gather(ch_lprobs[:, :-1, :], 2, temp_src_tokens_full[:, 1:].unsqueeze(2)).squeeze().view(bsz*beam_size*k, -1) ch_intermed_scores *= not_padding.float() ch_scores = torch.sum(ch_intermed_scores, dim=1) else: cur_tgt_size = 0 ch_scores = ch_scores.view(bsz*beam_size, k) expanded_lm_prefix_scores = lm_prefix_scores.unsqueeze(1).expand(-1, k).flatten() if self.share_tgt_dict: lm_scores = get_lm_scores(lm, tokens[:, :step + 1].view(-1, step+1), lm_incremental_states, fw_top_k_idx.view(-1, 1), torch.tensor(np.full(tokens.size(0), step+1)), k) else: new_lm_input = dict2dict(tokens[:, :step + 1].view(-1, step+1), self.tgt_to_lm) new_cands = dict2dict(fw_top_k_idx.view(-1, 1), self.tgt_to_lm) lm_scores = get_lm_scores(lm, new_lm_input, lm_incremental_states, new_cands, torch.tensor(np.full(tokens.size(0), step+1)), k) lm_scores.add_(expanded_lm_prefix_scores) ch_lm_scores = combine_ch_lm(self.combine_method, ch_scores, lm_scores, src_size, cur_tgt_size) # initialize all as min value new_fw_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1) new_ch_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1) new_lm_lprobs = ch_scores.new(lprobs_size).fill_(-1e17).view(bsz*beam_size, -1) new_fw_lprobs[:, self.pad] = -math.inf new_ch_lm_lprobs[:, self.pad] = -math.inf new_lm_lprobs[:, self.pad] = -math.inf new_fw_lprobs.scatter_(1, fw_top_k_idx, fw_top_k) new_ch_lm_lprobs.scatter_(1, fw_top_k_idx, ch_lm_scores) new_lm_lprobs.scatter_(1, fw_top_k_idx, lm_scores.view(-1, k)) return new_fw_lprobs, new_ch_lm_lprobs, new_lm_lprobs def combine_ch_lm(combine_type, ch_scores, lm_scores1, src_size, tgt_size): if self.channel_scoring_type == "unnormalized": ch_scores = self.log_softmax_fn( ch_scores.view(-1, self.beam_size * self.k2) ).view(ch_scores.shape) ch_scores = ch_scores * self.ch_weight lm_scores1 = lm_scores1 * self.lm_weight if combine_type == "lm_only": # log P(T|S) + log P(T) ch_scores = lm_scores1.view(ch_scores.size()) elif combine_type == "noisy_channel": # 1/t log P(T|S) + 1/s log P(S|T) + 1/t log P(T) if self.normalize_lm_scores_by_tgt_len: ch_scores.div_(src_size) lm_scores_norm = lm_scores1.view(ch_scores.size()).div(tgt_size) ch_scores.add_(lm_scores_norm) # 1/t log P(T|S) + 1/s log P(S|T) + 1/s log P(T) else: ch_scores.add_(lm_scores1.view(ch_scores.size())) ch_scores.div_(src_size) return ch_scores if self.channel_models is not None: channel_model = self.channel_models[0] # assume only one channel_model model else: channel_model = None lm = EnsembleModel(self.lm_models) lm_incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) for i in range(lm.models_size) ], ) reorder_state = None batch_idxs = None for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as(batch_idxs) reorder_state.view(-1, beam_size).add_(corr.unsqueeze(-1) * beam_size) model.reorder_incremental_state(incremental_states, reorder_state) encoder_outs = model.reorder_encoder_out(encoder_outs, reorder_state) lm.reorder_incremental_state(lm_incremental_states, reorder_state) fw_lprobs, avg_attn_scores = model.forward_decoder( tokens[:, :step + 1], encoder_outs, incremental_states, temperature=self.temperature, ) fw_lprobs[:, self.pad] = -math.inf # never select pad fw_lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty fw_lprobs, ch_lm_lprobs, lm_lprobs = noisy_channel_rescoring(fw_lprobs, beam_size, bsz, src_tokens, tokens, self.k2) # handle min and max length constraints if step >= max_len: fw_lprobs[:, :self.eos] = -math.inf fw_lprobs[:, self.eos + 1:] = -math.inf elif step < self.min_len: fw_lprobs[:, self.eos] = -math.inf # handle prefix tokens (possibly with different lengths) if prefix_tokens is not None and step < prefix_tokens.size(1): prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1) prefix_mask = prefix_toks.ne(self.pad) prefix_fw_lprobs = fw_lprobs.gather(-1, prefix_toks.unsqueeze(-1)) fw_lprobs[prefix_mask] = -math.inf fw_lprobs[prefix_mask] = fw_lprobs[prefix_mask].scatter_( -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_fw_lprobs ) prefix_ch_lm_lprobs = ch_lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1)) ch_lm_lprobs[prefix_mask] = -math.inf ch_lm_lprobs[prefix_mask] = ch_lm_lprobs[prefix_mask].scatter_( -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_ch_lm_lprobs ) prefix_lm_lprobs = lm_lprobs.gather(-1, prefix_toks.unsqueeze(-1)) lm_lprobs[prefix_mask] = -math.inf lm_lprobs[prefix_mask] = lm_lprobs[prefix_mask].scatter_( -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lm_lprobs ) # if prefix includes eos, then we should make sure tokens and # scores are the same across all beams eos_mask = prefix_toks.eq(self.eos) if eos_mask.any(): # validate that the first beam matches the prefix first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[:, 0, 1:step + 1] eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0] target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step] assert (first_beam == target_prefix).all() def replicate_first_beam(tensor, mask): tensor = tensor.view(-1, beam_size, tensor.size(-1)) tensor[mask] = tensor[mask][:, :1, :] return tensor.view(-1, tensor.size(-1)) # copy tokens, scores and lprobs from the first beam to all beams tokens = replicate_first_beam(tokens, eos_mask_batch_dim) scores = replicate_first_beam(scores, eos_mask_batch_dim) fw_lprobs = replicate_first_beam(fw_lprobs, eos_mask_batch_dim) ch_lm_lprobs = replicate_first_beam(ch_lm_lprobs, eos_mask_batch_dim) lm_lprobs = replicate_first_beam(lm_lprobs, eos_mask_batch_dim) if self.no_repeat_ngram_size > 0: # for each beam and batch sentence, generate a list of previous ngrams gen_ngrams = [{} for bbsz_idx in range(bsz * beam_size)] for bbsz_idx in range(bsz * beam_size): gen_tokens = tokens[bbsz_idx].tolist() for ngram in zip(*[gen_tokens[i:] for i in range(self.no_repeat_ngram_size)]): gen_ngrams[bbsz_idx][tuple(ngram[:-1])] = \ gen_ngrams[bbsz_idx].get(tuple(ngram[:-1]), []) + [ngram[-1]] # Record attention scores if avg_attn_scores is not None: if attn is None: attn = scores.new(bsz * beam_size, src_tokens.size(1), max_len + 2) attn_buf = attn.clone() nonpad_idxs = src_tokens.ne(self.pad) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(fw_lprobs) scores_buf = scores_buf.type_as(fw_lprobs) self.search.set_src_lengths(src_lengths_no_eos) if self.no_repeat_ngram_size > 0: def calculate_banned_tokens(bbsz_idx): # before decoding the next token, prevent decoding of ngrams that have already appeared ngram_index = tuple(tokens[bbsz_idx, step + 2 - self.no_repeat_ngram_size:step + 1].tolist()) return gen_ngrams[bbsz_idx].get(ngram_index, []) if step + 2 - self.no_repeat_ngram_size >= 0: # no banned tokens if we haven't generated no_repeat_ngram_size tokens yet banned_tokens = [calculate_banned_tokens(bbsz_idx) for bbsz_idx in range(bsz * beam_size)] else: banned_tokens = [[] for bbsz_idx in range(bsz * beam_size)] for bbsz_idx in range(bsz * beam_size): fw_lprobs[bbsz_idx, banned_tokens[bbsz_idx]] = -math.inf combined_noisy_channel_scores, fw_lprobs_top_k, lm_lprobs_top_k, cand_indices, cand_beams = self.search.step( step, fw_lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], ch_lm_lprobs.view(bsz, -1, self.vocab_size), lm_lprobs.view(bsz, -1, self.vocab_size), self.combine_method ) # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos (except for candidates to be ignored) eos_mask = cand_indices.eq(self.eos) eos_mask[:, :beam_size] &= ~cands_to_ignore # only consider eos when it's among the top beam_size indices eos_bbsz_idx = torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] ) finalized_sents = set() if eos_bbsz_idx.numel() > 0: eos_scores = torch.masked_select( fw_lprobs_top_k[:, :beam_size], mask=eos_mask[:, :beam_size] ) combined_noisy_channel_eos_scores = torch.masked_select( combined_noisy_channel_scores[:, :beam_size], mask=eos_mask[:, :beam_size], ) # finalize hypo using channel model score finalized_sents = finalize_hypos( step, eos_bbsz_idx, eos_scores, combined_noisy_channel_eos_scores) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = cand_indices.new_ones(bsz) batch_mask[cand_indices.new(finalized_sents)] = 0 batch_idxs = torch.nonzero(batch_mask).squeeze(-1) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) lm_lprobs_top_k = lm_lprobs_top_k[batch_idxs] fw_lprobs_top_k = fw_lprobs_top_k[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths_no_eos = src_lengths_no_eos[batch_idxs] cands_to_ignore = cands_to_ignore[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) scores_buf.resize_as_(scores) tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) tokens_buf.resize_as_(tokens) src_tokens = src_tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) src_lengths = src_lengths.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) lm_prefix_scores = lm_prefix_scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1).squeeze() if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, attn.size(1), -1) attn_buf.resize_as_(attn) bsz = new_bsz else: batch_idxs = None # Set active_mask so that values > cand_size indicate eos or # ignored hypos and values < cand_size indicate candidate # active hypos. After this, the min values per row are the top # candidate active hypos. eos_mask[:, :beam_size] |= cands_to_ignore active_mask = torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[: eos_mask.size(1)], ) # get the top beam_size active hypotheses, which are just the hypos # with the smallest values in active_mask active_hypos, new_cands_to_ignore = buffer('active_hypos'), buffer('new_cands_to_ignore') torch.topk( active_mask, k=beam_size, dim=1, largest=False, out=(new_cands_to_ignore, active_hypos) ) # update cands_to_ignore to ignore any finalized hypos cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] assert (~cands_to_ignore).any(dim=1).all() active_bbsz_idx = buffer('active_bbsz_idx') torch.gather( cand_bbsz_idx, dim=1, index=active_hypos, out=active_bbsz_idx, ) active_scores = torch.gather( fw_lprobs_top_k, dim=1, index=active_hypos, out=scores[:, step].view(bsz, beam_size), ) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses torch.index_select( tokens[:, :step + 1], dim=0, index=active_bbsz_idx, out=tokens_buf[:, :step + 1], ) torch.gather( cand_indices, dim=1, index=active_hypos, out=tokens_buf.view(bsz, beam_size, -1)[:, :, step + 1], ) if step > 0: torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx, out=scores_buf[:, :step], ) torch.gather( fw_lprobs_top_k, dim=1, index=active_hypos, out=scores_buf.view(bsz, beam_size, -1)[:, :, step], ) torch.gather( lm_lprobs_top_k, dim=1, index=active_hypos, out=lm_prefix_scores.view(bsz, beam_size) ) # copy attention for active hypotheses if attn is not None: torch.index_select( attn[:, :, :step + 2], dim=0, index=active_bbsz_idx, out=attn_buf[:, :, :step + 2], ) # swap buffers tokens, tokens_buf = tokens_buf, tokens scores, scores_buf = scores_buf, scores if attn is not None: attn, attn_buf = attn_buf, attn # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): finalized[sent] = sorted(finalized[sent], key=lambda r: r['score'], reverse=True) return finalized def get_lm_scores(model, input_tokens, incremental_states, cand_tokens, input_len, k): with torch.no_grad(): lm_lprobs, avg_attn_scores = model.forward_decoder( input_tokens, encoder_outs=None, incremental_states=incremental_states, ) lm_lprobs_size = lm_lprobs.size(0) probs_next_wrd = torch.gather(lm_lprobs.repeat(1, k).view(lm_lprobs_size*k, -1), 1, cand_tokens).squeeze().view(-1) return probs_next_wrd def make_dict2dict(old_dict, new_dict): dict2dict_map = {} for sym in old_dict.symbols: dict2dict_map[old_dict.index(sym)] = new_dict.index(sym) return dict2dict_map def dict2dict(tokens, dict2dict_map): if tokens.device == torch.device('cpu'): tokens_tmp = tokens else: tokens_tmp = tokens.cpu() return tokens_tmp.map_( tokens_tmp, lambda _, val, dict2dict_map=dict2dict_map : dict2dict_map[float(val)] ).to(tokens.device) def reorder_tokens(tokens, lengths, eos): # reorder source tokens so they may be used as reference for P(S|T) return torch.cat((tokens.new([eos]), tokens[-lengths:-1], tokens[:-lengths]), 0) def reorder_all_tokens(tokens, lengths, eos): # used to reorder src tokens from [ .. ] to [ ...] # so source tokens can be used to predict P(S|T) return torch.stack([reorder_tokens(token, length, eos) for token, length in zip(tokens, lengths)]) def normalized_scores_with_batch_vocab( model_decoder, features, target_ids, k, bsz, beam_size, pad_idx, top_k=0, vocab_size_meter=None, start_idx=None, end_idx=None, **kwargs): """ Get normalized probabilities (or log probs) from a net's output w.r.t. vocab consisting of target IDs in the batch """ if model_decoder.adaptive_softmax is None: weight = model_decoder.output_projection.weight vocab_ids = torch.unique( torch.cat( (torch.unique(target_ids), torch.arange(top_k, device=target_ids.device)) ) ) id_map = dict(zip(vocab_ids.tolist(), range(len(vocab_ids)))) mapped_target_ids = target_ids.cpu().apply_( lambda x, id_map=id_map: id_map[x] ).to(target_ids.device) expanded_target_ids = mapped_target_ids[:, :].repeat(1, k).view(bsz*beam_size*k, -1) if start_idx is not None and end_idx is not None: expanded_target_ids = expanded_target_ids[start_idx:end_idx, :] logits = F.linear(features, weight[vocab_ids, :]) log_softmax = F.log_softmax(logits, dim=-1, dtype=torch.float32) intermed_scores = torch.gather( log_softmax[:, :-1, :], 2, expanded_target_ids[:, 1:].unsqueeze(2), ).squeeze() not_padding = expanded_target_ids[:, 1:] != pad_idx intermed_scores *= not_padding.float() return intermed_scores else: raise ValueError("adaptive softmax doesn't work with " + "`normalized_scores_with_batch_vocab()`") ================================================ FILE: examples/fast_noisy_channel/noisy_channel_translation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.tasks.translation import TranslationTask from fairseq.tasks.language_modeling import LanguageModelingTask from fairseq import checkpoint_utils import argparse from fairseq.tasks import register_task import torch @register_task("noisy_channel_translation") class NoisyChannelTranslation(TranslationTask): """ Rescore the top k candidates from each beam using noisy channel modeling """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" TranslationTask.add_args(parser) # fmt: off parser.add_argument('--channel-model', metavar='FILE', help='path to P(S|T) model. P(S|T) and P(T|S) must share source and target dictionaries.') parser.add_argument('--combine-method', default='lm_only', choices=['lm_only', 'noisy_channel'], help="""method for combining direct and channel model scores. lm_only: decode with P(T|S)P(T) noisy_channel: decode with 1/t P(T|S) + 1/s(P(S|T)P(T))""") parser.add_argument('--normalize-lm-scores-by-tgt-len', action='store_true', default=False, help='normalize lm score by target length instead of source length') parser.add_argument('--channel-scoring-type', default='log_norm', choices=['unnormalized', 'log_norm', 'k2_separate', 'src_vocab', 'src_vocab_batched'], help="Normalize bw scores with log softmax or return bw scores without log softmax") parser.add_argument('--top-k-vocab', default=0, type=int, help='top k vocab IDs to use with `src_vocab` in channel model scoring') parser.add_argument('--k2', default=50, type=int, help='the top k2 candidates to rescore with the noisy channel model for each beam') parser.add_argument('--ch-wt', default=1, type=float, help='weight for the channel model') parser.add_argument('--lm-model', metavar='FILE', help='path to lm model file, to model P(T). P(T) must share the same vocab as the direct model on the target side') parser.add_argument('--lm-data', metavar='FILE', help='path to lm model training data for target language, used to properly load LM with correct dictionary') parser.add_argument('--lm-wt', default=1, type=float, help='the weight of the lm in joint decoding') # fmt: on def build_generator( self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None ): if getattr(args, "score_reference", False): raise NotImplementedError() else: from .noisy_channel_sequence_generator import NoisyChannelSequenceGenerator use_cuda = torch.cuda.is_available() and not self.args.cpu assert self.args.lm_model is not None, '--lm-model required for noisy channel generation!' assert self.args.lm_data is not None, '--lm-data required for noisy channel generation to map between LM and bitext vocabs' if self.args.channel_model is not None: import copy ch_args_task = copy.deepcopy(self.args) tmp = ch_args_task.source_lang ch_args_task.source_lang = ch_args_task.target_lang ch_args_task.target_lang = tmp ch_args_task._name = 'translation' channel_task = TranslationTask.setup_task(ch_args_task) arg_dict = {} arg_dict['task'] = 'language_modeling' arg_dict['sample_break_mode'] = 'eos' arg_dict['data'] = self.args.lm_data arg_dict['output_dictionary_size'] = -1 lm_args = argparse.Namespace(**arg_dict) lm_task = LanguageModelingTask.setup_task(lm_args) lm_dict = lm_task.output_dictionary if self.args.channel_model is not None: channel_models, _ = checkpoint_utils.load_model_ensemble(self.args.channel_model.split(':'), task=channel_task) for model in channel_models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if self.args.fp16: model.half() if use_cuda: model.cuda() else: channel_models = None lm_models, _ = checkpoint_utils.load_model_ensemble(self.args.lm_model.split(':'), task=lm_task) for model in lm_models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if self.args.fp16: model.half() if use_cuda: model.cuda() return NoisyChannelSequenceGenerator( combine_method=self.args.combine_method, tgt_dict=self.target_dictionary, src_dict=self.source_dictionary, beam_size=getattr(args, 'beam', 5), max_len_a=getattr(args, 'max_len_a', 0), max_len_b=getattr(args, 'max_len_b', 200), min_len=getattr(args, 'min_len', 1), len_penalty=getattr(args, 'lenpen', 1), unk_penalty=getattr(args, 'unkpen', 0), temperature=getattr(args, 'temperature', 1.), match_source_len=getattr(args, 'match_source_len', False), no_repeat_ngram_size=getattr(args, 'no_repeat_ngram_size', 0), normalize_scores=(not getattr(args, 'unnormalized', False)), channel_models=channel_models, k2=getattr(self.args, 'k2', 50), ch_weight=getattr(self.args, 'ch_wt', 1), channel_scoring_type=self.args.channel_scoring_type, top_k_vocab=self.args.top_k_vocab, lm_models=lm_models, lm_dict=lm_dict, lm_weight=getattr(self.args, 'lm_wt', 1), normalize_lm_scores_by_tgt_len=getattr(self.args, 'normalize_lm_scores_by_tgt_len', False), ) ================================================ FILE: examples/flores101/README.md ================================================

# Flores101: Large-Scale Multilingual Machine Translation ## Introduction Baseline pretrained models for small and large tracks of WMT 21 Large-Scale Multilingual Machine Translation competition. Flores Task at WMT 21: http://www.statmt.org/wmt21/large-scale-multilingual-translation-task.html Flores announement blog post: https://ai.facebook.com/blog/flores-researchers-kick-off-multilingual-translation-challenge-at-wmt-and-call-for-compute-grants/ ## Pretrained models Model | Num layers | Embed dimension | FFN dimension| Vocab Size | #params | Download ---|---|---|---|---|---|--- `flores101_mm100_615M` | 12 | 1024 | 4096 | 256,000 | 615M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz `flores101_mm100_175M` | 6 | 512 | 2048 | 256,000 | 175M | https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz These models are trained similar to [M2M-100](https://arxiv.org/abs/2010.11125) with additional support for the languages that are part of the WMT Large-Scale Multilingual Machine Translation track. Full list of languages can be found at the bottom. ## Example Generation code ### Download model, sentencepiece vocab ```bash fairseq=/path/to/fairseq cd $fairseq # Download 615M param model. wget https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz # Extract tar -xvzf flores101_mm100_615M.tar.gz ``` ### Encode using our SentencePiece Model Note: Install SentencePiece from [here](https://github.com/google/sentencepiece) ```bash fairseq=/path/to/fairseq cd $fairseq # Download example dataset From German to French sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr for lang in de fr ; do python scripts/spm_encode.py \ --model flores101_mm100_615M/sentencepiece.bpe.model \ --output_format=piece \ --inputs=raw_input.de-fr.${lang} \ --outputs=spm.de-fr.${lang} done ``` ### Binarization ```bash fairseq-preprocess \ --source-lang de --target-lang fr \ --testpref spm.de-fr \ --thresholdsrc 0 --thresholdtgt 0 \ --destdir data_bin \ --srcdict flores101_mm100_615M/dict.txt --tgtdict flores101_mm100_615M/dict.txt ``` ### Generation ```bash fairseq-generate \ data_bin \ --batch-size 1 \ --path flores101_mm100_615M/model.pt \ --fixed-dictionary flores101_mm100_615M/dict.txt \ -s de -t fr \ --remove-bpe 'sentencepiece' \ --beam 5 \ --task translation_multi_simple_epoch \ --lang-pairs flores101_mm100_615M/language_pairs.txt \ --decoder-langtok --encoder-langtok src \ --gen-subset test \ --fp16 \ --dataset-impl mmap \ --distributed-world-size 1 --distributed-no-spawn ``` ### Supported Languages and lang code Language | lang code ---|--- Akrikaans | af Amharic | am Arabic | ar Assamese | as Asturian | ast Aymara | ay Azerbaijani | az Bashkir | ba Belarusian | be Bulgarian | bg Bengali | bn Breton | br Bosnian | bs Catalan | ca Cebuano | ceb Chokwe | cjk Czech | cs Welsh | cy Danish | da German | de Dyula| dyu Greek | el English | en Spanish | es Estonian | et Persian | fa Fulah | ff Finnish | fi French | fr Western Frisian | fy Irish | ga Scottish Gaelic | gd Galician | gl Gujarati | gu Hausa | ha Hebrew | he Hindi | hi Croatian | hr Haitian Creole | ht Hungarian | hu Armenian | hy Indonesian | id Igbo | ig Iloko | ilo Icelandic | is Italian | it Japanese | ja Javanese | jv Georgian | ka Kachin | kac Kamba | kam Kabuverdianu | kea Kongo | kg Kazakh | kk Central Khmer | km Kimbundu | kmb Northern Kurdish | kmr Kannada | kn Korean | ko Kurdish | ku Kyrgyz | ky Luxembourgish | lb Ganda | lg Lingala | ln Lao | lo Lithuanian | lt Luo | luo Latvian | lv Malagasy | mg Maori | mi Macedonian | mk Malayalam | ml Mongolian | mn Marathi | mr Malay | ms Maltese | mt Burmese | my Nepali | ne Dutch | nl Norwegian | no Northern Sotho | ns Nyanja | ny Occitan | oc Oromo | om Oriya | or Punjabi | pa Polish | pl Pashto | ps Portuguese | pt Quechua | qu Romanian | ro Russian | ru Sindhi | sd Shan | shn Sinhala | si Slovak | sk Slovenian | sl Shona | sn Somali | so Albanian | sq Serbian | sr Swati | ss Sundanese | su Swedish | sv Swahili | sw Tamil | ta Telugu | te Tajik | tg Thai | th Tigrinya | ti Tagalog | tl Tswana | tn Turkish | tr Ukrainian | uk Umbundu | umb Urdu | ur Uzbek | uz Vietnamese | vi Wolof | wo Xhosa | xh Yiddish | yi Yoruba | yo Chinese| zh Zulu | zu ================================================ FILE: examples/fully_sharded_data_parallel/README.md ================================================ # Fully Sharded Data Parallel (FSDP) ## Overview Recent work by [Microsoft](https://arxiv.org/abs/1910.02054) and [Google](https://arxiv.org/abs/2004.13336) has shown that data parallel training can be made significantly more efficient by sharding the model parameters and optimizer state across data parallel workers. These ideas are encapsulated in the new **`FullyShardedDataParallel` (FSDP)** wrapper provided by [fairscale](https://github.com/facebookresearch/fairscale/). Compared to PyTorch DDP: * FSDP produces identical results as PyTorch DDP (it's still synchronous data parallel training) * FSDP shards parameters (FP16 + FP32) and optimizer state across data parallel GPUs * FSDP is faster than PyTorch DDP because the optimizer step is sharded, and the communication can be overlapped with the forward pass * FSDP enables training 13B parameter models on 8 GPUs and 175B parameter models on 128 GPUs FSDP is fully supported in fairseq via the following new arguments: * `--ddp-backend=fully_sharded`: enables full sharding via FSDP * `--cpu-offload`: offloads the optimizer state and FP32 model copy to CPU (combine with `--optimizer=cpu_adam`) * `--no-reshard-after-forward`: increases training speed for large models (1B+ params) and is similar to ZeRO stage 2 * other popular options (`--fp16`, `--update-freq`, `--checkpoint-activations`, `--offload-activations`, etc.) continue to work as normal
Limitations

FSDP currently has several limitations compared to fairseq's default DDP backend (PyTorch DDP): * while FSDP is full compatible with pointwise Optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.), it is not currently compatible with non-pointwise Optimizers (e.g., Adagrad, Adafactor, LAMB, etc.) * FSDP depends on flattening the parameters, so models that currently require `--fp16-no-flatten-grads` may not be supported See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed explanation of these and other limitations.

How it works

Fully Sharded Data Parallel See the [fairscale docs](https://fairscale.readthedocs.io/en/latest/api/nn/fsdp_tips.html) for a more detailed explanation of how FSDP works.

## Example usage The following examples illustrate how to train a very large language model with 13 billion parameters on 1 GPU by offloading parameters and optimizer states to CPU, or on 8 GPUs by fully sharding the params and optimizer states across GPUs. These examples use the WikiText-103 dataset for demonstration purposes, but in practice a much larger dataset will be needed to achieve good results. Follow the [instructions here](https://github.com/pytorch/fairseq/blob/main/examples/roberta/README.pretraining.md#1-preprocess-the-data) to preprocess the WikiText-103 dataset using the GPT-2/RoBERTa vocabulary. ### 13B params on 1 V100 GPU (with CPU offloading) The following command trains a 13B parameter GPT-3 model on a single V100 GPU using the `--cpu-offload` feature to offload parameters and optimizer states to CPU. In this setting, the optimizer step (Adam) happens on CPU. We also use the `--checkpoint-activations` feature (sometimes called [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html)), which further saves memory in exchange for a small increase in computation. **Requirements:** - Install the latest master version of fairscale: `pip install git+https://github.com/facebookresearch/fairscale.git@master` - You'll need 32GB of GPU memory and ~256GB of system memory to train the 13B param model. - If you have less system memory, the 6.7B param model can be trained with ~128GB of system memory, just set `--arch transformer_lm_gpt3_6_7` - We use the CPU Adam optimizer from [DeepSpeed](https://github.com/microsoft/DeepSpeed), so you'll need to `pip install deepspeed` before running the command. **Notes:** - The command will take ~5 minutes to start training, during which time it will appear to be hung, since randomly initializing 13B weights can be slow. - The `--cpu-offload` feature requires training in mixed precision (`--fp16`). - Tune the `OMP_NUM_THREADS` env variable for best performance with CPU offloading. - The example command below stops training after 10 steps (`--max-update 10`) and does not save checkpoints (`--no-save`). ```bash OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0 \ fairseq-train data-bin/wikitext-103-roberta-bpe-bin \ --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ --cpu-offload --checkpoint-activations \ --task language_modeling --tokens-per-sample 2048 --batch-size 8 \ --arch transformer_lm_gpt3_13 \ --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ --max-update 10 --no-save --log-format json --log-interval 1 ```
Example output

``` (...) 2021-03-08 12:29:51 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920) (...) 2021-03-08 12:29:51 | INFO | fairseq_cli.train | training on 1 devices (GPUs/TPUs) 2021-03-08 12:29:51 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8 (...) Adam Optimizer #0 is created with AVX2 arithmetic capability. Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1 (...) 2021-03-08 12:31:36 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.475", "ppl": "91120.8", "wps": "0", "ups": "0", "wpb": "16384", "bsz": "8", "num_updates": "1", "lr": "2e-05", "gnorm": "20.751", "loss_scale": "4", "train_wall": "99", "gb_free": "9.3", "wall": "105"} 2021-03-08 12:32:33 | INFO | train_inner | {"epoch": 1, "update": 0.0, "loss": "16.446", "ppl": "89281.6", "wps": "288.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "2", "lr": "4e-05", "gnorm": "19.777", "loss_scale": "4", "train_wall": "57", "gb_free": "9.3", "wall": "161"} 2021-03-08 12:33:12 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0 2021-03-08 12:33:51 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0 2021-03-08 12:34:45 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "25.22", "ppl": "3.90691e+07", "wps": "123.4", "ups": "0.01", "wpb": "16384", "bsz": "8", "num_updates": "3", "lr": "6e-05", "gnorm": "131.281", "loss_scale": "1", "train_wall": "133", "gb_free": "9.3", "wall": "294"} 2021-03-08 12:35:43 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.079", "ppl": "276809", "wps": "285.5", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "4", "lr": "8e-05", "gnorm": "13.776", "loss_scale": "1", "train_wall": "57", "gb_free": "9.3", "wall": "351"} 2021-03-08 12:36:35 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "23.729", "ppl": "1.39088e+07", "wps": "316.7", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "5", "lr": "0.0001", "gnorm": "72.774", "loss_scale": "1", "train_wall": "52", "gb_free": "9.3", "wall": "403"} 2021-03-08 12:37:28 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "20.429", "ppl": "1.41203e+06", "wps": "307.6", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "6", "lr": "8e-05", "gnorm": "60.846", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "456"} 2021-03-08 12:38:27 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.965", "ppl": "511684", "wps": "279.4", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "7", "lr": "6e-05", "gnorm": "22.687", "loss_scale": "1", "train_wall": "59", "gb_free": "9.3", "wall": "515"} 2021-03-08 12:39:18 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "18.345", "ppl": "332887", "wps": "319.1", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "8", "lr": "4e-05", "gnorm": "8.451", "loss_scale": "1", "train_wall": "51", "gb_free": "9.3", "wall": "566"} 2021-03-08 12:40:11 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "18.262", "ppl": "314336", "wps": "305.9", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "9", "lr": "2e-05", "gnorm": "6.457", "loss_scale": "1", "train_wall": "54", "gb_free": "9.3", "wall": "620"} 2021-03-08 12:41:04 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "17.556", "ppl": "192686", "wps": "311.8", "ups": "0.02", "wpb": "16384", "bsz": "8", "num_updates": "10", "lr": "0", "gnorm": "5.796", "loss_scale": "1", "train_wall": "53", "gb_free": "9.3", "wall": "673"} 2021-03-08 12:41:04 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10 2021-03-08 12:41:04 | INFO | fairseq_cli.train | begin validation on "valid" subset 2021-03-08 12:43:15 | INFO | valid | {"epoch": 1, "valid_loss": "17.953", "valid_ppl": "253807", "valid_wps": "1868.4", "valid_wpb": "15400.2", "valid_bsz": "7.6", "valid_num_updates": "10"} 2021-03-08 12:43:15 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below) 2021-03-08 12:43:15 | INFO | train | {"epoch": 1, "train_loss": "19.351", "train_ppl": "668509", "train_wps": "210.9", "train_ups": "0.01", "train_wpb": "16384", "train_bsz": "8", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "36.26", "train_loss_scale": "1", "train_train_wall": "667", "train_gb_free": "9.3", "train_wall": "804"} 2021-03-08 12:43:15 | INFO | fairseq_cli.train | done training in 798.6 seconds ```

### 13B params on 8 V100 GPUs (with full parameter + optimizer state sharding) FSDP can also shard the parameters and optimizer states across multiple GPUs, reducing memory requirements significantly. On 8 x 32GB GPUs, sharding enables training the same 13B parameter model *without offloading the parameters to CPU*. However, without CPU offloading we'd only be able to fit a batch size of 1 per GPU, which would cause training speed to suffer. We obtain the best performance on 8 GPUs by combining full sharding and CPU offloading. The following command trains the same 13B parameter GPT-3 model as before on 8 x 32GB V100 GPUs; training speed increases superlinearly from ~310 words per second to ~3200 words per second. ```bash OMP_NUM_THREADS=20 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ fairseq-train data-bin/wikitext-103-roberta-bpe-bin \ --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ --cpu-offload --checkpoint-activations \ --task language_modeling --tokens-per-sample 2048 --batch-size 8 \ --arch transformer_lm_gpt3_13 \ --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ --max-update 10 --no-save --log-format json --log-interval 1 ```
Example output

``` (...) 2021-03-08 18:04:09 | INFO | fairseq_cli.train | num. model params: 13,110,865,920 (num. trained: 13,110,865,920) (...) 2021-03-08 18:04:09 | INFO | fairseq_cli.train | training on 8 devices (GPUs/TPUs) 2021-03-08 18:04:09 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 8 (...) Adam Optimizer #0 is created with AVX2 arithmetic capability. Config: alpha=0.000100, betas=(0.900000, 0.980000), weight_decay=0.000000, adam_w=1 (...) 2021-03-08 18:05:06 | INFO | train_inner | {"epoch": 1, "update": 0.001, "loss": "16.408", "ppl": "86945.6", "wps": "0", "ups": "0", "wpb": "131072", "bsz": "64", "num_updates": "1", "lr": "2e-05", "gnorm": "18.27", "loss_scale": "4", "train_wall": "47", "gb_free": "9.3", "wall": "56"} 2021-03-08 18:05:45 | INFO | train_inner | {"epoch": 1, "update": 0.002, "loss": "16.352", "ppl": "83644.3", "wps": "3283.4", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "2", "lr": "4e-05", "gnorm": "18.411", "loss_scale": "4", "train_wall": "40", "gb_free": "9.3", "wall": "96"} 2021-03-08 18:06:21 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 2.0 2021-03-08 18:06:56 | INFO | fairseq.trainer | NOTE: gradient overflow detected, ignoring gradient, setting loss scale to: 1.0 2021-03-08 18:07:37 | INFO | train_inner | {"epoch": 1, "update": 0.006, "loss": "23.682", "ppl": "1.34537e+07", "wps": "1176.6", "ups": "0.01", "wpb": "131072", "bsz": "64", "num_updates": "3", "lr": "6e-05", "gnorm": "119.682", "loss_scale": "1", "train_wall": "111", "gb_free": "9.3", "wall": "208"} 2021-03-08 18:08:18 | INFO | train_inner | {"epoch": 1, "update": 0.007, "loss": "18.988", "ppl": "519921", "wps": "3189.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "4", "lr": "8e-05", "gnorm": "14.934", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "249"} 2021-03-08 18:08:59 | INFO | train_inner | {"epoch": 1, "update": 0.008, "loss": "20.08", "ppl": "1.10798e+06", "wps": "3223.1", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "5", "lr": "0.0001", "gnorm": "59.92", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "289"} 2021-03-08 18:09:39 | INFO | train_inner | {"epoch": 1, "update": 0.009, "loss": "18.323", "ppl": "327980", "wps": "3256.6", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "6", "lr": "8e-05", "gnorm": "37.425", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "330"} 2021-03-08 18:10:20 | INFO | train_inner | {"epoch": 1, "update": 0.01, "loss": "17.264", "ppl": "157354", "wps": "3188.7", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "7", "lr": "6e-05", "gnorm": "10.824", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "371"} 2021-03-08 18:11:01 | INFO | train_inner | {"epoch": 1, "update": 0.011, "loss": "16.794", "ppl": "113647", "wps": "3230", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "8", "lr": "4e-05", "gnorm": "5.616", "loss_scale": "1", "train_wall": "41", "gb_free": "9.3", "wall": "411"} 2021-03-08 18:11:39 | INFO | train_inner | {"epoch": 1, "update": 0.012, "loss": "16.706", "ppl": "106938", "wps": "3384", "ups": "0.03", "wpb": "131072", "bsz": "64", "num_updates": "9", "lr": "2e-05", "gnorm": "5.318", "loss_scale": "1", "train_wall": "39", "gb_free": "9.3", "wall": "450"} 2021-03-08 18:12:19 | INFO | train_inner | {"epoch": 1, "update": 0.013, "loss": "16.548", "ppl": "95796.2", "wps": "3274.4", "ups": "0.02", "wpb": "131072", "bsz": "64", "num_updates": "10", "lr": "0", "gnorm": "5.22", "loss_scale": "1", "train_wall": "40", "gb_free": "9.3", "wall": "490"} 2021-03-08 18:12:19 | INFO | fairseq_cli.train | Stopping training due to num_updates: 10 >= max_update: 10 2021-03-08 18:12:19 | INFO | fairseq_cli.train | begin validation on "valid" subset 2021-03-08 18:12:45 | INFO | valid | {"epoch": 1, "valid_loss": "16.624", "valid_ppl": "101000", "valid_wps": "10855.9", "valid_wpb": "123202", "valid_bsz": "60.5", "valid_num_updates": "10"} 2021-03-08 18:12:45 | INFO | fairseq_cli.train | end of epoch 1 (average epoch stats below) 2021-03-08 18:12:45 | INFO | train | {"epoch": 1, "train_loss": "18.114", "train_ppl": "283776", "train_wps": "2567.8", "train_ups": "0.02", "train_wpb": "131072", "train_bsz": "64", "train_num_updates": "10", "train_lr": "0", "train_gnorm": "29.562", "train_loss_scale": "1", "train_train_wall": "480", "train_gb_free": "9.3", "train_wall": "516"} 2021-03-08 18:12:45 | INFO | fairseq_cli.train | done training in 509.9 seconds ```

================================================ FILE: examples/gottbert/README.md ================================================ # GottBERT: a pure German language model ## Introduction [GottBERT](http://arxiv.org/abs/2012.02110) is a pretrained language model trained on 145GB of German text based on RoBERTa. ## Example usage ### fairseq ##### Load GottBERT from torch.hub (PyTorch >= 1.1): ```python import torch gottbert = torch.hub.load('pytorch/fairseq', 'gottbert-base') gottbert.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Load GottBERT (for PyTorch 1.0 or custom models): ```python # Download gottbert model wget https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz tar -xzvf gottbert.tar.gz # Load the model in fairseq from fairseq.models.roberta import GottbertModel gottbert = GottbertModel.from_pretrained('/path/to/gottbert') gottbert.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Filling masks: ```python masked_line = 'Gott ist ! :)' gottbert.fill_mask(masked_line, topk=3) # [('Gott ist gut ! :)', 0.3642110526561737, ' gut'), # ('Gott ist überall ! :)', 0.06009674072265625, ' überall'), # ('Gott ist großartig ! :)', 0.0370681993663311, ' großartig')] ``` ##### Extract features from GottBERT ```python # Extract the last layer's features line = "Der erste Schluck aus dem Becher der Naturwissenschaft macht atheistisch , aber auf dem Grunde des Bechers wartet Gott !" tokens = gottbert.encode(line) last_layer_features = gottbert.extract_features(tokens) assert last_layer_features.size() == torch.Size([1, 27, 768]) # Extract all layer's features (layer 0 is the embedding layer) all_layers = gottbert.extract_features(tokens, return_all_hiddens=True) assert len(all_layers) == 13 assert torch.all(all_layers[-1] == last_layer_features) ``` ## Citation If you use our work, please cite: ```bibtex @misc{scheible2020gottbert, title={GottBERT: a pure German Language Model}, author={Raphael Scheible and Fabian Thomczyk and Patric Tippmann and Victor Jaravine and Martin Boeker}, year={2020}, eprint={2012.02110}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ================================================ FILE: examples/hubert/README.md ================================================ # HuBERT ## Pre-trained and fine-tuned (ASR) models Model | Pretraining Data | Finetuning Dataset | Model | Quantizer |---|---|---|---|--- HuBERT Base (~95M params) | [Librispeech](http://www.openslr.org/12) 960 hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) | [L9 km500](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin) HuBERT Large (~316M params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt) HuBERT Extra Large (~1B params) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | No finetuning (Pretrained Model) | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt) HuBERT Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt) HuBERT Extra Large | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt) ## Load a model ``` ckpt_path = "/path/to/the/checkpoint.pt" models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) model = models[0] ``` ## Train a new model ### Data preparation Follow the steps in `./simple_kmeans` to create: - `{train,valid}.tsv` waveform list files - `{train,valid}.km` frame-aligned pseudo label files. - `dict.km.txt` a dummy dictionary The `label_rate` is the same as the feature frame rate used for clustering, which is 100Hz for MFCC features and 50Hz for HuBERT features by default. ### Pre-train a HuBERT model Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km` are saved at `/path/to/labels`, and the label rate is 100Hz. To train a base model (12 layer transformer), run: ```sh $ python fairseq_cli/hydra_train.py \ --config-dir /path/to/fairseq-py/examples/hubert/config/pretrain \ --config-name hubert_base_librispeech \ task.data=/path/to/data task.label_dir=/path/to/labels task.labels='["km"]' model.label_rate=100 ``` ### Fine-tune a HuBERT model with a CTC loss Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their corresponding character transcripts `{train,valid}.ltr` are saved at `/path/to/trans`. To fine-tune a pre-trained HuBERT model at `/path/to/checkpoint`, run ```sh $ python fairseq_cli/hydra_train.py \ --config-dir /path/to/fairseq-py/examples/hubert/config/finetune \ --config-name base_10h \ task.data=/path/to/data task.label_dir=/path/to/trans \ model.w2v_path=/path/to/checkpoint ``` ### Decode a HuBERT model Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is saved at `/path/to/checkpoint`. We support three decoding modes: - Viterbi decoding: greedy decoding without a language model - KenLM decoding: decoding with an arpa-format KenLM n-gram language model - Fairseq-LM deocding: decoding with a Fairseq neural language model #### Viterbi decoding `task.normalize` needs to be consistent with the value used during fine-tuning. Decoding results will be saved at `/path/to/experiment/directory/decode/viterbi/test`. ```sh $ python examples/speech_recognition/new/infer.py \ --config-dir /path/to/fairseq-py/examples/hubert/config/decode \ --config-name infer_viterbi \ task.data=/path/to/data \ task.normalize=[true|false] \ decoding.exp_dir=/path/to/experiment/directory \ common_eval.path=/path/to/checkpoint dataset.gen_subset=test \ ``` #### KenLM / Fairseq-LM decoding Suppose the pronunciation lexicon and the n-gram LM are saved at `/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be saved at `/path/to/experiment/directory/decode/kenlm/test`. ```sh $ python examples/speech_recognition/new/infer.py \ --config-dir /path/to/fairseq-py/examples/hubert/config/decode \ --config-name infer_kenlm \ task.data=/path/to/data \ task.normalize=[true|false] \ decoding.exp_dir=/path/to/experiment/directory \ common_eval.path=/path/to/checkpoint dataset.gen_subset=test \ decoding.decoder.lexicon=/path/to/lexicon \ decoding.decoder.lmpath=/path/to/arpa ``` The command above uses the default decoding hyperparameter, which can be found in `examples/speech_recognition/hydra/decoder.py`. These parameters can be configured from the command line. For example, to search with a beam size of 500, we can append the command above with `decoding.decoder.beam=500`. Important parameters include: - decoding.decoder.beam - decoding.decoder.beamthreshold - decoding.decoder.lmweight - decoding.decoder.wordscore - decoding.decoder.silweight To decode with a Fairseq LM, use `--config-name infer_fsqlm` instead, and change the path of lexicon and LM accordingly. ================================================ FILE: examples/hubert/config/decode/ax_sweep/ngram.yaml ================================================ # @package _global_ common_eval: results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset} hydra: sweeper: ax_config: max_trials: 60 early_stop: minimize: true max_epochs_without_improvement: 10 epsilon: 0.025 experiment: name: ${dataset.gen_subset} objective_name: wer minimize: true parameter_constraints: null outcome_constraints: null status_quo: null client: verbose_logging: false random_seed: null params: decoding.decoder.lmweight: type: range bounds: [0.0, 8.0] decoding.decoder.wordscore: type: range bounds: [-5.0, 5.0] decoding.decoder.silweight: type: range bounds: [-10.0, 0.0] ================================================ FILE: examples/hubert/config/decode/ax_sweep/transformer.yaml ================================================ # @package _global_ common_eval: results_path: ${decoding.exp_dir}/decode/${decoding.decoder.name}_ax/${dataset.gen_subset} hydra: sweeper: ax_config: max_trials: 60 early_stop: minimize: true max_epochs_without_improvement: 10 epsilon: 0.025 experiment: name: ${dataset.gen_subset} objective_name: wer minimize: true parameter_constraints: null outcome_constraints: null status_quo: null client: verbose_logging: false random_seed: null params: decoding.decoder.lmweight: type: range bounds: [0.0, 4.0] decoding.decoder.wordscore: type: range bounds: [-5.0, 5.0] decoding.decoder.silweight: type: range bounds: [-8.0, 0.0] ================================================ FILE: examples/hubert/config/decode/infer_fsqlm.yaml ================================================ # @package _group_ defaults: - model: null hydra: run: dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} sweep: dir: ${common_eval.results_path} subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} task: _name: hubert_pretraining single_target: true fine_tuning: true data: ??? normalize: ??? decoding: type: fairseqlm lexicon: ??? lmpath: ??? beamthreshold: 25 beam: 500 lmweight: 2 wordscore: -1 silweight: 0 unique_wer_file: true common_eval: results_path: ??? path: ??? post_process: letter dataset: max_tokens: 1100000 gen_subset: ??? ================================================ FILE: examples/hubert/config/decode/infer_kenlm.yaml ================================================ # @package _group_ defaults: - model: null hydra: run: dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} sweep: dir: ${common_eval.results_path} subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} task: _name: hubert_pretraining single_target: true fine_tuning: true data: ??? normalize: ??? decoding: type: kenlm lexicon: ??? lmpath: ??? beamthreshold: 100 beam: 500 lmweight: 2 wordscore: -1 silweight: 0 unique_wer_file: true common_eval: results_path: ??? path: ??? post_process: letter dataset: max_tokens: 1100000 gen_subset: ??? ================================================ FILE: examples/hubert/config/decode/infer_viterbi.yaml ================================================ # @package _group_ defaults: - model: null hydra: run: dir: ${common_eval.results_path}/viterbi sweep: dir: ${common_eval.results_path} subdir: viterbi task: _name: hubert_pretraining single_target: true fine_tuning: true data: ??? normalize: ??? decoding: type: viterbi unique_wer_file: true common_eval: results_path: ??? path: ??? post_process: letter dataset: max_tokens: 1100000 gen_subset: ??? ================================================ FILE: examples/hubert/config/decode/run/submitit_slurm.yaml ================================================ # @package _global_ hydra: launcher: cpus_per_task: ${distributed_training.distributed_world_size} gpus_per_node: ${distributed_training.distributed_world_size} tasks_per_node: ${hydra.launcher.gpus_per_node} nodes: 1 mem_gb: 200 timeout_min: 4320 max_num_timeout: 50 name: ${hydra.job.config_name} submitit_folder: ${hydra.sweep.dir}/submitit distributed_training: distributed_world_size: 1 distributed_no_spawn: true distributed_port: 29761 ================================================ FILE: examples/hubert/config/decode/run/submitit_slurm_8gpu.yaml ================================================ # @package _global_ hydra: launcher: cpus_per_task: ${distributed_training.distributed_world_size} gpus_per_node: ${distributed_training.distributed_world_size} tasks_per_node: ${hydra.launcher.gpus_per_node} nodes: 1 mem_gb: 200 timeout_min: 4320 max_num_timeout: 50 name: ${hydra.job.config_name} submitit_folder: ${hydra.sweep.dir}/submitit distributed_training: distributed_world_size: 8 distributed_no_spawn: true distributed_port: 29761 ================================================ FILE: examples/hubert/config/finetune/base_10h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tblog seed: 1337 checkpoint: save_interval: 5 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer distributed_training: ddp_backend: c10d find_unused_parameters: true distributed_world_size: 1 distributed_port: 29671 nprocs_per_node: 8 task: _name: hubert_pretraining data: ??? fine_tuning: true label_dir: ??? normalize: false # must be consistent with pre-training labels: ["ltr"] single_target: true dataset: num_workers: 0 max_tokens: 3200000 validate_after_updates: ${model.freeze_finetune_updates} validate_interval: 5 train_subset: train valid_subset: valid criterion: _name: ctc zero_infinity: true optimization: max_update: 25000 lr: [2e-5] sentence_avg: true update_freq: [1] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage warmup_steps: 8000 hold_steps: 0 decay_steps: 72000 final_lr_scale: 0.05 model: _name: hubert_ctc w2v_path: ??? apply_mask: true mask_selection: static mask_length: 10 mask_other: 0 mask_prob: 0.75 mask_channel_selection: static mask_channel_length: 64 mask_channel_other: 0 mask_channel_prob: 0.5 layerdrop: 0.1 dropout: 0.0 activation_dropout: 0.1 attention_dropout: 0.0 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir - model.w2v_path - dataset.train_subset - dataset.valid_subset - criterion.wer_kenlm_model - criterion.wer_lexicon run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/hubert/config/finetune/ckpt/it1.yaml ================================================ # @package _global_ task: normalize: false model: w2v_path: /checkpoint/wnhsu/w2v/hubert_final/iter1/hubert.km.randcrop.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU400k.s1337.ngpu32/checkpoint_last.pt ================================================ FILE: examples/hubert/config/finetune/lm/ls_4gram.yaml ================================================ # @package _global_ criterion: wer_kenlm_model: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/4-gram.bin wer_lexicon: /checkpoint/abdo/old_checkpoint02/datasets/librispeech/10h/raw/lexicon_ltr.lst wer_lm_weight: 2.0 wer_word_score: -1.0 ================================================ FILE: examples/hubert/config/finetune/run/submitit_reg.yaml ================================================ # @package _global_ hydra: launcher: cpus_per_task: 8 gpus_per_node: 8 tasks_per_node: ${hydra.launcher.gpus_per_node} nodes: 1 comment: null mem_gb: 384 timeout_min: 4320 max_num_timeout: 100 constraint: volta32gb name: ${hydra.job.config_name}/${hydra.job.override_dirname} submitit_folder: ${hydra.sweep.dir}/submitit/%j distributed_training: distributed_world_size: 8 distributed_port: 29671 nprocs_per_node: 8 ================================================ FILE: examples/hubert/config/pretrain/data/iter1.yaml ================================================ # @package _global_ task: label_dir: ??? labels: ["km"] model: label_rate: 100 ================================================ FILE: examples/hubert/config/pretrain/data/iter2.yaml ================================================ # @package _global_ task: label_dir: ??? labels: ["km"] model: label_rate: 50 ================================================ FILE: examples/hubert/config/pretrain/hubert_base_librispeech.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 seed: 1337 tensorboard_logdir: tblog checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true distributed_training: ddp_backend: no_c10d distributed_backend: 'nccl' distributed_world_size: 32 distributed_port: 29671 nprocs_per_node: 8 find_unused_parameters: true task: _name: hubert_pretraining data: ??? label_dir: ??? labels: ??? label_rate: ${model.label_rate} sample_rate: 16000 max_sample_size: 250000 min_sample_size: 32000 pad_audio: false random_crop: true normalize: false # must be consistent with extractor dataset: num_workers: 6 max_tokens: 1400000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 validate_interval_updates: 10000 criterion: _name: hubert pred_masked_weight: 1.0 pred_nomask_weight: 0.0 loss_weights: [10,] optimization: max_update: 400000 lr: [0.0005] clip_norm: 10.0 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: hubert label_rate: ??? skip_masked: false skip_nomask: false mask_prob: 0.80 extractor_mode: default conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' final_dim: 256 encoder_layerdrop: 0.05 dropout_input: 0.1 dropout_features: 0.1 dropout: 0.1 attention_dropout: 0.1 feature_grad_mult: 0.1 untie_final_proj: true activation_dropout: 0.0 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/hubert/config/pretrain/hubert_large_librivox.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 seed: 1337 tensorboard_logdir: tblog checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true distributed_training: ddp_backend: no_c10d distributed_backend: 'nccl' distributed_world_size: 128 distributed_port: 29671 nprocs_per_node: 8 find_unused_parameters: true task: _name: hubert_pretraining data: ??? label_dir: ??? labels: ??? label_rate: ${model.label_rate} sample_rate: 16000 max_sample_size: 250000 min_sample_size: 32000 pad_audio: false random_crop: true normalize: true # must be consistent with extractor dataset: num_workers: 6 max_tokens: 900000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 validate_interval_updates: 10000 criterion: _name: hubert pred_masked_weight: 1.0 pred_nomask_weight: 0.0 loss_weights: [10,] optimization: max_update: 400000 lr: [0.0015] clip_norm: 1.0 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: hubert label_rate: ??? encoder_layers: 24 encoder_embed_dim: 1024 encoder_ffn_embed_dim: 4096 encoder_attention_heads: 16 final_dim: 768 skip_masked: false skip_nomask: false mask_prob: 0.80 extractor_mode: layer_norm conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' encoder_layerdrop: 0.0 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 layer_norm_first: true feature_grad_mult: 1.0 untie_final_proj: true activation_dropout: 0.0 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data run: dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt sweep: dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/hubert/config/pretrain/hubert_xlarge_librivox.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 seed: 1337 tensorboard_logdir: tblog checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true distributed_training: ddp_backend: no_c10d distributed_backend: 'nccl' distributed_world_size: 256 distributed_port: 29671 nprocs_per_node: 8 find_unused_parameters: true task: _name: hubert_pretraining data: ??? label_dir: ??? labels: ??? label_rate: ${model.label_rate} sample_rate: 16000 max_sample_size: 250000 min_sample_size: 32000 pad_audio: false random_crop: true normalize: true # must be consistent with extractor dataset: num_workers: 6 max_tokens: 360000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 validate_interval_updates: 10000 criterion: _name: hubert pred_masked_weight: 1.0 pred_nomask_weight: 0.0 loss_weights: [10,] optimization: max_update: 400000 lr: [0.003] clip_norm: 1.0 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: hubert label_rate: ??? encoder_layers: 48 encoder_embed_dim: 1280 encoder_ffn_embed_dim: 5120 encoder_attention_heads: 16 final_dim: 1024 skip_masked: false skip_nomask: false mask_prob: 0.80 extractor_mode: layer_norm conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' encoder_layerdrop: 0.0 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 layer_norm_first: true feature_grad_mult: 1.0 untie_final_proj: true activation_dropout: 0.0 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data run: dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt sweep: dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/hubert/config/pretrain/run/submitit_reg.yaml ================================================ # @package _global_ hydra: launcher: cpus_per_task: 8 gpus_per_node: 8 tasks_per_node: ${hydra.launcher.gpus_per_node} nodes: 4 comment: null mem_gb: 384 timeout_min: 4320 max_num_timeout: 100 constraint: volta32gb name: ${hydra.job.config_name}/${hydra.job.override_dirname} submitit_folder: ${hydra.sweep.dir}/submitit/%j distributed_training: distributed_world_size: 32 distributed_port: 29671 nprocs_per_node: 8 ================================================ FILE: examples/hubert/measure_teacher_quality.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import os.path as op import re from tabulate import tabulate from collections import Counter def comp_purity(p_xy, axis): max_p = p_xy.max(axis=axis) marg_p = p_xy.sum(axis=axis) indv_pur = max_p / marg_p aggr_pur = max_p.sum() return indv_pur, aggr_pur def comp_entropy(p): return (-p * np.log(p + 1e-8)).sum() def comp_norm_mutual_info(p_xy): p_x = p_xy.sum(axis=1, keepdims=True) p_y = p_xy.sum(axis=0, keepdims=True) pmi = np.log(p_xy / np.matmul(p_x, p_y) + 1e-8) mi = (p_xy * pmi).sum() h_x = comp_entropy(p_x) h_y = comp_entropy(p_y) return mi, mi / h_x, mi / h_y, h_x, h_y def pad(labs, n): if n == 0: return np.array(labs) return np.concatenate([[labs[0]] * n, labs, [labs[-1]] * n]) def comp_avg_seg_dur(labs_list): n_frms = 0 n_segs = 0 for labs in labs_list: labs = np.array(labs) edges = np.zeros(len(labs)).astype(bool) edges[0] = True edges[1:] = labs[1:] != labs[:-1] n_frms += len(edges) n_segs += edges.astype(int).sum() return n_frms / n_segs def comp_joint_prob(uid2refs, uid2hyps): """ Args: pad: padding for spliced-feature derived labels """ cnts = Counter() skipped = [] abs_frmdiff = 0 for uid in uid2refs: if uid not in uid2hyps: skipped.append(uid) continue refs = uid2refs[uid] hyps = uid2hyps[uid] abs_frmdiff += abs(len(refs) - len(hyps)) min_len = min(len(refs), len(hyps)) refs = refs[:min_len] hyps = hyps[:min_len] cnts.update(zip(refs, hyps)) tot = sum(cnts.values()) ref_set = sorted({ref for ref, _ in cnts.keys()}) hyp_set = sorted({hyp for _, hyp in cnts.keys()}) ref2pid = dict(zip(ref_set, range(len(ref_set)))) hyp2lid = dict(zip(hyp_set, range(len(hyp_set)))) # print(hyp_set) p_xy = np.zeros((len(ref2pid), len(hyp2lid)), dtype=float) for (ref, hyp), cnt in cnts.items(): p_xy[ref2pid[ref], hyp2lid[hyp]] = cnt p_xy /= p_xy.sum() return p_xy, ref2pid, hyp2lid, tot, abs_frmdiff, skipped def read_phn(tsv_path, rm_stress=True): uid2phns = {} with open(tsv_path) as f: for line in f: uid, phns = line.rstrip().split("\t") phns = phns.split(",") if rm_stress: phns = [re.sub("[0-9]", "", phn) for phn in phns] uid2phns[uid] = phns return uid2phns def read_lab(tsv_path, lab_path, pad_len=0, upsample=1): """ tsv is needed to retrieve the uids for the labels """ with open(tsv_path) as f: f.readline() uids = [op.splitext(op.basename(line.rstrip().split()[0]))[0] for line in f] with open(lab_path) as f: labs_list = [pad(line.rstrip().split(), pad_len).repeat(upsample) for line in f] assert len(uids) == len(labs_list) return dict(zip(uids, labs_list)) def main_lab_lab( tsv_dir, lab_dir, lab_name, lab_sets, ref_dir, ref_name, pad_len=0, upsample=1, verbose=False, ): # assume tsv_dir is the same for both the reference and the hypotheses tsv_dir = lab_dir if tsv_dir is None else tsv_dir uid2refs = {} for s in lab_sets: uid2refs.update(read_lab(f"{tsv_dir}/{s}.tsv", f"{ref_dir}/{s}.{ref_name}")) uid2hyps = {} for s in lab_sets: uid2hyps.update( read_lab( f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample ) ) _main(uid2refs, uid2hyps, verbose) def main_phn_lab( tsv_dir, lab_dir, lab_name, lab_sets, phn_dir, phn_sets, pad_len=0, upsample=1, verbose=False, ): uid2refs = {} for s in phn_sets: uid2refs.update(read_phn(f"{phn_dir}/{s}.tsv")) uid2hyps = {} tsv_dir = lab_dir if tsv_dir is None else tsv_dir for s in lab_sets: uid2hyps.update( read_lab( f"{tsv_dir}/{s}.tsv", f"{lab_dir}/{s}.{lab_name}", pad_len, upsample ) ) _main(uid2refs, uid2hyps, verbose) def _main(uid2refs, uid2hyps, verbose): (p_xy, ref2pid, hyp2lid, tot, frmdiff, skipped) = comp_joint_prob( uid2refs, uid2hyps ) ref_pur_by_hyp, ref_pur = comp_purity(p_xy, axis=0) hyp_pur_by_ref, hyp_pur = comp_purity(p_xy, axis=1) (mi, mi_norm_by_ref, mi_norm_by_hyp, h_ref, h_hyp) = comp_norm_mutual_info(p_xy) outputs = { "ref pur": ref_pur, "hyp pur": hyp_pur, "H(ref)": h_ref, "H(hyp)": h_hyp, "MI": mi, "MI/H(ref)": mi_norm_by_ref, "ref segL": comp_avg_seg_dur(uid2refs.values()), "hyp segL": comp_avg_seg_dur(uid2hyps.values()), "p_xy shape": p_xy.shape, "frm tot": tot, "frm diff": frmdiff, "utt tot": len(uid2refs), "utt miss": len(skipped), } print(tabulate([outputs.values()], outputs.keys(), floatfmt=".4f")) if __name__ == "__main__": """ compute quality of labels with respect to phone or another labels if set """ import argparse parser = argparse.ArgumentParser() parser.add_argument("tsv_dir") parser.add_argument("lab_dir") parser.add_argument("lab_name") parser.add_argument("--lab_sets", default=["valid"], type=str, nargs="+") parser.add_argument( "--phn_dir", default="/checkpoint/wnhsu/data/librispeech/960h/fa/raw_phn/phone_frame_align_v1", ) parser.add_argument( "--phn_sets", default=["dev-clean", "dev-other"], type=str, nargs="+" ) parser.add_argument("--pad_len", default=0, type=int, help="padding for hypotheses") parser.add_argument( "--upsample", default=1, type=int, help="upsample factor for hypotheses" ) parser.add_argument("--ref_lab_dir", default="") parser.add_argument("--ref_lab_name", default="") parser.add_argument("--verbose", action="store_true") args = parser.parse_args() if args.ref_lab_dir and args.ref_lab_name: main_lab_lab( args.tsv_dir, args.lab_dir, args.lab_name, args.lab_sets, args.ref_lab_dir, args.ref_lab_name, args.pad_len, args.upsample, args.verbose, ) else: main_phn_lab( args.tsv_dir, args.lab_dir, args.lab_name, args.lab_sets, args.phn_dir, args.phn_sets, args.pad_len, args.upsample, args.verbose, ) ================================================ FILE: examples/hubert/simple_kmeans/README.md ================================================ # Sharded Feature Extraction and K-means Application This folder contains scripts for preparing HUBERT labels from tsv files, the steps are: 1. feature extraction 2. k-means clustering 3. k-means application ## Data preparation `*.tsv` files contains a list of audio, where each line is the root, and following lines are the subpath for each audio: ``` ... ``` ## Feature extraction ### MFCC feature Suppose the tsv file is at `${tsv_dir}/${split}.tsv`. To extract 39-D mfcc+delta+ddelta features for the 1st iteration HUBERT training, run: ```sh python dump_mfcc_feature.py ${tsv_dir} ${split} ${nshard} ${rank} ${feat_dir} ``` This would shard the tsv file into `${nshard}` and extract features for the `${rank}`-th shard, where rank is an integer in `[0, nshard-1]`. Features would be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`. ### HUBERT feature To extract features from the `${layer}`-th transformer layer of a trained HUBERT model saved at `${ckpt_path}`, run: ```sh python dump_hubert_feature.py ${tsv_dir} ${split} ${ckpt_path} ${layer} ${nshard} ${rank} ${feat_dir} ``` Features would also be saved at `${feat_dir}/${split}_${rank}_${nshard}.{npy,len}`. - if out-of-memory, decrease the chunk size with `--max_chunk` ## K-means clustering To fit a k-means model with `${n_clusters}` clusters on 10% of the `${split}` data, run ```sh python learn_kmeans.py ${feat_dir} ${split} ${nshard} ${km_path} ${n_cluster} --percent 0.1 ``` This saves the k-means model to `${km_path}`. - set `--precent -1` to use all data - more kmeans options can be found with `-h` flag ## K-means application To apply a trained k-means model `${km_path}` to obtain labels for `${split}`, run ```sh python dump_km_label.py ${feat_dir} ${split} ${km_path} ${nshard} ${rank} ${lab_dir} ``` This would extract labels for the `${rank}`-th shard out of `${nshard}` shards and dump them to `${lab_dir}/${split}_${rank}_${shard}.km` Finally, merge shards for `${split}` by running ```sh for rank in $(seq 0 $((nshard - 1))); do cat $lab_dir/${split}_${rank}_${nshard}.km done > $lab_dir/${split}.km ``` ## Create a dummy dict To create a dummy dictionary, run ```sh for x in $(seq 0 $((n_clusters - 1))); do echo "$x 1" done >> $lab_dir/dict.km.txt ``` ================================================ FILE: examples/hubert/simple_kmeans/dump_hubert_feature.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys import fairseq import soundfile as sf import torch import torch.nn.functional as F from feature_utils import get_path_iterator, dump_feature from fairseq.data.audio.audio_utils import get_features_or_waveform logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("dump_hubert_feature") class HubertFeatureReader(object): def __init__(self, ckpt_path, layer, max_chunk=1600000): ( model, cfg, task, ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) self.model = model[0].eval().cuda() self.task = task self.layer = layer self.max_chunk = max_chunk logger.info(f"TASK CONFIG:\n{self.task.cfg}") logger.info(f" max_chunk = {self.max_chunk}") def read_audio(self, path, ref_len=None): wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate) if wav.ndim == 2: wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim if ref_len is not None and abs(ref_len - len(wav)) > 160: logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") return wav def get_feats(self, path, ref_len=None): x = self.read_audio(path, ref_len=ref_len) with torch.no_grad(): x = torch.from_numpy(x).float().cuda() if self.task.cfg.normalize: x = F.layer_norm(x, x.shape) x = x.view(1, -1) feat = [] for start in range(0, x.size(1), self.max_chunk): x_chunk = x[:, start : start + self.max_chunk] feat_chunk, _ = self.model.extract_features( source=x_chunk, padding_mask=None, mask=False, output_layer=self.layer, ) feat.append(feat_chunk) return torch.cat(feat, 1).squeeze(0) def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk): reader = HubertFeatureReader(ckpt_path, layer, max_chunk) generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank) dump_feature(reader, generator, num, split, nshard, rank, feat_dir) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("tsv_dir") parser.add_argument("split") parser.add_argument("ckpt_path") parser.add_argument("layer", type=int) parser.add_argument("nshard", type=int) parser.add_argument("rank", type=int) parser.add_argument("feat_dir") parser.add_argument("--max_chunk", type=int, default=1600000) args = parser.parse_args() logger.info(args) main(**vars(args)) ================================================ FILE: examples/hubert/simple_kmeans/dump_hubert_feature_s2t.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import csv import io import logging import os import os.path as op import sys from dump_hubert_feature import HubertFeatureReader from feature_utils import get_shard_range, dump_feature from fairseq.data.audio.audio_utils import get_features_or_waveform logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("dump_hubert_feature_s2t") class HubertFeatureReaderS2T(HubertFeatureReader): def read_audio(self, path, ref_len=None): wav = get_features_or_waveform( path, need_waveform=True, use_sample_rate=self.task.cfg.sample_rate ) if wav.ndim == 2: wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim if ref_len is not None and abs(ref_len - len(wav)) > 160: logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") return wav def get_path_iterator(root, tsv, nshard, rank, audio_col_name): with open(tsv) as f: reader = csv.DictReader( f, delimiter="\t", quotechar=None, doublequote=False, lineterminator="\n", quoting=csv.QUOTE_NONE, ) subpaths = [op.join(root, e[audio_col_name]) for e in reader] start, end = get_shard_range(len(subpaths), nshard, rank) subpaths = subpaths[start:end] def iterate(): for subpath in subpaths: yield op.join(root, subpath), None return iterate, len(subpaths) def main( root, tsv_path, ckpt_path, layer, nshard, rank, feat_dir, split, max_chunk, audio_col_name, ): reader = HubertFeatureReaderS2T(ckpt_path, layer, max_chunk) generator, num = get_path_iterator(root, tsv_path, nshard, rank, audio_col_name) dump_feature(reader, generator, num, split, nshard, rank, feat_dir) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("root") parser.add_argument("tsv_path") parser.add_argument("ckpt_path") parser.add_argument("layer", type=int) parser.add_argument("nshard", type=int) parser.add_argument("rank", type=int) parser.add_argument("feat_dir") parser.add_argument("split") parser.add_argument("--audio_col_name", type=str, default="audio") parser.add_argument("--max_chunk", type=int, default=1600000) args = parser.parse_args() logger.info(args) main(**vars(args)) ================================================ FILE: examples/hubert/simple_kmeans/dump_km_label.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys import numpy as np import joblib import torch import tqdm logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("dump_km_label") class ApplyKmeans(object): def __init__(self, km_path): self.km_model = joblib.load(km_path) self.C_np = self.km_model.cluster_centers_.transpose() self.Cnorm_np = (self.C_np ** 2).sum(0, keepdims=True) self.C = torch.from_numpy(self.C_np) self.Cnorm = torch.from_numpy(self.Cnorm_np) if torch.cuda.is_available(): self.C = self.C.cuda() self.Cnorm = self.Cnorm.cuda() def __call__(self, x): if isinstance(x, torch.Tensor): dist = ( x.pow(2).sum(1, keepdim=True) - 2 * torch.matmul(x, self.C) + self.Cnorm ) return dist.argmin(dim=1).cpu().numpy() else: dist = ( (x ** 2).sum(1, keepdims=True) - 2 * np.matmul(x, self.C_np) + self.Cnorm_np ) return np.argmin(dist, axis=1) def get_feat_iterator(feat_dir, split, nshard, rank): feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" with open(leng_path, "r") as f: lengs = [int(line.rstrip()) for line in f] offsets = [0] + np.cumsum(lengs[:-1]).tolist() def iterate(): feat = np.load(feat_path, mmap_mode="r") assert feat.shape[0] == (offsets[-1] + lengs[-1]) for offset, leng in zip(offsets, lengs): yield feat[offset: offset + leng] return iterate, len(lengs) def dump_label(feat_dir, split, km_path, nshard, rank, lab_dir): apply_kmeans = ApplyKmeans(km_path) generator, num = get_feat_iterator(feat_dir, split, nshard, rank) iterator = generator() lab_path = f"{lab_dir}/{split}_{rank}_{nshard}.km" os.makedirs(lab_dir, exist_ok=True) with open(lab_path, "w") as f: for feat in tqdm.tqdm(iterator, total=num): # feat = torch.from_numpy(feat).cuda() lab = apply_kmeans(feat).tolist() f.write(" ".join(map(str, lab)) + "\n") logger.info("finished successfully") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("feat_dir") parser.add_argument("split") parser.add_argument("km_path") parser.add_argument("nshard", type=int) parser.add_argument("rank", type=int) parser.add_argument("lab_dir") args = parser.parse_args() logging.info(str(args)) dump_label(**vars(args)) ================================================ FILE: examples/hubert/simple_kmeans/dump_mfcc_feature.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys import soundfile as sf import torch import torchaudio from feature_utils import get_path_iterator, dump_feature from fairseq.data.audio.audio_utils import get_features_or_waveform logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("dump_mfcc_feature") class MfccFeatureReader(object): def __init__(self, sample_rate): self.sample_rate = sample_rate def read_audio(self, path, ref_len=None): wav = get_features_or_waveform(path, need_waveform=True, use_sample_rate=self.sample_rate) if ref_len is not None and abs(ref_len - len(wav)) > 160: logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") return wav def get_feats(self, path, ref_len=None): x = self.read_audio(path, ref_len=ref_len) with torch.no_grad(): x = torch.from_numpy(x).float() x = x.view(1, -1) mfccs = torchaudio.compliance.kaldi.mfcc( waveform=x, sample_frequency=self.sample_rate, use_energy=False, ) # (time, freq) mfccs = mfccs.transpose(0, 1) # (freq, time) deltas = torchaudio.functional.compute_deltas(mfccs) ddeltas = torchaudio.functional.compute_deltas(deltas) concat = torch.cat([mfccs, deltas, ddeltas], dim=0) concat = concat.transpose(0, 1).contiguous() # (freq, time) return concat def main(tsv_dir, split, nshard, rank, feat_dir, sample_rate): reader = MfccFeatureReader(sample_rate) generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank) dump_feature(reader, generator, num, split, nshard, rank, feat_dir) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("tsv_dir") parser.add_argument("split") parser.add_argument("nshard", type=int) parser.add_argument("rank", type=int) parser.add_argument("feat_dir") parser.add_argument("--sample_rate", type=int, default=16000) args = parser.parse_args() logger.info(args) main(**vars(args)) ================================================ FILE: examples/hubert/simple_kmeans/dump_w2v2_feature.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys import fairseq import soundfile as sf import torch import torch.nn.functional as F from feature_utils import get_path_iterator, dump_feature logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("dump_w2v2_feature") class Wav2Vec2FeatureReader(object): def __init__(self, ckpt_path, layer, max_chunk=1600000): ( model, cfg, task, ) = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) self.model = model[0].eval().cuda() self.task = task self.layer = layer # assume this is 1-based like HuBERT self.max_chunk = max_chunk logger.info(f"TASK CONFIG:\n{self.task.cfg}") logger.info(f" max_chunk = {self.max_chunk}") logger.info(f" model:\n{self.model}") def read_audio(self, path, ref_len=None): wav, sr = sf.read(path) assert sr == self.task.cfg.sample_rate, sr if wav.ndim == 2: wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim if ref_len is not None and abs(ref_len - len(wav)) > 160: logging.warning(f"ref {ref_len} != read {len(wav)} ({path})") return wav def get_feats(self, path, ref_len=None): x = self.read_audio(path, ref_len) with torch.no_grad(): x = torch.from_numpy(x).float().cuda() if self.task.cfg.normalize: x = F.layer_norm(x, x.shape) x = x.view(1, -1) feat = [] for start in range(0, x.size(1), self.max_chunk): x_chunk = x[:, start: start + self.max_chunk] res = self.model.extract_features( source=x_chunk, padding_mask=None, mask=False, layer=self.layer - 1, ) feat_chunk = res["x"] feat.append(feat_chunk) return torch.cat(feat, 1).squeeze(0) def main(tsv_dir, split, ckpt_path, layer, nshard, rank, feat_dir, max_chunk): reader = Wav2Vec2FeatureReader(ckpt_path, layer, max_chunk) generator, num = get_path_iterator(f"{tsv_dir}/{split}.tsv", nshard, rank) dump_feature(reader, generator, num, split, nshard, rank, feat_dir) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("tsv_dir") parser.add_argument("split") parser.add_argument("ckpt_path") parser.add_argument("layer", type=int) parser.add_argument("nshard", type=int) parser.add_argument("rank", type=int) parser.add_argument("feat_dir") parser.add_argument("--max_chunk", type=int, default=1600000) args = parser.parse_args() logger.info(args) main(**vars(args)) ================================================ FILE: examples/hubert/simple_kmeans/feature_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys import tqdm from npy_append_array import NpyAppendArray logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("feature_utils") def get_shard_range(tot, nshard, rank): assert rank < nshard and rank >= 0, f"invaid rank/nshard {rank}/{nshard}" start = round(tot / nshard * rank) end = round(tot / nshard * (rank + 1)) assert start < end, f"start={start}, end={end}" logger.info( f"rank {rank} of {nshard}, process {end-start} " f"({start}-{end}) out of {tot}" ) return start, end def get_path_iterator(tsv, nshard, rank): with open(tsv, "r") as f: root = f.readline().rstrip() lines = [line.rstrip() for line in f] start, end = get_shard_range(len(lines), nshard, rank) lines = lines[start:end] def iterate(): for line in lines: subpath, nsample = line.split("\t") yield f"{root}/{subpath}", int(nsample) return iterate, len(lines) def dump_feature(reader, generator, num, split, nshard, rank, feat_dir): iterator = generator() feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" os.makedirs(feat_dir, exist_ok=True) if os.path.exists(feat_path): os.remove(feat_path) feat_f = NpyAppendArray(feat_path) with open(leng_path, "w") as leng_f: for path, nsample in tqdm.tqdm(iterator, total=num): feat = reader.get_feats(path, nsample) feat_f.append(feat.cpu().numpy()) leng_f.write(f"{len(feat)}\n") logger.info("finished successfully") ================================================ FILE: examples/hubert/simple_kmeans/learn_kmeans.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys import numpy as np from sklearn.cluster import MiniBatchKMeans import joblib logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("learn_kmeans") def get_km_model( n_clusters, init, max_iter, batch_size, tol, max_no_improvement, n_init, reassignment_ratio, ): return MiniBatchKMeans( n_clusters=n_clusters, init=init, max_iter=max_iter, batch_size=batch_size, verbose=1, compute_labels=False, tol=tol, max_no_improvement=max_no_improvement, init_size=None, n_init=n_init, reassignment_ratio=reassignment_ratio, ) def load_feature_shard(feat_dir, split, nshard, rank, percent): feat_path = f"{feat_dir}/{split}_{rank}_{nshard}.npy" leng_path = f"{feat_dir}/{split}_{rank}_{nshard}.len" with open(leng_path, "r") as f: lengs = [int(line.rstrip()) for line in f] offsets = [0] + np.cumsum(lengs[:-1]).tolist() if percent < 0: return np.load(feat_path, mmap_mode="r") else: nsample = int(np.ceil(len(lengs) * percent)) indices = np.random.choice(len(lengs), nsample, replace=False) feat = np.load(feat_path, mmap_mode="r") sampled_feat = np.concatenate( [feat[offsets[i]: offsets[i] + lengs[i]] for i in indices], axis=0 ) logger.info( ( f"sampled {nsample} utterances, {len(sampled_feat)} frames " f"from shard {rank}/{nshard}" ) ) return sampled_feat def load_feature(feat_dir, split, nshard, seed, percent): assert percent <= 1.0 feat = np.concatenate( [ load_feature_shard(feat_dir, split, nshard, r, percent) for r in range(nshard) ], axis=0, ) logging.info(f"loaded feature with dimension {feat.shape}") return feat def learn_kmeans( feat_dir, split, nshard, km_path, n_clusters, seed, percent, init, max_iter, batch_size, tol, n_init, reassignment_ratio, max_no_improvement, ): np.random.seed(seed) feat = load_feature(feat_dir, split, nshard, seed, percent) km_model = get_km_model( n_clusters, init, max_iter, batch_size, tol, max_no_improvement, n_init, reassignment_ratio, ) km_model.fit(feat) joblib.dump(km_model, km_path) inertia = -km_model.score(feat) / len(feat) logger.info("total intertia: %.5f", inertia) logger.info("finished successfully") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("feat_dir", type=str) parser.add_argument("split", type=str) parser.add_argument("nshard", type=int) parser.add_argument("km_path", type=str) parser.add_argument("n_clusters", type=int) parser.add_argument("--seed", default=0, type=int) parser.add_argument( "--percent", default=-1, type=float, help="sample a subset; -1 for all" ) parser.add_argument("--init", default="k-means++") parser.add_argument("--max_iter", default=100, type=int) parser.add_argument("--batch_size", default=10000, type=int) parser.add_argument("--tol", default=0.0, type=float) parser.add_argument("--max_no_improvement", default=100, type=int) parser.add_argument("--n_init", default=20, type=int) parser.add_argument("--reassignment_ratio", default=0.0, type=float) args = parser.parse_args() logging.info(str(args)) learn_kmeans(**vars(args)) ================================================ FILE: examples/hubert/tests/sample.base.L9.km500.km ================================================ 17 17 17 17 296 296 20 20 20 461 461 20 184 20 20 20 184 289 144 445 445 213 213 213 213 252 215 129 401 20 354 180 494 44 416 416 416 192 192 180 180 84 84 84 16 88 88 88 88 319 242 240 348 35 35 117 404 197 226 209 83 55 55 55 322 67 94 199 118 118 118 118 118 118 402 219 219 219 222 222 222 353 59 245 245 251 251 241 241 431 367 367 178 35 35 35 458 192 351 41 324 324 324 252 464 464 139 139 424 424 424 497 497 497 122 90 42 42 147 380 380 499 319 319 319 348 348 33 33 394 90 76 465 74 425 425 386 386 431 319 319 319 319 319 240 203 53 473 34 340 340 340 340 116 64 212 384 377 123 123 123 216 216 216 114 114 57 57 57 203 381 381 117 48 13 47 80 20 80 80 320 7 7 364 345 141 141 141 141 281 281 9 86 221 198 198 22 283 455 236 239 239 107 107 395 286 286 286 468 468 406 406 467 176 176 176 328 200 200 248 464 145 365 365 365 365 330 385 457 77 77 77 54 224 300 334 334 382 304 304 271 186 31 342 342 342 198 22 283 5 38 162 232 232 482 68 26 26 359 359 81 444 213 213 252 143 458 41 324 324 324 422 143 445 445 445 351 180 486 315 315 450 450 450 203 53 473 291 89 116 379 243 478 478 66 482 482 105 105 336 336 354 29 498 498 498 498 396 396 313 37 314 198 22 222 222 222 222 245 129 74 74 437 437 496 496 496 413 94 199 41 41 324 324 318 318 269 342 9 168 106 106 284 426 426 426 426 348 64 76 401 259 108 123 153 153 153 153 372 372 396 313 24 314 90 401 259 445 445 351 351 365 365 365 365 282 282 215 233 233 229 427 20 247 126 126 126 326 326 326 326 326 326 326 101 101 101 149 228 228 20 289 20 7 217 70 65 189 189 151 240 285 300 300 495 406 467 176 135 135 339 248 466 114 222 222 222 313 313 239 384 371 490 490 38 31 54 54 224 494 494 236 129 259 74 190 487 288 288 288 288 374 173 173 280 280 302 302 175 175 69 69 223 130 129 401 75 108 119 295 295 295 295 143 192 192 135 135 135 135 200 200 464 255 255 255 251 251 241 431 235 235 235 348 348 465 192 44 44 236 8 8 354 319 319 383 348 36 310 107 107 395 462 462 8 32 32 32 354 153 153 153 153 153 387 387 387 387 85 207 318 318 318 49 453 9 168 125 125 125 125 125 466 199 44 44 143 129 144 445 351 351 351 486 486 460 285 285 302 302 497 497 122 239 161 161 79 79 499 499 499 265 265 265 85 85 85 299 299 173 352 352 427 229 170 247 15 15 15 15 15 15 193 193 193 17 ================================================ FILE: examples/hubert/tests/sample.base.L9.len ================================================ 596 ================================================ FILE: examples/hubert/tests/sample.large.L20.len ================================================ 596 ================================================ FILE: examples/hubert/tests/sample.large.hypo.word ================================================ KEEP A GOING AN IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0) ================================================ FILE: examples/hubert/tests/sample.xlarge.L30.len ================================================ 596 ================================================ FILE: examples/hubert/tests/sample.xlarge.hypo.word ================================================ KEEP A GOIN AND IF YOU'RE LUCKY YOU'LL RUN PLUMB INTO THEM WAS THE JEERING ANSWER AS THE SLEEPY COWMEN SPURRED THEIR PONIES ON TOWARD CAMP MUTTERING THEIR DISAPPROVAL OF TAKING ALONG A BUNCH OF BOYS ON A CATTLE DRIVE (None-0) ================================================ FILE: examples/hubert/tests/test_feature_and_unit.sh ================================================ #!/bin/bash set -e sizes="base large xlarge" declare -A ckpt_urls ckpt_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt" ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k.pt" ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k.pt" declare -A km_layers km_layers[base]=9 km_layers[large]=20 km_layers[xlarge]=30 declare -A km_urls km_urls[base]="https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960_L9_km500.bin" declare -A km_nunits km_nunits[base]=500 test_dir=./examples/hubert/tests split=sample echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv" check_feature () { echo "checking features..." size=$1 ckpt_url=$2 km_layer=$3 ckpt_path="$test_dir/$(basename "$ckpt_url")" if [ ! -f "$ckpt_path" ]; then echo "downloading $ckpt_url to $ckpt_path" wget "$ckpt_url" -O "$ckpt_path" fi python ./examples/hubert/simple_kmeans/dump_hubert_feature.py \ "${test_dir}" "${split}" "${ckpt_path}" "${km_layer}" 1 0 "${test_dir}" if diff -q "${test_dir}/${split}.${size}.L${km_layer}.npy" "${test_dir}/${split}_0_1.npy" &>/dev/null; then echo "...passed npy check" else echo "...failed npy check" fi if diff -q "${test_dir}/${split}.${size}.L${km_layer}.len" "${test_dir}/${split}_0_1.len" &>/dev/null; then echo "...passed len check" else echo "...failed len check" fi } check_unit () { echo "checking units..." size=$1 km_url=$2 km_layer=$3 km_nunit=$4 km_path="$test_dir/$(basename "$km_url")" if [ ! -f "$km_path" ]; then echo "downloading $km_url to $km_path" wget "$km_url" -O "$km_path" fi python ./examples/hubert/simple_kmeans/dump_km_label.py \ "${test_dir}" "${split}" "${km_path}" 1 0 "${test_dir}" if diff -q "${test_dir}/${split}.${size}.L${km_layer}.km${km_nunit}.km" "${test_dir}/${split}_0_1.km" &>/dev/null; then echo "...passed unit check" else echo "...failed unit check" fi } for size in $sizes; do echo "=== Running unit test for HuBERT $size ===" check_feature "$size" "${ckpt_urls[$size]}" "${km_layers[$size]}" if [ -n "${km_urls[$size]}" ]; then check_unit "$size" "${km_urls[$size]}" "${km_layers[$size]}" "${km_nunits[$size]}" fi rm -f $test_dir/${split}_0_1.* done ================================================ FILE: examples/hubert/tests/test_finetuned_asr.sh ================================================ #!/bin/bash set -e sizes="large xlarge" declare -A ckpt_urls ckpt_urls[large]="https://dl.fbaipublicfiles.com/hubert/hubert_large_ll60k_finetune_ls960.pt" ckpt_urls[xlarge]="https://dl.fbaipublicfiles.com/hubert/hubert_xtralarge_ll60k_finetune_ls960.pt" test_dir=$(pwd)/examples/hubert/tests split=sample echo -e "${test_dir}\n6313-76958-0021.flac\t190800" > "${test_dir}/${split}.tsv" echo -e "K E E P | A | G O I N G | A N D | I F | Y O U ' R E | L U C K Y | Y O U ' L L | R U N | P L U M B | I N T O | T H E M | W A S | T H E | J E E R I N G | A N S W E R | A S | T H E | S L E E P Y | C O W M E N | S P U R R E D | T H E I R | P O N I E S | O N | T O W A R D | C A M P | M U T T E R I N G | T H E I R | D I S A P P R O V A L | O F | T A K I N G | A L O N G | A | B U N C H | O F | B O Y S | O N | A | C A T T L E | D R I V E |" > "${test_dir}/${split}.ltr" check_asr () { echo "checking asr outputs..." size=$1 ckpt_url=$2 ckpt_path="$test_dir/$(basename "$ckpt_url")" if [ ! -f "$ckpt_path" ]; then echo "downloading $ckpt_url to $ckpt_path" wget "$ckpt_url" -O "$ckpt_path" fi python examples/speech_recognition/new/infer.py \ --config-dir examples/hubert/config/decode --config-name infer_viterbi \ common_eval.path="${ckpt_path}" task.data="${test_dir}" task.normalize=true \ decoding.results_path="${test_dir}/pred" \ common_eval.results_path="${test_dir}/pred" \ common_eval.quiet=false dataset.gen_subset="${split}" if diff -q "${test_dir}/pred/hypo.word" "${test_dir}/${split}.${size}.hypo.word" &>/dev/null; then echo "...passed word check" else echo "...failed word check" fi rm -rf "${test_dir}/pred" } for size in $sizes; do check_asr "$size" "${ckpt_urls[$size]}" done ================================================ FILE: examples/hubert/update_ckpt.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch src_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2.pt" ref_ckpt = "/checkpoint/wnhsu/w2v/hubert_icassp_oss_v3/iter2_km100-400k-grp-L6/oss.km500_p0_1_s334.pmw1_0.puw0_0.grpnorm.ml10.mp0_8.untie.mxsz250000.ufreq1.maxtok1400000.MU100k.s1337.ngpu32/checkpoint_last.pt" new_ckpt = "/checkpoint/wnhsu/w2v/archived/hubert_base_ls960_it2_updated.pt" def update_state(state): state["model"]["label_embs_concat"] = state["model"].pop("label_embs") state["args"].task = "hubert_pretraining" state["args"].labels = f"['{state['args'].labels}']" return state src_state = torch.load(src_ckpt) src_state = update_state(src_state) torch.save(src_state, new_ckpt) ================================================ FILE: examples/joint_alignment_translation/README.md ================================================ # Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019) This page includes instructions for training models described in [Jointly Learning to Align and Translate with Transformer Models (Garg et al., 2019)](https://arxiv.org/abs/1909.02074). ## Training a joint alignment-translation model on WMT'18 En-De ##### 1. Extract and preprocess the WMT'18 En-De data ```bash ./prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh ``` ##### 2. Generate alignments from statistical alignment toolkits e.g. Giza++/FastAlign. In this example, we use FastAlign. ```bash git clone git@github.com:clab/fast_align.git pushd fast_align mkdir build cd build cmake .. make popd ALIGN=fast_align/build/fast_align paste bpe.32k/train.en bpe.32k/train.de | awk -F '\t' '{print $1 " ||| " $2}' > bpe.32k/train.en-de $ALIGN -i bpe.32k/train.en-de -d -o -v > bpe.32k/train.align ``` ##### 3. Preprocess the dataset with the above generated alignments. ```bash fairseq-preprocess \ --source-lang en --target-lang de \ --trainpref bpe.32k/train \ --validpref bpe.32k/valid \ --testpref bpe.32k/test \ --align-suffix align \ --destdir binarized/ \ --joined-dictionary \ --workers 32 ``` ##### 4. Train a model ```bash fairseq-train \ binarized \ --arch transformer_wmt_en_de_big_align --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --activation-fn relu\ --lr 0.0002 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 3500 --label-smoothing 0.1 \ --save-dir ./checkpoints --log-interval 1000 --max-update 60000 \ --keep-interval-updates -1 --save-interval-updates 0 \ --load-alignments --criterion label_smoothed_cross_entropy_with_alignment \ --fp16 ``` Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer. If you want to train the above model with big batches (assuming your machine has 8 GPUs): - add `--update-freq 8` to simulate training on 8x8=64 GPUs - increase the learning rate; 0.0007 works well for big batches ##### 5. Evaluate and generate the alignments (BPE level) ```bash fairseq-generate \ binarized --gen-subset test --print-alignment \ --source-lang en --target-lang de \ --path checkpoints/checkpoint_best.pt --beam 5 --nbest 1 ``` ##### 6. Other resources. The code for: 1. preparing alignment test sets 2. converting BPE level alignments to token level alignments 3. symmetrizing bidirectional alignments 4. evaluating alignments using AER metric can be found [here](https://github.com/lilt/alignment-scripts) ## Citation ```bibtex @inproceedings{garg2019jointly, title = {Jointly Learning to Align and Translate with Transformer Models}, author = {Garg, Sarthak and Peitz, Stephan and Nallasamy, Udhyakumar and Paulik, Matthias}, booktitle = {Conference on Empirical Methods in Natural Language Processing (EMNLP)}, address = {Hong Kong}, month = {November}, url = {https://arxiv.org/abs/1909.02074}, year = {2019}, } ``` ================================================ FILE: examples/joint_alignment_translation/prepare-wmt18en2de_no_norm_no_escape_no_agressive.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl URLS=( "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz" "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz" "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz" "http://data.statmt.org/wmt18/translation-task/rapid2016.tgz" "http://data.statmt.org/wmt17/translation-task/dev.tgz" "http://statmt.org/wmt14/test-full.tgz" ) CORPORA=( "training/europarl-v7.de-en" "commoncrawl.de-en" "training-parallel-nc-v13/news-commentary-v13.de-en" "rapid2016.de-en" ) if [ ! -d "$SCRIPTS" ]; then echo "Please set SCRIPTS variable correctly to point to Moses scripts." exit fi src=en tgt=de lang=en-de prep=wmt18_en_de tmp=$prep/tmp orig=orig dev=dev/newstest2012 codes=32000 bpe=bpe.32k mkdir -p $orig $tmp $prep $bpe cd $orig for ((i=0;i<${#URLS[@]};++i)); do url=${URLS[i]} file=$(basename $url) if [ -f $file ]; then echo "$file already exists, skipping download" else wget "$url" if [ -f $file ]; then echo "$url successfully downloaded." else echo "$url not successfully downloaded." exit 1 fi if [ ${file: -4} == ".tgz" ]; then tar zxvf $file elif [ ${file: -4} == ".tar" ]; then tar xvf $file fi fi done cd .. echo "pre-processing train data..." for l in $src $tgt; do rm -rf $tmp/train.tags.$lang.tok.$l for f in "${CORPORA[@]}"; do cat $orig/$f.$l | \ perl $REM_NON_PRINT_CHAR | \ perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/train.tags.$lang.tok.$l done done echo "pre-processing test data..." for l in $src $tgt; do if [ "$l" == "$src" ]; then t="src" else t="ref" fi grep '\s*//g' | \ sed -e 's/\s*<\/seg>\s*//g' | \ sed -e "s/\’/\'/g" | \ perl $TOKENIZER -threads 8 -l $l -no-escape > $tmp/test.$l echo "" done # apply length filtering before BPE perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train 1 100 # use newstest2012 for valid echo "pre-processing valid data..." for l in $src $tgt; do rm -rf $tmp/valid.$l cat $orig/$dev.$l | \ perl $REM_NON_PRINT_CHAR | \ perl $TOKENIZER -threads 8 -l $l -no-escape >> $tmp/valid.$l done mkdir output mv $tmp/{train,valid,test}.{$src,$tgt} output #BPE git clone https://github.com/glample/fastBPE.git pushd fastBPE g++ -std=c++11 -pthread -O3 fastBPE/main.cc -IfastBPE -o fast popd fastBPE/fast learnbpe $codes output/train.$src output/train.$tgt > $bpe/codes for split in {train,valid,test}; do for lang in {en,de}; do fastBPE/fast applybpe $bpe/$split.$lang output/$split.$lang $bpe/codes; done; done ================================================ FILE: examples/language_model/README.adaptive_inputs.md ================================================ # Adaptive Input Representations for Neural Language Modeling (Baevski and Auli, 2018) ## Pre-trained models Description | Parameters | Dataset | Model and Test set(s) ---|---:|---|--- Adaptive Inputs
([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 1026M | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2) Adaptive Inputs
([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853)) | 247M | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2) ## Training an LM with adaptive inputs First, see the general [language modeling README](README.md) for instructions on preprocessing the WikiText-103 data. Then use the following training command to train a model with adaptive inputs using the `transformer_lm_wiki103` model architecture: ```bash fairseq-train --task language_modeling \ data-bin/wikitext-103 \ --save-dir checkpoints/transformer_wikitext-103 \ --arch transformer_lm_wiki103 \ --max-update 286000 --lr 1.0 --t-mult 2 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 \ --warmup-updates 16000 --warmup-init-lr 1e-07 --stop-min-lr 1e-09 --optimizer nag --min-lr 0.0001 --clip-norm 0.1 \ --criterion adaptive_loss --max-tokens 3072 --update-freq 3 --tokens-per-sample 3072 --seed 1 \ --sample-break-mode none --skip-invalid-size-inputs-valid-test --ddp-backend=legacy_ddp ``` ## Citation ```bibtex @inproceedings{ baevski2018adaptive, title={Adaptive Input Representations for Neural Language Modeling}, author={Alexei Baevski and Michael Auli}, booktitle={International Conference on Learning Representations}, year={2019}, url={https://openreview.net/forum?id=ByxZX20qFQ}, } ``` ================================================ FILE: examples/language_model/README.conv.md ================================================ # Language Modeling with Gated Convolutional Networks (Dauphin et al., 2017) ## Example usage First download and preprocess the data following the main [language modeling README](README.md). Then to train a convolutional LM using the `fconv_lm_dauphin_wikitext103` architecture: ```bash fairseq-train --task language_modeling \ data-bin/wikitext-103 \ --save-dir checkpoints/fconv_wikitext-103 \ --arch fconv_lm_dauphin_wikitext103 \ --adaptive-softmax-cutoff 10000,20000,200000 \ --dropout 0.2 \ --criterion adaptive_loss \ --optimizer nag --clip-norm 0.1 --weight-decay 5e-06 \ --lr 1.0 --lr-scheduler reduce_lr_on_plateau --lr-shrink 0.5 \ --max-tokens 1024 --tokens-per-sample 1024 \ --ddp-backend legacy_ddp \ --max-epoch 35 ``` And evaluate with: ```bash fairseq-eval-lm data-bin/wikitext-103 --path checkpoints/fconv_wiki103/checkpoint_best.pt ``` ## Citation ```bibtex @inproceedings{dauphin2017language, title={Language Modeling with Gated Convolutional Networks}, author={Dauphin, Yann N and Fan, Angela and Auli, Michael and Grangier, David}, booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70}, pages={933--941}, year={2017}, organization={JMLR} } ``` ================================================ FILE: examples/language_model/README.md ================================================ # Neural Language Modeling ## Pre-trained models Model | Description | Dataset | Download ---|---|---|--- `transformer_lm.gbw.adaptive_huge` | Adaptive Inputs
([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853))
1026M params | [Google Billion Words](https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2) `transformer_lm.wiki103.adaptive` | Adaptive Inputs
([Baevski and Auli, 2018](https://arxiv.org/abs/1809.10853))
247M params | [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2) `transformer_lm.wmt19.en` | English LM
([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz) `transformer_lm.wmt19.de` | German LM
([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz) `transformer_lm.wmt19.ru` | Russian LM
([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) | [WMT News Crawl](http://data.statmt.org/news-crawl/) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz) ## Example usage We require a few additional Python dependencies for preprocessing: ```bash pip install fastBPE sacremoses ``` To sample from a language model using PyTorch Hub: ```python import torch # List available models torch.hub.list('pytorch/fairseq') # [..., 'transformer_lm.wmt19.en', ...] # Load an English LM trained on WMT'19 News Crawl data en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe') en_lm.eval() # disable dropout # Move model to GPU en_lm.cuda() # Sample from the language model en_lm.sample('Barack Obama', beam=1, sampling=True, sampling_topk=10, temperature=0.8) # "Barack Obama is coming to Sydney and New Zealand (...)" # Compute perplexity for a sequence en_lm.score('Barack Obama is coming to Sydney and New Zealand')['positional_scores'].mean().neg().exp() # tensor(15.1474) # The same interface can be used with custom models as well from fairseq.models.transformer_lm import TransformerLanguageModel custom_lm = TransformerLanguageModel.from_pretrained('/path/to/model/dir', 'checkpoint100.pt', tokenizer='moses', bpe='fastbpe') custom_lm.sample('Barack Obama', beam=5) # "Barack Obama (...)" ``` ## Training a transformer language model with the CLI tools ### 1) Preprocess the data First download and prepare the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/): ```bash cd examples/language_model/ bash prepare-wikitext-103.sh cd ../.. ``` Next preprocess/binarize the data: ```bash TEXT=examples/language_model/wikitext-103 fairseq-preprocess \ --only-source \ --trainpref $TEXT/wiki.train.tokens \ --validpref $TEXT/wiki.valid.tokens \ --testpref $TEXT/wiki.test.tokens \ --destdir data-bin/wikitext-103 \ --workers 20 ``` ### 2) Train a language model Next we'll train a basic transformer language model on wikitext-103. For more advanced usage, see the [adaptive inputs README](README.adaptive_inputs.md). To train a basic LM (assumes 2 GPUs): ``` $ fairseq-train --task language_modeling \ data-bin/wikitext-103 \ --save-dir checkpoints/transformer_wikitext-103 \ --arch transformer_lm --share-decoder-input-output-embed \ --dropout 0.1 \ --optimizer adam --adam-betas '(0.9, 0.98)' --weight-decay 0.01 --clip-norm 0.0 \ --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \ --tokens-per-sample 512 --sample-break-mode none \ --max-tokens 2048 --update-freq 16 \ --fp16 \ --max-update 50000 ``` If you run out of memory, try reducing `--max-tokens` (max number of tokens per batch) or `--tokens-per-sample` (max sequence length). You can also adjust `--update-freq` to accumulate gradients and simulate training on a different number of GPUs. ### 3) Evaluate ```bash fairseq-eval-lm data-bin/wikitext-103 \ --path checkpoints/transformer_wiki103/checkpoint_best.pt \ --batch-size 2 \ --tokens-per-sample 512 \ --context-window 400 # | Evaluated 245569 tokens in 56.1s (4379.02 tokens/s) # | Loss: 3.4164, Perplexity: 30.46 ``` *Note:* The `--context-window` option controls how much context is provided to each token when computing perplexity. When the window size is 0, the dataset is chunked into segments of length 512 and perplexity is computed over each segment normally. However, this results in worse (higher) perplexity since tokens that appear earlier in each segment have less conditioning. When the maximum window size is used (511 in this case), then we compute perplexity for each token fully conditioned on 511 tokens of context. This slows down evaluation significantly, since we must run a separate forward pass for every token in the dataset, but results in better (lower) perplexity. ## Convolutional language models Please see the [convolutional LM README](README.conv.md) for instructions on training convolutional language models. ================================================ FILE: examples/language_model/prepare-wikitext-103.sh ================================================ #!/bin/bash # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh URLS=( "https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip" ) FILES=( "wikitext-103-v1.zip" ) for ((i=0;i<${#URLS[@]};++i)); do file=${FILES[i]} if [ -f $file ]; then echo "$file already exists, skipping download" else url=${URLS[i]} wget "$url" if [ -f $file ]; then echo "$url successfully downloaded." else echo "$url not successfully downloaded." exit -1 fi if [ ${file: -4} == ".tgz" ]; then tar zxvf $file elif [ ${file: -4} == ".tar" ]; then tar xvf $file elif [ ${file: -4} == ".zip" ]; then unzip $file fi fi done cd .. ================================================ FILE: examples/laser/README.md ================================================ # LASER Language-Agnostic SEntence Representations LASER is a library to calculate and use multilingual sentence embeddings. You can find more information about LASER and how to use it on the official [LASER repository](https://github.com/facebookresearch/LASER). This folder contains source code for training LASER embeddings. ## Prepare data and configuration file Binarize your data with fairseq, as described [here](https://fairseq.readthedocs.io/en/latest/getting_started.html#data-pre-processing). Create a json config file with this format: ``` { "src_vocab": "/path/to/spm.src.cvocab", "tgt_vocab": "/path/to/spm.tgt.cvocab", "train": [ { "type": "translation", "id": 0, "src": "/path/to/srclang1-tgtlang0/train.srclang1", "tgt": "/path/to/srclang1-tgtlang0/train.tgtlang0" }, { "type": "translation", "id": 1, "src": "/path/to/srclang1-tgtlang1/train.srclang1", "tgt": "/path/to/srclang1-tgtlang1/train.tgtlang1" }, { "type": "translation", "id": 0, "src": "/path/to/srclang2-tgtlang0/train.srclang2", "tgt": "/path/to/srclang2-tgtlang0/train.tgtlang0" }, { "type": "translation", "id": 1, "src": "/path/to/srclang2-tgtlang1/train.srclang2", "tgt": "/path/to/srclang2-tgtlang1/train.tgtlang1" }, ... ], "valid": [ { "type": "translation", "id": 0, "src": "/unused", "tgt": "/unused" } ] } ``` where paths are paths to binarized indexed fairseq dataset files. `id` represents the target language id. ## Training Command Line Example ``` fairseq-train \ /path/to/configfile_described_above.json \ --user-dir examples/laser/laser_src \ --log-interval 100 --log-format simple \ --task laser --arch laser_lstm \ --save-dir . \ --optimizer adam \ --lr 0.001 \ --lr-scheduler inverse_sqrt \ --clip-norm 5 \ --warmup-updates 90000 \ --update-freq 2 \ --dropout 0.0 \ --encoder-dropout-out 0.1 \ --max-tokens 2000 \ --max-epoch 50 \ --encoder-bidirectional \ --encoder-layers 5 \ --encoder-hidden-size 512 \ --decoder-layers 1 \ --decoder-hidden-size 2048 \ --encoder-embed-dim 320 \ --decoder-embed-dim 320 \ --decoder-lang-embed-dim 32 \ --warmup-init-lr 0.001 \ --disable-validation ``` ## Applications We showcase several applications of multilingual sentence embeddings with code to reproduce our results (in the directory "tasks"). * [**Cross-lingual document classification**](https://github.com/facebookresearch/LASER/tree/master/tasks/mldoc) using the [*MLDoc*](https://github.com/facebookresearch/MLDoc) corpus [2,6] * [**WikiMatrix**](https://github.com/facebookresearch/LASER/tree/master/tasks/WikiMatrix) Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia [7] * [**Bitext mining**](https://github.com/facebookresearch/LASER/tree/master/tasks/bucc) using the [*BUCC*](https://comparable.limsi.fr/bucc2018/bucc2018-task.html) corpus [3,5] * [**Cross-lingual NLI**](https://github.com/facebookresearch/LASER/tree/master/tasks/xnli) using the [*XNLI*](https://www.nyu.edu/projects/bowman/xnli/) corpus [4,5,6] * [**Multilingual similarity search**](https://github.com/facebookresearch/LASER/tree/master/tasks/similarity) [1,6] * [**Sentence embedding of text files**](https://github.com/facebookresearch/LASER/tree/master/tasks/embed) example how to calculate sentence embeddings for arbitrary text files in any of the supported language. **For all tasks, we use exactly the same multilingual encoder, without any task specific optimization or fine-tuning.** ## References [1] Holger Schwenk and Matthijs Douze, [*Learning Joint Multilingual Sentence Representations with Neural Machine Translation*](https://aclanthology.info/papers/W17-2619/w17-2619), ACL workshop on Representation Learning for NLP, 2017 [2] Holger Schwenk and Xian Li, [*A Corpus for Multilingual Document Classification in Eight Languages*](http://www.lrec-conf.org/proceedings/lrec2018/pdf/658.pdf), LREC, pages 3548-3551, 2018. [3] Holger Schwenk, [*Filtering and Mining Parallel Data in a Joint Multilingual Space*](http://aclweb.org/anthology/P18-2037) ACL, July 2018 [4] Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk and Veselin Stoyanov, [*XNLI: Cross-lingual Sentence Understanding through Inference*](https://aclweb.org/anthology/D18-1269), EMNLP, 2018. [5] Mikel Artetxe and Holger Schwenk, [*Margin-based Parallel Corpus Mining with Multilingual Sentence Embeddings*](https://arxiv.org/abs/1811.01136) arXiv, Nov 3 2018. [6] Mikel Artetxe and Holger Schwenk, [*Massively Multilingual Sentence Embeddings for Zero-Shot Cross-Lingual Transfer and Beyond*](https://arxiv.org/abs/1812.10464) arXiv, Dec 26 2018. [7] Holger Schwenk, Vishrav Chaudhary, Shuo Sun, Hongyu Gong and Paco Guzman, [*WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia*](https://arxiv.org/abs/1907.05791) arXiv, July 11 2019. [8] Holger Schwenk, Guillaume Wenzek, Sergey Edunov, Edouard Grave and Armand Joulin [*CCMatrix: Mining Billions of High-Quality Parallel Sentences on the WEB*](https://arxiv.org/abs/1911.04944) ================================================ FILE: examples/laser/laser_src/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .laser_task import * # noqa from .laser_lstm import * # noqa from .laser_transformer import * # noqa ================================================ FILE: examples/laser/laser_src/laser_lstm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F from fairseq import options, utils from fairseq.models import ( FairseqEncoder, FairseqIncrementalDecoder, FairseqEncoderDecoderModel, register_model, register_model_architecture, ) @register_model("laser_lstm") class LSTMModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) def forward( self, src_tokens, src_lengths, prev_output_tokens=None, tgt_tokens=None, tgt_lengths=None, target_language_id=None, dataset_name="", ): assert target_language_id is not None src_encoder_out = self.encoder(src_tokens, src_lengths, dataset_name) return self.decoder( prev_output_tokens, src_encoder_out, lang_id=target_language_id ) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--dropout", default=0.1, type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-embed-path", default=None, type=str, metavar="STR", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-hidden-size", type=int, metavar="N", help="encoder hidden size" ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="number of encoder layers" ) parser.add_argument( "--encoder-bidirectional", action="store_true", help="make all layers of encoder bidirectional", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-embed-path", default=None, type=str, metavar="STR", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-hidden-size", type=int, metavar="N", help="decoder hidden size" ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="number of decoder layers" ) parser.add_argument( "--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension", ) parser.add_argument( "--decoder-zero-init", type=str, metavar="BOOL", help="initialize the decoder hidden/cell state to zero", ) parser.add_argument( "--decoder-lang-embed-dim", type=int, metavar="N", help="decoder language embedding dimension", ) parser.add_argument( "--fixed-embeddings", action="store_true", help="keep embeddings fixed (ENCODER ONLY)", ) # TODO Also apply to decoder embeddings? # Granular dropout settings (if not specified these default to --dropout) parser.add_argument( "--encoder-dropout-in", type=float, metavar="D", help="dropout probability for encoder input embedding", ) parser.add_argument( "--encoder-dropout-out", type=float, metavar="D", help="dropout probability for encoder output", ) parser.add_argument( "--decoder-dropout-in", type=float, metavar="D", help="dropout probability for decoder input embedding", ) parser.add_argument( "--decoder-dropout-out", type=float, metavar="D", help="dropout probability for decoder output", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) pretrained_encoder_embed = None if args.encoder_embed_path: pretrained_encoder_embed = load_pretrained_embedding_from_file( args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim ) pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim ) num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0 encoder = LSTMEncoder( dictionary=task.source_dictionary, embed_dim=args.encoder_embed_dim, hidden_size=args.encoder_hidden_size, num_layers=args.encoder_layers, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, bidirectional=args.encoder_bidirectional, pretrained_embed=pretrained_encoder_embed, fixed_embeddings=args.fixed_embeddings, ) decoder = LSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, zero_init=options.eval_bool(args.decoder_zero_init), encoder_embed_dim=args.encoder_embed_dim, encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, num_langs=num_langs, lang_embed_dim=args.decoder_lang_embed_dim, ) return cls(encoder, decoder) class LSTMEncoder(FairseqEncoder): """LSTM encoder.""" def __init__( self, dictionary, embed_dim=512, hidden_size=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, left_pad=True, pretrained_embed=None, padding_value=0.0, fixed_embeddings=False, ): super().__init__(dictionary) self.num_layers = num_layers self.dropout_in = dropout_in self.dropout_out = dropout_out self.bidirectional = bidirectional self.hidden_size = hidden_size num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) else: self.embed_tokens = pretrained_embed if fixed_embeddings: self.embed_tokens.weight.requires_grad = False self.lstm = LSTM( input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=self.dropout_out if num_layers > 1 else 0.0, bidirectional=bidirectional, ) self.left_pad = left_pad self.padding_value = padding_value self.output_units = hidden_size if bidirectional: self.output_units *= 2 def forward(self, src_tokens, src_lengths, dataset_name): if self.left_pad: # convert left-padding to right-padding src_tokens = utils.convert_padding_direction( src_tokens, self.padding_idx, left_to_right=True, ) bsz, seqlen = src_tokens.size() # embed tokens x = self.embed_tokens(src_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # pack embedded source tokens into a PackedSequence try: packed_x = nn.utils.rnn.pack_padded_sequence(x, src_lengths.data.tolist()) except BaseException: raise Exception(f"Packing failed in dataset {dataset_name}") # apply LSTM if self.bidirectional: state_size = 2 * self.num_layers, bsz, self.hidden_size else: state_size = self.num_layers, bsz, self.hidden_size h0 = x.data.new(*state_size).zero_() c0 = x.data.new(*state_size).zero_() packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_value ) x = F.dropout(x, p=self.dropout_out, training=self.training) assert list(x.size()) == [seqlen, bsz, self.output_units] if self.bidirectional: def combine_bidir(outs): return torch.cat( [ torch.cat([outs[2 * i], outs[2 * i + 1]], dim=0).view( 1, bsz, self.output_units ) for i in range(self.num_layers) ], dim=0, ) final_hiddens = combine_bidir(final_hiddens) final_cells = combine_bidir(final_cells) encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # Set padded outputs to -inf so they are not selected by max-pooling padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1) if padding_mask.any(): x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x) # Build the sentence embedding by max-pooling over the encoder outputs sentemb = x.max(dim=0)[0] return { "sentemb": sentemb, "encoder_out": (x, final_hiddens, final_cells), "encoder_padding_mask": encoder_padding_mask if encoder_padding_mask.any() else None, } def reorder_encoder_out(self, encoder_out_dict, new_order): encoder_out_dict["sentemb"] = encoder_out_dict["sentemb"].index_select( 0, new_order ) encoder_out_dict["encoder_out"] = tuple( eo.index_select(1, new_order) for eo in encoder_out_dict["encoder_out"] ) if encoder_out_dict["encoder_padding_mask"] is not None: encoder_out_dict["encoder_padding_mask"] = encoder_out_dict[ "encoder_padding_mask" ].index_select(1, new_order) return encoder_out_dict def max_positions(self): """Maximum input length supported by the encoder.""" return int(1e5) # an arbitrary large number class LSTMDecoder(FairseqIncrementalDecoder): """LSTM decoder.""" def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, zero_init=False, encoder_embed_dim=512, encoder_output_units=512, pretrained_embed=None, num_langs=1, lang_embed_dim=0, ): super().__init__(dictionary) self.dropout_in = dropout_in self.dropout_out = dropout_out self.hidden_size = hidden_size num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.layers = nn.ModuleList( [ LSTMCell( input_size=encoder_output_units + embed_dim + lang_embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ] ) if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) if zero_init: self.sentemb2init = None else: self.sentemb2init = Linear( encoder_output_units, 2 * num_layers * hidden_size ) if lang_embed_dim == 0: self.embed_lang = None else: self.embed_lang = nn.Embedding(num_langs, lang_embed_dim) nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1) def forward( self, prev_output_tokens, encoder_out_dict, incremental_state=None, lang_id=0 ): sentemb = encoder_out_dict["sentemb"] encoder_out = encoder_out_dict["encoder_out"] if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] bsz, seqlen = prev_output_tokens.size() # get outputs from encoder encoder_outs, _, _ = encoder_out[:3] srclen = encoder_outs.size(0) # embed tokens x = self.embed_tokens(prev_output_tokens) x = F.dropout(x, p=self.dropout_in, training=self.training) # embed language identifier if self.embed_lang is not None: lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id) langemb = self.embed_lang(lang_ids) # TODO Should we dropout here??? # B x T x C -> T x B x C x = x.transpose(0, 1) # initialize previous states (or get from cache during incremental generation) cached_state = utils.get_incremental_state( self, incremental_state, "cached_state" ) if cached_state is not None: prev_hiddens, prev_cells, input_feed = cached_state else: num_layers = len(self.layers) if self.sentemb2init is None: prev_hiddens = [ x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers) ] prev_cells = [ x.data.new(bsz, self.hidden_size).zero_() for i in range(num_layers) ] else: init = self.sentemb2init(sentemb) prev_hiddens = [ init[:, (2 * i) * self.hidden_size : (2 * i + 1) * self.hidden_size] for i in range(num_layers) ] prev_cells = [ init[ :, (2 * i + 1) * self.hidden_size : (2 * i + 2) * self.hidden_size, ] for i in range(num_layers) ] input_feed = x.data.new(bsz, self.hidden_size).zero_() attn_scores = x.data.new(srclen, seqlen, bsz).zero_() outs = [] for j in range(seqlen): if self.embed_lang is None: input = torch.cat((x[j, :, :], sentemb), dim=1) else: input = torch.cat((x[j, :, :], sentemb, langemb), dim=1) for i, rnn in enumerate(self.layers): # recurrent cell hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i])) # hidden state becomes the input to the next layer input = F.dropout(hidden, p=self.dropout_out, training=self.training) # save state for next time step prev_hiddens[i] = hidden prev_cells[i] = cell out = hidden out = F.dropout(out, p=self.dropout_out, training=self.training) # input feeding input_feed = out # save final output outs.append(out) # cache previous states (no-op except during incremental generation) utils.set_incremental_state( self, incremental_state, "cached_state", (prev_hiddens, prev_cells, input_feed), ) # collect outputs across time steps x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size) # T x B x C -> B x T x C x = x.transpose(1, 0) # srclen x tgtlen x bsz -> bsz x tgtlen x srclen attn_scores = attn_scores.transpose(0, 2) # project back to size of vocabulary if hasattr(self, "additional_fc"): x = self.additional_fc(x) x = F.dropout(x, p=self.dropout_out, training=self.training) x = self.fc_out(x) return x, attn_scores def reorder_incremental_state(self, incremental_state, new_order): super().reorder_incremental_state(incremental_state, new_order) cached_state = utils.get_incremental_state( self, incremental_state, "cached_state" ) if cached_state is None: return def reorder_state(state): if isinstance(state, list): return [reorder_state(state_i) for state_i in state] return state.index_select(0, new_order) new_state = tuple(map(reorder_state, cached_state)) utils.set_incremental_state(self, incremental_state, "cached_state", new_state) def max_positions(self): """Maximum output length supported by the decoder.""" return int(1e5) # an arbitrary large number def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.uniform_(m.weight, -0.1, 0.1) nn.init.constant_(m.weight[padding_idx], 0) return m def LSTM(input_size, hidden_size, **kwargs): m = nn.LSTM(input_size, hidden_size, **kwargs) for name, param in m.named_parameters(): if "weight" in name or "bias" in name: param.data.uniform_(-0.1, 0.1) return m def LSTMCell(input_size, hidden_size, **kwargs): m = nn.LSTMCell(input_size, hidden_size, **kwargs) for name, param in m.named_parameters(): if "weight" in name or "bias" in name: param.data.uniform_(-0.1, 0.1) return m def Linear(in_features, out_features, bias=True, dropout=0): """Weight-normalized Linear layer (input: N x T x C)""" m = nn.Linear(in_features, out_features, bias=bias) m.weight.data.uniform_(-0.1, 0.1) if bias: m.bias.data.uniform_(-0.1, 0.1) return m @register_model_architecture("laser_lstm", "laser_lstm") def base_architecture(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_hidden_size = getattr( args, "encoder_hidden_size", args.encoder_embed_dim ) args.encoder_layers = getattr(args, "encoder_layers", 1) args.encoder_bidirectional = getattr(args, "encoder_bidirectional", False) args.encoder_dropout_in = getattr(args, "encoder_dropout_in", args.dropout) args.encoder_dropout_out = getattr(args, "encoder_dropout_out", args.dropout) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_hidden_size = getattr( args, "decoder_hidden_size", args.decoder_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 1) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512) args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout) args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout) args.decoder_zero_init = getattr(args, "decoder_zero_init", "0") args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0) args.fixed_embeddings = getattr(args, "fixed_embeddings", False) ================================================ FILE: examples/laser/laser_src/laser_task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import OrderedDict, defaultdict import json import os import logging from argparse import ArgumentError from fairseq import options, models from fairseq.data import ( data_utils, Dictionary, LanguagePairDataset, IndexedDataset, FairseqDataset, ) from .multitask_data_utils import ( MultitaskDatasetWrapper, MultidatasetEpochBatchIterator, ) from fairseq.tasks import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) @register_task("laser") class LaserTask(LegacyFairseqTask): @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument( "configfile", metavar="PATH", help="dataset configuration file in json" ) parser.add_argument( "--weighting-alpha", type=float, default=None, help="alpha for automatic weighting", ) parser.add_argument( "--raw-text", action="store_true", help="load raw text dataset" ) parser.add_argument( "--left-pad-source", default="True", type=str, metavar="BOOL", help="pad the source on the left (default: True)", ) parser.add_argument( "--left-pad-target", default="False", type=str, metavar="BOOL", help="pad the target on the left (default: False)", ) try: parser.add_argument( "--max-source-positions", default=1024, type=int, metavar="N", help="max number of tokens in the source sequence", ) parser.add_argument( "--max-target-positions", default=1024, type=int, metavar="N", help="max number of tokens in the target sequence", ) except ArgumentError: # this might have already been defined. Once we transition this to hydra it should be fine to add it here. pass def __init__(self, args, config, src_dictionary, tgt_dictionary, num_tasks): super().__init__(args) self.config = config self.src_dictionary = src_dictionary self.tgt_dictionary = tgt_dictionary self.num_tasks = num_tasks @classmethod def setup_task(cls, args, **kwargs): with open(args.configfile, "r") as f: config = json.load(f) num_tasks = max(dataset["id"] for dataset in config["train"]) + 1 args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) src_dictionary = Dictionary.load(config["src_vocab"]) tgt_dictionary = Dictionary.load(config["tgt_vocab"]) logger.info( "| src Dictionary {} : {} types".format( config["src_vocab"], len(src_dictionary) ) ) logger.info( "| tgt Dictionary {} : {} types".format( config["tgt_vocab"], len(tgt_dictionary) ) ) return cls(args, config, src_dictionary, tgt_dictionary, num_tasks) # Experimental overriding for backtranslation def build_model(self, args, from_checkpoint=False): model = models.build_model(args, self) return model def dataset(self, split): if split not in self.datasets: raise KeyError("Dataset not loaded: " + split) return self.datasets[split] def load_dataset(self, split, epoch=1, **kwargs): """Load a dataset split.""" def indexed_dataset(path, dictionary): if self.args.raw_text: raise Exception("Unable to handle raw text.") dataset = IndexedDataset(path, fix_lua_indexing=True) return dataset pair_datasets = OrderedDict() if split == "valid": self.datasets[split] = pair_datasets return if split not in self.config: raise FileNotFoundError( "Dataset not found in config file: {}".format(split) ) size_by_corpus = defaultdict(int) size_sum = 0 size_sum_with_subsampling = 0 init_pair_datasets = {} for dataset_config in self.config[split]: src_path = os.path.dirname(dataset_config["src"]) corpus_name = src_path.split("/")[-2] language_pair_name = src_path.split("/")[-1] pair_datasets_key = corpus_name + "-" + language_pair_name logger.info(f"loading... {pair_datasets_key}") if "src" in dataset_config: src_dataset = indexed_dataset( dataset_config["src"], self.src_dictionary ) else: src_dataset = None if "tgt" in dataset_config: tgt_dataset = indexed_dataset( dataset_config["tgt"], self.tgt_dictionary ) else: tgt_dataset = None dataset = LanguagePairDataset( src_dataset, src_dataset.sizes, self.src_dictionary, tgt_dataset, tgt_dataset.sizes, self.tgt_dictionary, left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, ) if pair_datasets_key in init_pair_datasets: logger.warning( f"Ignoring already added {pair_datasets_key}. " f"Consider using `sample` key in order to upsample." ) else: init_pair_datasets[pair_datasets_key] = { "dataset": dataset, "sample": dataset_config.get("sample", None), "id": dataset_config.get("id", None), "len": len(dataset), } length_sum = 0 weighted_freqs_sum = 0 freq_per_dataset = {} vmax = 0 vmin = 1 weighted_freq_per_dataset = {} if self.args.weighting_alpha: for key in init_pair_datasets: if init_pair_datasets[key]["sample"] is None: length_sum += len(init_pair_datasets[key]["dataset"]) for key in init_pair_datasets: if init_pair_datasets[key]["sample"] is None: val = float(init_pair_datasets[key]["len"]) / length_sum freq_per_dataset[key] = val weighted_freqs_sum += val ** self.args.weighting_alpha for key in freq_per_dataset: val = ( freq_per_dataset[key] ** self.args.weighting_alpha / weighted_freqs_sum ) vmin = min(vmin, val) vmax = max(vmax, val) weighted_freq_per_dataset[key] = val for pair_datasets_key in init_pair_datasets: dataset_config = init_pair_datasets[pair_datasets_key] dataset = dataset_config["dataset"] sample = dataset_config["sample"] if sample is None: sample = 1.0 if pair_datasets_key in weighted_freq_per_dataset: w = vmax / weighted_freq_per_dataset[pair_datasets_key] sample = w sample = round(sample) initial_sample = sample initial_pair_datasets_key = pair_datasets_key while sample >= 1.0: assert ( pair_datasets_key not in pair_datasets ), f"{pair_datasets_key} already in" size_sum_with_subsampling += len(dataset) pair_datasets[pair_datasets_key] = MultitaskDatasetWrapper( dataset, dataset_config.get("id", 0), 1.0, name=pair_datasets_key ) size_sum += len(dataset) sample -= 1.0 pair_datasets_key += "-up" assert sample < 1e-6, f"sample remains > 0 {pair_datasets_key}" logger.info( f"added pair {initial_pair_datasets_key} length {len(dataset)} new_length = {len(dataset)*initial_sample}" ) size_by_corpus[corpus_name] += len(dataset) self.datasets[split] = pair_datasets logger.info( f"Datasets number = {len(self.datasets[split])} size = {size_sum} size_sum_with_subsampling = {size_sum_with_subsampling}" ) @property def source_dictionary(self): return self.src_dictionary @property def target_dictionary(self): return self.tgt_dictionary def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, grouped_shuffling=False, update_epoch_batch_itr=False, **kwargs, ): assert isinstance(dataset, OrderedDict) assert len(dataset) assert isinstance(dataset[next(iter(dataset))], FairseqDataset) # initialize the dataset with the correct starting epoch for _, dt in dataset.items(): dt.set_epoch(epoch) indices = OrderedDict() batch_sampler = OrderedDict() with data_utils.numpy_seed(seed + epoch): for key, dt in dataset.items(): logger.info(f"\t ordered_indices {key}") indices[key] = dt.ordered_indices() # filter examples that are too large if max_positions is not None: for key, dt in dataset.items(): logger.info(f"\t filter_by_size {key}") indices[key], ignored = dt.filter_indices_by_size( indices[key], max_positions ) for key, dt in dataset.items(): logger.info(f"\t batch_by_size {key}") batch_sampler[key] = data_utils.batch_by_size( indices[key], dt.num_tokens, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) epoch_iter = MultidatasetEpochBatchIterator( dataset=dataset, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) return epoch_iter ================================================ FILE: examples/laser/laser_src/laser_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import Any, Dict, List, Optional from torch import Tensor import torch import torch.nn as nn from fairseq.models import ( FairseqEncoderDecoderModel, register_model, register_model_architecture, ) from fairseq.models.transformer import ( base_architecture, Embedding, TransformerModel, TransformerEncoder, TransformerDecoder, ) from fairseq.modules import ( TransformerDecoderLayer, ) logger = logging.getLogger(__name__) @register_model("laser_transformer") class LaserTransformerModel(FairseqEncoderDecoderModel): """Train Transformer for LASER task Requires --task laser """ def __init__(self, encoder, decoder): super().__init__(encoder, decoder) def forward( self, src_tokens, src_lengths, prev_output_tokens=None, tgt_tokens=None, tgt_lengths=None, target_language_id=-1, dataset_name="", ): laser_encoder_out = self.encoder(src_tokens, src_lengths) return self.decoder( prev_output_tokens, laser_encoder_out, lang_id=target_language_id ) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" TransformerModel.add_args(parser) parser.add_argument( "--decoder-lang-embed-dim", type=int, metavar="N", help="decoder language embedding dimension", ) @classmethod def build_model(cls, args, task): base_laser_transformer_architecture(args) num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0 def load_embed_tokens(dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() return Embedding(num_embeddings, embed_dim, padding_idx) encoder_embed_tokens = load_embed_tokens( task.source_dictionary, args.encoder_embed_dim ) decoder_embed_tokens = load_embed_tokens( task.target_dictionary, args.decoder_embed_dim ) num_langs = task.num_tasks if hasattr(task, "num_tasks") else 0 encoder = LaserTransformerEncoder( args, task.source_dictionary, encoder_embed_tokens ) decoder = LaserTransformerDecoder( args, task.target_dictionary, decoder_embed_tokens, num_langs=num_langs, lang_embed_dim=args.decoder_lang_embed_dim, ) return cls(encoder, decoder) class LaserTransformerEncoder(TransformerEncoder): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def forward(self, src_tokens, *args, **kwargs): encoder_out = super().forward(src_tokens, *args, **kwargs) x = encoder_out["encoder_out"][0] # T x B x C padding_mask = src_tokens.eq(self.padding_idx).t().unsqueeze(-1) if padding_mask.any(): x = x.float().masked_fill_(padding_mask, float("-inf")).type_as(x) # Build the sentence embedding by max-pooling over the encoder outputs sentemb = x.max(dim=0)[0] # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `foward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. return {"sentemb": [sentemb]} # B x C @torch.jit.export def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): """ Same as the one in transformer.py, with new_sentemb """ if len(encoder_out["sentemb"]) == 0: new_sentemb = [] else: new_sentemb = [encoder_out["sentemb"][0].index_select(0, new_order)] return { "sentemb": new_sentemb, # B x C } class LaserTransformerDecoder(TransformerDecoder): def __init__(self, args, dictionary, *kargs, **kwargs): self.num_langs = kwargs.get("num_langs", 1) self.lang_embed_dim = kwargs.get("lang_embed_dim", 0) kwargs.pop("num_langs", None) kwargs.pop("lang_embed_dim", None) super().__init__(args, dictionary, *kargs, **kwargs, no_encoder_attn=True) if self.lang_embed_dim == 0: self.embed_lang = None else: self.embed_lang = nn.Embedding(self.num_langs, self.lang_embed_dim) nn.init.uniform_(self.embed_lang.weight, -0.1, 0.1) if self.output_projection is not None: laser_output_embed_dim = ( self.output_embed_dim + self.lang_embed_dim + args.encoder_embed_dim ) self.output_projection = nn.Linear( laser_output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=laser_output_embed_dim ** -0.5, ) def build_decoder_layer(self, args, no_encoder_attn=False): decoder_embed_dim = args.decoder_embed_dim args.decoder_embed_dim = ( decoder_embed_dim + self.lang_embed_dim + args.encoder_embed_dim ) res = TransformerDecoderLayer(args, no_encoder_attn=True) args.decoder_embed_dim = decoder_embed_dim return res def extract_features( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, lang_id: Optional[int] = None, ): """ Similar to *forward* but only return features. Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ if alignment_layer is None: alignment_layer = self.num_layers - 1 # embed positions positions = ( self.embed_positions( prev_output_tokens, incremental_state=incremental_state ) if self.embed_positions is not None else None ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] bsz, seqlen = prev_output_tokens.size() # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.quant_noise is not None: x = self.quant_noise(x) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) if self.embed_lang is not None: lang_ids = prev_output_tokens.data.new_full((bsz,), lang_id) langemb = self.embed_lang(lang_ids) langemb = langemb.unsqueeze(0) repeat_vals = [x.shape[0] // langemb.shape[0]] + [-1] * ( len(langemb.shape) - 1 ) x = torch.cat((x, langemb.expand(*repeat_vals)), dim=-1) sentemb = encoder_out["sentemb"][0] sentemb = sentemb.unsqueeze(0) repeat_vals = [x.shape[0] // sentemb.shape[0]] + [-1] * (len(sentemb.shape) - 1) x = torch.cat((x, sentemb.expand(*repeat_vals)), dim=-1) self_attn_padding_mask: Optional[Tensor] = None if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn: Optional[Tensor] = None inner_states: List[Optional[Tensor]] = [x] for idx, layer in enumerate(self.layers): if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None x, layer_attn, _ = layer( x, None, None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float().to(x) if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {"attn": [attn], "inner_states": inner_states} def forward( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, lang_id: Optional[int] = None, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ assert lang_id is not None x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, alignment_layer=alignment_layer, alignment_heads=alignment_heads, lang_id=lang_id, ) if not features_only: x = self.output_layer(x) return x, extra @register_model_architecture("laser_transformer", "laser_transformer") def base_laser_transformer_architecture(args): base_architecture(args) args.decoder_lang_embed_dim = getattr(args, "decoder_lang_embed_dim", 0) ================================================ FILE: examples/laser/laser_src/multitask_data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import OrderedDict import numpy as np from fairseq.data import BaseWrapperDataset, FairseqDataset, iterators class MultiItr(object): def __init__(self, itr): self.itr = itr self._counts = [0 for x in itr] def __len__(self): return sum(len(itr) for itr in self.itr) def __iter__(self): return self def __next__(self): ratios = [count / len(itr) for count, itr in zip(self._counts, self.itr)] idx = ratios.index(min(ratios)) self._counts[idx] += 1 return next(self.itr[idx]) class MultidatasetEpochBatchIterator(iterators.EpochBatchIterating): """A wrapper around multiple epoch batch iterators.""" def __init__( self, dataset, batch_sampler, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, ): assert isinstance(dataset, OrderedDict) assert len(dataset) assert isinstance(dataset[next(iter(dataset))], FairseqDataset) self.iterators = [] self.epoch = epoch for key, dt in dataset.items(): epoch_iter = iterators.EpochBatchIterator( dataset=dt, collate_fn=dt.collater, batch_sampler=batch_sampler[key], seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=0, epoch=epoch, ) self.iterators.append(epoch_iter) def __len__(self): return sum(len(itr) for itr in self.iterators) def next_epoch_itr(self, shuffle=True, fix_batches_to_gpus=False): # `self.epoch += 1` should be handled by underlying `EpochBatchIterator`s. return MultiItr( [ itr.next_epoch_itr( shuffle=shuffle, fix_batches_to_gpus=fix_batches_to_gpus ) for itr in self.iterators ] ) def end_of_epoch(self): return all(itr.end_of_epoch() for itr in self.iterators) @property def next_epoch_idx(self): """Return the epoch index after *next_epoch_itr* is called.""" epochs = [itr.next_epoch_idx for itr in self.iterators] self.epoch = epochs[0] assert all(epoch == self.epoch for epoch in epochs) return self.epoch @property def iterations_in_epoch(self): return sum(itr.iterations_in_epoch for itr in self.iterators) def state_dict(self): return { "iterators": [it.state_dict() for it in self.iterators], "epoch": self.epoch, } def load_state_dict(self, state_dict): self.epoch = state_dict["epoch"] for it, d in zip(self.iterators, state_dict["iterators"]): it.load_state_dict(d) class MultitaskDatasetWrapper(BaseWrapperDataset): """A wrapper for a multitask dataset.""" def __init__(self, dataset, target_language_id, sample=1.0, name=""): super().__init__(dataset) self.target_language_id = target_language_id self.sample = sample self.name = name def collater(self, *args, **kwargs): ans = self.dataset.collater(*args, **kwargs) if "net_input" in ans: ans["net_input"]["target_language_id"] = self.target_language_id ans["net_input"]["dataset_name"] = self.name return ans def num_tokens(self, *args, **kwargs): return self.dataset.num_tokens(*args, **kwargs) def ordered_indices(self, *args, **kwargs): indices = self.dataset.ordered_indices(*args, **kwargs) # Hacky solution for sampling size = int(self.sample * indices.shape[0]) return indices.take(np.sort(np.random.permutation(indices.shape[0])[:size])) def size(self, index: int): return self.dataset.size(index) @property def supports_prefetch(self): """Whether this dataset supports prefetching.""" return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): return self.dataset.prefetch(indices) ================================================ FILE: examples/latent_depth/README.md ================================================ # Deep Transformers with Latent Depth (Li et al., 2020) [https://arxiv.org/abs/2009.13102](https://arxiv.org/abs/2009.13102). ## Introduction We present a probabilistic framework to automatically learn which layer(s) to use by learning the posterior distributions of layer selection. As an extension of this framework, we propose a novel method to train one shared Transformer network for multilingual machine translation with different layer selection posteriors for each language pair. ## Training a multilingual model with latent depth Below is an example of training with latent depth in decoder for one-to-many (O2M) related languages. We use the same preprocessed (numberized and binarized) TED8 dataset as in [Balancing Training for Multilingual Neural Machine Translation (Wang et al., 2020)](https://github.com/cindyxinyiwang/multiDDS), which could be generated by [the script](https://github.com/cindyxinyiwang/multiDDS/blob/multiDDS/util_scripts/prepare_multilingual_data.sh) the author provided. ```bash lang_pairs_str="eng-aze,eng-bel,eng-ces,eng-glg,eng-por,eng-rus,eng-slk,eng-tur" databin_dir= fairseq-train ${databin_dir} \ --user-dir examples/latent_depth/latent_depth_src \ --lang-pairs "${lang_pairs_str}" \ --arch multilingual_transformer_iwslt_de_en \ --task multilingual_translation_latent_depth \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --share-encoders \ --share-decoders \ --decoder-langtok \ --share-decoder-input-output-embed \ --dropout 0.3 --attention-dropout 0.3 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --lr-scheduler inverse_sqrt --stop-min-lr 1e-9 --warmup-init-lr 1e-7 --warmup-updates 8000 \ --max-tokens 4096 --update-freq 1 \ --lr 0.0015 \ --clip-norm 1.0 \ --seed 2 \ --ddp-backend=legacy_ddp \ --encoder-layers 12 \ --decoder-layers 24 \ --decoder-latent-layer \ --sparsity-weight 0.1 \ --anneal-updates 5000 \ --soft-update 500 \ --target-layers 12 \ --share-weight 0.1 ``` ## Inference command ```bash lang_pairs_str="eng-aze,eng-bel,eng-ces,eng-glg,eng-por,eng-rus,eng-slk,eng-tur" databin_dir= model_path= src_lang= tgt_lang= gen_data= fairseq-generate ${databin_dir} \ --path ${model_path} \ --task multilingual_translation_latent_depth \ --decoder-latent-layer \ --lang-pairs "${lang_pairs_str}" \ -s ${src_lang} -t ${tgt_lang} \ --gen-subset $gen_data \ --scoring sacrebleu \ --remove-bpe 'sentencepiece' \ --lenpen 1.0 \ --beam 5 \ --decoder-langtok \ --max-tokens 4096 ``` ## Citation ```bibtex @article{li2020deep, title={Deep Transformers with Latent Depth}, author={Li, Xian and Stickland, Asa Cooper and Tang, Yuqing and Kong, Xiang}, journal={arXiv preprint arXiv:2009.13102}, year={2020} } ``` ================================================ FILE: examples/latent_depth/latent_depth_src/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import multilingual_translation_latent_depth # noqa from .loss import latent_depth # noqa from .models import latent_multilingual_transformer # noqa from .modules import latent_layers # noqa ================================================ FILE: examples/latent_depth/latent_depth_src/loss/__init__.py ================================================ ================================================ FILE: examples/latent_depth/latent_depth_src/loss/latent_depth.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch from torch.nn.modules.loss import _Loss class LatentLayersKLLoss(_Loss): def __init__(self, args): super().__init__() self.args = args def forward(self, layer_samples, lang_idx, update_num, sample_size): prior = self.args.prior samples = layer_samples[lang_idx] eps = 1e-7 if prior == "uniform": # uniform prior kl_loss = (samples * (torch.log(samples + eps) - math.log(0.5))).sum(-1) elif prior == "agged_posterior": # aggregated posterior y_t = torch.stack([x.detach() for x in layer_samples], dim=0) agged_q = torch.sum(y_t, dim=0) row_norm = agged_q.sum(-1) normed_agg_q = agged_q / row_norm kl_loss = ( samples * (torch.log(samples + eps) - torch.log(normed_agg_q + eps)) ).sum(-1) else: raise NotImplementedError("The specified prior is not implemented.") # normalized by number of layers kl_loss /= layer_samples[0].size()[0] kl_weight = min( self.args.sparsity_weight, (update_num - self.args.soft_update) * self.args.sparsity_weight / self.args.anneal_updates, ) kl_loss *= kl_weight * sample_size return kl_loss class LatentLayersSparsityLoss(_Loss): def __init__(self, args): super().__init__() self.args = args def is_valid(self, update_num): if self.args.target_layers <= 0: return False return update_num > (self.args.soft_update + self.args.anneal_updates) def forward(self, layer_samples_list, update_num, sample_size): batch_loss = 0 share_loss = 0 global_sparsity_loss = 0 layer_samples = torch.stack(layer_samples_list, dim=0) if ( self.args.target_layers > 0 or self.args.share_weight > 0 ) and update_num > (self.args.soft_update + self.args.anneal_updates): # anneal sparsity weight if update_num < (self.args.anneal_updates + self.args.soft_update): weight_anneal = 0 elif update_num < (2 * self.args.anneal_updates + self.args.soft_update): weight_anneal = ( (update_num - self.args.soft_update - self.args.anneal_updates) * self.args.share_weight / self.args.anneal_updates ) else: weight_anneal = 1 # compute ratio among languages layer_utilization = torch.sum(layer_samples, dim=0) layer_utilization /= layer_samples.size()[0] if self.args.share_weight > 0: # encouraging sharing across languages share_loss = sum( -1.0 * v * math.log(v) for v in layer_utilization if v > 0 ) batch_loss += ( weight_anneal * self.args.share_weight * sample_size * share_loss ) if self.args.target_layers > 0: # computed expected number of layers selected expeted_layers = sum(layer_utilization) # compute l2 loss wrt target number of layers global_sparsity_loss = (expeted_layers - self.args.target_layers) ** 2 batch_loss += ( weight_anneal * self.args.share_weight * sample_size * global_sparsity_loss ) return batch_loss ================================================ FILE: examples/latent_depth/latent_depth_src/models/__init__.py ================================================ ================================================ FILE: examples/latent_depth/latent_depth_src/models/latent_multilingual_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.models import register_model, register_model_architecture from fairseq.models.multilingual_transformer import MultilingualTransformerModel from fairseq.models.transformer import ( TransformerDecoder, TransformerEncoder, base_architecture, ) from fairseq.utils import safe_hasattr from .latent_transformer import LatentTransformerDecoder, LatentTransformerEncoder @register_model("latent_multilingual_transformer") class LatentMultilingualTransformerModel(MultilingualTransformerModel): """A variant of standard multilingual Transformer models which encoder and/or decoders supports latent depth, as is in "Deep Transformer with Latent Depth" (https://arxiv.org/abs/2009.13102). """ @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" MultilingualTransformerModel.add_args(parser) parser.add_argument( '--soft-select', action='store_true', help='use soft samples in training an inference', ) parser.add_argument( '--sampling-tau', type=float, default=5., help='sampling temperature', ) @classmethod def _get_module_class(cls, is_encoder, args, lang_dict, embed_tokens, langs): if is_encoder: if safe_hasattr(args, "encoder_latent_layer") and args.encoder_latent_layer: return LatentTransformerEncoder( args, lang_dict, embed_tokens, num_logits=len(langs) ) else: return TransformerEncoder(args, lang_dict, embed_tokens) else: if safe_hasattr(args, "decoder_latent_layer") and args.decoder_latent_layer: return LatentTransformerDecoder( args, lang_dict, embed_tokens, num_logits=len(langs) ) else: return TransformerDecoder(args, lang_dict, embed_tokens) @register_model_architecture( "latent_multilingual_transformer", "latent_multilingual_transformer" ) def latent_multilingual_architecture(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.encoder_layers = getattr(args, "encoder_layers", 12) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.decoder_layers = getattr(args, "decoder_layers", 24) args.share_encoders = getattr(args, "share_encoders", True) args.share_decoders = getattr(args, "share_decoders", True) args.share_encoder_embeddings = getattr(args, "share_encoder_embeddings", True) args.share_decoder_embeddings = getattr(args, "share_decoder_embeddings", True) base_architecture(args) ================================================ FILE: examples/latent_depth/latent_depth_src/models/latent_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Any, Dict, Optional import torch.nn as nn from fairseq.models.fairseq_encoder import EncoderOut from fairseq.models.transformer import TransformerDecoder, TransformerEncoder from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer from torch import Tensor from ..modules.latent_layers import LayerSelect class LatentTransformerEncoder(TransformerEncoder): """Latent depth (https://arxiv.org/abs/2009.13102) implemented in TransformerEncoder. """ def __init__(self, args, dictionary, embed_tokens, num_logits=1): self.num_logits = num_logits self.num_layers = args.encoder_layers super().__init__(args, dictionary, embed_tokens) self.layer_select = LayerSelect( num_layers=self.num_layers, num_logits=self.num_logits, soft_select=getattr(args, "soft_select", False), sampling_tau=getattr(args, "sampling_tau", 5.), ) self.lang_idx = None self.layers = nn.ModuleList( [self._build_encoder_layer(args, idx) for idx in range(args.encoder_layers)] ) def set_lang_idx(self, lang_idx): self.lang_idx = lang_idx def _build_encoder_layer(self, args, idx=None): return LatentTransformerEncoderLayer(args, idx, layer_select=self.layer_select) def forward(self, src_tokens, src_lengths, return_all_hiddens: bool = False): self.layer_select.sample(self.lang_idx) return super().forward(src_tokens, src_lengths, return_all_hiddens) class LatentTransformerEncoderLayer(TransformerEncoderLayer): """Encoder layer with each (non_residual) block weighted by samples of Bernouli or Gumbel Signmoid samples. Args: args (argparse.Namespace): parsed command-line arguments from standard TransformerEncoderLayer. idx (int): layer index (used to retrieve samples). layer_select (LayerSelect, optional): instance of LayerSelect module with logits parameters and sampling method. """ def __init__(self, args, idx, layer_select=None): super().__init__(args) self.idx = idx self.layer_select = layer_select def residual_connection(self, x, residual): return residual + x * self.layer_select(self.idx) class LatentTransformerDecoder(TransformerDecoder): """Latent depth (https://arxiv.org/abs/2009.13102) implemented in TransformerDecoder. """ def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, num_logits=1 ): self.num_logits = num_logits self.num_layers = args.decoder_layers super().__init__( args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn ) self.layer_select = LayerSelect( num_layers=self.num_layers, num_logits=self.num_logits, soft_select=getattr(args, "soft_select", False), sampling_tau=getattr(args, "sampling_tau", 5.), ) self.lang_idx = None self.layers = nn.ModuleList( [ self._build_decoder_layer(args, no_encoder_attn, idx) for idx in range(args.decoder_layers) ] ) def set_lang_idx(self, lang_idx): self.lang_idx = lang_idx def _build_decoder_layer(self, args, no_encoder_attn=False, idx=None): return LatentTransformerDecoderLayer( args, idx, layer_select=self.layer_select, no_encoder_attn=no_encoder_attn ) def forward( self, prev_output_tokens, encoder_out: Optional[EncoderOut] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, ): self.layer_select.sample(self.lang_idx) return super().forward( prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, features_only=features_only, alignment_layer=alignment_layer, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens, ) class LatentTransformerDecoderLayer(TransformerDecoderLayer): """Decoder layer with each (non_residual) block weighted by samples of Bernouli or Gumbel Signmoid samples. Args: args (argparse.Namespace): parsed command-line arguments from standard TransformerDecoderLayer. idx (int): layer index (used to retrieve samples). layer_select (LayerSelect, optional): instance of LayerSelect module with logits parameters and sampling method. no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, args, idx, layer_select=None, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False, ): super().__init__(args, no_encoder_attn, add_bias_kv, add_zero_attn) self.idx = idx self.layer_select = layer_select def residual_connection(self, x, residual): return residual + x * self.layer_select(self.idx) ================================================ FILE: examples/latent_depth/latent_depth_src/modules/__init__.py ================================================ ================================================ FILE: examples/latent_depth/latent_depth_src/modules/latent_layers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn class LayerSelect(nn.Module): """Compute samples (from a Gumbel-Sigmoid distribution) which is used as either (soft) weighting or (hard) selection of residual connection. https://arxiv.org/abs/2009.13102 """ def __init__(self, num_layers, num_logits, soft_select=False, sampling_tau=5.): super(LayerSelect, self).__init__() self.layer_logits = torch.nn.Parameter( torch.Tensor(num_logits, num_layers), requires_grad=True, ) self.hard_select = not soft_select self.tau = sampling_tau self.detach_grad = False self.layer_samples = [None] * num_logits def sample(self, logit_idx): """To leverage the efficiency of distributed training, samples for all layers are computed at once for each logit_idx. Logits are parameters learnt independent of each other. Args: logit_idx: The index of logit parameters used for sampling. """ assert logit_idx is not None self.samples = self._gumbel_sigmoid( self.layer_logits[logit_idx, :].detach() if self.detach_grad else self.layer_logits[logit_idx, :], dim=-1, tau=self.tau, hard=self.hard_select, ) self.layer_samples[logit_idx] = self.samples def forward(self, i): sample = self.samples[i] return sample def _gumbel_sigmoid( self, logits, tau=1, hard=False, eps=1e-10, dim=-1, threshold=0.5 ): # ~Gumbel(0,1) gumbels1 = ( -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format) .exponential_() .log() ) gumbels2 = ( -torch.empty_like(logits, memory_format=torch.legacy_contiguous_format) .exponential_() .log() ) # Difference of two gumbels because we apply a sigmoid gumbels1 = (logits + gumbels1 - gumbels2) / tau y_soft = gumbels1.sigmoid() if hard: # Straight through. y_hard = torch.zeros_like( logits, memory_format=torch.legacy_contiguous_format ).masked_fill(y_soft > threshold, 1.0) ret = y_hard - y_soft.detach() + y_soft else: # Reparametrization trick. ret = y_soft return ret ================================================ FILE: examples/latent_depth/latent_depth_src/multilingual_translation_latent_depth.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.tasks import register_task from fairseq.tasks.multilingual_translation import MultilingualTranslationTask from fairseq.utils import safe_hasattr from .loss.latent_depth import LatentLayersKLLoss, LatentLayersSparsityLoss @register_task("multilingual_translation_latent_depth") class MultilingualTranslationTaskLatentDepth(MultilingualTranslationTask): """A task for multiple translation with latent depth. See `"Deep Transformer with Latent Depth" (Li et al., 2020) `_. """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off MultilingualTranslationTask.add_args(parser) parser.add_argument('--encoder-latent-layer', action='store_true', help='latent layer selection in encoder') parser.add_argument('--decoder-latent-layer', action='store_true', help='latent layer selection in decoder') parser.add_argument('--target-layers', default=-1, type=int, help='number of effective layers to learn; -1 means no constraint') parser.add_argument('--sparsity-weight', default=0.0, type=float, help='weight for sparsity loss') parser.add_argument('--share-weight', default=0.0, type=float, help='weight for sharing loss') parser.add_argument('--soft-update', default=1, type=int, help='number of updates with soft sampling') parser.add_argument('--anneal-updates', default=1, type=int, help='number of updates to anneal the KL loss weight') parser.add_argument('--prior', default="uniform", type=str, help='prior used for computing KL loss') # fmt: on def __init__(self, args, dicts, training): super().__init__(args, dicts, training) self.src_langs, self.tgt_langs = zip( *[(lang.split("-")[0], lang.split("-")[1]) for lang in args.lang_pairs] ) if self.training and self.encoder_latent_layer: assert self.args.share_encoders if self.training and self.decoder_latent_layer: assert self.args.share_decoders if training or self.encoder_latent_layer or self.decoder_latent_layer: self.lang_pairs = args.lang_pairs else: self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)] self.eval_lang_pairs = self.lang_pairs self.model_lang_pairs = self.lang_pairs if self.training and (self.encoder_latent_layer or self.decoder_latent_layer): self.kl_loss = LatentLayersKLLoss(self.args) self.sparsity_loss = LatentLayersSparsityLoss(self.args) def _per_lang_pair_train_loss( self, lang_pair, model, update_num, criterion, sample, optimizer, ignore_grad ): src, tgt = lang_pair.split("-") if self.encoder_latent_layer: src_lang_idx = self.src_lang_idx_dict[src] model.models[lang_pair].encoder.set_lang_idx(src_lang_idx) model.models[lang_pair].encoder.layer_select.hard_select = ( update_num > self.args.soft_update ) if self.decoder_latent_layer: tgt_lang_idx = self.tgt_lang_idx_dict[tgt] model.models[lang_pair].decoder.set_lang_idx(tgt_lang_idx) model.models[lang_pair].decoder.layer_select.hard_select = ( update_num > self.args.soft_update ) loss, sample_size, logging_output = criterion( model.models[lang_pair], sample[lang_pair] ) if self.encoder_latent_layer: none_samples = sum( 1 if x is None else 0 for x in model.models[lang_pair].encoder.layer_select.layer_samples ) if none_samples == 0 or self.args.prior != "agged_posterior": loss += self.kl_loss( model.models[lang_pair].encoder.layer_select.layer_samples, src_lang_idx, update_num, sample_size, ) if self.decoder_latent_layer: none_samples = sum( 1 if x is None else 0 for x in model.models[lang_pair].decoder.layer_select.layer_samples ) if none_samples == 0 or self.args.prior != "agged_posterior": loss += self.kl_loss( model.models[lang_pair].decoder.layer_select.layer_samples, tgt_lang_idx, update_num, sample_size, ) if ignore_grad: loss *= 0 if hasattr(self, "sparsity_loss") and self.sparsity_loss.is_valid(update_num): # need to retain the graph if sparsity loss needs to be added loss.backward(retain_graph=True) else: optimizer.backward(loss) return loss, sample_size, logging_output def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): agg_loss, agg_sample_size, agg_logging_output = super().train_step( sample, model, criterion, optimizer, update_num, ignore_grad ) # compute auxiliary loss from layere sparsity, based on all samples from all languages if hasattr(self, "sparsity_loss") and self.sparsity_loss.is_valid(update_num): sparsity_loss = 0 if self.encoder_latent_layer: sparsity_loss += self.sparsity_loss( next( iter(model.models.values()) ).encoder.layer_select.layer_samples, update_num, agg_sample_size, ) if self.decoder_latent_layer: sparsity_loss += self.sparsity_loss( next( iter(model.models.values()) ).decoder.layer_select.layer_samples, update_num, agg_sample_size, ) if sparsity_loss > 0: optimizer.backward(sparsity_loss) return agg_loss, agg_sample_size, agg_logging_output def _per_lang_pair_valid_loss(self, lang_pair, model, criterion, sample): src, tgt = lang_pair.split("-") if self.encoder_latent_layer: src_lang_idx = self.src_lang_idx_dict[src] model.models[lang_pair].encoder.set_lang_idx(src_lang_idx) if self.decoder_latent_layer: tgt_lang_idx = self.tgt_lang_idx_dict[tgt] model.models[lang_pair].decoder.set_lang_idx(tgt_lang_idx) loss, sample_size, logging_output = criterion( model.models[lang_pair], sample[lang_pair] ) return loss, sample_size, logging_output def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): if self.encoder_latent_layer or self.decoder_latent_layer: for model in models: if self.encoder_latent_layer: assert model.encoder.layer_select is not None src_lang_idx = self.src_lang_idx_dict[self.args.source_lang] model.encoder.set_lang_idx(src_lang_idx) if self.decoder_latent_layer: assert model.decoder.layer_select is not None tgt_lang_idx = self.tgt_lang_idx_dict[self.args.target_lang] model.decoder.set_lang_idx(tgt_lang_idx) return super().inference_step( generator, models, sample, prefix_tokens, constraints ) @property def encoder_latent_layer(self): return ( safe_hasattr(self.args, "encoder_latent_layer") and self.args.encoder_latent_layer ) @property def decoder_latent_layer(self): return ( safe_hasattr(self.args, "decoder_latent_layer") and self.args.decoder_latent_layer ) @property def src_lang_idx_dict(self): return {lang: lang_idx for lang_idx, lang in enumerate(self.src_langs)} @property def tgt_lang_idx_dict(self): return {lang: lang_idx for lang_idx, lang in enumerate(self.tgt_langs)} ================================================ FILE: examples/layerdrop/README.md ================================================ # Reducing Transformer Depth on Demand with Structured Dropout (Fan et al., 2019) This page contains information for how to train models with LayerDrop, based on this [paper](https://arxiv.org/abs/1909.11556). ## Citation: If you found this technique useful, please cite our paper: ```bibtex @article{fan2019reducing, title={Reducing Transformer Depth on Demand with Structured Dropout}, author={Fan, Angela and Grave, Edouard and Joulin, Armand}, journal={arXiv preprint arXiv:1909.11556}, year={2019} } ``` ## Pre-trained models Model | Description | Download ---|---|--- `layerdrop_wmt_en_de_12_6` | Transformer + LayerDrop 0.2 trained on WMT16 en-de with 12 encoder and 6 decoder layers | [layerdrop_wmt_en_de_12_6.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/layerdrop_wmt_en_de_12_6.tar.gz) `roberta_layerdrop.base` | RoBERTa Base + LayerDrop 0.2 | [roberta_layerdrop.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.base.qnli.tar.gz) `roberta_layerdrop.large` | RoBERTa Large + LayerDrop 0.2 | [roberta_layerdrop.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.tar.gz) `roberta_layerdrop.large.mnli` | `roberta_layerdrop.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | [roberta_layerdrop.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.mnli.tar.gz) `roberta_layerdrop.large.qnli` | `roberta_layerdrop.large` finetuned on [QNLI](https://arxiv.org/abs/1804.07461) | [roberta_layerdrop.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta_layerdrop.large.qnli.tar.gz) Evaluate performance of these pre-trained models: ```bash # Example for Machine Translation fairseq-generate /path/to/bped/wmt/data --path nmt_checkpoint.pt \ --beam 8 --lenpen 0.4 \ --batch-size 64 \ --remove-bpe \ --gen-subset test > wmt16_gen.txt bash scripts/compound_split_bleu.sh wmt16_gen.txt # prints BLEU4 = 30.17 ``` ```python # Example for RoBERTa + LayerDrop finetuned on MNLI: from fairseq.models.roberta import RobertaModel roberta_layerdrop = RobertaModel.from_pretrained( '/path/to/MNLI/model', checkpoint_file='mnli_checkpoint.pt', data_name_or_path='/path/to/MNLI/data/MNLI-bin' ) label_map = {0: 'contradiction', 2: 'neutral', 1: 'entailment'} ncorrect, nsamples = 0, 0 roberta_layerdrop.cuda() roberta_layerdrop.eval() with open('/path/to/MNLI/data/dev_matched.tsv') as fin: fin.readline() for index, line in enumerate(fin): tokens = line.strip().split('\t') sent1, sent2, target = tokens[8], tokens[9], tokens[-1] tokens = roberta_layerdrop.encode(sent1, sent2) prediction = roberta_layerdrop.predict('sentence_classification_head', tokens).argmax().item() prediction_label = label_map[prediction] ncorrect += int(prediction_label == target) nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) # prints | Accuracy: 0.9026999490575649 # Example for RoBERTa + LayerDrop finetuned on QNLI: roberta = RobertaModel.from_pretrained( '/path/to/QNLI/model', checkpoint_file='qnli_checkpoint.pt', data_name_or_path='/path/to/QNLI/data/QNLI-bin' ) label_fn = lambda label: roberta.task.label_dictionary.string( [label + roberta.task.target_dictionary.nspecial] ) ncorrect, nsamples = 0, 0 roberta.cuda() roberta.eval() with open('/path/to/QNLI/data/dev.tsv') as fin: fin.readline() for index, line in enumerate(fin): tokens = line.strip().split('\t') sent1, sent2, target = tokens[1], tokens[2], tokens[3] tokens = roberta.encode(sent1, sent2) prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() prediction_label = label_fn(prediction) ncorrect += int(prediction_label == target) nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) # prints | Accuracy: 0.9480139117700896 ``` ## Example usage To train a model with LayerDrop, add the following flags. We recommend 0.2, a value that worked well in our experiments. For Language Models that are decoder-only, you need only the decoder flag. For RoBERTa, an encoder, you need only the encoder flag. The encoder and decoder LayerDrop values can be set differently. ``` --encoder-layerdrop 0.2 --decoder-layerdrop 0.2 ``` To prune a model that has been trained with LayerDrop, add the following flags followed by a comma separated list of which layers you would like to keep. ``` --encoder-layers-to-keep 0,2,4,6,8,10,12,14 --decoder-layers-to-keep 0,2,4,6,8,10,12,14 ``` Setting these flags should print a message such as: ``` | Pruning model to specified layer configuration ``` You should also see a smaller number of parameters in the model, for example the 16-Layer Transformer Language Model prints: ``` num. model params: 246933504 ``` while a model pruned to 8 Layers prints: ``` num. model params: 146163712 ``` If you would like to pick up training with a model that has been pruned, simply adding these flags is sufficient. If you would like to use a script that only does evaluation (no training), you may need to pass an override command. A specific example would be for language modeling: ```bash fairseq-eval-lm /path/to/wikitext-103 \ --path /path/to/model/checkpoint.pt \ --model-overrides "{'decoder_layers_to_keep':'0,2,4,6,8,10,12,14'}" ``` This model override command overrides the training parameters and updates the model arguments so that the pruned model is run instead of the full model. ## Reproduce Paper Results Looking to reproduce the results in the paper? 1. For Translation on WMT16 en-de, we followed this setting [here](https://github.com/pytorch/fairseq/blob/main/examples/scaling_nmt/README.md) 2. To train RoBERTa, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/roberta) 3. To train Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model) ## Tips 1. If you would like to train large models with better performance, LayerDrop should be set to a smaller value such as 0.1 or 0.2. Too much LayerDrop will mean the model has too much regularization, so may not reach the best performance. Since LayerDrop adds regularization, you may achieve the best performance by slightly reducing the amount of standard dropout (for example, reduce by 0.1). 2. If you would like to train large models to be pruned and made smaller, LayerDrop should be set to a larger value such as 0.5 if you want to prune very aggressively (such as removing half the network or more). If you would like to prune fewer layers away, LayerDrop can be set to a smaller value such as 0.2. Our experiments were conducted with low values of LayerDrop (such as 0.1 and 0.2), for reference. 3. When pruning layers at inference time, it is best to spread out the layers remaining so they are evenly spaced throughout the network. For example, if you want to remove 50% of the network, keeping every other layer is good. ## FAQ 1. How did the sharing layers experiment work? In an appendix (https://openreview.net/pdf?id=SylO2yStDr) we added an experiment on Wikitext-103 language modeling that combined LayerDrop with Weight Sharing. We shared chunks of 2 layers such that every other layer had shared weights. For example, if our network has layers 1 through 6, then layer 1 and 2 are shared, layer 3 and 4 are shared, and layer 5 and 6 are shared. 2. LayerDrop hasn't been helping in my setting? During training time, LayerDrop can help regularize your network. This is most important if your network is already overfitting - if your network is underfitting, it is possible LayerDrop is adding too much regularization. We recommend using smaller values (such as 0.1 or 0.2) and also decreasing the quantity of standard dropout (for example, reduce by 0.1). 3. Can you train a model without LayerDrop and finetune with LayerDrop (e.g. for BERT)? In our experiments, we did not see great performance. Models such as RoBERTa have trained for a long time in the pre-training setting, so only finetuning with LayerDrop for a few epochs on a downstream task such as MNLI does not achieve the robustness required for successful pruning. ## Having an issue or have a question? Please open an issue in this repository with the details of your question. Thanks! ================================================ FILE: examples/linformer/README.md ================================================ # Linformer: Self-Attention with Linear Complexity (Wang et al., 2020) This example contains code to train Linformer models as described in our paper [Linformer: Self-Attention with Linear Complexity](https://arxiv.org/abs/2006.04768). ## Training a new Linformer RoBERTa model You can mostly follow the [RoBERTa pretraining README](/examples/roberta/README.pretraining.md), updating your training command with `--user-dir examples/linformer/linformer_src --arch linformer_roberta_base`. ## Citation If you use our work, please cite: ```bibtex @article{wang2020linformer, title={Linformer: Self-Attention with Linear Complexity}, author={Wang, Sinong and Li, Belinda and Khabsa, Madian and Fang, Han and Ma, Hao}, journal={arXiv preprint arXiv:2006.04768}, year={2020} } ``` ================================================ FILE: examples/linformer/linformer_src/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .models import linformer_roberta # noqa ================================================ FILE: examples/linformer/linformer_src/models/__init__.py ================================================ ================================================ FILE: examples/linformer/linformer_src/models/linformer_roberta.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Linformer: Self-Attention with Linear Complexity """ import logging import torch from fairseq import utils from fairseq.models import register_model, register_model_architecture from fairseq.models.roberta import ( init_bert_params, roberta_base_architecture, roberta_large_architecture, RobertaEncoder, RobertaModel, ) from fairseq.utils import safe_hasattr from ..modules.linformer_sentence_encoder import LinformerTransformerEncoder logger = logging.getLogger(__name__) @register_model("linformer_roberta") class LinformerModel(RobertaModel): @staticmethod def add_args(parser): RobertaModel.add_args(parser) # add args for Linformer parser.add_argument( "--compressed", type=int, help="compressed ratio of sequence length" ) parser.add_argument( "--shared-kv-compressed", type=int, help="share compressed matrix between k and v, in each layer", ) parser.add_argument( "--shared-layer-kv-compressed", type=int, help="share compressed matrix between k and v and across all layers", ) parser.add_argument( "--freeze-compress", type=int, help="freeze the parameters in compressed layer", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present base_architecture(args) if not safe_hasattr(args, "max_positions"): args.max_positions = args.tokens_per_sample encoder = LinformerEncoder(args, task.source_dictionary) return cls(args, encoder) class LinformerEncoder(RobertaEncoder): """Linformer encoder.""" def __init__(self, args, dictionary): super().__init__(args, dictionary) self.register_buffer("version", torch.tensor(2)) def build_encoder(self, args, dictionary, embed_tokens): encoder = LinformerTransformerEncoder(args, dictionary, embed_tokens) encoder.apply(init_bert_params) return encoder def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) prefix = name + "." if name != "" else "" # some old checkpoints had weight sharing implemented incorrectly # (note: this was correct in the original paper code) if utils.item(state_dict.get(f"{prefix}version", torch.tensor(1))) < 2: state_dict[f"{prefix}version"] = torch.tensor(1) # check if input embeddings and output embeddings were tied if not torch.allclose( state_dict[f"{prefix}sentence_encoder.embed_tokens.weight"], state_dict[f"{prefix}lm_head.weight"], ): # they weren't tied, re-init the LM head without weight sharing self.lm_head = self.build_lm_head( embed_dim=self.args.encoder_embed_dim, output_dim=len(self.dictionary), activation_fn=self.args.activation_fn, weight=None, # don't share weights ) @register_model_architecture("linformer_roberta", "linformer_roberta") def base_architecture(args): args.compressed = getattr(args, "compressed", 4) args.shared_kv_compressed = getattr(args, "shared_kv_compressed", 0) args.shared_layer_kv_compressed = getattr(args, "shared_layer_kv_compressed", 0) args.freeze_compress = getattr(args, "freeze_compress", 0) roberta_base_architecture(args) @register_model_architecture("linformer_roberta", "linformer_roberta_base") def linformer_roberta_base_architecture(args): base_architecture(args) @register_model_architecture("linformer_roberta", "linformer_roberta_large") def linformer_roberta_large_architecture(args): roberta_large_architecture(args) base_architecture(args) ================================================ FILE: examples/linformer/linformer_src/modules/__init__.py ================================================ ================================================ FILE: examples/linformer/linformer_src/modules/linformer_sentence_encoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch.nn as nn from fairseq.models.transformer import TransformerEncoder from .linformer_sentence_encoder_layer import LinformerTransformerEncoderLayer class LinformerTransformerEncoder(TransformerEncoder): """ Implementation for a Bi-directional Linformer based Sentence Encoder used in BERT/XLM style pre-trained models. This first computes the token embedding using the token embedding matrix, position embeddings (if specified) and segment embeddings (if specified). After applying the specified number of LinformerEncoderLayers, it outputs all the internal states of the encoder as well as the final representation associated with the first token (usually CLS token). Input: - tokens: B x T matrix representing sentences - segment_labels: B x T matrix representing segment label for tokens Output: - a tuple of the following: - a list of internal model states used to compute the predictions where each tensor has shape T x B x C - sentence representation associated with first input token in format B x C. """ def __init__(self, args, dictionary, embed_tokens): self.compress_layer = None super().__init__(args, dictionary, embed_tokens) def build_encoder_layer(self, args): if self.args.shared_layer_kv_compressed == 1 and self.compress_layer is None: compress_layer = nn.Linear( self.args.max_positions, self.args.max_positions // self.args.compressed, ) # intialize parameters for compressed layer nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2)) if self.args.freeze_compress == 1: compress_layer.weight.requires_grad = False self.compress_layer = compress_layer return LinformerTransformerEncoderLayer(args, self.compress_layer) ================================================ FILE: examples/linformer/linformer_src/modules/linformer_sentence_encoder_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq import utils from fairseq.modules import TransformerEncoderLayer from .multihead_linear_attention import MultiheadLinearAttention class LinformerTransformerEncoderLayer(TransformerEncoderLayer): """ Implements a Linformer Encoder Layer used in BERT/XLM style pre-trained models. """ def __init__(self, args, shared_compress_layer): # wrap in a list so it's not automatically registered by PyTorch self.shared_compress_layer = [shared_compress_layer] super().__init__(args) self.register_buffer("version", torch.tensor(2)) def build_self_attention(self, embed_dim, args): return MultiheadLinearAttention( embed_dim, args.encoder_attention_heads, dropout=args.dropout, self_attention=True, q_noise=args.quant_noise_pq, qn_block_size=args.quant_noise_pq_block_size, compressed=args.compressed, max_seq_len=args.max_positions, shared_kv_compressed=args.shared_kv_compressed, shared_compress_layer=self.shared_compress_layer[0], freeze_compress=args.freeze_compress, ) def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) prefix = name + "." if name != "" else "" # some old checkpoints had weight sharing implemented incorrectly # (note: this was correct in the original paper code) if utils.item(state_dict.get(f"{prefix}version", torch.tensor(1))) < 2: state_dict[f"{prefix}version"] = torch.tensor(1) # check compression layer sharing if f"{prefix}shared_compress_layer.weight" in state_dict: # reinitialize block without sharing compression layer to match # old behavior self.shared_compress_layer = [ torch.nn.Linear( self.shared_compress_layer[0].weight.size(1), self.shared_compress_layer[0].weight.size(0), ) ] self.self_attn = self.build_self_attention(self.embed_dim, self.args) # delete shared_compress_layer, since it's already copied to # self_attn.compress_k.weight del state_dict[f"{prefix}shared_compress_layer.weight"] if f"{prefix}shared_compress_layer.bias" in state_dict: del state_dict[f"{prefix}shared_compress_layer.bias"] ================================================ FILE: examples/linformer/linformer_src/modules/multihead_linear_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F from fairseq import utils from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules.quant_noise import quant_noise from torch import Tensor, nn from torch.nn import Parameter @with_incremental_state class MultiheadLinearAttention(nn.Module): """Multi-headed linformer attention. Projects the key and values down to the compressed dimension, before computing self-attention. See "Linformer: Self-Attention with Linear Complexity" for more details. """ def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, compressed=1, max_seq_len=256, shared_kv_compressed=0, shared_compress_layer=None, freeze_compress=0, ): super().__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads assert ( self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim ** -0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size" ) self.k_proj = quant_noise( nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size ) self.v_proj = quant_noise( nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size ) self.q_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size ) # used for compress sequence to subsequence if shared_compress_layer is None: self.compress_seq_len = max_seq_len // compressed self.compress_k = nn.Linear(max_seq_len, self.compress_seq_len, bias=False) if shared_kv_compressed == 0: self.compress_v = nn.Linear( max_seq_len, self.compress_seq_len, bias=False ) self.layerwise_sharing = False else: self.compress_k = shared_compress_layer if shared_kv_compressed == 0: self.compress_v = shared_compress_layer self.layerwise_sharing = True self.shared_kv_compressed = shared_kv_compressed self.out_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size ) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.reset_parameters() if freeze_compress == 1: self.compress_k.weight.requires_grad = False if shared_kv_compressed == 0: self.compress_v.weight.requires_grad = False self.onnx_trace = False def prepare_for_onnx_export_(self): self.onnx_trace = True def reset_parameters(self): if self.qkv_same_dim: # Empirically observed the convergence to be much better with # the scaled initialization nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) if ( not self.layerwise_sharing ): # otherwise, we already initialize the parameters nn.init.xavier_uniform_(self.compress_k.weight, gain=1 / math.sqrt(2)) if self.shared_kv_compressed == 0: nn.init.xavier_uniform_( self.compress_v.weight, gain=1 / math.sqrt(2) ) else: nn.init.xavier_uniform_(self.k_proj.weight) nn.init.xavier_uniform_(self.v_proj.weight) nn.init.xavier_uniform_(self.q_proj.weight) if ( not self.layerwise_sharing ): # otherwise, we already initialize the parameters nn.init.xavier_uniform_(self.compress_k.weight) if self.shared_kv_compressed == 0: nn.init.xavier_uniform_(self.compress_v.weight) nn.init.xavier_uniform_(self.out_proj.weight) if self.out_proj.bias is not None: nn.init.constant_(self.out_proj.bias, 0.0) if self.bias_k is not None: nn.init.xavier_normal_(self.bias_k) if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v) def forward( self, query, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, need_weights: bool = True, static_kv: bool = False, attn_mask: Optional[Tensor] = None, before_softmax: bool = False, need_head_weights: bool = False, ) -> Tuple[Tensor, Optional[Tensor]]: """Input shape: Time x Batch x Channel Args: key_padding_mask (ByteTensor, optional): mask to exclude keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s. need_weights (bool, optional): return the attention weights, averaged over heads (default: False). attn_mask (ByteTensor, optional): typically used to implement causal attention, where the mask prevents the attention from looking forward in time (default: None). before_softmax (bool, optional): return the raw attention weights and values before the attention softmax. need_head_weights (bool, optional): return the attention weights for each head. Implies *need_weights*. Default: return the average attention weights over all heads. """ if need_head_weights: need_weights = True tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if saved_state is not None and "prev_key" in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert self.encoder_decoder_attention and not self.self_attention key = value = None else: saved_state = None if self.self_attention: q = self.q_proj(query) k_input = query.permute(1, 2, 0).contiguous() # B * C * T k_input = ( F.linear(k_input, self.compress_k.weight[:, 0:tgt_len]) .permute(2, 0, 1) .contiguous() ) k = self.k_proj(k_input) v_input = query.permute(1, 2, 0).contiguous() # B * C * T if self.shared_kv_compressed == 0: v_input = ( F.linear(v_input, self.compress_v.weight[:, 0:tgt_len]) .permute(2, 0, 1) .contiguous() ) if self.shared_kv_compressed == 1: # use shared kv compressed linear layer v_input = ( F.linear(v_input, self.compress_k.weight[:, 0:tgt_len]) .permute(2, 0, 1) .contiguous() ) v = self.v_proj(v_input) elif self.encoder_decoder_attention: # encoder-decoder attention q = self.q_proj(query) if key is None: assert value is None k = v = None else: k = self.k_proj(key) v = self.v_proj(key) else: assert key is not None and value is not None q = self.q_proj(query) k = self.k_proj(key) v = self.v_proj(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1), ], dim=1, ) q = ( q.contiguous() .view(tgt_len, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if k is not None: k = ( k.contiguous() .view(-1, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if v is not None: v = ( v.contiguous() .view(-1, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) prev_key_padding_mask: Optional[Tensor] = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None key_padding_mask = MultiheadLinearAttention._append_prev_key_padding_mask( key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=bsz, src_len=k.size(1), static_kv=static_kv, ) saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None incremental_state = self._set_input_buffer(incremental_state, saved_state) assert k is not None src_len = k.size(1) if self.add_zero_attn: assert v is not None src_len += 1 k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) attn_weights = torch.bmm(q, k.transpose(1, 2)) attn_weights = MultiheadLinearAttention.apply_sparse_mask( attn_weights, tgt_len, src_len, bsz ) assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if before_softmax: return attn_weights, v attn_weights_float = utils.softmax( attn_weights, dim=-1, onnx_trace=self.onnx_trace ) attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = F.dropout( attn_weights, p=self.dropout, training=self.training, ) assert v is not None attn = torch.bmm(attn_probs, v) assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if self.onnx_trace and attn.size(1) == 1: # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) attn_weights: Optional[Tensor] = None if need_weights: attn_weights = attn_weights_float.view( bsz, self.num_heads, tgt_len, src_len ).transpose(1, 0) if not need_head_weights: # average attention weights over heads attn_weights = attn_weights.mean(dim=0) return attn, attn_weights @staticmethod def _append_prev_key_padding_mask( key_padding_mask: Optional[Tensor], prev_key_padding_mask: Optional[Tensor], batch_size: int, src_len: int, static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask elif prev_key_padding_mask is not None and key_padding_mask is not None: new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1 ) # During incremental decoding, as the padding token enters and # leaves the frame, there will be a time when prev or current # is None elif prev_key_padding_mask is not None: filler = torch.zeros( (batch_size, src_len - prev_key_padding_mask.size(1)), device=prev_key_padding_mask.device, ) new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), filler.float()], dim=1 ) elif key_padding_mask is not None: filler = torch.zeros( (batch_size, src_len - key_padding_mask.size(1)), device=key_padding_mask.device, ) new_key_padding_mask = torch.cat( [filler.float(), key_padding_mask.float()], dim=1 ) else: new_key_padding_mask = prev_key_padding_mask return new_key_padding_mask @torch.jit.export def reorder_incremental_state( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor, ): """Reorder buffered internal state (for incremental generation).""" input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: for k in input_buffer.keys(): input_buffer_k = input_buffer[k] if input_buffer_k is not None: if self.encoder_decoder_attention and input_buffer_k.size( 0 ) == new_order.size(0): break input_buffer[k] = input_buffer_k.index_select(0, new_order) incremental_state = self._set_input_buffer(incremental_state, input_buffer) return incremental_state def _get_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ) -> Dict[str, Optional[Tensor]]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: return result else: empty_result: Dict[str, Optional[Tensor]] = {} return empty_result def _set_input_buffer( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], buffer: Dict[str, Optional[Tensor]], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) def apply_sparse_mask(attn_weights, tgt_len: int, src_len: int, bsz: int): return attn_weights def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" items_to_add = {} keys_to_remove = [] for k in state_dict.keys(): if k.endswith(prefix + "in_proj_weight"): # in_proj_weight used to be q + k + v with same dimensions dim = int(state_dict[k].shape[0] / 3) items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim] items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim] items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :] keys_to_remove.append(k) k_bias = prefix + "in_proj_bias" if k_bias in state_dict.keys(): dim = int(state_dict[k].shape[0] / 3) items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim] items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][ dim : 2 * dim ] items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :] keys_to_remove.append(prefix + "in_proj_bias") for k in keys_to_remove: del state_dict[k] for key, value in items_to_add.items(): state_dict[key] = value ================================================ FILE: examples/m2m_100/README.md ================================================ # Beyond English-Centric Multilingual Machine Translation ## Introduction In this work, we create a true Many-to-Many multilingual translation model that can translate directly between any pair of 100 languages. Our focus on non-English-Centric models brings gains of more than 10 BLEU when directly translating between non-English directions while performing competitively with the best single systems of WMT. If you are new to using fairseq, read the following walkthrough. Otherwise, skip to the sections below. 0. **Generation Data** To download the generation data, follow the below commands. Note that all datasets need to be detokenized *before* applying SPM in the data preprocessing step. If you use these evaluation datasets, please cite their associated papers. ```bash # WMT - use sacrebleu, example here: sacrebleu -t wmt14 -l fr-en --echo src > wmt.test.fr-en.fr sacrebleu -t wmt14 -l fr-en --echo ref > wmt.test.fr-en.en # WAT wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip unzip wat2020.my-en.zip # FLORES # download from: https://github.com/facebookresearch/flores # TED - need to detokenize with Moses! # from: https://github.com/neulab/word-embeddings-for-nmt wget http://phontron.com/data/ted_talks.tar.gz # Autshumato # request to download: https://repo.sadilar.org/handle/20.500.12185/397 # Tatoeba Challenge # available here: https://github.com/Helsinki-NLP/Tatoeba-Challenge ``` 1. **Training Data** To produce the training data, we use a combination of [CCMatrix](https://arxiv.org/abs/1911.04944) and [CCAligned](https://arxiv.org/abs/1911.06154). Check out the instructions [here](https://github.com/facebookresearch/LASER/tree/master/tasks/CCMatrix) to download the raw data. 2. **Preprocess Data** After downloading raw data, you will need to postprocess the data, then apply SPM, then binarize. Note that it is very important you run the postprocessing script, because this removes any instance of the evaluation data in the mined training data. ```bash # preprocess data # remove sentences with more than 50% punctuation python /path/to/fairseq/examples/m2m_100/process_data/remove_too_much_punc.py # deduplicate training data paste /path/to/datadir/train.$src /path/to/datadir/train.$tgt | awk '!x[$0]++' > /path/to/datadir/train.dedup echo "keeping $(wc -l /path/to/datadir/train.dedup) bitext out of $(wc -l /path/to/datadir/train.$src)" cut -f1 /path/to/datadir/train.dedup > /path/to/datadir/train.$src cut -f2 /path/to/datadir/train.dedup > /path/to/datadir/train.$tgt # remove all instances of evaluation data from the training data python /path/to/fairseq/examples/m2m_100/process_data/dedup_data.py # frequency cleaning wget https://dl.fbaipublicfiles.com/m2m_100/histograms.tar.gz tar -xvzf histograms.tar.gz python /path/to/fairseq/examples/m2m_100/process_data/clean_histogram.py --src $src --tgt $tgt --src-file /path/to/source/file --tgt-file /path/to/output/file --src-output-file source_output.$src --tgt-output-file target_output.$tgt --histograms /path/to/histograms # apply SPM wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model python /path/to/fairseq/scripts/spm_encode.py \ --model spm.128k.model \ --output_format=piece \ --inputs=/path/to/input/file/here \ --outputs=/path/to/output/file/here # length ratio cleaning perl mosesdecoder/scripts/training/clean-corpus-n.perl --ratio 3 /path/to/training/data/train.spm.$src-$tgt $src $tgt /path/to/output/directory/train.spm.$src-$tgt 1 250 # binarize data wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt fairseq-preprocess \ --source-lang $src --target-lang $tgt \ --testpref spm.$src.$tgt \ --thresholdsrc 0 --thresholdtgt 0 \ --destdir data_bin \ --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt ``` 3. **Training Scripts** To reproduce the training of our models, we train with fairseq-py's multilingual translation [task](https://github.com/pytorch/fairseq/tree/main/examples/multilingual). If you are interested in model parallel training, also check out [fairscale](https://github.com/facebookresearch/fairscale). 4. **Generation** To generate from our models, follow the the commands in the generation section below. If you use any of the resources listed here, please cite: ```bibtex @article{fan2020beyond, title={Beyond English-Centric Multilingual Machine Translation}, author={Fan, Angela and Bhosale, Shruti and Schwenk, Holger and Ma, Zhiyi and El-Kishky, Ahmed and Goyal, Siddharth and Baines, Mandeep and Celebi, Onur and Wenzek, Guillaume and Chaudhary, Vishrav and Goyal, Naman and Birch, Tom and Liptchinsky, Vitaliy and Edunov, Sergey and Grave, Edouard and Auli, Michael and Joulin, Armand}, journal={arXiv preprint}, year={2020} } @article{schwenk2019ccmatrix, title={Ccmatrix: Mining billions of high-quality parallel sentences on the web}, author={Schwenk, Holger and Wenzek, Guillaume and Edunov, Sergey and Grave, Edouard and Joulin, Armand}, journal={arXiv preprint arXiv:1911.04944}, year={2019} } @article{el2019massive, title={A Massive Collection of Cross-Lingual Web-Document Pairs}, author={El-Kishky, Ahmed and Chaudhary, Vishrav and Guzman, Francisco and Koehn, Philipp}, journal={arXiv preprint arXiv:1911.06154}, year={2019} } ``` ## Trained Models ### 418M and 1.2B Model We include the last checkpoint for both of these models. ```bash wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs_small_models.txt # 418M parameter model wget https://dl.fbaipublicfiles.com/m2m_100/418M_last_checkpoint.pt # 1.2B parameter model wget https://dl.fbaipublicfiles.com/m2m_100/1.2B_last_checkpoint.pt # Generation: fairseq-generate $binarized_data_path --batch-size 32 --path $path_to_model --fixed-dictionary model_dict.128k.txt -s en -t fr --remove-bpe 'sentencepiece' --beam 5 --task translation_multi_simple_epoch --lang-pairs language_pairs_small_models.txt --decoder-langtok --encoder-langtok src --gen-subset test > gen_out ``` ### 12B Model 12B parameter model trained on many-to-many training data for 100 languages. We include the last checkpoint, average of last 5 checkpoints, average of last 10 checkpoints. There isn't a universally best choice out of these three, but all three versions are pretty close in accuracy. You can either sweep over the 3 checkpoints on a dev test and use the best performing checkpoint for final testing. Or the last checkpoint can be a good default choice. **Model Download Links** Configuration | 2 32GB GPUs | 4 16GB GPUs | 6 12GB GPUs | 8 8GB GPUs :--|:--|:--|:--|:-- Last Checkpoint | [12b_last_chk_2_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_2_gpus.pt) | [12b_last_chk_4_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_4_gpus.pt) | [12b_last_chk_6_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_6_gpus.pt) | [12b_last_chk_8_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_8_gpus.pt) Average of last 5 checkpoints | [12b_avg5_chk_2_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_2_gpus.pt) | [12b_avg5_chk_4_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_4_gpus.pt) | [12b_avg5_chk_6_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_6_gpus.pt) | [12b_avg5_chk_8_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg5_chk_8_gpus.pt) Average of last 10 checkpoints | [12b_avg10_chk_2_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_2_gpus.pt) | [12b_avg10_chk_4_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_4_gpus.pt) | [12b_avg10_chk_6_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_6_gpus.pt) | [12b_avg10_chk_8_gpus.pt](https://dl.fbaipublicfiles.com/m2m_100/12b_avg10_chk_8_gpus.pt) **Generation Arguments** Configuration | 2 32GB GPUs | 4 16GB GPUs | 6 12GB GPUs | 8 8GB GPUs :--|:--|:--|:--|:-- `--pipeline-encoder-balance` | `[26]` | `[1,15,10]` | `[1,9,9,7]` | `[1,6,6,6,7]` `--pipeline-encoder-devices` | `[0]` | `[0,1,0]` | `[0,1,2,0]` | `[0,4,5,1,0]` `--pipeline-decoder-balance` | `[3,22,1]` | `[3,11,11,1]` | `[3,7,7,8,1]` | `[1,6,6,6,6,1]` `--pipeline-decoder-devices` | `[0,1,0]` | `[0,2,3,0]` | `[0,3,4,5,0]` | `[0,2,6,7,3,0]` ## SentencePiece Model ```bash wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model ``` ## Generation with M2M-100 ### Encode using our SentencePiece Model Note: Install SentencePiece from [here](https://github.com/google/sentencepiece) ```bash fairseq=/path/to/fairseq cd $fairseq sacrebleu --echo src -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.de sacrebleu --echo ref -l de-fr -t wmt19 | head -n 20 > raw_input.de-fr.fr wget https://dl.fbaipublicfiles.com/m2m_100/spm.128k.model for lang in de fr ; do python scripts/spm_encode.py \ --model spm.128k.model \ --output_format=piece \ --inputs=raw_input.de-fr.${lang} \ --outputs=spm.de-fr.${lang} done ``` ### Binarization ```bash wget https://dl.fbaipublicfiles.com/m2m_100/data_dict.128k.txt fairseq-preprocess \ --source-lang de --target-lang fr \ --testpref spm.de-fr \ --thresholdsrc 0 --thresholdtgt 0 \ --destdir data_bin \ --srcdict data_dict.128k.txt --tgtdict data_dict.128k.txt ``` ### Generation for the 12B model Note that generation can currently be run using 2 32GB / 4 16GB / 6 12GB / 8 8GB GPUs, and the corresponding model checkpoints and pipeline arguments can be found in the [12B Model Section](#12b-model). Generation on CPUs will be added in the future. ```bash wget https://dl.fbaipublicfiles.com/m2m_100/model_dict.128k.txt wget https://dl.fbaipublicfiles.com/m2m_100/language_pairs.txt wget https://dl.fbaipublicfiles.com/m2m_100/12b_last_chk_4_gpus.pt fairseq-generate \ data_bin \ --batch-size 1 \ --path 12b_last_chk_4_gpus.pt \ --fixed-dictionary model_dict.128k.txt \ -s de -t fr \ --remove-bpe 'sentencepiece' \ --beam 5 \ --task translation_multi_simple_epoch \ --lang-pairs language_pairs.txt \ --decoder-langtok --encoder-langtok src \ --gen-subset test \ --fp16 \ --dataset-impl mmap \ --distributed-world-size 1 --distributed-no-spawn \ --pipeline-model-parallel \ --pipeline-chunks 1 \ --pipeline-encoder-balance '[1,15,10]' \ --pipeline-encoder-devices '[0,1,0]' \ --pipeline-decoder-balance '[3,11,11,1]' \ --pipeline-decoder-devices '[0,2,3,0]' > gen_out ``` ## Evaluation with M2M-100 ### Tokenization Note: Refer to tokenizers/README.md for more details on tokenization. ```bash cd ${fairseq}/examples/m2m_100 cat ${fairseq}/gen_out | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh fr > hyp cat ${fairseq}/raw_input.de-fr.fr | sh tok.sh fr > ref ``` ### BLEU ```bash sacrebleu -tok 'none' ref < hyp ``` ================================================ FILE: examples/m2m_100/install_dependecies.sh ================================================ #!/usr/bin/env bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. CWD=`pwd` INSTALL_PATH=$CWD/tokenizers/thirdparty MOSES=$INSTALL_PATH/mosesdecoder if [ ! -d $MOSES ]; then echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git $MOSES cd $MOSES # To deal with differences in handling ' vs " git checkout 03578921cc1a03402 cd - fi WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts if [ ! -d $WMT16_SCRIPTS ]; then echo 'Cloning Romanian tokenization scripts' git clone https://github.com/rsennrich/wmt16-scripts.git $WMT16_SCRIPTS fi KYTEA=$INSTALL_PATH/kytea if [ ! -f $KYTEA/bin/kytea ]; then git clone https://github.com/neubig/kytea.git $KYTEA cd $KYTEA autoreconf -i ./configure --prefix=`pwd` make make install cd .. fi export MECAB=$INSTALL_PATH/mecab-0.996-ko-0.9.2 if [ ! -f $MECAB/bin/mecab ]; then cd $INSTALL_PATH curl -LO https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz tar zxfv mecab-0.996-ko-0.9.2.tar.gz cd mecab-0.996-ko-0.9.2/ ./configure --prefix=`pwd` make make install cd .. curl -LO https://bitbucket.org/eunjeon/mecab-ko-dic/downloads/mecab-ko-dic-2.1.1-20180720.tar.gz tar zxfv mecab-ko-dic-2.1.1-20180720.tar.gz cd mecab-ko-dic-2.1.1-20180720/ ./autogen.sh ./configure --prefix=`pwd` --with-dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic --with-mecab-config=$MECAB/bin/mecab-config make sh -c 'echo "dicdir=$MECAB/lib/mecab/dic/mecab-ko-dic" > $MECAB/etc/mecabrc' make install cd $CWD fi INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources if [ ! -d $INDIC_RESOURCES_PATH ]; then echo 'Cloning indic_nlp_resources' git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git $INDIC_RESOURCES_PATH fi if [ ! -f $INSTALL_PATH/seg_my.py ]; then cd $INSTALL_PATH wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip unzip wat2020.my-en.zip # switch to python3 cat wat2020.my-en/myseg.py |sed 's/^sys.std/###sys.std/g' | sed 's/### sys/sys/g' | sed 's/unichr/chr/g' > seg_my.py cd $CWD fi pip install pythainlp sacrebleu indic-nlp-library ================================================ FILE: examples/m2m_100/process_data/clean_histogram.py ================================================ import argparse parser = argparse.ArgumentParser() parser.add_argument('--src', type=str, help='Source language') parser.add_argument('--tgt', type=str, help='Target language') parser.add_argument('--src-file', type=str, help='Input source file') parser.add_argument('--tgt-file', type=str, help='Input target file') parser.add_argument('--src-output-file', type=str, help='Output source file') parser.add_argument('--tgt-output-file', type=str, help='Output target file') parser.add_argument('--threshold', type=float, default=0.5, help='Threshold') parser.add_argument('--threshold-character', type=str, default=']', help='Threshold character') parser.add_argument('--histograms', type=str, help='Path to histograms') args = parser.parse_args() def read_hist(f): ch = [] for line in f: c = line[0] if c == args.threshold_character: break ch.append(c) return ch with(open("{}/{}".format(args.histograms, args.src), 'r', encoding='utf8')) as f: ch1 = read_hist(f) with(open("{}/{}".format(args.histograms, args.tgt), 'r', encoding='utf8')) as f: ch2 = read_hist(f) print("Accepted characters for {}: {}".format(args.src, ch1)) print("Accepted characters for {}: {}".format(args.tgt, ch2)) with open(args.src_file, 'r', encoding='utf8') as fs1, open(args.tgt_file, 'r', encoding='utf8') as fs2, open(args.src_output_file, 'w', encoding='utf8') as fos1, open(args.tgt_output_file, 'w', encoding='utf8') as fos2: ls1 = fs1.readline() ls2 = fs2.readline() while ls1 or ls2: cnt1 = len([c for c in ls1.strip() if c in ch1]) cnt2 = len([c for c in ls2.strip() if c in ch2]) if cnt1 / len(ls1) > args.threshold and cnt2 / len(ls2) > args.threshold: fos1.write(ls1) fos2.write(ls2) else: print("{} {} {} \n{} {} {}".format(args.src, cnt1 / len(ls1), ls1.strip(), args.tgt, cnt2 / len(ls2), ls2.strip())) ls1 = fs1.readline() ls2 = fs2.readline() ================================================ FILE: examples/m2m_100/process_data/dedup_data.py ================================================ import argparse from collections import namedtuple import os DATADIR = "/path/to/train_data" DEDUP_FROM_DIR = "/path/to/eval/data" OUTPUT_DIR = "/path/to/output/data" def main(args): languages = set() for language_directory in os.listdir(DATADIR): if "_" in language_directory: src, tgt = language_directory.split("_") languages.add(LanguagePair(src=src, tgt=tgt)) data = existing_data() train_languages = sorted(languages) for language_pair in train_languages[args.start_index:args.start_index + args.size]: print(language_pair) dedup(language_pair, data) LanguagePair = namedtuple("LanguagePair", ["src", "tgt"]) def existing_data(): data = set() for file in os.listdir(DEDUP_FROM_DIR): with open(os.path.join(DEDUP_FROM_DIR, file)) as f: data |= set(f.readlines()) return data def dedup(language_pair, data, verbose=True, output=True): train_filenames = LanguagePair( src=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.src}", tgt=f"{DATADIR}/{language_pair.src}_{language_pair.tgt}/train.{language_pair.tgt}", ) output_filenames = LanguagePair( src=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.src}", tgt=f"{OUTPUT_DIR}/train.dedup.{language_pair.src}-{language_pair.tgt}.{language_pair.tgt}" ) # If output exists, skip this pair. It has already been done. if (os.path.exists(output_filenames.src) and os.path.exists(output_filenames.tgt)): if verbose: print(f"{language_pair.src}-{language_pair.tgt} already done.") return if verbose: print(f"{language_pair.src}-{language_pair.tgt} ready, will check dups.") # If there is no output, no need to actually do the loop. if not output: return if os.path.exists(train_filenames.src) and os.path.exists(train_filenames.tgt): with open(train_filenames.src) as f: train_source = f.readlines() with open(train_filenames.tgt) as f: train_target = f.readlines() # do dedup new_train_source = [] new_train_target = [] for i, train_line in enumerate(train_source): if train_line not in data and train_target[i] not in data: new_train_source.append(train_line) new_train_target.append(train_target[i]) assert len(train_source) == len(train_target) assert len(new_train_source) == len(new_train_target) assert len(new_train_source) <= len(train_source) with open(output_filenames.src, "w") as o: for line in new_train_source: o.write(line) with open(output_filenames.tgt, "w") as o: for line in new_train_target: o.write(line) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-s", "--start-index", required=True, type=int) parser.add_argument("-n", "--size", required=True, type=int) main(parser.parse_args()) ================================================ FILE: examples/m2m_100/process_data/remove_too_much_punc.py ================================================ import gzip import argparse from string import punctuation def len_no_punc(s, punc): return len([ch for ch in s if ch in punc]) def filter_overpunc(len_npunc, len_sen): return len_npunc < 0.5*len_sen def main(args): punc = punctuation + "—|–" print('Processing file {}'.format(args.input)) with gzip.open(args.input, 'rt', encoding=args.encoding) as tsv: with open(args.bitext + '.' + args.src_lang, 'wt', encoding=args.encoding) as fsrc: with open(args.bitext + '.' + args.tgt_lang, 'wt', encoding=args.encoding) as ftgt: line = tsv.readline() fields = line.split('\t') src, tgt = fields[1], fields[2] nchar_npunc_src = len_no_punc(src, punc) nchar_npunc_tgt = len_no_punc(tgt, punc) if filter_overpunc(nchar_npunc_src, len(src)) and filter_overpunc(nchar_npunc_tgt, len(tgt)): fsrc.write(src.strip() + '\n') ftgt.write(tgt.strip() + '\n') if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--input", required=True, type=str) parser.add_argument('--encoding', default='utf-8', help='character encoding for input/output') parser.add_argument('--bitext', type=str, required=True, help='language direction') parser.add_argument('--src-lang', type=str, required=True, help='Source language') parser.add_argument('--tgt-lang', type=str, required=True, help='Target language') main(parser.parse_args()) ================================================ FILE: examples/m2m_100/tok.sh ================================================ #!/usr/bin/env bash # Copyright (c) 2019-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # set -e TOKENIZERS_SCRIPTS=tokenizers INSTALL_PATH=$TOKENIZERS_SCRIPTS/thirdparty N_THREADS=8 lg=$1 MOSES=$INSTALL_PATH/mosesdecoder REPLACE_UNICODE_PUNCT=$MOSES/scripts/tokenizer/replace-unicode-punctuation.perl NORM_PUNC=$MOSES/scripts/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$MOSES/scripts/tokenizer/remove-non-printing-char.perl TOKENIZER=$MOSES/scripts/tokenizer/tokenizer.perl # special tokenization for Romanian WMT16_SCRIPTS=$INSTALL_PATH/wmt16-scripts NORMALIZE_ROMANIAN=$WMT16_SCRIPTS/preprocess/normalise-romanian.py REMOVE_DIACRITICS=$WMT16_SCRIPTS/preprocess/remove-diacritics.py # Burmese MY_SEGMENT=$INSTALL_PATH/seg_my.py # Arabic AR_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenizer_ar.sh # Korean KO_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ko.sh # Japanese JA_SEGMENT=$TOKENIZERS_SCRIPTS/seg_ja.sh # Indic IN_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_indic.py INDIC_RESOURCES_PATH=$INSTALL_PATH/indic_nlp_resources # Thai THAI_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_thai.py # Chinese CHINESE_TOKENIZER=$TOKENIZERS_SCRIPTS/tokenize_zh.py # Chinese if [ "$lg" = "zh" ]; then cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | python $CHINESE_TOKENIZER # Thai elif [ "$lg" = "th" ]; then cat - | python $THAI_TOKENIZER # Japanese elif [ "$lg" = "ja" ]; then cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | ${JA_SEGMENT} # Korean elif [ "$lg" = "ko" ]; then cat - | $REM_NON_PRINT_CHAR | ${KO_SEGMENT} # Romanian elif [ "$lg" = "ro" ]; then cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $NORMALIZE_ROMANIAN | $REMOVE_DIACRITICS | $TOKENIZER -no-escape -threads $N_THREADS -l $lg # Burmese elif [ "$lg" = "my" ]; then cat - | python ${MY_SEGMENT} # Arabic elif [ "$lg" = "ar" ]; then cat - | ${AR_TOKENIZER} # Indic elif [ "$lg" = "ne" ]; then cat - | python ${IN_TOKENIZER} $lg elif [ "$lg" = "si" ]; then cat - | python ${IN_TOKENIZER} $lg elif [ "$lg" = "hi" ]; then cat - | python ${IN_TOKENIZER} $lg # other languages else cat - | $REPLACE_UNICODE_PUNCT | $NORM_PUNC -l $lg | $REM_NON_PRINT_CHAR | $TOKENIZER -no-escape -threads $N_THREADS -l $lg fi ================================================ FILE: examples/m2m_100/tokenizers/README.md ================================================ # M2M-100 Tokenization We apply different tokenization strategies for different languages following the existing literature. Here we provide tok.sh a tokenizer that can be used to reproduce our results. To reproduce the results, follow these steps: ``` tgt_lang=... reference_translation=... cat generation_output | grep -P "^H" | sort -V | cut -f 3- | sh tok.sh $tgt_lang > hyp cat $reference_translation |sh tok.sh $tgt_lang > ref sacrebleu -tok 'none' ref < hyp ``` ## Installation Tools needed for all the languages except Arabic can be installed by running install_dependencies.sh If you want to evaluate Arabic models, please follow the instructions provided here: http://alt.qcri.org/tools/arabic-normalizer/ to install ================================================ FILE: examples/m2m_100/tokenizers/seg_ja.sh ================================================ #!/usr/bin/env bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. SCRIPT=`realpath $0` KYTEA=`dirname $SCRIPT`/thirdparty/kytea export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$KYTEA/lib:/usr/local/lib export PATH=$PATH:"$KYTEA/bin" cat - | tr -d "[:blank:]" | kytea -notags ================================================ FILE: examples/m2m_100/tokenizers/seg_ko.sh ================================================ #!/usr/bin/env bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. SCRIPT=`realpath $0` MECAB=`dirname $SCRIPT`/thirdparty/mecab-0.996-ko-0.9.2 export PATH=$PATH:"$MECAB/bin":"$MECAB/lib" export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:"$MECAB/lib" cat - | mecab -O wakati ================================================ FILE: examples/m2m_100/tokenizers/thirdparty/.gitignore ================================================ seg_my.py indic_nlp_library/ indic_nlp_resources/ kytea/ mecab-0.996-ko-0.9.2.tar.gz mecab-0.996-ko-0.9.2/ mosesdecoder/ wat2020.my-en.zip wat2020.my-en/ wmt16-scripts/ mecab-ko-dic-2.1.1-20180720/ mecab-ko-dic-2.1.1-20180720.tar.gz ================================================ FILE: examples/m2m_100/tokenizers/tokenize_indic.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # Use: echo {text} | python tokenize_indic.py {language} import sys from indicnlp.normalize.indic_normalize import IndicNormalizerFactory from indicnlp.tokenize.indic_tokenize import trivial_tokenize factory = IndicNormalizerFactory() normalizer = factory.get_normalizer( sys.argv[1], remove_nuktas=False, nasals_mode="do_nothing" ) for line in sys.stdin: normalized_line = normalizer.normalize(line.strip()) tokenized_line = " ".join(trivial_tokenize(normalized_line, sys.argv[1])) print(tokenized_line) ================================================ FILE: examples/m2m_100/tokenizers/tokenize_thai.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys from pythainlp import word_tokenize for line in sys.stdin: print(" ".join(word_tokenize(line.strip()))) ================================================ FILE: examples/m2m_100/tokenizers/tokenize_zh.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import fileinput import sacrebleu for line in fileinput.input(): print(sacrebleu.tokenize_zh(line)) ================================================ FILE: examples/m2m_100/tokenizers/tokenizer_ar.sh ================================================ #!/usr/bin/env sh # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # # Please follow the instructions here http://alt.qcri.org/tools/arabic-normalizer/ # to install tools needed for Arabic echo "Please install Arabic tools: http://alt.qcri.org/tools/arabic-normalizer/" echo "Then update environment variables in tokenizer_ar.sh" exit 1 SVMTOOL=... GOMOSESGO=... QCRI_ARABIC_NORMALIZER=... export PERL5LIB="$SVMTOOL/lib":"$GOMOSESGO/bin/MADA-3.2":$PERL5LIB tempfile=$(mktemp) cat - > $tempfile cd $QCRI_ARABIC_NORMALIZER bash qcri_normalizer_mada3.2_aramorph1.2.1.sh $tempfile cat $tempfile.mada_norm-aramorph.europarl_tok ================================================ FILE: examples/mbart/README.md ================================================ # MBART: Multilingual Denoising Pre-training for Neural Machine Translation [https://arxiv.org/abs/2001.08210] ## Introduction MBART is a sequence-to-sequence denoising auto-encoder pre-trained on large-scale monolingual corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only on the encoder, decoder, or reconstructing parts of the text. ## Pre-trained models Model | Description | # params | Download ---|---|---|--- `mbart.CC25` | mBART model with 12 encoder and decoder layers trained on 25 languages' monolingual corpus | 610M | [mbart.CC25.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz) `mbart.ft.ro_en` | finetune mBART cc25 model on ro-en language pairs | 610M | [mbart.cc25.ft.enro.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.ft.enro.tar.gz) ## Results **[WMT16 EN-RO](https://www.statmt.org/wmt16/translation-task.html)** _(test set, no additional data used)_ Model | en-ro | ro-en ---|---|--- `Random` | 34.3 | 34.0 `mbart.cc25` | 37.7 | 37.8 `mbart.enro.bilingual` | 38.5 | 38.5 ## BPE data # download model wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.v2.tar.gz tar -xzvf mbart.CC25.tar.gz # bpe data install SPM [here](https://github.com/google/sentencepiece) ```bash SPM=/path/to/sentencepiece/build/src/spm_encode MODEL=sentence.bpe.model ${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${SRC} > ${DATA}/${TRAIN}.spm.${SRC} & ${SPM} --model=${MODEL} < ${DATA}/${TRAIN}.${TGT} > ${DATA}/${TRAIN}.spm.${TGT} & ${SPM} --model=${MODEL} < ${DATA}/${VALID}.${SRC} > ${DATA}/${VALID}.spm.${SRC} & ${SPM} --model=${MODEL} < ${DATA}/${VALID}.${TGT} > ${DATA}/${VALID}.spm.${TGT} & ${SPM} --model=${MODEL} < ${DATA}/${TEST}.${SRC} > ${DATA}/${TEST}.spm.${SRC} & ${SPM} --model=${MODEL} < ${DATA}/${TEST}.${TGT} > ${DATA}/${TEST}.spm.${TGT} & ``` ## Preprocess data ```bash DICT=dict.txt fairseq-preprocess \ --source-lang ${SRC} \ --target-lang ${TGT} \ --trainpref ${DATA}/${TRAIN}.spm \ --validpref ${DATA}/${VALID}.spm \ --testpref ${DATA}/${TEST}.spm \ --destdir ${DEST}/${NAME} \ --thresholdtgt 0 \ --thresholdsrc 0 \ --srcdict ${DICT} \ --tgtdict ${DICT} \ --workers 70 ``` ## Finetune on EN-RO Finetune on mbart CC25 ```bash PRETRAIN=mbart.cc25 # fix if you moved the downloaded checkpoint langs=ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN fairseq-train path_2_data \ --encoder-normalize-before --decoder-normalize-before \ --arch mbart_large --layernorm-embedding \ --task translation_from_pretrained_bart \ --source-lang en_XX --target-lang ro_RO \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --lr-scheduler polynomial_decay --lr 3e-05 --warmup-updates 2500 --total-num-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ --seed 222 --log-format simple --log-interval 2 \ --restore-file $PRETRAIN \ --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler \ --langs $langs \ --ddp-backend legacy_ddp ``` ## Generate on EN-RO Get sacrebleu on finetuned en-ro model get tokenizer [here](https://github.com/rsennrich/wmt16-scripts) ```bash wget https://dl.fbaipublicfiles.com/fairseq/models/mbart/mbart.cc25.ft.enro.tar.gz tar -xzvf mbart.cc25.ft.enro.tar.gz ``` ```bash model_dir=MBART_finetuned_enro # fix if you moved the checkpoint fairseq-generate path_2_data \ --path $model_dir/model.pt \ --task translation_from_pretrained_bart \ --gen-subset test \ -t ro_RO -s en_XX \ --bpe 'sentencepiece' --sentencepiece-model $model_dir/sentence.bpe.model \ --sacrebleu --remove-bpe 'sentencepiece' \ --batch-size 32 --langs $langs > en_ro cat en_ro | grep -P "^H" |sort -V |cut -f 3- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > en_ro.hyp cat en_ro | grep -P "^T" |sort -V |cut -f 2- | sed 's/\[ro_RO\]//g' |$TOKENIZER ro > en_ro.ref sacrebleu -tok 'none' -s 'none' en_ro.ref < en_ro.hyp ``` ## Citation ```bibtex @article{liu2020multilingual, title={Multilingual Denoising Pre-training for Neural Machine Translation}, author={Yinhan Liu and Jiatao Gu and Naman Goyal and Xian Li and Sergey Edunov and Marjan Ghazvininejad and Mike Lewis and Luke Zettlemoyer}, year={2020}, eprint={2001.08210}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ================================================ FILE: examples/megatron_11b/README.md ================================================ # Megatron-11b Megatron-11b is a unidirectional language model with `11B` parameters based on [Megatron-LM](https://arxiv.org/pdf/1909.08053.pdf). Following the original Megatron work, we trained the model using intra-layer model parallelism with each layer's parameters split across 8 GPUs. Megatron-11b is trained on the same data and uses the same byte-pair encoding (BPE) as [RoBERTa](https://arxiv.org/pdf/1907.11692.pdf). ## Pre-trained models Model | Description | # params | # filesize | Download ---|---|---|---|--- `megatron_11b` | megatron_11b unidirectional language model | 11B | 19Gb | [megatron_11b.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/model_parallel/megatron_11b.tar.gz) #### Architecture: Param | Value ---|--- embed_dim | 3072 ffn_dim | 3072 * 6 layers | 72 attention heads | 32 #### Training details: Param | value ---|--- bsz | 512 num_updates | 300,000 peak_lr | 1.5e-04 lr scheduler | inverse_sqrt clip norm | 0.0 ## Example training command (model parallel) Megatron-11b contains too many parameters to train on a single GPU. Following the original Megatron work, we adopt an intra-layer model parallel training approach in which each layer's parameters are split across multiple GPUs and activations and gradients are communicated during the forward/backward pass, respectively. We similarly split the loss computation using the `vocab_parallel_cross_entropy` criterion. The following training command illustrates how to do model parallel training in fairseq. We assume that each machine (node) has 8 GPUs among which to split the model parameters (`--model-parallel-size 8`). If you have access to multiple nodes, you may combine this with data parallel training by increasing `--distributed-world-size`. To train Megatron-11b on a single node: ```bash fairseq-train \ --distributed-world-size 8 \ --memory-efficient-fp16 \ --num-workers 2 \ --model-parallel-size 8 \ --criterion vocab_parallel_cross_entropy \ --task language_modeling \ --sample-break-mode none \ --tokens-per-sample 1024 \ --arch transformer_lm_megatron_11b \ --share-decoder-input-output-embed \ --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-08 --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --lr 0.00015 \ --warmup-updates 3000 --weight-decay 0.01 \ --dropout 0.1 --attention-dropout 0.1 \ --batch-size 2 \ --max-update 300000; ``` Note: Above was tested on `DGX-1` box, with `8xV100-32Gb` GPUs. ## Results **[Wikitext103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/)** Model | Valid perplexity | Test perplexity ---|---|--- `megatron_11b` | 10.64 | 10.54 ## Evaluating `megatron_11b` on Wikitext-103 #### 1. Downloading Megatron-11b ```bash # WARNING: this file is 19GB wget https://dl.fbaipublicfiles.com/fairseq/models/model_parallel/megatron_11b.tar.gz tar -xzvf megatron_11b.tar.gz ``` #### 2. Download Wikitext-103 ```bash wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip unzip wikitext-103-raw-v1.zip ``` #### 3. Detokenize test tokens Megatron-11b uses a byte-level BPE that expects raw (untokenized) input. Since the wikitext-103 dataset comes tokenized, we apply a simple detokenization process to restore the untokenized test set: ```bash python -m examples.megatron_11b.detok wikitext-103-raw/wiki.test.raw > wikitext-103-raw/wiki.test.detok ``` #### 4. BPE encoding ```bash wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' python -m examples.roberta.multiprocessing_bpe_encoder \ --encoder-json encoder.json \ --vocab-bpe vocab.bpe \ --inputs "wikitext-103-raw/wiki.test.detok" \ --outputs "wikitext-103-raw/wiki.test.bpe" \ --workers 60; ``` #### 5. Fairseq binarize ```bash fairseq-preprocess \ --only-source \ --testpref wikitext-103-raw/wiki.test.bpe \ --srcdict megatron_11b/dict.txt \ --destdir wikitext103-bin; ``` #### 6. Evaluating perplexity. We can now evaluate perplexity on the test set. Note that because we've modified the test set (via detokenization and BPE), the perplexity reported by `fairseq-eval-lm` needs to be renormalized. Compute unnormalized perplexity: ```bash DATA_PATH=wikitext103-bin/ fairseq-eval-lm \ $DATA_PATH \ --path megatron_11b/model.pt \ --task language_modeling \ --gen-subset test \ --batch-size 8 \ --criterion cross_entropy \ --context-window 992 \ --distributed-world-size 8 \ --model-parallel-size 8; # Expected PPL (unnormalized_ppl): [8.46] # Note: the eval command needs to run on 8 GPUs for the released model ``` Renormalizing formula: `2 ^ ( log_2(unnormalized_PPL) * (270847 / 245566))`. PPL After normalization: `10.54` To renormalize the perplexity, we must account for the change in token count after detokenizing and appling BPE. The formula for this is: `2 ^ ( log_2(unnormalized_PPL) * (new_token_cnt / orig_token_cnt))` For the wikitext-103 test set, the original token count is `245566` and the token count after detokenization and applying BPE is `270847`. The perplexity after renormalization is: `2 ^ ( log_2(8.46) * (270847 / 245566)) = 10.54` ================================================ FILE: examples/megatron_11b/detok.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import fileinput import sacremoses def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("files", nargs="*", help="input files") args = parser.parse_args() detok = sacremoses.MosesDetokenizer() for line in fileinput.input(args.files, openhook=fileinput.hook_compressed): print( detok.detokenize(line.strip().split(" ")) .replace(" @", "") .replace("@ ", "") .replace(" =", "=") .replace("= ", "=") .replace(" – ", "–") ) if __name__ == "__main__": main() ================================================ FILE: examples/mms/MODEL_CARD.md ================================================ # MMS Model Card ## Model details **Organization developing the model** The FAIR team **Model version** This is version 1 of the model. **Model type** MMS is speech model, based on the transformer architecture. The pre-trained model comes in two sizes: 300M and 1B parameters. We fine-tune the model for speech recognition and make it available in the 1B variant. We also fine-tune the 1B variant for language identification. **License** CC BY-NC **Where to send questions or comments about the model** Questions and comments about MMS can be sent via the [GitHub repository](https://github.com/pytorch/fairseq/tree/master/examples/mms) of the project , by opening an issue and tagging it as MMS. ## Uses **Primary intended uses** The primary use of MMS is to perform speech processing research for many more languages and to perform tasks such as automatic speech recognition, language identification, and speech synthesis. **Primary intended users** The primary intended users of the model are researchers in speech processing, machine learning and artificial intelligence. **Out-of-scope use cases** Fine-tuning the pre-pretrained models on other labeled datasets or downstream tasks requires further risk evaluation and mitigation. ## Bias and Risks The MMS models were pre-trained on a blend of data from different domains, including readings of the New Testament. In the paper, we describe two studies analyzing gender bias and the use of religious language which conclude that models perform equally well for both genders and that on average, there is little bias for religious language (section 8 of the paper). # Training Details ## Training Data MMS is pre-trained on VoxPopuli (parliamentary speech), MLS (read audiobooks), VoxLingua-107 (YouTube speech), CommonVoice (read Wikipedia text), BABEL (telephone conversations), and MMS-lab-U (New Testament readings), MMS-unlab (various read Christian texts). Models are fine-tuned on FLEURS, VoxLingua-107, MLS, CommonVoice, and MMS-lab. We obtained the language information for MMS-lab, MMS-lab-U and MMS-unlab from our data soucrce and did not manually verify it for every language. ## Training Procedure Please refer to the research paper for details on this. # Evaluation ## Testing Data, Factors & Metrics We evaluate the model on a different benchmarks for the downstream tasks. The evaluation details are presented in the paper. The models performance is measured using standard metrics such as character error rate, word error rate, and classification accuracy. # Citation **BibTeX:** ``` @article{pratap2023mms, title={Scaling Speech Technology to 1,000+ Languages}, author={Vineel Pratap and Andros Tjandra and Bowen Shi and Paden Tomasello and Arun Babu and Sayani Kundu and Ali Elkahky and Zhaoheng Ni and Apoorv Vyas and Maryam Fazel-Zarandi and Alexei Baevski and Yossi Adi and Xiaohui Zhang and Wei-Ning Hsu and Alexis Conneau and Michael Auli}, journal={arXiv}, year={2023} } ``` # Model Card Contact Please reach out to the authors at: [vineelkpratap@meta.com](mailto:vineelkpratap@meta.com) [androstj@meta.com](mailto:androstj@meta.com) [bshi@meta.com](mailto:bshi@meta.com) [michaelauli@meta.com](mailto:michaelauli@gmail.com) ================================================ FILE: examples/mms/README.md ================================================ # MMS: Scaling Speech Technology to 1000+ languages The Massively Multilingual Speech (MMS) project expands speech technology from about 100 languages to over 1,000 by building a single multilingual speech recognition model supporting over 1,100 languages (more than 10 times as many as before), language identification models able to identify over [4,000 languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html) (40 times more than before), pretrained models supporting over 1,400 languages, and text-to-speech models for over 1,100 languages. Our goal is to make it easier for people to access information and to use devices in their preferred language. You can find details in the paper [Scaling Speech Technology to 1000+ languages](https://research.facebook.com/publications/scaling-speech-technology-to-1000-languages/) and the [blog post](https://ai.facebook.com/blog/multilingual-model-speech-recognition/). An overview of the languages covered by MMS can be found [here](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html). ## 🤗 Transformers MMS has been added to Transformers. For more information, please refer to [Transformers' MMS docs](https://huggingface.co/docs/transformers/main/en/model_doc/mms). [Click here](https://huggingface.co/models?other=mms) to find all MMS checkpoints on the Hub. Checkout the demo here [![Open In HF Spaces](https://huggingface.co/datasets/huggingface/badges/raw/main/open-in-hf-spaces-sm-dark.svg)](https://huggingface.co/spaces/facebook/MMS) ## Finetuned models ### ASR | Model | Languages | Dataset | Model | Dictionary* | Supported languages | | |---|---|---|---|---|---|--- MMS-1B:FL102 | 102 | FLEURS | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_fl102.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_fl102/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_fl102_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-fl102) MMS-1B:L1107| 1107 | MMS-lab | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_l1107.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_l1107/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_l1107_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-l1107) MMS-1B-all| 1162 | MMS-lab + FLEURS
+ CV + VP + MLS | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_all.pt) | [download](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_all/eng.txt) | [download](https://dl.fbaipublicfiles.com/mms/asr/mms1b_all_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-1b-all) \* In the `Dictionary` column, we provide the download link for token dictionary in English language. To download token dictionary for a different language supported by the model, modify the language code in the URL appropriately. For example, to get token dictionary of FL102 model for Hindi language, use [this](https://dl.fbaipublicfiles.com/mms/asr/dict/mms1b_fl102/hin.txt) link. ### TTS 1. Download the list of [iso codes](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html) of 1107 languages. 2. Find the iso code of the target language and download the checkpoint. Each folder contains 3 files: `G_100000.pth`, `config.json`, `vocab.txt`. The `G_100000.pth` is the generator trained for 100K updates, `config.json` is the training config, `vocab.txt` is the vocabulary for the TTS model. ``` # Examples: wget https://dl.fbaipublicfiles.com/mms/tts/eng.tar.gz # English (eng) wget https://dl.fbaipublicfiles.com/mms/tts/azj-script_latin.tar.gz # North Azerbaijani (azj-script_latin) ``` The above command downloads generator only, which is enough to run TTS inference. If you want the full model checkpoint which also includes the discriminator (`D_100000.pth`) and the optimizer states, download as follows. ``` # Example (full checkpoint: generator + discriminator + optimizer): wget https://dl.fbaipublicfiles.com/mms/tts/full_model/eng.tar.gz # English (eng) ``` ### LID \# Languages | Dataset | Model | Dictionary | Supported languages | | |---|---|---|---|---|--- 126 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l126/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l126_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-lid-126) 256 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l256.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l256/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l256_langs.html) | [🤗 Hub](https://huggingface.co/facebook/mms-lid-256) 512 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l512.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l512/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l512_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-512) 1024 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l1024.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l1024/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l1024_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-1024) 2048 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l2048.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l2048/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l2048_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-2048) 4017 | FLEURS + VL + MMS-lab-U + MMS-unlab | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l4017.pt) | [download](https://dl.fbaipublicfiles.com/mms/lid/dict/l4017/dict.lang.txt) | [download](https://dl.fbaipublicfiles.com/mms/lid/mms1b_l4017_langs.html)| [🤗 Hub](https://huggingface.co/facebook/mms-lid-4017) ## Commands to run inference ### ASR Run this command to transcribe one or more audio files: ```shell command cd /path/to/fairseq-py/ python examples/mms/asr/infer/mms_infer.py --model "/path/to/asr/model" --lang lang_code \ --audio "/path/to/audio_1.wav" "/path/to/audio_2.wav" "/path/to/audio_3.wav" ``` We also provide an Ipython notebook example inside `asr/tutorial` folder [ipynb](https://github.com/facebookresearch/fairseq/blob/main/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb) or [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/facebookresearch/fairseq/blob/main/examples/mms/asr/tutorial/MMS_ASR_Inference_Colab.ipynb) For more advance configuration and calculate CER/WER, you could prepare manifest folder by creating a folder with this format: ``` $ ls /path/to/manifest dev.tsv dev.wrd dev.ltr dev.uid # dev.tsv each line contains
`), which corresponds to embedding index `2`. Thus **the model never saw newline characters during pretraining** and newlines should not be used during few-shot prompting. This is more clearly illustrated in the following example, which uses fairseq's Hub Interface to tokenize two documents in the desired format: ```python from fairseq.models.transformer_lm import TransformerLanguageModel model_dir = '/path/to/en_dense_lm_125m' lm = TransformerLanguageModel.from_pretrained(model_dir, bpe='gpt2') data = """\ This is the first paragraph of the first document. This is the second paragraph of the first document. This is the first paragraph of the second document.\ """ # The following is wrong, since it will encode newlines present in `data`. tokens_bad = lm.score(data)['tokens'] assert '\n' in lm.decode(tokens_bad) # oops, we encoded a newline # Instead pass the replace_newlines_with_eos option to get the correct behavior. tokens_good = lm.score(data, replace_newline_with_eos=True)['tokens'] assert '\n' not in lm.decode(tokens_good) # no newlines were encoded ``` ## Citation Coming soon. ================================================ FILE: examples/moe_lm/data_card.md ================================================ # Data card for the paper "Efficient Large Scale Language Modeling with Mixtures of Experts" ## Version 1.0.0 We follow the recommendations of Gebru et al. (2018) and provide a datacard for the dataset used to train the 1.1T parameter model. ## Motivation * **For what purpose was the dataset created? Was there a specific task in mind? Was there a specific gap that needed to be filled? Please provide a description.** The pre-training data for training the 1.1 T model was created by a union of six English language datasets, including five datasets used by RoBERTa (Liu et al 2019) and the English subset of CC 100. These purpose of creating this dataset was to pre-train the language model. * **Who created the dataset (e.g., which team, research group) and on behalf of which entity (e.g., company, institution, organization)?** FAIR (Fundamental Artificial Intelligence Research) * **Who funded the creation of the dataset? If there is an associated grant, please provide the name of the grantor and the grant name and number.** FAIR (Fundamental Artificial Intelligence Research) * **Any other comments?** No. ## Composition * **What do the instances that comprise the dataset represent (e.g., documents, photos, people, countries)? Are there multiple types of instances (e.g., movies, users, and ratings; people and interactions between them; nodes and edges)? Please provide a description.** The instances are textual documents. The overall dataset is composed from a union of the following datasets - * BookCorpus (Zhu et al., 2019) consists of more than 10K unpublished books (4GB); * English Wikipedia, excluding lists, tables and headers (12GB); * CC-News (Nagel,2016) contains 63 million English news articles crawled between September 2016 and February 2019 (76GB); * OpenWebText (Gokaslan and Cohen, 2019), an open source recreation of the WebText dataset used to train GPT-2 (38GB); * CC-Stories (Trinh and Le, 2018) contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas (31GB); * English CC100 (Wenzek et al., 2020), a dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia (292GB). * **How many instances are there in total (of each type, if appropriate)?** The training data contains 112B tokens corresponding to 453 GB of data. * **Does the dataset contain all possible instances or is it a sample (not necessarily random) of instances from a larger set? If the dataset is a sample, then what is the larger set? Is the sample representative of the larger set (e.g., geographic coverage)? If so, please describe how this representativeness was validated/verified. If it is not representative of the larger set, please describe why not (e.g., to cover a more diverse range of instances, because instances were withheld or unavailable).** The English CC100 section of the dataset is a subset of CommonCrawl snapshots extracted between January 2018 to December 2018, filtered to match the style of Wikipedia. The CC-stories dataset contains a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas. * **What data does each instance consist of? “Raw” data (e.g., unprocessed text or images) or features? In either case, please provide a description.** Each instance consists of raw text data. * **Is there a label or target associated with each instance? If so, please provide a description.** No. * **Is any information missing from individual instances? If so, please provide a description, explaining why this information is missing (e.g., because it was unavailable). This does not include intentionally removed information, but might include, e.g., redacted text.** No. * **Are relationships between individual instances made explicit (e.g., users' movie ratings, social network links)? If so, please describe how these relationships are made explicit.** There are no explicit relationships between individual instances. * **Are there recommended data splits (e.g., training, development/validation, testing)? If so, please provide a description of these splits, explaining the rationale behind them.** We hold out a random validation set of approximately 150MB from the pretraining data, sampled proportionally to each dataset's size in the pretraining corpus. * **Are there any errors, sources of noise, or redundancies in the dataset? If so, please provide a description.** N/A * **Is the dataset self-contained, or does it link to or otherwise rely on external resources (e.g., websites, tweets, other datasets)?** It's self-contained. * **Does the dataset contain data that might be considered confidential (e.g., data that is protected by legal privilege or by doctor-patient confidentiality, data that includes the content of individuals' non-public communications)? If so, please provide a description.** The datasets used are publicly available, and the information in them is not considered confidential. * **Does the dataset contain data that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety? If so, please describe why.** Parts of the dataset are a subset of public Common Crawl data, which could contain sentences that, if viewed directly, might be offensive, insulting, threatening, or might otherwise cause anxiety. * **Does the dataset relate to people? If not, you may skip the remaining questions in this section.** Some documents of this data relate to people, such as news articles, Wikipedia descriptions, etc. * **Does the dataset identify any subpopulations (e.g., by age, gender)? If so, please describe how these subpopulations are identified and provide a description of their respective distributions within the dataset.** No. * **Is it possible to identify individuals (i.e., one or more natural persons), either directly or indirectly (i.e., in combination with other data) from the dataset? If so, please describe how** In addition to individuals who have Wikipedia pages (celebrities, politicians, etc.), it may be possible to identify other individuals by their names, Twitter account names, etc. if that information is present in Common Crawl. * **Does the dataset contain data that might be considered sensitive in any way (e.g., data that reveals racial or ethnic origins, sexual orientations, religious beliefs, political opinions or union memberships, or locations; financial or health data; biometric or genetic data; forms of government identification, such as social security numbers; criminal history)? If so, please provide a description.** The training dataset is partially derived from Common Crawl, which may contain some sensitive information. * **Any other comments?** No ## Collection Process * **How was the data associated with each instance acquired? Was the data directly observable (e.g., raw text, movie ratings), reported by subjects (e.g., survey responses), or indirectly inferred/ derived from other data (e.g., part-of-speech tags, model-based guesses for age or language)? If data was reported by subjects or indirectly inferred/derived from other data, was the data validated/verified? If so, please describe how.** N/A. The dataset is a union of six publicly available datasets. * **What mechanisms or procedures were used to collect the data (e.g., hardware apparatus or sensor, manual human curation, software program, software API)? How were these mechanisms or procedures validated?** N/A * **If the dataset is a sample from a larger set, what was the sampling strategy (e.g., deterministic, probabilistic with specific sampling probabilities)?** Please refer to the main document for details. * **Who was involved in the data collection process (e.g., students, crowdworkers, contractors) and how were they compensated (e.g., how much were crowdworkers paid)?** This data is mined, filtered and sampled by machines. * **Over what timeframe was the data collected? Does this timeframe match the creation timeframe of the data associated with the instances (e.g., recent crawl of old news articles)? If not, please describe the timeframe in which the data associated with the instances was created.** Different parts of the dataset were mined over different time periods. 1. The CC-News dataset contains English news articles crawled between September 2016 and February 2019. 2. The English CC-100 dataset was extracted from CommonCrawl snapshots between January 2018 and December 2018. * **Were any ethical review processes conducted (e.g., by an institutional review board)? If so, please provide a description of these review processes, including the outcomes, as well as a link or other access point to any supporting documentation.** No. * **Does the dataset relate to people? If not, you may skip the remainder of the questions in this section.** No. * **Did you collect the data from the individuals in question directly, or obtain it via third parties or other sources (e.g., websites)?** N/A * **Were the individuals in question notified about the data collection? If so, please describe (or show with screenshots or other information) how notice was provided, and provide a link or other access point to, or otherwise reproduce, the exact language of the notification itself.** N/A * **Did the individuals in question consent to the collection and use of their data? If so, please describe (or show with screenshots or other information) how consent was requested and provided, and provide a link or other access point to, or otherwise reproduce, the exact language to which the individuals consented.** N/A * **If consent was obtained, were the consenting individuals provided with a mechanism to revoke their consent in the future or for certain uses? If so, please provide a description, as well as a link or other access point to the mechanism (if appropriate).** N/A * **Has an analysis of the potential impact of the dataset and its use on data subjects (e.g., a data protection impact analysis) been conducted? If so, please provide a description of this analysis, including the outcomes, as well as a link or other access point to any supporting documentation.** Some responsible AI related evaluations were performed. Please refer to the main document and the model card for the paper. * **Any other comments?** No ## Preprocessing/cleaning/labeling * **Was any preprocessing/cleaning/labeling of the data done (e.g., discretization or bucketing, tokenization, part-of-speech tagging, SIFT feature extraction, removal of instances, processing of missing values)? If so, please provide a description. If not, you may skip the remainder of the questions in this section.** The component datasets went through standard cleaning and re-formatting practices, including removing repetitive/non informative text like "Chapter One", or "This ebook by Project Gutenberg". * **Was the “raw” data saved in addition to the preprocessed/cleaned/labeled data (e.g., to support unanticipated future uses)? If so, please provide a link or other access point to the “raw” data.** The "raw" component datasets is publicly available in their respective locations (more details can be seen in the respective papers linked in references). * **Is the software used to preprocess/clean/label the instances available? If so, please provide a link or other access point.** The software is proprietary to Meta Platforms and currently unavailable publicly. * **Any other comments?** No ## Uses * **Has the dataset been used for any tasks already? If so, please provide a description.** Yes, this dataset was used to pre-train the models described in the paper. * **Is there a repository that links to any or all papers or systems that use the dataset? If so, please provide a link or other access point.** No. * **What (other) tasks could the dataset be used for?** This data can be used to pretrain English language models, which are foundation to many current and future language tasks. * **Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses? For example, is there anything that a future user might need to know to avoid uses that could result in unfair treatment of individuals or groups (e.g., stereotyping, quality of service issues) or other undesirable harms (e.g., financial harms, legal risks) If so, please provide a description. Is there anything a future user could do to mitigate these undesirable harms?** The pipeline for creating this dataset paves a way for building a scalable infrastructure for mining datasets to be be used for training large-scale models. * **Are there tasks for which the dataset should not be used? If so, please provide a description.** No. * **Any other comments?** No. ## Distribution * **Will the dataset be distributed to third parties outside of the entity (e.g., company, institution, organization) on behalf of which the dataset was created? If so, please provide a description.** No. * **How will the dataset will be distributed (e.g., tarball on website, API, GitHub)? Does the dataset have a digital object identifier (DOI)?** N/A * **When will the dataset be distributed?** No. * **Will the dataset be distributed under a copyright or other intellectual property (IP) license, and/or under applicable terms of use (ToU)? If so, please describe this license and/or ToU, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms or ToU, as well as any fees associated with these restrictions.** No. * **Have any third parties imposed IP-based or other restrictions on the data associated with the instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any relevant licensing terms, as well as any fees associated with these restrictions.** No. * **Do any export controls or other regulatory restrictions apply to the dataset or to individual instances? If so, please describe these restrictions, and provide a link or other access point to, or otherwise reproduce, any supporting documentation.** N/A * **Any other comments?** No. ## Maintenance * **Who is supporting/hosting/maintaining the dataset?** FAIR (Fundamental Artificial Intelligence Research) * **How can the owner/curator/manager of the dataset be contacted (e.g., email address)?** Refer to the main document. * **Is there an erratum? If so, please provide a link or other access point.** N/A * **Will the dataset be updated (e.g., to correct labeling errors, add new instances, delete instances)? If so, please describe how often, by whom, and how updates will be communicated to users (e.g., mailing list, GitHub)?** No plan for updating. * **If the dataset relates to people, are there applicable limits on the retention of the data associated with the instances (e.g., were individuals in question told that their data would be retained for a fixed period of time and then deleted)? If so, please describe these limits and explain how they will be enforced.** N/A * **Will older versions of the dataset continue to be supported/hosted/maintained? If so, please describe how. If not, please describe how its obsolescence will be communicated to users.** N/A * **If others want to extend/augment/build on/contribute to the dataset, is there a mechanism for them to do so? If so, please provide a description. Will these contributions be validated/ verified? If so, please describe how. If not, why not? Is there a process for communicating/ distributing these contributions to other users? If so, please provide a description.** No. * **Any other comments?** No. ## References Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov. 2019. Roberta: A robustly optimized bert pretraining approach. arXiv preprint arXiv:1907.11692. Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724. Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available. Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847. Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association. ================================================ FILE: examples/moe_lm/model_card.md ================================================ # Model card for the paper ``Efficient Large Scale Language Modeling with Mixtures of Experts" ## Version 1.0.0 ### Model developer FAIR (Fundamental Artificial Intelligence Research) ### Model type An autoregressive English language model trained on a union of six English language models. We explore dense and sparse (MoE based) architectures in the paper. * Dense models - Our dense models range from 125M parameters to 13B parameters. * Sparse (MoE) models - Our MoE based models range from 15B parameters to 1.1 Trillion parameters. This model card focuses on the 1.1 Trillion parameter model, but the discussion applies to all of the models explored in this work. ### Citation details Artetxe et al. (2021): Efficient Large Scale Language Modeling with Mixtures of Experts ### Model Feedback Channel fairseq ## Intended use ### Primary intended use For research purposes only, e.g. reproducing model evaluation results. Generation is only used in a limited capacity for explanation/justification or for prompting/probing/priming for class labels. ### Out of scope uses The primary purpose of the model is not to generate language, although the model is capable of doing that. ## Factors influencing model performance This section discusses potential risks associated with using the model. ### Relevant factors Based on known problems with NLP technology, potential relevant factors include bias (gender, profession, race and religion). ### Evaluation factors The 1.1T model was evaluated on StereoSet and CrowS-Pairs datasets to quantify encoded bias in the model. ## Metrics ### Model performance measures The 1.1T parameter model was primarily evaluated on 1. In-domain and out-of-domain language modeling perplexity. 2. Zero-shot and few-shot priming. 3. Fully supervised finetuning. ### Approaches to handle uncertainty For few-shot learning, we report the average results across 25 runs, randomly sampling a different set of few-shot examples from the training set each time. ## Evaluation data ## Zero Shot evaluation ### HellaSwag #### Description HellaSwag is a dataset for evaluating commonsense reasoning. ### PIQA #### Description PIQA is a dataset designed to evaluate reasoning about Physical Commonsense in Natural Language ### ReCoRd #### Description Reading Comprehension with Commonsense Reasoning Dataset (ReCoRD) is a large-scale reading comprehension dataset which requires commonsense reasoning. ReCoRD consists of queries automatically generated from CNN/Daily Mail news articles; the answer to each query is a text span from a summarizing passage of the corresponding news. The goal of ReCoRD is to evaluate a machine's ability of commonsense reasoning in reading comprehension. ## Few Shot evaluation ### Winogrande #### Description Winogrande is a benchmark for commonsense reasoning. The dataset contains pronoun resolution problems originally designed to be unsolvable for statistical models that rely on selectional preferences or word associations. ### StoryCloze #### Description StoryCloze is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story. ### OpenBookQA #### Description OpenBookQA is a new kind of question-answering dataset modeled after open book exams for assessing human understanding of a subject. It consists of 5,957 multiple-choice elementary-level science questions (4,957 train, 500 dev, 500 test), which probe the understanding of a small “book” of 1,326 core science facts and the application of these facts to novel situations. ## Fully supervised evaluation ### BoolQ #### Description BoolQ is a question answering dataset for yes/no questions containing 15942 examples. These questions are naturally occurring – they are generated in unprompted and unconstrained settings. Each example is a triplet of (question, passage, answer), with the title of the page as optional additional context. ### SST-2 #### Description SST-2 (or SST-binary) is a binary classification dataset where the goal is to differentiate between negative or somewhat negative vs somewhat positive or positive. ### MNLI #### Description The Multi-Genre Natural Language Inference (MultiNLI) corpus is a crowd-sourced collection of 433k sentence pairs annotated with textual entailment information. The corpus is modeled on the SNLI corpus, but differs in that covers a range of genres of spoken and written text, and supports a distinctive cross-genre generalization evaluation. ## Responsible AI (RAI) evaluation ### StereoSet #### Description A large-scale natural dataset in English to measure stereotypical biases in four domains: gender, profession, race, and religion #### Motivation for dataset use The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model's stereotype bias in gender, profession, race, and religion ### CrowS #### Description Challenge Dataset for Measuring Social Biases in Masked Language Models #### Motivation for dataset use The motivation for evaluating the 1.1T parameter model on this dataset is to evaluate the model’s bias in the domains of race, religion and age ---- ## Training data ### BookCorpus #### Description A dataset consisting of more than 10K unpublished books. 4GB in size. (Zhu et al., 2019) ### English Wikipedia #### Description Data from English wikipedia, excluding lists, tables and headers. 12GB in size. ### CC-News #### Description A dataset containing 63 millions English news articles crawled between September 2016 and February 2019. 76GB in size. (Nagel,2016) ### OpenWebText #### Description An open source recreation of the WebText dataset used to train GPT-2. 38GB in size. (Gokaslan and Cohen, 2019) ### CC-Stories #### Description A dataset containing a subset of CommonCrawl data filtered to match the story-like style of Winograd schemas. 31GB in size. (Trinh and Le, 2018) ### English CC100 #### Description A dataset extracted from CommonCrawl snapshots between January 2018 and December 2018, filtered to match the style of Wikipedia following the methodology introduced in CCNet (https://arxiv.org/abs/1911.00359). 292GB in size. (Wenzek et al., 2020) ## Responsible AI (RAI) Dimensions ### Fairness (Bias and inclusion) The 1.1T parameter model was evaluated on the StereoSet and CrowS pairs dataset for inherent bias in the model, and bias as a result of the data. Similar to StereoSet, we observe that both the dense and MoE models get worse in terms of the Stereotype Score (SS) with scale. ### Privacy and security The 1.1T model did not have any special Privacy and Security considerations. The training data and evaluation data were both public and went through standard Meta privacy and licensing procedures. ### Transparency and control In the spirit of transparency and accountability we have created this model card for the 1.1T parameter model and a data card for the training data (referenced in Artetxe et al. (2021)). ### Efficiency (Green AI) The 1.1T parameter model is trained as a Mixture of Experts (MoE) model. Mixture of expert (MoE) models are efficient because they leverage sparse computation, i.e., only a small fraction of parameters are active for any given input. For instance, our 1.1T parameter MoE model requires only 30% more FLOPS compared to a 6.7B parameter dense model, i.e., a 160x increase in parameters with only a 30% increase in FLOPS. Notably, MoE models achieve much better validation perplexity for a given compute budget compared to dense models. ## References Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi. 2019. HellaSwag: Can a machine really finish your sentence? In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 4791– 4800, Florence, Italy. Association for Computational Linguistics. Yonatan Bisk, Rowan Zellers, Ronan Le bras, Jianfeng Gao, and Yejin Choi. 2020. Piqa: Reasoning about physical commonsense in natural language. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):7432–7439. Sheng Zhang, Xiaodong Liu, Jingjing Liu, Jianfeng Gao, Kevin Duh, and Benjamin Van Durme. 2018. ReCoRD: Bridging the gap between human and machine commonsense reading comprehension. arXiv preprint 1810.12885. Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi. 2020. Winogrande: An adversarial winograd schema challenge at scale. Proceedings of the AAAI Conference on Artificial Intelligence, 34(05):8732–8740. Nasrin Mostafazadeh, Nathanael Chambers, Xiaodong He, Devi Parikh, Dhruv Batra, Lucy Vanderwende, Pushmeet Kohli, and James Allen. 2016. A corpus and cloze evaluation for deeper understanding of commonsense stories. In Proceedings of the 2016 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, pages 839–849, San Diego, California. Association for Computational Linguistics. Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. 2018. Can a suit of armor conduct electricity? a new dataset for open book question answering. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 2381–2391, Brussels, Belgium. Association for Computational Linguistics. Christopher Clark and Kenton Lee and Ming-Wei Chang and Tom Kwiatkowski and Michael Collins and Kristina Toutanova. 2019. BoolQ: Exploring the Surprising Difficulty of Natural Yes/No Questions Moin Nadeem, Anna Bethke, and Siva Reddy. 2021. StereoSet: Measuring stereotypical bias in pretrained language models. In Association for Computational Linguistics (ACL). Nikita Nangia, Clara Vania, Rasika Bhalerao, and Samuel R. Bowman. 2020. CrowS-pairs: A challenge dataset for measuring social biases in masked language models. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1953–1967, Online. Association for Computational Linguistics. Yukun Zhu, Ryan Kiros, Richard Zemel, Ruslan Salakhutdinov, Raquel Urtasun, Antonio Torralba, and Sanja Fidler. 2019. Aligning books and movies: Towards story-like visual explanations by watching movies and reading books. arXiv:1506.06724. Sebastian Nagel. 2016. Cc-news. http: //web.archive.org/save/http: //commoncrawl.org/2016/10/news-dataset-available. Aaron Gokaslan and Vanya Cohen. 2019. Openwebtext corpus. http://web.archive.org/save/http://Skylion007.github.io/OpenWebTextCorpus Trieu H Trinh and Quoc V Le. 2018. A simple method for commonsense reasoning. arXiv preprint arXiv:1806.02847. Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, and Edouard Grave. 2020. CCNet: Extracting high quality monolingual datasets from web crawl data. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4003–4012, Marseille, France. European Language Resources Association. ================================================ FILE: examples/mr_hubert/README.md ================================================ # MR-HuBERT ## Pre-trained models ### Main models Model | Pretraining Data | Model | Paper Reference |---|---|---|--- MR-HuBERT Base (~97M) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_base/mrhubert_mono_base.pt) | mono\_base MR-HuBERT Base (~321M) | [Libri-Light](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_large/mrhubert_mono_large.pt) | mono\_large Multilingual MR-HuBERT Base (~97M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_base/multi_base.pt) | multi\_base Multilingual MR-HuBERT Large (~321M) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download 400k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_400k.pt) or [download 600k steps](https://dl.fbaipublicfiles.com/mrhubert/multi_large/multi_large_600k.pt) | Not in the paper ### Abalation models Model | Pretraining Data | Model | Paper Reference |---|---|---|--- MR-HuBERT Base (2-4-6 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-a/b1-a.pt) | (B.1)-a MR-HuBERT Base (5-2-5 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-b/b1-b.pt) | (B.1)-b MR-HuBERT Base (6-4-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b1-c/b1-c.pt) | (B.1)-c MR-HuBERT Base (3res 3-2-2-2-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-a/b2-a.pt) | (B.2)-a MR-HuBERT Base (3res 2-2-4-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-b/b2-b.pt) | (B.2)-b MR-HuBERT Base (3res 2-2-2-2-2 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b2-c/b2-c.pt) | (B.2)-c MR-HuBERT Base (Simple sampling) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b3-a/b3-a.pt) | (B.3)-a MR-HuBERT Base (Single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-a/b4-a.pt) | (B.4)-a MR-HuBERT Base (Simple Sampling + single target) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b4-b/b4-b.pt) | (B.4)-b MR-HuBERT Base (Mono-resolution 20ms) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b5-a/b5-a.pt) | (B.5)-a MR-HuBERT Base (3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-a/b6-a.pt) | (B.6)-a MR-HuBERT Base (Mono-resolution 20ms, 3-3-3 lyrs) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b6-b/b6-b.pt) | (B.6)-b MR-HuBERT Base (HuBERT 20ms&40ms units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-a/b7-a.pt) | (B.7)-a MR-HuBERT Base (Encodec 50Hz unit) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-b/b7-b.pt) | (B.7)-b MR-HuBERT Base (Encodec 50Hz units and 25Hz units) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-c/b7-c.pt) | (B.7)-c MR-HuBERT Base (Encodec 50Hz units stream 0&1 ) | [Librispeech](http://www.openslr.org/12) 960 hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b7-d/b7-d.pt) | (B.7)-d MR-HuBERT Large (no audio norm) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-a/b8-a.pt) | (B.8)-a MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-b/b8-b.pt) | (B.8)-b MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-c/b8-c.pt) | (B.8)-c MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-d/b8-d.pt) | (B.8)-d MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-e/b8-e.pt) | (B.8)-e MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-f/b8-f.pt) | (B.8)-f MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-g/b8-g.pt) | (B.8)-g MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-h/b8-h.pt) | (B.8)-h MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-i/b8-i.pt) | (B.8)-i MR-HuBERT Large (check paper ) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/b8-j/b8-j.pt) | (B.8)-j Multilingual MR-HuBERT Large (Simple sampling) | [Voxpopuli](https://github.com/facebookresearch/voxpopuli) 100k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/multi_large_simple/multi_large_simple.pt) | Not in paper MR-HuBERT xLarge (from HuBERT-base label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v1.pt) | Not in paper MR-HuBERT xLarge (from HuBERT-large label) | [LibriLight](https://github.com/facebookresearch/libri-light) 60k hr | [download](https://dl.fbaipublicfiles.com/mrhubert/mono_xlarge/v2.pt) | Not in paper ## Load a model ``` ckpt_path = "/path/to/the/checkpoint.pt" models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([ckpt_path]) model = models[0] ``` ## Train a new model ### Data preparation Follow the steps in `./simple_kmeans` to create: - `{train,valid}.tsv` waveform list files with length information ``` /path/to/your/audio/files file1.wav\t160000 file2.wav\t154600 ... filen.wav\t54362 ``` - `{train,valid}.km` frame-aligned pseudo label files (the order is the same as wavefiles in the tsv file). ``` 44 44 44 48 48 962 962 962 962 962 962 962 962 967 967 967 967 967 967 967 967 370 852 370 ... 18 18 745 745 44 44 44 48 48 962 962 962 147 147 147 147 147 147 147 147 147 147 147 147 176 176 271 271 ... 27 27 745 745 ... 44 44 44 48 962 962 962 962 962 962 377 377 377 77 77 852 696 694 433 578 578 82 740 622 ... 27 27 745 745 ``` - `dict.km.txt` a dummy dictionary (first column is id, the second is dummy one) ``` 0 1 1 1 2 1 ... 999 1 ``` The `label_rate` is the same as the feature frame rate used for clustering, which is 100Hz for MFCC features and 50Hz for HuBERT features by default. ### Pre-train a MR-HuBERT model Suppose `{train,valid}.tsv` are saved at `/path/to/data`, `{train,valid}.km` are saved at `/path/to/labels`, and the label rate is 100Hz. To train a base model (12 layer transformer), run: ```sh $ python fairseq_cli/hydra_train.py \ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/pretrain \ --config-name mrhubert_base_librispeech \ task.data=/path/to/data task.label_dir=/path/to/labels \ task.labels='["km"]' model.label_rate=100 \ task.label_rate_ratios='[1, 2]' \ ``` Please see sample pre-training scripts `train.sh` for an example script. ### Fine-tune a MR-HuBERT model with a CTC loss Suppose `{train,valid}.tsv` are saved at `/path/to/data`, and their corresponding character transcripts `{train,valid}.ltr` are saved at `/path/to/trans`. A typical ltr file is with the same order of tsv waveform files as ``` HOW | ARE | YOU ... THANK | YOU ``` To fine-tune a pre-trained MR-HuBERT model at `/path/to/checkpoint`, run ```sh $ python fairseq_cli/hydra_train.py \ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/finetune \ --config-name base_10h \ task.data=/path/to/data task.label_dir=/path/to/trans \ model.w2v_path=/path/to/checkpoint ``` Please see sample fine-tuning scripts `finetune.sh` for an example script. ### Decode a MR-HuBERT model Suppose the `test.tsv` and `test.ltr` are the waveform list and transcripts of the split to be decoded, saved at `/path/to/data`, and the fine-tuned model is saved at `/path/to/checkpoint`. We support three decoding modes: - Viterbi decoding: greedy decoding without a language model - KenLM decoding: decoding with an arpa-format KenLM n-gram language model - Fairseq-LM deocding: decoding with a Fairseq neural language model (not fully tested) #### Viterbi decoding `task.normalize` needs to be consistent with the value used during fine-tuning. Decoding results will be saved at `/path/to/experiment/directory/decode/viterbi/test`. ```sh $ python examples/speech_recognition/new/infer.py \ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \ --config-name infer \ task.data=/path/to/data \ task.normalize=[true|false] \ decoding.exp_dir=/path/to/experiment/directory \ common_eval.path=/path/to/checkpoint dataset.gen_subset=test \ ``` #### KenLM / Fairseq-LM decoding Suppose the pronunciation lexicon and the n-gram LM are saved at `/path/to/lexicon` and `/path/to/arpa`, respectively. Decoding results will be saved at `/path/to/experiment/directory/decode/kenlm/test`. ```sh $ python examples/speech_recognition/new/infer.py \ --config-dir /path/to/fairseq-py/examples/mr_hubert/config/decode \ --config-name infer_lm \ task.data=/path/to/data \ task.normalize=[true|false] \ decoding.exp_dir=/path/to/experiment/directory \ common_eval.path=/path/to/checkpoint dataset.gen_subset=test \ decoding.decoder.lexicon=/path/to/lexicon \ decoding.decoder.lmpath=/path/to/arpa ``` The command above uses the default decoding hyperparameter, which can be found in `examples/speech_recognition/hydra/decoder.py`. These parameters can be configured from the command line. For example, to search with a beam size of 500, we can append the command above with `decoding.decoder.beam=500`. Important parameters include: - decoding.decoder.beam - decoding.decoder.beamthreshold - decoding.decoder.lmweight - decoding.decoder.wordscore - decoding.decoder.silweight To decode with a Fairseq LM, you may check the usage examples in wav2vec2 or hubert examples. Please see sample decoding scripts `decode.sh` for an example script. ================================================ FILE: examples/mr_hubert/config/decode/infer.yaml ================================================ # @package _group_ defaults: - model: null hydra: run: dir: ${common_eval.results_path}/viterbi sweep: dir: ${common_eval.results_path} subdir: viterbi task: _name: multires_hubert_pretraining single_target: true fine_tuning: true label_rate_ratios: ??? data: ??? normalize: false decoding: type: viterbi unique_wer_file: true common_eval: results_path: ??? path: ??? post_process: letter dataset: max_tokens: 1100000 gen_subset: ??? ================================================ FILE: examples/mr_hubert/config/decode/infer_lm.yaml ================================================ # @package _group_ defaults: - model: null hydra: run: dir: ${common_eval.results_path}/beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} sweep: dir: ${common_eval.results_path} subdir: beam${decoding.beam}_th${decoding.beamthreshold}_lmw${decoding.lmweight}_wrd${decoding.wordscore}_sil${decoding.silweight} task: _name: multires_hubert_pretraining single_target: true fine_tuning: true data: ??? label_rate_ratios: ??? normalize: ??? decoding: type: kenlm lexicon: ??? lmpath: ??? beamthreshold: 100 beam: 500 lmweight: 1.5 wordscore: -1 silweight: 0 unique_wer_file: true common_eval: results_path: ??? path: ??? post_process: letter dataset: max_tokens: 1100000 gen_subset: ??? ================================================ FILE: examples/mr_hubert/config/decode/run/submitit_slurm.yaml ================================================ # @package _global_ hydra: launcher: cpus_per_task: ${distributed_training.distributed_world_size} gpus_per_node: ${distributed_training.distributed_world_size} tasks_per_node: ${hydra.launcher.gpus_per_node} nodes: 1 mem_gb: 200 timeout_min: 4320 max_num_timeout: 50 name: ${hydra.job.config_name} submitit_folder: ${hydra.sweep.dir}/submitit distributed_training: distributed_world_size: 1 distributed_no_spawn: true distributed_port: 29761 ================================================ FILE: examples/mr_hubert/config/decode/run/submitit_slurm_8gpu.yaml ================================================ # @package _global_ hydra: launcher: cpus_per_task: ${distributed_training.distributed_world_size} gpus_per_node: ${distributed_training.distributed_world_size} tasks_per_node: ${hydra.launcher.gpus_per_node} nodes: 1 mem_gb: 200 timeout_min: 4320 max_num_timeout: 50 name: ${hydra.job.config_name} submitit_folder: ${hydra.sweep.dir}/submitit distributed_training: distributed_world_size: 8 distributed_no_spawn: true distributed_port: 29761 ================================================ FILE: examples/mr_hubert/config/finetune/base_100h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tblog seed: 1337 checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: wer distributed_training: ddp_backend: c10d find_unused_parameters: true distributed_world_size: 8 distributed_port: 29671 nprocs_per_node: 8 task: _name: multires_hubert_pretraining data: ??? fine_tuning: true label_dir: ??? label_rate_ratios: ??? normalize: false # must be consistent with pre-training labels: ["ltr"] single_target: true dataset: num_workers: 0 max_tokens: 3200000 validate_after_updates: ${model.freeze_finetune_updates} validate_interval: 5 train_subset: train_100h valid_subset: dev_other criterion: _name: ctc zero_infinity: true optimization: max_update: 80000 lr: [3e-5] sentence_avg: true update_freq: [1] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: multires_hubert_ctc multires_hubert_path: ??? apply_mask: true mask_selection: static mask_length: 10 mask_other: 0 mask_prob: 0.75 mask_channel_selection: static mask_channel_length: 64 mask_channel_other: 0 mask_channel_prob: 0.5 layerdrop: 0.1 dropout: 0.0 activation_dropout: 0.1 attention_dropout: 0.0 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir - model.multires_hubert_path - dataset.train_subset - dataset.valid_subset - criterion.wer_kenlm_model - criterion.wer_lexicon run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/mr_hubert/config/finetune/base_100h_large.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tblog seed: 1337 checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: wer distributed_training: ddp_backend: c10d find_unused_parameters: true distributed_world_size: 8 distributed_port: 29671 nprocs_per_node: 8 task: _name: multires_hubert_pretraining data: ??? fine_tuning: true label_dir: ??? label_rate_ratios: ??? normalize: true # must be consistent with pre-training labels: ["ltr"] single_target: true dataset: num_workers: 0 max_tokens: 1600000 validate_after_updates: ${model.freeze_finetune_updates} validate_interval: 5 train_subset: train_100h valid_subset: dev_other criterion: _name: ctc zero_infinity: true optimization: max_update: 80000 lr: [3e-5] sentence_avg: true update_freq: [2] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: multires_hubert_ctc multires_hubert_path: ??? apply_mask: true mask_selection: static mask_length: 10 mask_other: 0 mask_prob: 0.75 mask_channel_selection: static mask_channel_length: 64 mask_channel_other: 0 mask_channel_prob: 0.5 layerdrop: 0.1 dropout: 0.0 activation_dropout: 0.1 attention_dropout: 0.0 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir - model.multires_hubert_path - dataset.train_subset - dataset.valid_subset - criterion.wer_kenlm_model - criterion.wer_lexicon run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/mr_hubert/config/finetune/base_10h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tblog seed: 1337 checkpoint: save_interval: 5 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer distributed_training: ddp_backend: c10d find_unused_parameters: true distributed_world_size: 8 distributed_port: 29671 nprocs_per_node: 8 task: _name: multires_hubert_pretraining data: ??? fine_tuning: true label_dir: ??? label_rate_ratios: ??? normalize: false # must be consistent with pre-training labels: ["ltr"] single_target: true dataset: num_workers: 0 max_tokens: 3200000 validate_after_updates: ${model.freeze_finetune_updates} validate_interval: 5 train_subset: train_10h valid_subset: dev criterion: _name: ctc zero_infinity: true optimization: max_update: 25000 lr: [2e-5] sentence_avg: true update_freq: [1] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage warmup_steps: 8000 hold_steps: 0 decay_steps: 72000 final_lr_scale: 0.05 model: _name: multires_hubert_ctc multires_hubert_path: ??? apply_mask: true mask_selection: static mask_length: 10 mask_other: 0 mask_prob: 0.75 mask_channel_selection: static mask_channel_length: 64 mask_channel_other: 0 mask_channel_prob: 0.5 layerdrop: 0.1 dropout: 0.0 activation_dropout: 0.1 attention_dropout: 0.0 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir - model.multires_hubert_path - dataset.train_subset - dataset.valid_subset - criterion.wer_kenlm_model - criterion.wer_lexicon run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/mr_hubert/config/finetune/base_10h_large.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tblog seed: 1337 checkpoint: save_interval: 5 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer distributed_training: ddp_backend: c10d find_unused_parameters: true distributed_world_size: 8 distributed_port: 29671 nprocs_per_node: 8 task: _name: multires_hubert_pretraining data: ??? fine_tuning: true label_dir: ??? label_rate_ratios: ??? normalize: true # must be consistent with pre-training labels: ["ltr"] single_target: true dataset: num_workers: 0 max_tokens: 3200000 validate_after_updates: ${model.freeze_finetune_updates} validate_interval: 5 train_subset: train_10h valid_subset: dev criterion: _name: ctc zero_infinity: true optimization: max_update: 25000 lr: [2e-5] sentence_avg: true update_freq: [1] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage warmup_steps: 8000 hold_steps: 0 decay_steps: 72000 final_lr_scale: 0.05 model: _name: multires_hubert_ctc multires_hubert_path: ??? apply_mask: true mask_selection: static mask_length: 10 mask_other: 0 mask_prob: 0.75 mask_channel_selection: static mask_channel_length: 64 mask_channel_other: 0 mask_channel_prob: 0.5 layerdrop: 0.1 dropout: 0.0 activation_dropout: 0.1 attention_dropout: 0.0 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir - model.multires_hubert_path - dataset.train_subset - dataset.valid_subset - criterion.wer_kenlm_model - criterion.wer_lexicon run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/mr_hubert/config/finetune/base_1h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tblog seed: 1337 checkpoint: save_interval: 50 keep_interval_updates: 1 save_interval_updates: 1000 no_epoch_checkpoints: true best_checkpoint_metric: wer distributed_training: ddp_backend: c10d find_unused_parameters: true distributed_world_size: 8 distributed_port: 29671 nprocs_per_node: 8 task: _name: multires_hubert_pretraining data: ??? fine_tuning: true label_dir: ??? label_rate_ratios: ??? normalize: false # must be consistent with pre-training labels: ["ltr"] single_target: true dataset: num_workers: 0 max_tokens: 3200000 validate_after_updates: ${model.freeze_finetune_updates} validate_interval: 1000 train_subset: train_1h valid_subset: dev_other criterion: _name: ctc zero_infinity: true optimization: max_update: 13000 lr: [5e-5] sentence_avg: true update_freq: [4] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: multires_hubert_ctc multires_hubert_path: ??? apply_mask: true mask_selection: static mask_length: 10 mask_other: 0 mask_prob: 0.75 mask_channel_selection: static mask_channel_length: 64 mask_channel_other: 0 mask_channel_prob: 0.5 layerdrop: 0.1 dropout: 0.0 activation_dropout: 0.1 attention_dropout: 0.0 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir - model.multires_hubert_path - dataset.train_subset - dataset.valid_subset - criterion.wer_kenlm_model - criterion.wer_lexicon run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/mr_hubert/config/finetune/base_1h_large.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tblog seed: 1337 checkpoint: save_interval: 1000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer distributed_training: ddp_backend: c10d find_unused_parameters: true distributed_world_size: 8 distributed_port: 29671 nprocs_per_node: 8 task: _name: multires_hubert_pretraining data: ??? fine_tuning: true label_dir: ??? label_rate_ratios: ??? normalize: true # must be consistent with pre-training labels: ["ltr"] single_target: true dataset: num_workers: 0 max_tokens: 1280000 validate_after_updates: ${model.freeze_finetune_updates} validate_interval: 5 train_subset: train_10h valid_subset: dev criterion: _name: ctc zero_infinity: true optimization: max_update: 25000 lr: [3e-4] sentence_avg: true update_freq: [5] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: multires_hubert_ctc multires_hubert_path: ??? apply_mask: true mask_selection: static mask_length: 10 mask_other: 0 mask_prob: 0.75 mask_channel_selection: static mask_channel_length: 64 mask_channel_other: 0 mask_channel_prob: 0.5 layerdrop: 0.1 dropout: 0.0 activation_dropout: 0.1 attention_dropout: 0.0 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data - task.label_dir - model.multires_hubert_path - dataset.train_subset - dataset.valid_subset - criterion.wer_kenlm_model - criterion.wer_lexicon run: dir: ??? sweep: dir: ??? subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/mr_hubert/config/pretrain/mrhubert_base_librispeech.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 seed: 1337 tensorboard_logdir: tblog min_loss_scale: 1e-8 checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true distributed_training: ddp_backend: no_c10d distributed_backend: 'nccl' distributed_world_size: 32 distributed_port: 29671 nprocs_per_node: 8 find_unused_parameters: true task: _name: multires_hubert_pretraining data: ??? label_dir: ??? labels: ??? label_rate: ${model.label_rate} label_rate_ratios: ??? sample_rate: 16000 max_sample_size: 250000 min_sample_size: 32000 pad_audio: false random_crop: true normalize: false # must be consistent with extractor # max_keep_size: 300000 # max_keep_size: 50000 dataset: num_workers: 0 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 validate_interval_updates: 10000 criterion: _name: hubert pred_masked_weight: 1.0 pred_nomask_weight: 0.0 loss_weights: [10,] optimization: max_update: 400000 lr: [0.0005] clip_norm: 10.0 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: multires_hubert label_rate: ??? label_rate_ratios: ${task.label_rate_ratios} skip_masked: false skip_nomask: false mask_prob: 0.80 extractor_mode: default conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' final_dim: 256 encoder_layers: 4 encoder_layerdrop: 0.05 dropout_input: 0.1 dropout_features: 0.1 dropout: 0.1 attention_dropout: 0.1 feature_grad_mult: 0.1 untie_final_proj: true activation_dropout: 0.0 conv_adapator_kernal: 1 use_single_target: true hydra: job: config: override_dirname: kv_sep: '-' item_sep: '/' exclude_keys: - run - task.data - task.label_dir - common.min_loss_scale - common.log_interval - optimization.clip_norm ================================================ FILE: examples/mr_hubert/config/pretrain/mrhubert_large_librilight.yaml ================================================ # @package _group_ common: memory_efficient_fp16: true log_format: json log_interval: 200 seed: 1337 tensorboard_logdir: tblog checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true distributed_training: ddp_backend: no_c10d distributed_backend: 'nccl' distributed_world_size: 128 distributed_port: 29671 nprocs_per_node: 8 find_unused_parameters: true task: _name: multires_hubert_pretraining data: ??? label_dir: ??? labels: ??? label_rate: ${model.label_rate} label_rate_ratios: ??? sample_rate: 16000 max_sample_size: 250000 min_sample_size: 32000 pad_audio: false random_crop: true normalize: true # must be consistent with extractor # max_keep_size: 50000 dataset: num_workers: 0 max_tokens: 300000 skip_invalid_size_inputs_valid_test: true validate_interval: 5 validate_interval_updates: 10000 criterion: _name: hubert pred_masked_weight: 1.0 pred_nomask_weight: 0.0 loss_weights: [10,] optimization: max_update: 400000 lr: [0.0015] clip_norm: 1.0 update_freq: [3] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: multires_hubert label_rate: ??? label_rate_ratios: ${task.label_rate_ratios} encoder_layers: 8 encoder_embed_dim: 1024 encoder_ffn_embed_dim: 4096 encoder_attention_heads: 16 final_dim: 768 skip_masked: false skip_nomask: false mask_prob: 0.80 extractor_mode: layer_norm conv_feature_layers: '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2' encoder_layerdrop: 0.0 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 layer_norm_first: true feature_grad_mult: 1.0 untie_final_proj: true activation_dropout: 0.0 conv_adapator_kernal: 1 use_single_target: true hydra: job: config: override_dirname: kv_sep: '-' item_sep: '__' exclude_keys: - run - task.data run: dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt sweep: dir: /checkpoint/wnhsu/w2v/hubert_final/hydra_pt subdir: ${hydra.job.config_name}__${hydra.job.override_dirname} ================================================ FILE: examples/mr_hubert/config/pretrain/run/submitit_reg.yaml ================================================ # @package _global_ hydra: launcher: cpus_per_task: 8 gpus_per_node: 8 tasks_per_node: ${hydra.launcher.gpus_per_node} nodes: 4 comment: null mem_gb: 384 timeout_min: 4320 max_num_timeout: 100 constraint: volta32gb name: ${hydra.job.config_name}/${hydra.job.override_dirname} submitit_folder: ${hydra.sweep.dir}/submitit/%j distributed_training: distributed_world_size: 32 distributed_port: 29671 nprocs_per_node: 8 ================================================ FILE: examples/mr_hubert/decode.sh ================================================ #!/bin/bash FAIRSEQ= # Setup your fairseq directory config_dir=${FAIRSEQ}/examples/mr_hubert/config config_name=mr_hubert_base_librispeech # Prepared Data Directory data_dir=librispeech # -- data_dir # -- test.tsv # -- test.ltr # -- dict.ltr.txt exp_dir=exp # Target experiments directory (where you have your pre-trained model with checkpoint_best.pt) ratios="[1, 2]" # Default label rate ratios _opts= # If use slurm, uncomment this line and modify the job submission at # _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg" # If want to set additional experiment tag, uncomment this line # _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}" # If use un-normalized audio, uncomment this line # _opts="${_opts} task.normalize=false" PYTHONPATH=${FAIRSEQ} python examples/speech_recognition/new/infer.py \ --config-dir ${config_dir} \ --config-name infer_multires \ ${_opts} \ task.data=${data_dir} \ task.label_rate_ratios='${ratios}' \ common_eval.results_path=${exp_dir} \ common_eval.path=${exp_dir}/checkpoint_best.pt \ dataset.max_tokens=2000000 \ dataset.gen_subset=test \ dataset.skip_invalid_size_inputs_valid_test=true ================================================ FILE: examples/mr_hubert/finetune.sh ================================================ #!/bin/bash FAIRSEQ= # Setup your fairseq directory config_dir=${FAIRSEQ}/examples/mr_hubert/config config_name=mr_hubert_base_librispeech # override configs if need max_tokens=3200000 max_sample_size=1000000 max_update=50000 # Prepared Data Directory data_dir=librispeech # -- data_dir # -- train.tsv # -- train.ltr # -- valid.tsv # -- valid.ltr # -- dict.ltr.txt exp_dir=exp # Target experiments directory ratios="[1, 2]" # Default label rate ratios hubert_path=/path/of/your/hubert.pt _opts= # If use slurm, uncomment this line and modify the job submission at # _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg" # If want to set additional experiment tag, uncomment this line # _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}" python ${FAIRSEQ}/fairseq_cli/hydra_train.py \ -m --config-dir ${config_dir} --config-name ${config_name} ${_opts} \ task.data=${data_dir} +task.max_sample_size=${max_sample_size} \ task.label_dir=${data_dir} \ task.label_rate_ratios='${ratios}' \ dataset.max_tokens=${max_tokens} \ optimization.max_update=${max_update} \ model.multires_hubert_path=${hubert_path} \ hydra.sweep.dir=${exp_dir} & ================================================ FILE: examples/mr_hubert/train.sh ================================================ #!/bin/bash FAIRSEQ= # Setup your fairseq directory config_dir=${FAIRSEQ}/examples/mr_hubert/config config_name=mr_hubert_base_librispeech # Prepared Data Directory data_dir=librispeech # -- data_dir # -- train.tsv # -- valid.tsv label_dir=labels # -- label_dir # -- train.km # -- valid.km # -- dict.km.txt exp_dir=exp # Target experiments directory ratios="[1, 2]" # Default label rate ratios label_rate=50 # Base label rate _opts= # If use slurm, uncomment this line and modify the job submission at # _opts="${_opts} hydra/launcher=submitit_slurm +hydra.launcher.partition=${your_slurm_partition} +run=submitit_reg" # If want to set additional experiment tag, uncomment this line # _opts="${_opts} hydra.sweep.subdir=${your_experiment_tag}" python ${FAIRSEQ}/fairseq_cli/hydra_train.py \ -m --config-dir ${config_dir} --config-name ${config_name} ${_opts} \ task.data=${data_dir} \ task.label_dir=${label_dir} \ task.labels='["km"]' \ model.label_rate=${label_rate} \ task.label_rate_ratios='${ratios}' \ hydra.sweep.dir=${exp_dir} & ================================================ FILE: examples/multilingual/ML50_langs.txt ================================================ ar_AR cs_CZ de_DE en_XX es_XX et_EE fi_FI fr_XX gu_IN hi_IN it_IT ja_XX kk_KZ ko_KR lt_LT lv_LV my_MM ne_NP nl_XX ro_RO ru_RU si_LK tr_TR vi_VN zh_CN af_ZA az_AZ bn_IN fa_IR he_IL hr_HR id_ID ka_GE km_KH mk_MK ml_IN mn_MN mr_IN pl_PL ps_AF pt_XX sv_SE sw_KE ta_IN te_IN th_TH tl_XX uk_UA ur_PK xh_ZA gl_ES sl_SI ================================================ FILE: examples/multilingual/README.md ================================================ # Multilingual Translation [[Multilingual Translation with Extensible Multilingual Pretraining and Finetuning, https://arxiv.org/abs/2008.00401]](https://arxiv.org/abs/2008.00401) ## Introduction This work is for training multilingual translation models with multiple bitext datasets. This multilingual translation framework supports (see [[training section]](#Training) and [[finetuning section]](#Finetuning) for examples) * temperature based sampling over unbalancing datasets of different translation directions - --sampling-method' with choices=['uniform', 'temperature', 'concat'] - --sampling-temperature * configurable to automatically add source and/or target language tokens to source/target sentences using data which are prepared in the same way as bilignual training - --encoder-langtok with choices=['src', 'tgt', None] to specify whether to add source or target language tokens to the source sentences - --decoder-langtok (binary option) to specify whether to add target language tokens to the target sentences or not * finetuning mBART pretrained models for multilingual translation - --finetune-from-model to specify the path from which to load the pretrained model ## Preprocessing data Multilingual training requires a joint BPE vocab. Please follow [mBART's preprocessing steps](https://github.com/pytorch/fairseq/tree/main/examples/mbart#bpe-data) to reuse our pretrained sentence-piece model. You can also train a joint BPE model on your own dataset and then follow the steps in [[link]](https://github.com/pytorch/fairseq/tree/main/examples/translation#multilingual-translation). ## Training ```bash lang_pairs= path_2_data= lang_list= fairseq-train $path_2_data \ --encoder-normalize-before --decoder-normalize-before \ --arch transformer --layernorm-embedding \ --task translation_multi_simple_epoch \ --sampling-method "temperature" \ --sampling-temperature 1.5 \ --encoder-langtok "src" \ --decoder-langtok \ --lang-dict "$lang_list" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ --seed 222 --log-format simple --log-interval 2 ``` ## Finetuning We can also finetune multilingual models from a monolingual pretrained models, e.g. [mMBART](https://github.com/pytorch/fairseq/tree/main/examples/mbart). ```bash lang_pairs= path_2_data= lang_list= pretrained_model= fairseq-train $path_2_data \ --finetune-from-model $pretrained_model \ --encoder-normalize-before --decoder-normalize-before \ --arch transformer --layernorm-embedding \ --task translation_multi_simple_epoch \ --sampling-method "temperature" \ --sampling-temperature 1.5 \ --encoder-langtok "src" \ --decoder-langtok \ --lang-dict "$lang_list" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ --seed 222 --log-format simple --log-interval 2 ``` ## Generate The following command uses the multilingual task (translation_multi_simple_epoch) to generate translation from $source_lang to $target_lang on the test dataset. During generaton, the source language tokens are added to source sentences and the target language tokens are added as the starting token to decode target sentences. Options --lang-dict and --lang-pairs are needed to tell the generation process the ordered list of languages and translation directions that the trained model are awared of; they will need to be consistent with the training. ```bash model= source_lang= target_lang= fairseq-generate $path_2_data \ --path $model \ --task translation_multi_simple_epoch \ --gen-subset test \ --source-lang $source_lang \ --target-lang $target_lang --sacrebleu --remove-bpe 'sentencepiece'\ --batch-size 32 \ --encoder-langtok "src" \ --decoder-langtok \ --lang-dict "$lang_list" \ --lang-pairs "$lang_pairs" > ${source_lang}_${target_lang}.txt ``` Fairseq will generate translation into a file {source_lang}_${target_lang}.txt with sacreblue at the end. You can also use costomized tokenizer to compare the performance with the literature. For example, you get a tokenizer [here](https://github.com/rsennrich/wmt16-scripts) and do the following: ```bash TOKENIZER= TOK_CMD=<"$TOKENIZER $target_lang" or cat for sacrebleu> cat {source_lang}_${target_lang}.txt | grep -P "^H" |sort -V |cut -f 3- |$TOK_CMD > ${source_lang}_${target_lang}.hyp cat {source_lang}_${target_lang}.txt | grep -P "^T" |sort -V |cut -f 2- |$TOK_CMD > ${source_lang}_${target_lang}.ref sacrebleu -tok 'none' -s 'none' ${source_lang}_${target_lang}.ref < ${source_lang}_${target_lang}.hyp ``` # mBART50 models * [mMBART 50 pretrained model](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.pretrained.tar.gz). * [mMBART 50 finetuned many-to-one](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.n1.tar.gz). * [mMBART 50 finetuned one-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.1n.tar.gz). * [mMBART 50 finetuned many-to-many](https://dl.fbaipublicfiles.com/fairseq/models/mbart50/mbart50.ft.nn.tar.gz). Please download and extract from the above tarballs. Each tarball contains * The fairseq model checkpoint: model.pt * The list of supported languages: ML50_langs.txt * Sentence piece model: sentence.bpe.model * Fairseq dictionary of each language: dict.{lang}.txt (please replace lang with a language specified in ML50_langs.txt) To use the trained models, * use the tool [binarize.py](./data_scripts/binarize.py) to binarize your data using sentence.bpe.model and dict.{lang}.txt, and copy the dictionaries to your data path * then run the generation command: ```bash path_2_data= model=/model.pt lang_list=/ML50_langs.txt source_lang= target_lang= fairseq-generate $path_2_data \ --path $model \ --task translation_multi_simple_epoch \ --gen-subset test \ --source-lang $source_lang \ --target-lang $target_lang --sacrebleu --remove-bpe 'sentencepiece'\ --batch-size 32 \ --encoder-langtok "src" \ --decoder-langtok \ --lang-dict "$lang_list" ``` ## Citation ```bibtex @article{tang2020multilingual, title={Multilingual Translation with Extensible Multilingual Pretraining and Finetuning}, author={Yuqing Tang and Chau Tran and Xian Li and Peng-Jen Chen and Naman Goyal and Vishrav Chaudhary and Jiatao Gu and Angela Fan}, year={2020}, eprint={2008.00401}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ================================================ FILE: examples/multilingual/data_scripts/README.md ================================================ # Install dependency ```bash pip install -r requirement.txt ``` # Download the data set ```bash export WORKDIR_ROOT= ``` The downloaded data will be at $WORKDIR_ROOT/ML50 # preprocess the data Install SPM [here](https://github.com/google/sentencepiece) ```bash export WORKDIR_ROOT= export SPM_PATH= ``` * $WORKDIR_ROOT/ML50/raw: extracted raw data * $WORKDIR_ROOT/ML50/dedup: dedup data * $WORKDIR_ROOT/ML50/clean: data with valid and test sentences removed from the dedup data ================================================ FILE: examples/multilingual/data_scripts/binarize.py ================================================ import shutil import os, sys from subprocess import check_call, check_output import glob import argparse import shutil import pathlib import itertools def call_output(cmd): print(f"Executing: {cmd}") ret = check_output(cmd, shell=True) print(ret) return ret def call(cmd): print(cmd) check_call(cmd, shell=True) WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) SPM_PATH = os.environ.get('SPM_PATH', None) if SPM_PATH is None or not SPM_PATH.strip(): print("Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting...") sys.exit(-1) SPM_MODEL = f'{WORKDIR_ROOT}/sentence.bpe.model' SPM_VOCAB = f'{WORKDIR_ROOT}/dict_250k.txt' SPM_ENCODE = f'{SPM_PATH}' if not os.path.exists(SPM_MODEL): call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/sentence.bpe.model -O {SPM_MODEL}") if not os.path.exists(SPM_VOCAB): call(f"wget https://dl.fbaipublicfiles.com/fairseq/models/mbart50/dict_250k.txt -O {SPM_VOCAB}") def get_data_size(raw): cmd = f'wc -l {raw}' ret = call_output(cmd) return int(ret.split()[0]) def encode_spm(model, direction, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None): src, tgt = direction.split('-') for split in splits: src_raw, tgt_raw = f'{RAW_DIR}/{split}{prefix}.{direction}.{src}', f'{RAW_DIR}/{split}{prefix}.{direction}.{tgt}' if os.path.exists(src_raw) and os.path.exists(tgt_raw): cmd = f"""python {SPM_ENCODE} \ --model {model}\ --output_format=piece \ --inputs {src_raw} {tgt_raw} \ --outputs {BPE_DIR}/{direction}{prefix}/{split}.bpe.{src} {BPE_DIR}/{direction}{prefix}/{split}.bpe.{tgt} """ print(cmd) call(cmd) def binarize_( bpe_dir, databin_dir, direction, spm_vocab=SPM_VOCAB, splits=['train', 'test', 'valid'], ): src, tgt = direction.split('-') try: shutil.rmtree(f'{databin_dir}', ignore_errors=True) os.mkdir(f'{databin_dir}') except OSError as error: print(error) cmds = [ "fairseq-preprocess", f"--source-lang {src} --target-lang {tgt}", f"--destdir {databin_dir}/", f"--workers 8", ] if isinstance(spm_vocab, tuple): src_vocab, tgt_vocab = spm_vocab cmds.extend( [ f"--srcdict {src_vocab}", f"--tgtdict {tgt_vocab}", ] ) else: cmds.extend( [ f"--joined-dictionary", f"--srcdict {spm_vocab}", ] ) input_options = [] if 'train' in splits and glob.glob(f"{bpe_dir}/train.bpe*"): input_options.append( f"--trainpref {bpe_dir}/train.bpe", ) if 'valid' in splits and glob.glob(f"{bpe_dir}/valid.bpe*"): input_options.append(f"--validpref {bpe_dir}/valid.bpe") if 'test' in splits and glob.glob(f"{bpe_dir}/test.bpe*"): input_options.append(f"--testpref {bpe_dir}/test.bpe") if len(input_options) > 0: cmd = " ".join(cmds + input_options) print(cmd) call(cmd) def binarize( databin_dir, direction, spm_vocab=SPM_VOCAB, prefix='', splits=['train', 'test', 'valid'], pairs_per_shard=None, ): def move_databin_files(from_folder, to_folder): for bin_file in glob.glob(f"{from_folder}/*.bin") \ + glob.glob(f"{from_folder}/*.idx") \ + glob.glob(f"{from_folder}/dict*"): try: shutil.move(bin_file, to_folder) except OSError as error: print(error) bpe_databin_dir = f"{BPE_DIR}/{direction}{prefix}_databin" bpe_dir = f"{BPE_DIR}/{direction}{prefix}" if pairs_per_shard is None: binarize_(bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=splits) move_databin_files(bpe_databin_dir, databin_dir) else: # binarize valid and test which will not be sharded binarize_( bpe_dir, bpe_databin_dir, direction, spm_vocab=spm_vocab, splits=[s for s in splits if s != "train"]) for shard_bpe_dir in glob.glob(f"{bpe_dir}/shard*"): path_strs = os.path.split(shard_bpe_dir) shard_str = path_strs[-1] shard_folder = f"{bpe_databin_dir}/{shard_str}" databin_shard_folder = f"{databin_dir}/{shard_str}" print(f'working from {shard_folder} to {databin_shard_folder}') os.makedirs(databin_shard_folder, exist_ok=True) binarize_( shard_bpe_dir, shard_folder, direction, spm_vocab=spm_vocab, splits=["train"]) for test_data in glob.glob(f"{bpe_databin_dir}/valid.*") + glob.glob(f"{bpe_databin_dir}/test.*"): filename = os.path.split(test_data)[-1] try: os.symlink(test_data, f"{databin_shard_folder}/{filename}") except OSError as error: print(error) move_databin_files(shard_folder, databin_shard_folder) def load_langs(path): with open(path) as fr: langs = [l.strip() for l in fr] return langs if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--data_root", default=f"{WORKDIR_ROOT}/ML50") parser.add_argument("--raw-folder", default='raw') parser.add_argument("--bpe-folder", default='bpe') parser.add_argument("--databin-folder", default='databin') args = parser.parse_args() DATA_PATH = args.data_root #'/private/home/yuqtang/public_data/ML50' RAW_DIR = f'{DATA_PATH}/{args.raw_folder}' BPE_DIR = f'{DATA_PATH}/{args.bpe_folder}' DATABIN_DIR = f'{DATA_PATH}/{args.databin_folder}' os.makedirs(BPE_DIR, exist_ok=True) raw_files = itertools.chain( glob.glob(f'{RAW_DIR}/train*'), glob.glob(f'{RAW_DIR}/valid*'), glob.glob(f'{RAW_DIR}/test*'), ) directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] for direction in directions: prefix = "" splits = ['train', 'valid', 'test'] try: shutil.rmtree(f'{BPE_DIR}/{direction}{prefix}', ignore_errors=True) os.mkdir(f'{BPE_DIR}/{direction}{prefix}') os.makedirs(DATABIN_DIR, exist_ok=True) except OSError as error: print(error) spm_model, spm_vocab = SPM_MODEL, SPM_VOCAB encode_spm(spm_model, direction=direction, splits=splits) binarize(DATABIN_DIR, direction, spm_vocab=spm_vocab, splits=splits) ================================================ FILE: examples/multilingual/data_scripts/check_iswlt_test_data.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os, sys import subprocess import re from subprocess import check_call, check_output WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ") def run_eval_bleu(cmd): output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip() print(output) bleu = -1.0 for line in output.strip().split('\n'): m = BLEU_REGEX.search(line) if m is not None: bleu = m.groups()[0] bleu = float(bleu) break return bleu def check_data_test_bleu(raw_folder, data_lang_pairs): not_matchings = [] for sacrebleu_set, src_tgts in data_lang_pairs: for src_tgt in src_tgts: print(f'checking test bleus for: {src_tgt} at {sacrebleu_set}') src, tgt = src_tgt.split('-') ssrc, stgt = src[:2], tgt[:2] if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'): # reversed direction may have different test set test_src = f'{raw_folder}/test.{tgt}-{src}.{src}' else: test_src = f'{raw_folder}/test.{src}-{tgt}.{src}' cmd1 = f'cat {test_src} | sacrebleu -t "{sacrebleu_set}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""' test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}' cmd2 = f'cat {test_tgt} | sacrebleu -t "{sacrebleu_set}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""' bleu1 = run_eval_bleu(cmd1) if bleu1 != 100.0: not_matchings.append(f'{sacrebleu_set}:{src_tgt} source side not matching: {test_src}') bleu2 = run_eval_bleu(cmd2) if bleu2 != 100.0: not_matchings.append(f'{sacrebleu_set}:{src_tgt} target side not matching: {test_tgt}') return not_matchings if __name__ == "__main__": to_data_path = f'{WORKDIR_ROOT}/iwsltv2' not_matching = check_data_test_bleu( f'{to_data_path}/raw', [ ('iwslt17', ['en_XX-ar_AR', 'en_XX-ko_KR', 'ar_AR-en_XX', 'ko_KR-en_XX']), ('iwslt17', ['en_XX-it_IT', 'en_XX-nl_XX', 'it_IT-en_XX', 'nl_XX-en_XX']), ('iwslt17/tst2015', ['en_XX-vi_VN', "vi_VN-en_XX"]), ] ) if len(not_matching) > 0: print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching)) ================================================ FILE: examples/multilingual/data_scripts/check_self_overlaps.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import glob import argparse from utils.dedup import deup import sys WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) def get_directions(folder): raw_files = glob.glob(f'{folder}/train*') directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] return directions def diff_list(lhs, rhs): return set(lhs).difference(set(rhs)) def check_diff( from_src_file, from_tgt_file, to_src_file, to_tgt_file, ): seen_in_from = set() seen_src_in_from = set() seen_tgt_in_from = set() from_count = 0 with open(from_src_file, encoding='utf-8') as fsrc, \ open(from_tgt_file, encoding='utf-8') as ftgt: for s, t in zip(fsrc, ftgt): seen_in_from.add((s, t)) seen_src_in_from.add(s) seen_tgt_in_from.add(t) from_count += 1 common = 0 common_src = 0 common_tgt = 0 to_count = 0 seen = set() with open(to_src_file, encoding='utf-8') as fsrc, \ open(to_tgt_file, encoding='utf-8') as ftgt: for s, t in zip(fsrc, ftgt): to_count += 1 if (s, t) not in seen: if (s, t) in seen_in_from: common += 1 if s in seen_src_in_from: common_src += 1 seen_src_in_from.remove(s) if t in seen_tgt_in_from: common_tgt += 1 seen_tgt_in_from.remove(t) seen.add((s, t)) return common, common_src, common_tgt, from_count, to_count def main(): parser = argparse.ArgumentParser() parser.add_argument("--folder", type=str, required=True, help="the data folder ") parser.add_argument("--split", type=str, default='test', help="split (valid, test) to check against training data") parser.add_argument('--directions', type=str, default=None, required=False) args = parser.parse_args() if args.directions is None: directions = set(get_directions(args.folder)) directions = sorted(directions) else: directions = args.directions.split(',') directions = sorted(set(directions)) results = [] print(f'checking where {args.split} split data are in training') print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size') for direction in directions: src, tgt = direction.split('-') from_src_file = f'{args.folder}/{args.split}.{src}-{tgt}.{src}' from_tgt_file = f'{args.folder}/{args.split}.{src}-{tgt}.{tgt}' if not os.path.exists(from_src_file): # some test/valid data might in reverse directinos: from_src_file = f'{args.folder}/{args.split}.{tgt}-{src}.{src}' from_tgt_file = f'{args.folder}/{args.split}.{tgt}-{src}.{tgt}' to_src_file = f'{args.folder}/train.{src}-{tgt}.{src}' to_tgt_file = f'{args.folder}/train.{src}-{tgt}.{tgt}' if not os.path.exists(to_src_file) or not os.path.exists(from_src_file): continue r = check_diff(from_src_file, from_tgt_file, to_src_file, to_tgt_file) results.append(r) print(f'{direction}\t', '\t'.join(map(str, r))) if __name__ == "__main__": main() ================================================ FILE: examples/multilingual/data_scripts/check_valid_test_overlaps.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import argparse import pandas as pd import sys WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) def load_langs(path): with open(path) as fr: langs = [l.strip() for l in fr] return langs def load_sentences(raw_data, split, direction): src, tgt = direction.split('-') src_path = f"{raw_data}/{split}.{direction}.{src}" tgt_path = f"{raw_data}/{split}.{direction}.{tgt}" if os.path.exists(src_path) and os.path.exists(tgt_path): return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())] else: return [] def swap_direction(d): src, tgt = d.split('-') return f'{tgt}-{src}' def get_all_test_data(raw_data, directions, split='test'): test_data = [ x for dd in directions for d in [dd, swap_direction(dd)] for x in load_sentences(raw_data, split, d) ] # all_test_data = {s for _, d in test_data for s in d} all_test_data = {} for lang, d in test_data: for s in d: s = s.strip() lgs = all_test_data.get(s, set()) lgs.add(lang) all_test_data[s] = lgs return all_test_data, test_data def check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train={}): # src, tgt = direction.split('-') print(f'check training data for {direction} in {src_path} and {tgt_path}') size = 0 overlapped_size_counted_dup = 0 if not os.path.exists(tgt_path) or not os.path.exists(src_path): return mess_up_train, size, overlapped_size_counted_dup with open(src_path) as f, open(tgt_path) as g: for src_line, tgt_line in zip(f, g): s = src_line.strip() t = tgt_line.strip() size += 1 if s in all_test_data: langs = mess_up_train.get(s, set()) langs.add(direction) mess_up_train[s] = langs overlapped_size_counted_dup += 1 if t in all_test_data: langs = mess_up_train.get(t, set()) langs.add(direction) mess_up_train[t] = langs overlapped_size_counted_dup += 1 print(f'{direction}: size={size}, overlapped={overlapped_size_counted_dup}') return mess_up_train, size, overlapped_size_counted_dup def check_train_all(raw_data, directions, all_test_data): mess_up_train = {} data_sizes = {} # raw_data = '~chau/data-bin/MineBART/multilingual_mined_100M/en_XX/et_EE-en_XX/all.{en_XX, et_EE}' print(f'checking training data againsts # {len(all_test_data)} sentences') print(f'example test data: ', [s for i, s in enumerate(all_test_data.keys()) if i < 10]) for direction in directions: src, tgt = direction.split('-') path = f'{raw_data}/en_XX/{direction}/all' src_path = f'{path}.{src}' tgt_path = f'{path}.{tgt}' print(f'checking {src_path} {tgt_path}') _, size, overlapped_size_counted_dup = check_train_sentences(src_path, tgt_path, direction, all_test_data, mess_up_train) data_sizes[direction] = (size, overlapped_size_counted_dup) return mess_up_train, data_sizes def main(): parser = argparse.ArgumentParser() parser.add_argument("--folder", type=str, required=True, help="the data folder ") parser.add_argument("--test-data", type=str, required=True, help="the test data folder ") parser.add_argument('--directions', type=str, default=None, required=False) args = parser.parse_args() directions = args.directions.split(',') directions = sorted(set(directions)) results = [] # print(f'checking where {args.split} split data are in training') # print(f'direction\tcommon_count\tsrc common\ttgt common\tfrom_size\tto_size') raw_data = args.folder all_test_data, test_data = get_all_test_data(args.test_data, directions, split='test') mess_up_train, data_sizes = check_train_all(raw_data, directions, all_test_data) print(data_sizes) if __name__ == "__main__": main() ================================================ FILE: examples/multilingual/data_scripts/dedup_all.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import glob import argparse from utils.dedup import deup import sys WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) def main(): parser = argparse.ArgumentParser() parser.add_argument("--from-folder", type=str, required=True, help="the data folder to be dedup") parser.add_argument("--to-folder", type=str, required=True, help="the data folder to save deduped data") parser.add_argument('--directions', type=str, default=None, required=False) args = parser.parse_args() if args.directions is None: raw_files = glob.glob(f'{args.from_folder}/train*') directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] else: directions = args.directions.split(',') directions = sorted(set(directions)) for direction in directions: src, tgt = direction.split('-') src_file = f'{args.from_folder}/train.{src}-{tgt}.{src}' tgt_file = f'{args.from_folder}/train.{src}-{tgt}.{tgt}' src_file_out = f'{args.to_folder}/train.{src}-{tgt}.{src}' tgt_file_out = f'{args.to_folder}/train.{src}-{tgt}.{tgt}' assert src_file != src_file_out assert tgt_file != tgt_file_out print(f'deduping {src_file}, {tgt_file}') deup(src_file, tgt_file, src_file_out, tgt_file_out) if __name__ == "__main__": main() ================================================ FILE: examples/multilingual/data_scripts/download_ML50_v1.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi # first run download_wmt20.sh; it will install a few useful tools for other scripts # TODO: need to print out instructions on downloading a few files which requires manually authentication from the websites bash ./download_wmt20.sh python ./download_wmt19_and_before.py bash ./download_wat19_my.sh python ./download_ted_and_extract.py bash ./download_lotus.sh bash ./download_iitb.sh bash ./download_af_xh.sh # IWSLT downloading URLs have changed in between; TODO: fix them: bash ./download_iwslt_and_extract.sh # TODO: globalvoices URLs changed; need to be fixed bash ./download_flores_data.sh ================================================ FILE: examples/multilingual/data_scripts/download_af_xh.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # set -x -e if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi # put intermediate files TMP_DIR=$WORKDIR_ROOT/temp/af_xhv2 # output {train,valid,test} files to dest DEST=${WORKDIR_ROOT}/ML50/raw ROOT=${WORKDIR_ROOT} UTILS=$PWD/utils TMX2CORPUS="${UTILS}/tmx2corpus" TMX_TOOL="python ${TMX2CORPUS}/tmx2corpus.py" mkdir -p $TMP_DIR mkdir -p $DEST mkdir -p $UTILS function download_opus(){ src=$1 tgt=$2 subset=$3 ulr=$4 mkdir extract_$subset.$src-$tgt pushd extract_$subset.$src-$tgt if [ ! -f "$subset.$src-$tgt.tmx.gz" ]; then wget $url -O "$subset.$src-$tgt.tmx.gz" gzip -d "$subset.$src-$tgt.tmx.gz" f=$subset.$src-$tgt.tmx $TMX_TOOL $f mv bitext.$src ../$subset.$src-$tgt.$src mv bitext.$tgt ../$subset.$src-$tgt.$tgt fi popd } function concat_subsets(){ src=$1 tgt=$2 subsets=$3 src_train=raw_train.$src-$tgt.$src tgt_train=raw_train.$src-$tgt.$tgt > $src_train > $tgt_train for subset in $subsets; do cat $subset.$src-$tgt.$src >> $src_train cat $subset.$src-$tgt.$tgt >> $tgt_train done } function get_seeded_random() { seed="$1" openssl enc -aes-256-ctr -pass pass:"$seed" -nosalt \ /dev/null } function split_train_valid(){ src=$1 tgt=$2 raw_src_train=raw_train.$src-$tgt.$src raw_tgt_train=raw_train.$src-$tgt.$tgt shuf --random-source=<(get_seeded_random 43) $raw_src_train > shuffled.$src-$tgt.$src shuf --random-source=<(get_seeded_random 43) $raw_tgt_train > shuffled.$src-$tgt.$tgt head -n 1500 shuffled.$src-$tgt.$src > valid.$src-$tgt.$src head -n 1500 shuffled.$src-$tgt.$tgt > valid.$src-$tgt.$tgt tail +1501 shuffled.$src-$tgt.$src > train.$src-$tgt.$src tail +1501 shuffled.$src-$tgt.$tgt > train.$src-$tgt.$tgt } function copy2dst(){ lsrc=$1 ltgt=$2 src=${lsrc:0:2} tgt=${ltgt:0:2} cp valid.$src-$tgt.$src $DEST/valid.$lsrc-$ltgt.$lsrc cp valid.$src-$tgt.$tgt $DEST/valid.$lsrc-$ltgt.$ltgt cp train.$src-$tgt.$src $DEST/train.$lsrc-$ltgt.$lsrc cp train.$src-$tgt.$tgt $DEST/train.$lsrc-$ltgt.$ltgt } #for xh-en declare -A xh_en_urls xh_en_urls=( [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/en-xh.tmx.gz [wikimedia]=https://object.pouta.csc.fi/OPUS-wikimedia/v20190628/tmx/en-xh.tmx.gz [memat]=https://object.pouta.csc.fi/OPUS-memat/v1/tmx/en-xh.tmx.gz [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/en-xh.tmx.gz [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/en-xh.tmx.gz [XhosaNavy]=https://object.pouta.csc.fi/OPUS-XhosaNavy/v1/tmx/en-xh.tmx.gz [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/en-xh.tmx.gz [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/en-xh.tmx.gz ) mkdir $TMP_DIR/xh-en pushd $TMP_DIR/xh-en for k in "${!xh_en_urls[@]}" do name=$k url=${xh_en_urls[$k]} echo "$name: $url" download_opus xh en $name $ulr done concat_subsets xh en "${!xh_en_urls[@]}" split_train_valid xh en copy2dst xh_ZA en_XX popd ## #for af-en declare -A af_en_urls af_en_urls=( [Tatoeba]=https://object.pouta.csc.fi/OPUS-Tatoeba/v20190709/tmx/af-en.tmx.gz [uedin]=https://object.pouta.csc.fi/OPUS-bible-uedin/v1/tmx/af-en.tmx.gz [GNOME]=https://object.pouta.csc.fi/OPUS-GNOME/v1/tmx/af-en.tmx.gz [QED]=https://object.pouta.csc.fi/OPUS-QED/v2.0a/tmx/af-en.tmx.gz [KDE4]=https://object.pouta.csc.fi/OPUS-KDE4/v2/tmx/af-en.tmx.gz [OpenSubtitles]=https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/tmx/af-en.tmx.gz [SPC]=https://object.pouta.csc.fi/OPUS-SPC/v1/tmx/af-en.tmx.gz [Ubuntu]=https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/tmx/af-en.tmx.gz ) mkdir $TMP_DIR/af-en pushd $TMP_DIR/af-en for k in "${!af_en_urls[@]}" do name=$k url=${af_en_urls[$k]} echo "$name: $url" download_opus af en $name $ulr done concat_subsets af en "${!af_en_urls[@]}" split_train_valid af en copy2dst af_ZA en_XX popd ================================================ FILE: examples/multilingual/data_scripts/download_flores_data.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi set -e set -o pipefail SRC=en SI_TGT=si NE_TGT=ne DESTDIR=${WORKDIR_ROOT}/ML50/raw/ ROOT=${WORKDIR_ROOT}/tmp mkdir -p $ROOT DATA=$ROOT/data NE_ROOT=$DATA/all-clean-ne SI_ROOT=$DATA/all-clean-si mkdir -p $DATA $NE_ROOT $SI_ROOT SI_OPUS_DATASETS=( "$SI_ROOT/GNOME.en-si" "$SI_ROOT/Ubuntu.en-si" "$SI_ROOT/KDE4.en-si" "$SI_ROOT/OpenSubtitles.en-si" ) SI_OPUS_URLS=( "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-si.txt.zip" "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-si.txt.zip" "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-si.txt.zip" "https://object.pouta.csc.fi/OPUS-OpenSubtitles/v2018/moses/en-si.txt.zip" ) NE_OPUS_DATASETS=( "$NE_ROOT/GNOME.en-ne" "$NE_ROOT/Ubuntu.en-ne" "$NE_ROOT/KDE4.en-ne" ) NE_OPUS_URLS=( "https://object.pouta.csc.fi/OPUS-GNOME/v1/moses/en-ne.txt.zip" "https://object.pouta.csc.fi/OPUS-Ubuntu/v14.10/moses/en-ne.txt.zip" "https://object.pouta.csc.fi/OPUS-KDE4/v2/moses/en-ne.txt.zip" ) REMOVE_FILE_PATHS=() # Download data download_data() { CORPORA=$1 URL=$2 if [ -f $CORPORA ]; then echo "$CORPORA already exists, skipping download" else echo "Downloading $URL" wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA if [ -f $CORPORA ]; then echo "$URL successfully downloaded." else echo "$URL not successfully downloaded." rm -f $CORPORA exit -1 fi fi } # Example: download_opus_data $LANG_ROOT $TGT download_opus_data() { LANG_ROOT=$1 TGT=$2 if [ "$TGT" = "si" ]; then URLS=("${SI_OPUS_URLS[@]}") DATASETS=("${SI_OPUS_DATASETS[@]}") else URLS=("${NE_OPUS_URLS[@]}") DATASETS=("${NE_OPUS_DATASETS[@]}") fi # Download and extract data for ((i=0;i<${#URLS[@]};++i)); do URL=${URLS[i]} CORPORA=${DATASETS[i]} download_data $CORPORA $URL unzip -o $CORPORA -d $LANG_ROOT REMOVE_FILE_PATHS+=( $CORPORA $CORPORA.xml $CORPORA.ids $LANG_ROOT/README $LANG_ROOT/LICENSE ) done cat ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$SRC cat ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT > $LANG_ROOT/GNOMEKDEUbuntu.$SRC-$TGT.$TGT REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$SRC ${DATASETS[1]}.$SRC ${DATASETS[2]}.$SRC ) REMOVE_FILE_PATHS+=( ${DATASETS[0]}.$TGT ${DATASETS[1]}.$TGT ${DATASETS[2]}.$TGT ) } download_opus_data $SI_ROOT $SI_TGT cp ${SI_OPUS_DATASETS[3]}.$SRC $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SRC cp ${SI_OPUS_DATASETS[3]}.$SI_TGT $SI_ROOT/OpenSubtitles2018.$SRC-$SI_TGT.$SI_TGT REMOVE_FILE_PATHS+=( ${SI_OPUS_DATASETS[3]}.$SRC ${SI_OPUS_DATASETS[3]}.$SI_TGT ) download_opus_data $NE_ROOT $NE_TGT # Download and extract Global Voices data GLOBAL_VOICES="$NE_ROOT/globalvoices.2018q4.ne-en" GLOBAL_VOICES_URL="http://www.casmacat.eu/corpus/global-voices/globalvoices.ne-en.xliff.gz" download_data $GLOBAL_VOICES.gz $GLOBAL_VOICES_URL gunzip -Nf $GLOBAL_VOICES.gz sed -ne 's?.*\(.*\).*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$NE_TGT sed -ne 's?.*]*>\(.*\).*?\1?p' $GLOBAL_VOICES > $GLOBAL_VOICES.$SRC REMOVE_FILE_PATHS+=( $GLOBAL_VOICES ) # Download and extract the bible dataset BIBLE_TOOLS=bible-corpus-tools XML_BIBLES=XML_Bibles XML_BIBLES_DUP=XML_Bibles_dup if [ ! -e $BIBLE_TOOLS ]; then echo "Cloning bible-corpus-tools repository..." git clone https://github.com/christos-c/bible-corpus-tools.git fi mkdir -p $BIBLE_TOOLS/bin $XML_BIBLES $XML_BIBLES_DUP javac -cp "$BIBLE_TOOLS/lib/*" -d $BIBLE_TOOLS/bin $BIBLE_TOOLS/src/bible/readers/*.java $BIBLE_TOOLS/src/bible/*.java download_data bible.tar.gz "https://github.com/christos-c/bible-corpus/archive/v1.2.1.tar.gz" tar xvzf bible.tar.gz cp bible-corpus-1.2.1/bibles/{Greek.xml,English.xml,Nepali.xml} $XML_BIBLES/ cp bible-corpus-1.2.1/bibles/{Greek.xml,English-WEB.xml,Nepali.xml} $XML_BIBLES_DUP/ java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateMLBooks $XML_BIBLES_DUP java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES java -cp $BIBLE_TOOLS/lib/*:$BIBLE_TOOLS/bin bible.CreateVerseAlignedBooks $XML_BIBLES_DUP cat $XML_BIBLES/aligned/*/English.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$SRC cat $XML_BIBLES/aligned/*/Nepali.txt > $NE_ROOT/bible.$SRC-$NE_TGT.$NE_TGT cat $XML_BIBLES_DUP/aligned/*/English-WEB.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$SRC cat $XML_BIBLES_DUP/aligned/*/Nepali.txt > $NE_ROOT/bible_dup.$SRC-$NE_TGT.$NE_TGT REMOVE_FILE_PATHS+=( bible-corpus-1.2.1 bible.tar.gz $BIBLE_TOOLS $XML_BIBLES $XML_BIBLES_DUP ) # Download and extract the Penn Treebank dataset NE_TAGGED=$ROOT/new_submissions_parallel_corpus_project_Nepal NE_TAGGED_URL="http://www.cle.org.pk/Downloads/ling_resources/parallelcorpus/NepaliTaggedCorpus.zip" EN_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.en.patch" NE_TAGGED_PATCH_URL="https://dl.fbaipublicfiles.com/fairseq/data/nepali-penn-treebank.ne.patch" MOSES=mosesdecoder MOSES_TOK=$MOSES/scripts/tokenizer EN_PATCH_REGEX="{s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}" NE_PATCH_REGEX="{s:\p{Cf}::g;s:\\\/:\/:g;s/\*\T\*\-\n+//g;s/\-LCB\-/\{/g;s/\-RCB\-/\}/g; s/\-LSB\-/\[/g; s/\-RSB\-/\]/g;s/\-LRB\-/\(/g; s/\-RRB\-/\)/g; s/\'\'/\"/g; s/\`\`/\"/g; s/\ +\'s\ +/\'s /g; s/\ +\'re\ +/\'re /g; s/\"\ +/\"/g; s/\ +\"/\"/g; s/\ n't([\ \.\"])/n't\1/g; s/\r+(.)/\1/g;}" download_data $DATA/nepali-penn-treebank.$SRC.patch $EN_TAGGED_PATCH_URL download_data $DATA/nepali-penn-treebank.$NE_TGT.patch $NE_TAGGED_PATCH_URL download_data original.zip $NE_TAGGED_URL unzip -o original.zip -d $ROOT cat $NE_TAGGED/00.txt $NE_TAGGED/01.txt $NE_TAGGED/02.txt > $NE_TAGGED/nepali-penn-treebank.$SRC cat $NE_TAGGED/00ne_revised.txt $NE_TAGGED/01ne_revised.txt $NE_TAGGED/02ne_revised.txt > $NE_TAGGED/nepali-penn-treebank.$NE_TGT patch $NE_TAGGED/nepali-penn-treebank.$SRC -i $DATA/nepali-penn-treebank.$SRC.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$SRC patch $NE_TAGGED/nepali-penn-treebank.$NE_TGT -i $DATA/nepali-penn-treebank.$NE_TGT.patch -o $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT if [ ! -e $MOSES ]; then echo "Cloning moses repository..." git clone https://github.com/moses-smt/mosesdecoder.git fi cat $NE_TAGGED/nepali-penn-treebank-patched.$SRC | \ perl -anpe "$EN_PATCH_REGEX" | \ $MOSES_TOK/tokenizer.perl -l $SRC | \ $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$SRC cat $NE_TAGGED/nepali-penn-treebank-patched.$NE_TGT | \ perl -CIO -anpe "$NE_PATCH_REGEX" | \ $MOSES_TOK/detokenizer.perl -l $SRC > $NE_ROOT/nepali-penn-treebank.$NE_TGT # Download nepali dictionary data NE_DICT=$NE_ROOT/dictionaries download_data $NE_DICT "http://www.seas.upenn.edu/~nlp/resources/TACL-data-release/dictionaries.tar.gz" tar xvzf $NE_DICT cp dictionaries/dict.ne $NE_ROOT/dictionary.$NE_TGT-$SRC REMOVE_FILE_PATHS+=( $NE_DICT dictionaries ) REMOVE_FILE_PATHS+=( $MOSES $NE_TAGGED original.zip $DATA/nepali-penn-treebank.$SRC.patch $DATA/nepali-penn-treebank.$NE_TGT.patch ) # Remove the temporary files for ((i=0;i<${#REMOVE_FILE_PATHS[@]};++i)); do rm -rf ${REMOVE_FILE_PATHS[i]} done # Copy the training data si=si_LK ne=ne_NP en=en_XX cat $SI_ROOT/GNOMEKDEUbuntu.en-si.si $SI_ROOT/OpenSubtitles2018.en-si.si > $DESTDIR/train.$si-$en.$si cat $SI_ROOT/GNOMEKDEUbuntu.en-si.en $SI_ROOT/OpenSubtitles2018.en-si.en > $DESTDIR/train.$si-$en.$en cat $NE_ROOT/bible_dup.en-ne.ne $NE_ROOT/bible.en-ne.ne $NE_ROOT/globalvoices.2018q4.ne-en.ne $NE_ROOT/GNOMEKDEUbuntu.en-ne.ne $NE_ROOT/nepali-penn-treebank.ne > $DESTDIR/train.$ne-$en.$ne cat $NE_ROOT/bible_dup.en-ne.en $NE_ROOT/bible.en-ne.en $NE_ROOT/globalvoices.2018q4.ne-en.en $NE_ROOT/GNOMEKDEUbuntu.en-ne.en $NE_ROOT/nepali-penn-treebank.en > $DESTDIR/train.$ne-$en.$en #Download the test sets wget https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz tar -xvzf wikipedia_en_ne_si_test_sets.tgz cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.ne $DESTDIR/valid.$ne-$en.$ne cp wikipedia_en_ne_si_test_sets/wikipedia.dev.ne-en.en $DESTDIR/valid.$ne-$en.$en cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.si $DESTDIR/valid.$si-$en.$si cp wikipedia_en_ne_si_test_sets/wikipedia.dev.si-en.en $DESTDIR/valid.$si-$en.$en cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.ne $DESTDIR/devtest.$ne-$en.$ne cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.ne-en.en $DESTDIR/devtest.$ne-$en.$en cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.si $DESTDIR/devtest.$si-$en.$si cp wikipedia_en_ne_si_test_sets/wikipedia.devtest.si-en.en $DESTDIR/devtest.$si-$en.$en cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.ne $DESTDIR/test.$ne-$en.$ne cp wikipedia_en_ne_si_test_sets/wikipedia.test.ne-en.en $DESTDIR/test.$ne-$en.$en cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.si $DESTDIR/test.$si-$en.$si cp wikipedia_en_ne_si_test_sets/wikipedia.test.si-en.en $DESTDIR/test.$si-$en.$en rm -rf wikipedia_en_ne_si_test_sets.tgz wikipedia_en_ne_si_test_sets ================================================ FILE: examples/multilingual/data_scripts/download_iitb.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi IITB=$WORKDIR_ROOT/IITB mkdir -p $IITB pushd $IITB wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/parallel.tgz tar -xvzf parallel.tgz wget http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download/dev_test.tgz tar -xvzf dev_test.tgz DESTDIR=${WORKDIR_ROOT}/ML50/raw/ cp parallel/IITB.en-hi.en $DESTDIR/train.hi_IN-en_XX.en_XX cp parallel/IITB.en-hi.hi $DESTDIR/train.hi_IN-en_XX.hi_IN cp dev_test/dev.en $DESTDIR/valid.hi_IN-en_XX.en_XX cp dev_test/dev.hi $DESTDIR/valid.hi_IN-en_XX.hi_IN cp dev_test/test.en $DESTDIR/test.hi_IN-en_XX.en_XX cp dev_test/test.hi $DESTDIR/test.hi_IN-en_XX.hi_IN popd ================================================ FILE: examples/multilingual/data_scripts/download_iwslt_and_extract.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. #echo 'Cloning Moses github repository (for tokenization scripts)...' #git clone https://github.com/moses-smt/mosesdecoder.git if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi data_root=${WORKDIR_ROOT}/iwsltv2 DESTDIR=${WORKDIR_ROOT}/ML50/raw langs="ar_AR it_IT nl_XX ko_KR vi_VN" echo "data_root: $data_root" download_path=${data_root}/downloads raw=${DESTDIR} tmp=${data_root}/tmp orig=${data_root}/orig mkdir -p $download_path $orig $raw $tmp ####################### download_iwslt(){ iwslt_key=$1 src=$2 tgt=$3 save_prefix=$4 pushd ${download_path} if [[ ! -f ${save_prefix}$src-$tgt.tgz ]]; then wget https://wit3.fbk.eu/archive/${iwslt_key}/texts/$src/$tgt/$src-$tgt.tgz -O ${save_prefix}$src-$tgt.tgz [ $? -eq 0 ] && return 0 fi popd } extract_iwslt(){ src=$1 tgt=$2 prefix=$3 pushd $orig tar zxvf ${download_path}/${prefix}$src-${tgt}.tgz popd } generate_train(){ lsrc=$1 ltgt=$2 src=${lsrc:0:2} tgt=${ltgt:0:2} for ll in $lsrc $ltgt; do l=${ll:0:2} f="$orig/*/train.tags.$src-$tgt.$l" f_raw=$raw/train.$lsrc-$ltgt.$ll cat $f \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | grep -v '' \ | sed -e 's///g' \ | sed -e 's/<\/title>//g' \ | sed -e 's/<description>//g' \ | sed -e 's/<\/description>//g' \ | sed 's/^\s*//g' \ | sed 's/\s*$//g' \ > $f_raw [ $? -eq 0 ] && echo "extracted $f to $f_raw" done return 0 } convert_valid_test(){ src=$1 tgt=$2 for l in $src $tgt; do echo "lang: ${l}" for o in `ls $orig/*/IWSLT*.TED*.$src-$tgt.$l.xml`; do fname=${o##*/} f=$tmp/${fname%.*} echo "$o => $f" grep '<seg id' $o \ | sed -e 's/<seg id="[0-9]*">\s*//g' \ | sed -e 's/\s*<\/seg>\s*//g' \ | sed -e "s/\’/\'/g" \ > $f echo "" done done } generate_subset(){ lsrc=$1 ltgt=$2 src=${lsrc:0:2} tgt=${ltgt:0:2} subset=$3 prefix=$4 for ll in $lsrc $ltgt; do l=${ll:0:2} f=$tmp/$prefix.${src}-${tgt}.$l if [[ -f $f ]]; then cp $f $raw/$subset.${lsrc}-$ltgt.${ll} fi done } ################# echo "downloading iwslt training and dev data" # using multilingual for it, nl download_iwslt "2017-01-trnmted" DeEnItNlRo DeEnItNlRo download_iwslt "2017-01-trnted" ar en download_iwslt "2017-01-trnted" en ar download_iwslt "2017-01-trnted" ko en download_iwslt "2017-01-trnted" en ko download_iwslt "2015-01" vi en download_iwslt "2015-01" en vi echo "donwloading iwslt test data" download_iwslt "2017-01-mted-test" it en "test." download_iwslt "2017-01-mted-test" en it "test." download_iwslt "2017-01-mted-test" nl en "test." download_iwslt "2017-01-mted-test" en nl "test." download_iwslt "2017-01-ted-test" ar en "test." download_iwslt "2017-01-ted-test" en ar "test." download_iwslt "2017-01-ted-test" ko en "test." download_iwslt "2017-01-ted-test" en ko "test." download_iwslt "2015-01-test" vi en "test." download_iwslt "2015-01-test" en vi "test." echo "extract training data tar balls" extract_iwslt DeEnItNlRo DeEnItNlRo extract_iwslt ar en extract_iwslt en ar extract_iwslt ko en extract_iwslt en ko extract_iwslt vi en extract_iwslt en vi echo "extracting iwslt test data" for lang in $langs; do l=${lang:0:2} extract_iwslt $l en "test." extract_iwslt en $l "test." done echo "convert dev and test data" for lang in $langs; do s_lang=${lang:0:2} convert_valid_test $s_lang en convert_valid_test en $s_lang done echo "creating training data into $raw" for lang in $langs; do generate_train $lang en_XX generate_train en_XX $lang done echo "creating iwslt dev data into raw" generate_subset en_XX vi_VN valid "IWSLT15.TED.tst2013" generate_subset vi_VN en_XX valid "IWSLT15.TED.tst2013" generate_subset en_XX ar_AR valid "IWSLT17.TED.tst2016" generate_subset ar_AR en_XX valid "IWSLT17.TED.tst2016" generate_subset en_XX ko_KR valid "IWSLT17.TED.tst2016" generate_subset ko_KR en_XX valid "IWSLT17.TED.tst2016" generate_subset en_XX it_IT valid "IWSLT17.TED.tst2010" generate_subset it_IT en_XX valid "IWSLT17.TED.tst2010" generate_subset en_XX nl_XX valid "IWSLT17.TED.tst2010" generate_subset nl_XX en_XX valid "IWSLT17.TED.tst2010" echo "creating iswslt test data into raw" generate_subset en_XX vi_VN test "IWSLT15.TED.tst2015" generate_subset vi_VN en_XX test "IWSLT15.TED.tst2015" generate_subset en_XX ar_AR test "IWSLT17.TED.tst2017" generate_subset ar_AR en_XX test "IWSLT17.TED.tst2017" generate_subset en_XX ko_KR test "IWSLT17.TED.tst2017" generate_subset ko_KR en_XX test "IWSLT17.TED.tst2017" generate_subset en_XX it_IT test "IWSLT17.TED.tst2017.mltlng" generate_subset it_IT en_XX test "IWSLT17.TED.tst2017.mltlng" generate_subset en_XX nl_XX test "IWSLT17.TED.tst2017.mltlng" generate_subset nl_XX en_XX test "IWSLT17.TED.tst2017.mltlng" # normalze iwslt directions into x-en pushd $raw for lang in $langs; do for split in test valid; do x_en_f1=$split.$lang-en_XX.en_XX x_en_f2=$split.$lang-en_XX.${lang} en_x_f1=$split.en_XX-$lang.en_XX en_x_f2=$split.en_XX-$lang.${lang} if [ -f $en_x_f1 ] && [ ! -f $x_en_f1 ]; then echo "cp $en_x_f1 $x_en_f1" cp $en_x_f1 $x_en_f1 fi if [ -f $x_en_f2 ] && [ ! -f $x_en_f2 ]; then echo "cp $en_x_f2 $x_en_f2" cp $en_x_f2 $x_en_f2 fi done done popd ================================================ FILE: examples/multilingual/data_scripts/download_lotus.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi SRCDIR=$WORKDIR_ROOT/indic_languages_corpus DESTDIR=${WORKDIR_ROOT}/ML50/raw/ mkdir -p $SRCDIR mkdir -p $DESTDIR cd $SRCDIR wget http://lotus.kuee.kyoto-u.ac.jp/WAT/indic-multilingual/indic_languages_corpus.tar.gz tar -xvzf indic_languages_corpus.tar.gz SRC_EXTRACT_DIR=$SRCDIR/indic_languages_corpus/bilingual cp $SRC_EXTRACT_DIR/ml-en/train.ml $DESTDIR/train.ml_IN-en_XX.ml_IN cp $SRC_EXTRACT_DIR/ml-en/train.en $DESTDIR/train.ml_IN-en_XX.en_XX cp $SRC_EXTRACT_DIR/ml-en/dev.ml $DESTDIR/valid.ml_IN-en_XX.ml_IN cp $SRC_EXTRACT_DIR/ml-en/dev.en $DESTDIR/valid.ml_IN-en_XX.en_XX cp $SRC_EXTRACT_DIR/ml-en/test.ml $DESTDIR/test.ml_IN-en_XX.ml_IN cp $SRC_EXTRACT_DIR/ml-en/test.en $DESTDIR/test.ml_IN-en_XX.en_XX cp $SRC_EXTRACT_DIR/ur-en/train.ur $DESTDIR/train.ur_PK-en_XX.ur_PK cp $SRC_EXTRACT_DIR/ur-en/train.en $DESTDIR/train.ur_PK-en_XX.en_XX cp $SRC_EXTRACT_DIR/ur-en/dev.ur $DESTDIR/valid.ur_PK-en_XX.ur_PK cp $SRC_EXTRACT_DIR/ur-en/dev.en $DESTDIR/valid.ur_PK-en_XX.en_XX cp $SRC_EXTRACT_DIR/ur-en/test.ur $DESTDIR/test.ur_PK-en_XX.ur_PK cp $SRC_EXTRACT_DIR/ur-en/test.en $DESTDIR/test.ur_PK-en_XX.en_XX cp $SRC_EXTRACT_DIR/te-en/train.te $DESTDIR/train.te_IN-en_XX.te_IN cp $SRC_EXTRACT_DIR/te-en/train.en $DESTDIR/train.te_IN-en_XX.en_XX cp $SRC_EXTRACT_DIR/te-en/dev.te $DESTDIR/valid.te_IN-en_XX.te_IN cp $SRC_EXTRACT_DIR/te-en/dev.en $DESTDIR/valid.te_IN-en_XX.en_XX cp $SRC_EXTRACT_DIR/te-en/test.te $DESTDIR/test.te_IN-en_XX.te_IN cp $SRC_EXTRACT_DIR/te-en/test.en $DESTDIR/test.te_IN-en_XX.en_XX ================================================ FILE: examples/multilingual/data_scripts/download_ted_and_extract.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import os import csv from collections import defaultdict from six.moves import zip import io import wget import sys from subprocess import check_call, check_output # scripts and data locations CWD = os.getcwd() UTILS = f"{CWD}/utils" MOSES = f"{UTILS}/mosesdecoder" WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) # please donwload mosesdecoder here: detok_cmd = f'{MOSES}/scripts/tokenizer/detokenizer.perl' def call(cmd): print(f"Executing: {cmd}") check_call(cmd, shell=True) class MultiLingualAlignedCorpusReader(object): """A class to read TED talk dataset """ def __init__(self, corpus_path, delimiter='\t', target_token=True, bilingual=True, corpus_type='file', lang_dict={'source': ['fr'], 'target': ['en']}, eval_lang_dict=None, zero_shot=False, detok=True, ): self.empty_line_flag = 'NULL' self.corpus_path = corpus_path self.delimiter = delimiter self.bilingual = bilingual self.lang_dict = lang_dict self.lang_set = set() self.target_token = target_token self.zero_shot = zero_shot self.eval_lang_dict = eval_lang_dict self.corpus_type = corpus_type self.detok = detok for list_ in self.lang_dict.values(): for lang in list_: self.lang_set.add(lang) self.data = dict() self.data['train'] = self.read_aligned_corpus(split_type='train') self.data['test'] = self.read_aligned_corpus(split_type='test') self.data['dev'] = self.read_aligned_corpus(split_type='dev') def read_data(self, file_loc_): data_list = list() with io.open(file_loc_, 'r', encoding='utf8') as fp: for line in fp: try: text = line.strip() except IndexError: text = self.empty_line_flag data_list.append(text) return data_list def filter_text(self, dict_): if self.target_token: field_index = 1 else: field_index = 0 data_dict = defaultdict(list) list1 = dict_['source'] list2 = dict_['target'] for sent1, sent2 in zip(list1, list2): try: src_sent = ' '.join(sent1.split()[field_index: ]) except IndexError: src_sent = 'NULL' if src_sent.find(self.empty_line_flag) != -1 or len(src_sent) == 0: continue elif sent2.find(self.empty_line_flag) != -1 or len(sent2) == 0: continue else: data_dict['source'].append(sent1) data_dict['target'].append(sent2) return data_dict def read_file(self, split_type, data_type): return self.data[split_type][data_type] def save_file(self, path_, split_type, data_type, lang): tok_file = tok_file_name(path_, lang) with io.open(tok_file, 'w', encoding='utf8') as fp: for line in self.data[split_type][data_type]: fp.write(line + '\n') if self.detok: de_tok(tok_file, lang) def add_target_token(self, list_, lang_id): new_list = list() token = '__' + lang_id + '__' for sent in list_: new_list.append(token + ' ' + sent) return new_list def read_from_single_file(self, path_, s_lang, t_lang): data_dict = defaultdict(list) with io.open(path_, 'r', encoding='utf8') as fp: reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: data_dict['source'].append(row[s_lang]) data_dict['target'].append(row[t_lang]) if self.target_token: text = self.add_target_token(data_dict['source'], t_lang) data_dict['source'] = text return data_dict['source'], data_dict['target'] def read_aligned_corpus(self, split_type='train'): data_dict = defaultdict(list) iterable = [] s_list = [] t_list = [] if self.zero_shot: if split_type == "train": iterable = zip(self.lang_dict['source'], self.lang_dict['target']) else: iterable = zip(self.eval_lang_dict['source'], self.eval_lang_dict['target']) elif self.bilingual: iterable = itertools.product(self.lang_dict['source'], self.lang_dict['target']) for s_lang, t_lang in iterable: if s_lang == t_lang: continue if self.corpus_type == 'file': split_type_file_path = os.path.join(self.corpus_path, "all_talks_{}.tsv".format(split_type)) s_list, t_list = self.read_from_single_file(split_type_file_path, s_lang=s_lang, t_lang=t_lang) data_dict['source'] += s_list data_dict['target'] += t_list new_data_dict = self.filter_text(data_dict) return new_data_dict def read_langs(corpus_path): split_type_file_path = os.path.join(corpus_path, 'extracted', "all_talks_dev.tsv") with io.open(split_type_file_path, 'r', encoding='utf8') as fp: reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE) header = next(reader) return [k for k in header.keys() if k != 'talk_name'] def extra_english(corpus_path, split): split_type_file_path = os.path.join(corpus_path, f"all_talks_{split}.tsv") output_split_type_file_path = os.path.join(corpus_path, f"all_talks_{split}.en") with io.open(split_type_file_path, 'r', encoding='utf8') as fp, io.open(output_split_type_file_path, 'w', encoding='utf8') as fw: reader = csv.DictReader(fp, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: line = row['en'] fw.write(line + '\n') de_tok(output_split_type_file_path, 'en') def tok_file_name(filename, lang): seps = filename.split('.') seps.insert(-1, 'tok') tok_file = '.'.join(seps) return tok_file def de_tok(tok_file, lang): # seps = tok_file.split('.') # seps.insert(-1, 'detok') # de_tok_file = '.'.join(seps) de_tok_file = tok_file.replace('.tok.', '.') cmd = 'perl {detok_cmd} -l {lang} < {tok_file} > {de_tok_file}'.format( detok_cmd=detok_cmd, tok_file=tok_file, de_tok_file=de_tok_file, lang=lang[:2]) call(cmd) def extra_bitex( ted_data_path, lsrc_lang, ltrg_lang, target_token, output_data_path, ): def get_ted_lang(lang): long_langs = ['pt-br', 'zh-cn', 'zh-tw', 'fr-ca'] if lang[:5] in long_langs: return lang[:5] elif lang[:4] =='calv': return lang[:5] elif lang in ['pt_BR', 'zh_CN', 'zh_TW', 'fr_CA']: return lang.lower().replace('_', '-') return lang[:2] src_lang = get_ted_lang(lsrc_lang) trg_lang = get_ted_lang(ltrg_lang) train_lang_dict={'source': [src_lang], 'target': [trg_lang]} eval_lang_dict = {'source': [src_lang], 'target': [trg_lang]} obj = MultiLingualAlignedCorpusReader(corpus_path=ted_data_path, lang_dict=train_lang_dict, target_token=target_token, corpus_type='file', eval_lang_dict=eval_lang_dict, zero_shot=False, bilingual=True) os.makedirs(output_data_path, exist_ok=True) lsrc_lang = lsrc_lang.replace('-', '_') ltrg_lang = ltrg_lang.replace('-', '_') obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}", split_type='train', data_type='source', lang=src_lang) obj.save_file(output_data_path + f"/train.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}", split_type='train', data_type='target', lang=trg_lang) obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}", split_type='test', data_type='source', lang=src_lang) obj.save_file(output_data_path + f"/test.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}", split_type='test', data_type='target', lang=trg_lang) obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{lsrc_lang}", split_type='dev', data_type='source', lang=src_lang) obj.save_file(output_data_path + f"/valid.{lsrc_lang}-{ltrg_lang}.{ltrg_lang}", split_type='dev', data_type='target', lang=trg_lang) def bar_custom(current, total, width=80): print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r') def download_and_extract(download_to, extract_to): url = 'http://phontron.com/data/ted_talks.tar.gz' filename = f"{download_to}/ted_talks.tar.gz" if os.path.exists(filename): print(f'{filename} has already been downloaded so skip') else: filename = wget.download(url, filename, bar=bar_custom) if os.path.exists(f'{extract_to}/all_talks_train.tsv'): print(f'Already extracted so skip') else: extract_cmd = f'tar xzfv "{filename}" -C "{extract_to}"' call(extract_cmd) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('--ted_data_path', type=str, default=WORKDIR_ROOT, required=False) parser.add_argument( '--direction-list', type=str, # default=None, #for ML50 default=( "bn_IN-en_XX,he_IL-en_XX,fa_IR-en_XX,id_ID-en_XX,sv_SE-en_XX,pt_XX-en_XX,ka_GE-en_XX,ka_GE-en_XX,th_TH-en_XX," "mr_IN-en_XX,hr_HR-en_XX,uk_UA-en_XX,az_AZ-en_XX,mk_MK-en_XX,gl_ES-en_XX,sl_SI-en_XX,mn_MN-en_XX," #non-english directions # "fr_XX-de_DE," # replaced with wmt20 # "ja_XX-ko_KR,es_XX-pt_XX,ru_RU-sv_SE,hi_IN-bn_IN,id_ID-ar_AR,cs_CZ-pl_PL,ar_AR-tr_TR" ), required=False) parser.add_argument('--target-token', action='store_true', default=False) parser.add_argument('--extract-all-english', action='store_true', default=False) args = parser.parse_args() import sys import json # TED Talks data directory ted_data_path = args.ted_data_path download_to = f'{ted_data_path}/downloads' extract_to = f'{ted_data_path}/extracted' #DESTDIR=${WORKDIR_ROOT}/ML50/raw/ output_path = f'{ted_data_path}/ML50/raw' os.makedirs(download_to, exist_ok=True) os.makedirs(extract_to, exist_ok=True) os.makedirs(output_path, exist_ok=True) download_and_extract(download_to, extract_to) if args.extract_all_english: for split in ['train', 'dev', 'test']: extra_english(ted_data_path, split) exit(0) if args.direction_list is not None: directions = args.direction_list.strip().split(',') directions = [tuple(d.strip().split('-', 1)) for d in directions if d] else: langs = read_langs(ted_data_path) # directions = [ # '{}.{}'.format(src, tgt) # for src in langs # for tgt in langs # if src < tgt # ] directions = [('en', tgt) for tgt in langs if tgt != 'en'] print(f'num directions={len(directions)}: {directions}') for src_lang, trg_lang in directions: print('--working on {}-{}'.format(src_lang, trg_lang)) extra_bitex( extract_to, src_lang, trg_lang, target_token=args.target_token, output_data_path=output_path ) ================================================ FILE: examples/multilingual/data_scripts/download_wat19_my.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi SRCDIR=$WORKDIR_ROOT/indic_languages_corpus DESTDIR=$WORKDIR_ROOT/ML50/raw mkdir -p $SRCDIR mkdir -p $DESTDIR WAT_MY_EN=wat2020.my-en.zip cd $SRCDIR # please refer to http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/ for latest URL if the following url expired #- The data used for WAT2020 are identical to those used in WAT2019. wget http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/$WAT_MY_EN unzip $WAT_MY_EN SRC_EXTRACT_DIR=$SRCDIR/wat2020.my-en/alt cp $SRC_EXTRACT_DIR/train.alt.en $DESTDIR/train.my_MM-en_XX.en_XX cp $SRC_EXTRACT_DIR/train.alt.my $DESTDIR/train.my_MM-en_XX.my_MM cp $SRC_EXTRACT_DIR/dev.alt.en $DESTDIR/valid.my_MM-en_XX.en_XX cp $SRC_EXTRACT_DIR/dev.alt.my $DESTDIR/valid.my_MM-en_XX.my_MM cp $SRC_EXTRACT_DIR/test.alt.en $DESTDIR/test.my_MM-en_XX.en_XX cp $SRC_EXTRACT_DIR/test.alt.my $DESTDIR/test.my_MM-en_XX.my_MM ================================================ FILE: examples/multilingual/data_scripts/download_wmt19_and_before.py ================================================ from typing import NamedTuple, List from urllib.parse import urlparse import os, sys import subprocess from subprocess import check_call, check_output import glob import wget import re import multiprocessing as mp from functools import partial import pathlib from collections import OrderedDict WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) # scripts and data locations CWD = os.getcwd() UTILS = f"{CWD}/utils" MOSES = f"{UTILS}/mosesdecoder" SGM_TOOL = f'{MOSES}/scripts/ems/support/input-from-sgm.perl' TMX2CORPUS = f"{UTILS}/tmx2corpus" TMX_TOOL = f'python {TMX2CORPUS}/tmx2corpus.py' to_data_path = f'{WORKDIR_ROOT}/wmt' download_to = f'{to_data_path}/downloads' manually_downloads = f'{to_data_path}/downloads' extract_to = f'{to_data_path}/extracted' #DESTDIR=${WORKDIR_ROOT}/ML50/raw/ raw_data = f'{WORKDIR_ROOT}/ML50/raw' #### class DLDataset(NamedTuple): name: str train_urls: List[str] valid_urls: List[str] test_urls: List[str] train_files_patterns: List[str] = [] valid_files_patterns: List[str] = [] test_files_patterns: List[str] = [] def bar_custom(current, total, width=80): print("Downloading: %d%% [%d / %d] Ks" % (current / total * 100, current / 1000, total / 1000), end='\r') def get_downloaded_file(dl_folder, url): if isinstance(url, tuple): url, f = url else: url_f = urlparse(url) # f = os.path.split(url_f.path)[-1] f = '_'.join(url_f.path.split('/')[1:]) return url, f"{dl_folder}/{f}" def download_parts_and_combine(dl_folder, urls, filename): parts = [] for url_record in urls: url, part_file = get_downloaded_file(dl_folder, url_record) if os.path.exists(part_file): print(f'{part_file} has already been downloaded so skip') else: part_file = wget.download(url, part_file, bar=bar_custom) parts.append(part_file) def get_combine_cmd(parts): #default as tar.gz.?? return f'cat {" ".join(parts)} > {filename}' combine_cmd = get_combine_cmd(parts) call(combine_cmd, debug=True) return filename def download_a_url(dl_folder, url): url, filename = get_downloaded_file(dl_folder, url) if os.path.exists(filename): print(f'{filename} has already been downloaded so skip') return filename print(f'downloading {url} to {filename}') if isinstance(url, list) or isinstance(url, tuple): download_parts_and_combine(dl_folder, url, filename) else: wget.download(url, filename, bar=bar_custom) print(f'dowloaded: {filename}') return filename def download_files(dl_folder, urls, completed_urls={}): for url_record in urls: url, _ = get_downloaded_file(dl_folder, url_record) filename = download_a_url(dl_folder, url_record) completed_urls[str(url)] = filename return completed_urls def check_need_manual_downalod(dl_folder, to_manually_download_urls): to_be_manually_dowloaded = [] manually_completed_urls = {} for url_record, instruction in to_manually_download_urls: url, filename = get_downloaded_file(dl_folder, url_record) if not os.path.exists(filename): print(f'{url} need to be download manually, please download it manually following {instruction}; and copy it to {filename}') to_be_manually_dowloaded.append((url, filename)) else: manually_completed_urls[url] = filename # if len(to_be_manually_dowloaded) > 0: # raise ValueError('Missing files that need to be downloaded manually; stop the process now.') return to_be_manually_dowloaded def download_dataset(to_folder, dl_dataset, completed_urls={}): download_files(to_folder, dl_dataset.train_urls, completed_urls) download_files(to_folder, dl_dataset.valid_urls, completed_urls) download_files(to_folder, dl_dataset.test_urls, completed_urls) print('completed downloading') return completed_urls def call(cmd, debug=False): if debug: print(cmd) check_call(cmd, shell=True) def get_extract_name(file_path): path = os.path.split(file_path) return path[-1] + '_extract' #.split('.')[0] def extract_file(downloaded_file, extract_folder, get_extract_name=get_extract_name, debug=False): extract_name = get_extract_name(downloaded_file) extract_to = f'{extract_folder}/{extract_name}' os.makedirs(extract_to, exist_ok=True) if os.path.exists(f'{extract_to}/DONE'): print(f'{downloaded_file} has already been extracted to {extract_to} so skip') return extract_to def get_extract_cmd(filename): if filename.endswith('.tgz') or filename.endswith('tar.gz'): return f'tar xzfv {filename} -C {extract_to}' elif filename.endswith('.gz.tar'): return f'tar xfv {filename} -C {extract_to}; (cd {extract_to}; gzip -d *.gz; [ $? -eq 0 ] || gzip -d */*.gz)' elif filename.endswith('.tar'): return f'tar xfv {filename} -C {extract_to}' elif filename.endswith('.gz'): return f'cp {filename} {extract_to}; (cd {extract_to}; gzip -d *.gz)' elif filename.endswith('.zip'): return f'unzip {filename} -d {extract_to}' extract_cmd = get_extract_cmd(downloaded_file) print(f'extracting {downloaded_file}') if isinstance(extract_cmd, list): for c in extract_cmd: call(c, debug=debug) else: call(extract_cmd, debug=debug) call(f'echo DONE > {extract_to}/DONE') return extract_to def extract_all_files( completed_urls, extract_folder, get_extract_name=get_extract_name, completed_extraction={}, debug=False): extracted_folders = OrderedDict() for url, downloaded_file in set(completed_urls.items()): if downloaded_file in completed_extraction: print(f'{downloaded_file} is already extracted; so skip') continue folder = extract_file(downloaded_file, extract_folder, get_extract_name, debug) extracted_folders[url] = folder return extracted_folders def my_glob(folder): for p in [f'{folder}/*', f'{folder}/*/*', f'{folder}/*/*/*']: for f in glob.glob(p): yield f def sgm2raw(sgm, debug): to_file = sgm[0:len(sgm) - len('.sgm')] if os.path.exists(to_file): debug and print(f'{sgm} already converted to {to_file}; so skip') return to_file cmd = f'{SGM_TOOL} < {sgm} > {to_file}' call(cmd, debug) return to_file def tmx2raw(tmx, debug): to_file = tmx[0:len(tmx) - len('.tmx')] to_folder = os.path.join(*os.path.split(tmx)[:-1]) if os.path.exists(f'{to_folder}/bitext.en'): debug and print(f'{tmx} already extracted to {to_file}; so skip') return to_file cmd = f'(cd {to_folder}; {TMX_TOOL} {tmx})' call(cmd, debug) return to_file CZENG16_REGEX = re.compile(r'.*?data.plaintext-format/0[0-9]train$') WMT19_WIKITITLES_REGEX = re.compile(r'.*?wikititles-v1.(\w\w)-en.tsv.gz') TSV_REGEX = re.compile(r'.*?(\w\w)-(\w\w).tsv$') def cut_wikitles(wiki_file, debug): # different languages have different file names: if wiki_file.endswith('wiki/fi-en/titles.fi-en'): to_file1 = f'{wiki_file}.fi' to_file2 = f'{wiki_file}.en' BACKSLASH = '\\' cmd1 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f1 |awk '{{$1=$1}};1' > {to_file1}" cmd2 = f"cat {wiki_file} | sed 's/|||/{BACKSLASH}t/g' |cut -f2 |awk '{{$1=$1}};1' > {to_file2}" # elif WMT19_WIKITITLES_REGEX.match(wiki_file): # src = WMT19_WIKITITLES_REGEX.match(wiki_file).groups()[0] # to_file1 = f'{wiki_file}.{src}' # to_file2 = f'{wiki_file}.en' # cmd1 = f"cat {wiki_file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}" # cmd2 = f"cat {wiki_file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}" else: return None if os.path.exists(to_file1) and os.path.exists(to_file2): debug and print(f'{wiki_file} already processed to {to_file1} and {to_file2}; so skip') return wiki_file call(cmd1, debug=debug) call(cmd2, debug=debug) return wiki_file def cut_tsv(file, debug): m = TSV_REGEX.match(file) if m is None: raise ValueError(f'{file} is not matching tsv pattern') src = m.groups()[0] tgt = m.groups()[1] to_file1 = f'{file}.{src}' to_file2 = f'{file}.{tgt}' cmd1 = f"cat {file} | cut -f1 |awk '{{$1=$1}};1' > {to_file1}" cmd2 = f"cat {file} | cut -f2 |awk '{{$1=$1}};1' > {to_file2}" if os.path.exists(to_file1) and os.path.exists(to_file2): debug and print(f'{file} already processed to {to_file1} and {to_file2}; so skip') return file call(cmd1, debug=debug) call(cmd2, debug=debug) return file def convert_file_if_needed(file, debug): if file.endswith('.sgm'): return sgm2raw(file, debug) elif file.endswith('.tmx'): return tmx2raw(file, debug) elif file.endswith('wiki/fi-en/titles.fi-en'): return cut_wikitles(file, debug) # elif WMT19_WIKITITLES_REGEX.match(file): # return cut_wikitles(file, debug) elif file.endswith('.tsv'): return cut_tsv(file, debug) elif CZENG16_REGEX.match(file): return convert2czeng17(file, debug) else: return file def convert_files_if_needed(extracted_foldrs, my_glob=my_glob, debug=False): return { url: list(sorted(set(convert_file_if_needed(f, debug)) for f in sorted(set(my_glob(folder))))) for url, folder in extracted_foldrs.items() } def match_patt(file_path, file_pattern, src, tgt, lang): return file_pattern.format(src=src, tgt=tgt, lang=lang) in file_path def match_patts(file_path, file_patterns, src, tgt, lang): for file_pattern in file_patterns: params = { k: v for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] if k in file_pattern} matching = file_pattern.format(**params) if isinstance(file_pattern, tuple): pattern, directions = file_pattern if f'{src}-{tgt}' in directions and matching in file_path: return True else: if matching in file_path: return True return False def extracted_glob(extracted_folder, file_patterns, src, tgt, lang): def get_matching_pattern(file_pattern): params = { k: v for k, v in [('src', src), ('tgt', tgt), ('lang', lang)] if '{' + k + '}' in file_pattern } file_pattern = re.sub(r'{src:(.*?)}', r'\1' if lang == src else '', file_pattern) file_pattern = re.sub(r'{tgt:(.*?)}', r'\1' if lang == tgt else '', file_pattern) file_pattern = file_pattern.format(**params) return file_pattern for file_pattern in file_patterns: if isinstance(file_pattern, tuple): file_pattern, lang_pairs = file_pattern if f'{src}-{tgt}' not in lang_pairs: continue # print('working on pattern: ', file_pattern, lang_pairs ) matching_pattern = get_matching_pattern(file_pattern) if matching_pattern is None: continue glob_patterns = f'{extracted_folder}/{matching_pattern}' # print('glob_patterns: ', glob_patterns) for f in glob.glob(glob_patterns): yield f # for debug usage def all_extracted_files(split, src, tgt, extracted_folders, split_urls): def get_url(url): if isinstance(url, tuple): url, downloaded_file = url return url return [ f for url in split_urls for f in my_glob(extracted_folders[str(get_url(url))]) ] def concat_files(split, src, tgt, extracted_folders, split_urls, path_patterns, to_folder, debug=False): # if debug: # print('extracted files to be filtered by patterns: ', # '\n\t'.join(sorted(all_extracted_files(split, src, tgt, extracted_folders, split_urls)))) for lang in [src, tgt]: to_file = f'{to_folder}/{split}.{src}-{tgt}.{lang}' s_src, s_tgt, s_lang = src.split('_')[0], tgt.split('_')[0], lang.split('_')[0] files = [] for url in split_urls: if isinstance(url, tuple): url, downloaded_file = url if str(url) not in extracted_folders: print(f'warning: {url} not in extracted files') for extracted_file in set( extracted_glob( extracted_folders[str(url)], path_patterns, s_src, s_tgt, s_lang)): files.append(extracted_file) if len(files) == 0: print('warning: ', f'No files found for split {to_file}') continue files = sorted(set(files)) print(f'concating {len(files)} files into {to_file}') cmd = ['cat'] + [f'"{f}"' for f in files] + [f'>{to_file}'] cmd = " ".join(cmd) call(cmd, debug=debug) UTILS = os.path.join(pathlib.Path(__file__).parent, 'utils') LID_MODEL = f'{download_to}/lid.176.bin' LID_MULTI = f'{UTILS}/fasttext_multi_filter.py' def lid_filter(split, src, tgt, from_folder, to_folder, debug=False): if not os.path.exists(LID_MODEL): call(f'wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O {LID_MODEL}') from_prefix = f'{from_folder}/{split}.{src}-{tgt}' to_prefix = f'{to_folder}/{split}.{src}-{tgt}' if os.path.exists(f'{from_prefix}.{src}') and os.path.exists(f'{from_prefix}.{tgt}'): s_src, s_tgt = src.split('_')[0], tgt.split('_')[0] cmd = ( f'python {LID_MULTI} --model {LID_MODEL} --inputs {from_prefix}.{src} {from_prefix}.{tgt} ' f'--langs {s_src} {s_tgt} --outputs {to_prefix}.{src} {to_prefix}.{tgt}' ) print(f'filtering {from_prefix}') call(cmd, debug=debug) def concat_into_splits(dl_dataset, src, tgt, extracted_folders, to_folder, debug): to_folder_tmp = f"{to_folder}_tmp" os.makedirs(to_folder_tmp, exist_ok=True) concat_files('train', src, tgt, extracted_folders, split_urls=dl_dataset.train_urls, path_patterns=dl_dataset.train_files_patterns, to_folder=to_folder_tmp, debug=debug) lid_filter('train', src, tgt, to_folder_tmp, to_folder, debug) concat_files('valid', src, tgt, extracted_folders, split_urls=dl_dataset.valid_urls, path_patterns=dl_dataset.valid_files_patterns, to_folder=to_folder, debug=debug) concat_files('test', src, tgt, extracted_folders, split_urls=dl_dataset.test_urls, path_patterns=dl_dataset.test_files_patterns, to_folder=to_folder, debug=debug) def download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=False): pool = mp.Pool(processes=num_processes) download_f = partial(download_a_url, dl_folder) downloaded_files = pool.imap_unordered(download_f, urls) pool.close() pool.join() BLEU_REGEX = re.compile("^BLEU\\S* = (\\S+) ") def run_eval_bleu(cmd): output = check_output(cmd, shell=True, stderr=subprocess.STDOUT).decode("utf-8").strip() print(output) bleu = -1.0 for line in output.strip().split('\n'): m = BLEU_REGEX.search(line) if m is not None: bleu = m.groups()[0] bleu = float(bleu) break return bleu def check_wmt_test_bleu(raw_folder, wmt_lang_pairs): not_matchings = [] for wmt, src_tgts in wmt_lang_pairs: for src_tgt in src_tgts: print(f'checking test bleus for: {src_tgt} at {wmt}') src, tgt = src_tgt.split('-') ssrc, stgt = src[:2], tgt[:2] if os.path.exists(f'{raw_folder}/test.{tgt}-{src}.{src}'): # reversed direction may have different test set test_src = f'{raw_folder}/test.{tgt}-{src}.{src}' else: test_src = f'{raw_folder}/test.{src}-{tgt}.{src}' cmd1 = f'cat {test_src} | sacrebleu -t "{wmt}" -l {stgt}-{ssrc}; [ $? -eq 0 ] || echo ""' test_tgt = f'{raw_folder}/test.{src}-{tgt}.{tgt}' cmd2 = f'cat {test_tgt} | sacrebleu -t "{wmt}" -l {ssrc}-{stgt}; [ $? -eq 0 ] || echo ""' bleu1 = run_eval_bleu(cmd1) if bleu1 != 100.0: not_matchings.append(f'{wmt}:{src_tgt} source side not matching: {test_src}') bleu2 = run_eval_bleu(cmd2) if bleu2 != 100.0: not_matchings.append(f'{wmt}:{src_tgt} target side not matching: {test_tgt}') return not_matchings def download_and_extract( to_folder, lang_pairs, dl_dataset, to_manually_download_urls, completed_urls={}, completed_extraction={}, debug=False): dl_folder = f'{to_folder}/downloads' extract_folder = f'{to_folder}/extracted' raw_folder = f'{to_folder}/raw' lid_filtered = f'{to_folder}/lid_filtered' os.makedirs(extract_folder, exist_ok=True) os.makedirs(raw_folder, exist_ok=True) os.makedirs(lid_filtered, exist_ok=True) to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls) completed_urls = download_dataset( dl_folder, dl_dataset, completed_urls) if debug: print('completed urls: ', completed_urls) extracted_folders = extract_all_files( completed_urls, extract_folder=extract_folder, completed_extraction=completed_extraction, debug=debug) if debug: print('download files have been extracted to folders: ', extracted_folders) converted_files = convert_files_if_needed(extracted_folders, debug=False) for src_tgt in lang_pairs: print(f'working on {dl_dataset.name}: {src_tgt}') src, tgt = src_tgt.split('-') concat_into_splits(dl_dataset, src=src, tgt=tgt, extracted_folders=extracted_folders, to_folder=raw_folder, debug=debug) print('completed data into: ', raw_folder) def download_czang16(download_to, username=None): wgets = [ f'wget --user={username} --password=czeng -P {download_to} http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar' for i in range(10)] cmds = [] for i, cmd in enumerate(wgets): filename = f'{download_to}/data-plaintext-format.{i}.tar' if os.path.exists(filename): print(f'{filename} has already been downloaded; so skip') continue cmds.append(cmd) if cmds and username is None: raise ValueError('No czeng username is given; please register at http://ufal.mff.cuni.cz/czeng/czeng16 to obtain username to download') for cmd in cmds: call(cmd) print('done with downloading czeng1.6') def download_czeng17_script(download_to, extract_folder, debug=False): url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip' filename = f'{download_to}/convert_czeng16_to_17.pl.zip' extract_to = f'{extract_folder}/{get_extract_name(filename)}' script_path = f'{extract_to}/convert_czeng16_to_17.pl' if not os.path.exists(script_path): wget.download(url, filename, bar=bar_custom) extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug) return script_path czeng17_script_path = "" def convert2czeng17(file, debug): en_file = f'{file}.en' cs_file = f'{file}.cs' if not os.path.exists(en_file) or not os.path.exists(cs_file): cs_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f3 > {cs_file}' en_cmd = f'cat {file} | perl {czeng17_script_path} | cut -f4 > {en_file}' call(cs_cmd, debug) call(en_cmd, debug) else: print(f'already extracted: {en_file} and {cs_file}') return file def extract_czeng17(extract_folder, debug=False): url = 'http://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zip' filename = f'{download_to}/convert_czeng16_to_17.pl.zip' extract_to = f'{extract_folder}/{get_extract_name(filename)}' script_path = f'{extract_to}/convert_czeng16_to_17.pl' if not os.path.exists(script_path): wget.download(url, filename, bar=bar_custom) extract_to = extract_file(f'{download_to}/convert_czeng16_to_17.pl.zip', extract_folder, get_extract_name=get_extract_name, debug=debug) return script_path ######### # definitions of wmt data sources # for es-en # Punctuation in the official test sets will be encoded with ASCII characters (not complex Unicode characters) as much as possible. You may want to normalize your system's output before submission. You are able able to use a rawer version of the test sets that does not have this normalization. # script to normalize punctuation: http://www.statmt.org/wmt11/normalize-punctuation.perl wmt13_es_en = DLDataset( name='wmt13_es-en', train_urls=[ 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', 'http://www.statmt.org/wmt13/training-parallel-un.tgz', 'http://www.statmt.org/wmt13/training-parallel-nc-v8.tgz', ], valid_urls=[ ('http://www.statmt.org/wmt13/dev.tgz', 'wmt13_dev.tgz') ], test_urls=[ ('http://www.statmt.org/wmt13/test.tgz', 'wmt13_test.tgz') ], train_files_patterns=[ ('*/europarl-v7.{src}-{tgt}.{lang}', ['es-en']), ('*commoncrawl.{src}-{tgt}.{lang}', ['es-en']), ('*/news-commentary-v8.{src}-{tgt}.{lang}', ['es-en']), ('un/*undoc.2000.{src}-{tgt}.{lang}', ['es-en']), ] , valid_files_patterns=[ ('dev/newstest2012.{lang}', ['es-en']) ], test_files_patterns=[ ('test/newstest*.{lang}', ['es-en']) ], ) wmt14_de_fr_en = DLDataset( name='wmt14_de_fr_en', train_urls=[ 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', 'http://www.statmt.org/wmt13/training-parallel-un.tgz', 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz', ('http://www.statmt.org/wmt10/training-giga-fren.tar', 'training-giga-fren.gz.tar'), #it is actuall a gz.tar ], valid_urls=[ ('http://www.statmt.org/wmt14/dev.tgz', 'wmt14_dev.tgz'), ], test_urls=[ ('http://www.statmt.org/wmt14/test-full.tgz', 'wmt14_test_full.tgz'), # cleaned test sets ], train_files_patterns=[ ('*/europarl-v7.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), ('*commoncrawl.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), ('*/*news-commentary-v9.{src}-{tgt}.{lang}', ['fr-en', 'de-en']), ('un/undoc.2000.{src}-{tgt}.{lang}', ['fr-en']), ('*giga-{src}{tgt}*{lang}', ['fr-en']) ], valid_files_patterns=[ ('dev/newstest2013.{lang}', ['fr-en', 'de-en']) ], test_files_patterns=[ ('test-full/newstest*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['en-de', 'de-en', 'fr-en', 'en-fr']), ], ) # pip install git+https://github.com/amake/tmx2corpus.git wmt16_ro_en = DLDataset( name='wmt16_ro-en', train_urls=[ ('http://data.statmt.org/wmt16/translation-task/training-parallel-ep-v8.tgz', 'wmt16_training-parallel-ep-v8.tgz'), ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-ro.tmx.gz', 'en-ro.tmx.gz'), ], valid_urls=[ ('http://data.statmt.org/wmt16/translation-task/dev-romanian-updated.tgz', 'wmt16_dev.tgz') ], test_urls=[ ('http://data.statmt.org/wmt16/translation-task/test.tgz', 'wmt16_test.tgz') ], train_files_patterns=[ ('*/*europarl-v8.{src}-{tgt}.{lang}', ['ro-en']), ('bitext.{lang}', ['ro-en']) #setimes from tmux ] , valid_files_patterns=[ ('dev/newsdev2016*{src}{tgt}*.{lang}', ['ro-en', 'ro-en']) ], test_files_patterns=[ ('test/newstest*{src}{tgt}*.{lang}', ['ro-en', 'en-ro']) ], ) cwmt_wmt_instruction = 'cwmt download instruction at: http://nlp.nju.edu.cn/cwmt-wmt' wmt17_fi_lv_tr_zh_en_manual_downloads = [ # fake urls to have unique keys for the data ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), cwmt_wmt_instruction), ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), cwmt_wmt_instruction), ( ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), cwmt_wmt_instruction), ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), cwmt_wmt_instruction), ( ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), cwmt_wmt_instruction), ( ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), cwmt_wmt_instruction), ] wmt17_fi_lv_tr_zh_en = DLDataset( name='wmt17_fi_lv_tr_zh_en', train_urls=[ ('http://data.statmt.org/wmt17/translation-task/training-parallel-ep-v8.tgz', 'wmt17_training-parallel-ep-v8.tgz'), 'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz', 'http://www.statmt.org/wmt15/wiki-titles.tgz', ('http://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-tr.tmx.gz', 'en-tr.tmx.gz'), ('http://data.statmt.org/wmt17/translation-task/rapid2016.tgz', 'wmt17_rapid2016.tgz'), 'http://data.statmt.org/wmt17/translation-task/leta.v1.tgz', 'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz', 'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz', (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00', 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01',), 'UNv1.0.en-zh.tar.gz'), #manually download files: ('http://nlp.nju.edu.cn/cwmt-wmt/CASIA2015.zip', 'CASIA2015.zip'), ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2011.zip', 'CASICT2011.zip'), ('http://nlp.nju.edu.cn/cwmt-wmt/CASICT2015.zip', 'CASICT2015.zip'), ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2015.zip', 'Datum2015.zip'), ('http://nlp.nju.edu.cn/cwmt-wmt/Datum2017.zip', 'Datum2017.zip'), ('http://nlp.nju.edu.cn/cwmt-wmt/NEU2017.zip', 'NEU2017.zip'), ], valid_urls=[ ('http://data.statmt.org/wmt17/translation-task/dev.tgz', 'wmt17_dev.tgz'), ], test_urls=[ #NEW: Improved translations for zh test sets ('http://data.statmt.org/wmt17/translation-task/test-update-1.tgz', 'wmt17_test_zh_en.tgz'), ('http://data.statmt.org/wmt17/translation-task/test.tgz', 'wmt17_test_others.tgz') ], train_files_patterns=[ ('casict*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ), ('casia*/cas*{src:ch}{tgt:en}.txt', ['zh-en', 'zh-en'] ), ('dataum*/Book*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en']), ('neu*/NEU*{src:cn}{tgt:en}.txt', ['zh-en', 'zh-en'] ), ('*/*UNv1.0.en-zh.{src:zh}{tgt:en}', ['zh-en']), ('training/*news-commentary-v12.{src}-{tgt}.{lang}', ['zh-en', ]), ('*/*europarl-v8.{src}-{tgt}.{lang}', ['fi-en', 'lv-en']), ('wiki/fi-en/titles.{src}-{tgt}.{lang}', ['fi-en', ]), ('rapid2016.{tgt}-{src}.{lang}', ['fi-en', 'lv-en']), ('*/leta.{lang}', ['lv-en']), ('*/dcep.{lang}', ['lv-en']), ('*/farewell.{lang}', ['lv-en']), ('bitext.{lang}', ['tr-en']), ] , valid_files_patterns=[ ('dev/newsdev2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', [ 'fi-en', 'lv-en', 'tr-en', 'zh-en', 'en-fi', 'en-lv', 'en-tr', 'en-zh' ]), ('dev/newstest2016*{src}{tgt}-{src:src}{tgt:ref}.{lang}', [ 'fi-en', 'tr-en', 'en-fi', 'en-tr', ]), ], test_files_patterns=[ ('test/newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}', [ 'fi-en', 'lv-en', 'tr-en', 'en-fi', 'en-lv', 'en-tr', ]), ('newstest2017-{src}{tgt}-{src:src}{tgt:ref}.{lang}', [ 'zh-en', 'en-zh' ]), ], ) czeng_instruction = 'download instruction at: http://ufal.mff.cuni.cz/czeng/czeng16' #alternative: use the prepared data but detokenize it? wmt18_cs_et_en_manual_downloads = [ #for cs, need to register and download; Register and download CzEng 1.6. #Better results can be obtained by using a subset of sentences, released under a new version name CzEng 1.7. # ((f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar', # f'data-plaintext-format.{i}.tar'), czeng_instruction) # for i in range(10) ] wmt18_cs_et_en = DLDataset( name='wmt18_cs_et_en', train_urls=[ 'http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz', 'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz', 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-cs.zipporah0-dedup-clean.tgz', 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-et.zipporah0-dedup-clean.tgz', 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', 'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz', ('http://data.statmt.org/wmt18/translation-task/rapid2016.tgz', 'wmt18_rapid2016.tgz'), # (tuple( # (f'http://ufallab.ms.mff.cuni.cz/~bojar/czeng16-data/data-plaintext-format.{i}.tar', # f'data-plaintext-format.{i}.tar') # for i in range(10) # ), # 'czeng16_data_plaintext.gz.tar'), ], valid_urls=[ ('http://data.statmt.org/wmt18/translation-task/dev.tgz', 'wmt18_dev.tgz'), ], test_urls=[ ('http://data.statmt.org/wmt18/translation-task/test.tgz', 'wmt18_test.tgz'), ], train_files_patterns=[ # ('*/*europarl-v7.{src}-{tgt}.{lang}', ['cs-en']), ('*/*europarl-v8.{src}-{tgt}.{lang}', ['et-en']), # ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['cs-en', 'et-en']), ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['et-en']), # ('*commoncrawl.{src}-{tgt}.{lang}', ['cs-en']), # ('*/news-commentary-v13.{src}-{tgt}.{lang}', ['cs-en']), # ('data.plaintext-format/*train.{lang}', ['cs-en']), ('rapid2016.{tgt}-{src}.{lang}', ['et-en']), ] , valid_files_patterns=[ ('dev/newsdev2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['et-en']), # ('dev/newstest2017*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['cs-en']) ], test_files_patterns=[ ('test/newstest2018-{src}{tgt}-{src:src}{tgt:ref}.{lang}', # ['cs-en', 'et-en']), ['et-en']), ] ) ru_en_yandex_instruction = 'Yandex Corpus download instruction at: https://translate.yandex.ru/corpus?lang=en' wmt19_ru_gu_kk_lt_manual_downloads = [ (('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), ru_en_yandex_instruction) ] wmt19_ru_gu_kk_lt = DLDataset( name='wmt19_ru_gu_kk_lt', train_urls=[ 'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz', 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz', 'https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz', 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz', 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14-wmt19.en-kk.tsv.gz', 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-ru.tsv.gz', 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz', 'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz', 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz', 'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz', 'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz', (('https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00', 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01', 'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02',), 'wmt19_UNv1.0.en-ru.tar.gz'), 'https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2016.en-lt.tmx.zip', ('https://translate.yandex.ru/corpus?lang=en', 'wmt19_1mcorpus.zip'), ], valid_urls=[ ('http://data.statmt.org/wmt19/translation-task/dev.tgz', 'wmt19_dev.tgz'), ], test_urls=[ ('http://data.statmt.org/wmt19/translation-task/test.tgz', 'wmt19_test.tgz'), ], train_files_patterns=[ ('*europarl-v9.{src}-{tgt}.tsv.{lang}', ['lt-en']), #paracrawl ('*paracrawl-release1.{tgt}-{src}.zipporah0-dedup-clean.{lang}', ['ru-en']), ('bitext.{lang}', ['lt-en',]), ('*commoncrawl.{src}-{tgt}.{lang}', ['ru-en',]), ('*news-commentary-v14-wmt19.{tgt}-{src}.tsv.{lang}', ['kk-en', ]), ('*news-commentary-v14.{tgt}-{src}.tsv.{lang}', ['ru-en']), #yandex ('corpus.{tgt}_{src}.1m.{lang}', ['ru-en']), ('wikititles_v1_wikititles-v1.{src}-{tgt}.tsv.{lang}', ['ru-en', 'kk-en', 'lt-en', 'gu-en']), ('*/UNv1.0.{tgt}-{src}.{lang}', ['ru-en']), #rapid ('bitext.{lang}', ['lt-en']) ], valid_files_patterns=[ ('dev/newsdev2019*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['gu-en', 'kk-en', 'lt-en']), ('dev/newstest2018*{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['ru-en']), ], test_files_patterns=[ ('sgm/newstest2019-{src}{tgt}-{src:src}{tgt:ref}.{lang}', ['ru-en', 'gu-en', 'kk-en', 'lt-en', 'en-ru', 'en-gu', 'en-kk', 'en-lt']), ] ) ######### if __name__ == "__main__": # speed up the downloads with multiple processing dl_folder = f'{to_data_path}/downloads' extract_folder = f'{to_data_path}/extracted' urls = [ url for dataset in [wmt13_es_en, wmt14_de_fr_en, wmt16_ro_en, wmt18_cs_et_en, wmt19_ru_gu_kk_lt] for urls in [dataset.train_urls, dataset.valid_urls, dataset.test_urls] for url in urls ] urls = set(urls) download_multi(dl_folder, extract_folder, urls, num_processes=8, debug=True) # check manually downlaods to_manually_download_urls = ( wmt17_fi_lv_tr_zh_en_manual_downloads + wmt18_cs_et_en_manual_downloads + wmt19_ru_gu_kk_lt_manual_downloads ) to_be_manually_dowloaded = check_need_manual_downalod(dl_folder, to_manually_download_urls) if len(to_be_manually_dowloaded) > 0: print('Missing files that need to be downloaded manually; stop the process now.') exit(-1) completed_urls = {} completed_extraction = {} def work_on_wmt(directions, wmt_data): download_and_extract( to_data_path, directions, wmt_data, to_manually_download_urls=to_manually_download_urls, completed_urls=completed_urls, completed_extraction=completed_extraction, debug=True) work_on_wmt( ['es_XX-en_XX'], wmt13_es_en,) work_on_wmt( [ 'fr_XX-en_XX', 'en_XX-fr_XX', # 'en_XX-de_DE', 'de_DE-en_XX', ], wmt14_de_fr_en,) work_on_wmt( ['ro_RO-en_XX', 'en_XX-ro_XX'], wmt16_ro_en,) work_on_wmt( [ # 'zh_CN-en_XX', 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX', #in case the reversed directions have different train/valid/test data # 'en_XX-zh_CN', 'en_XX-lv_LV', 'en_XX-fi_FI', 'en_XX-tr_TR', ], wmt17_fi_lv_tr_zh_en, ) # czeng17_script_path = download_czeng17_script(download_to, extract_to, debug=False) # cz_username = None work_on_wmt( [ # 'cs_CZ-en_XX', 'et_EE-en_XX'], wmt18_cs_et_en,) work_on_wmt( [ # 'ru_RU-en_XX', 'en_XX-ru_RU', 'gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX', #in case the reversed directions have different train/valid/test data 'en_XX-gu_IN', 'en_XX-kk_KZ', 'en_XX-lt_LT' ], wmt19_ru_gu_kk_lt,) not_matching = check_wmt_test_bleu( f'{to_data_path}/raw', [ ('wmt13', ['es_XX-en_XX']), ('wmt14/full', ['fr_XX-en_XX',]), ('wmt16', ['ro_RO-en_XX',]), # ('wmt17/improved', ['zh_CN-en_XX']), ('wmt17', [ 'lv_LV-en_XX', 'fi_FI-en_XX', 'tr_TR-en_XX']), ('wmt18', ['cs_CZ-en_XX', 'et_EE-en_XX']), ('wmt19', ['gu_IN-en_XX', 'kk_KZ-en_XX', 'lt_LT-en_XX']), #'ru_RU-en_XX', ] ) if len(not_matching) > 0: print('the following datasets do not have matching test datasets:\n\t', '\n\t'.join(not_matching)) ================================================ FILE: examples/multilingual/data_scripts/download_wmt20.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi set -x -e # TODO update the workdir and dest dir name # put fasttext model WORKDIR=$WORKDIR_ROOT # put intermediate files TMP_DIR=$WORKDIR_ROOT/tmp/tmp_wmt20_lowres_download # output {train,valid,test} files to dest DEST=$WORKDIR_ROOT/ML50/raw UTILS=$PWD/utils # per dataset locations COMMONCRAWL_DIR=$TMP_DIR/commoncrawl YANDEX_CORPUS=$WORKDIR_ROOT/wmt20/official/ru/yandex/1mcorpus.zip # unzipped CZENG_CORPUS=$WORKDIR_ROOT/wmt20/official/cs/czeng/czeng20-train CCMT_DIR=$WORKDIR_ROOT/wmt20/official/zh/ccmt/parallel download_and_select() { SUBFOLDER=$1 URL=$2 UNCOMPRESS_CMD=$3 LANG=$4 INPUT_FILEPATH=$5 if [[ $# -gt 5 ]]; then LANG_COL=$6 EN_COL=$7 fi mkdir -p $SUBFOLDER cd $SUBFOLDER wget -nc --content-disposition $URL $UNCOMPRESS_CMD if [[ $# -gt 5 ]]; then cut -f$LANG_COL $INPUT_FILEPATH > $INPUT_FILEPATH.$LANG cut -f$EN_COL $INPUT_FILEPATH > $INPUT_FILEPATH.en fi cd .. ln -sf $SUBFOLDER/$INPUT_FILEPATH.$LANG $SUBFOLDER.$LANG ln -sf $SUBFOLDER/$INPUT_FILEPATH.en $SUBFOLDER.en } prepare_lid() { pip install fasttext # TODO specify global workdir MODEL=$WORKDIR/fasttext/lid.176.bin LID_MULTI=$UTILS/fasttext_multi_filter.py if [ ! -f "$MODEL" ]; then echo "downloading fasttext lid model..." mkdir -p $WORKDIR/fasttext wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $MODEL fi } prepare_moses() { pushd $UTILS echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git popd } lid_filter() { # TODO specify global workdir MODEL=$WORKDIR/fasttext/lid.176.bin LID_MULTI=$UTILS/fasttext_multi_filter.py prepare_lid SRC=$1 SRC_FILE=$2 SRC_OUTPUT=$3 TGT=$4 TGT_FILE=$5 TGT_OUTPUT=$6 python $LID_MULTI --model $MODEL --inputs $SRC_FILE $TGT_FILE --langs $SRC $TGT --outputs $SRC_OUTPUT $TGT_OUTPUT } prepare_ja_ted() { mkdir -p ted cd ted wget -nc https://wit3.fbk.eu/archive/2017-01-trnted//texts/en/ja/en-ja.tgz tar -zxvf en-ja.tgz cat en-ja/train.tags.en-ja.en | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.en cat en-ja/train.tags.en-ja.ja | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.ja cd .. ln -sf ted/en-ja/train.en-ja.ja ted.ja ln -sf ted/en-ja/train.en-ja.en ted.en } prepare_ja() { OUTPUT_DIR=$TMP_DIR/ja mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select paracrawl "http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/2.0/bitext/en-ja.tar.gz" "tar -zxvf en-ja.tar.gz" ja en-ja/en-ja.bicleaner05.txt 4 3 & download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ja.tsv.gz" "gunzip -f news-commentary-v15.en-ja.tsv.gz" ja news-commentary-v15.en-ja.tsv 2 1 & download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ja-en.tsv.gz" "gunzip -f wikititles-v2.ja-en.tsv.gz" ja wikititles-v2.ja-en.tsv 1 2 & download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ja.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ja.langid.tsv.gz" ja WikiMatrix.v1.en-ja.langid.tsv 3 2 & download_and_select subtitle "https://nlp.stanford.edu/projects/jesc/data/split.tar.gz" "tar -zxvf split.tar.gz" ja split/train 2 1 & download_and_select kftt "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" "tar -zxvf kftt-data-1.0.tar.gz" ja kftt-data-1.0/data/orig/kyoto-train & prepare_ja_ted & # ted data needs to wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.ja" | sort -V | xargs cat > all.ja find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter ja all.ja $DEST/train.ja_XX-en_XX.ja_XX en all.en $DEST/train.ja_XX-en_XX.en_XX } prepare_ta() { OUTPUT_DIR=$TMP_DIR/ta mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ta-en.tsv.gz" "gunzip -f wikititles-v2.ta-en.tsv.gz" ta wikititles-v2.ta-en.tsv 1 2 & download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ta.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ta.langid.tsv.gz" ta WikiMatrix.v1.en-ta.langid.tsv 3 2 & download_and_select pmindia "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.ta-en.tsv" "" ta pmindia.v1.ta-en.tsv 2 1 & download_and_select tanzil "https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-ta.txt.zip" "unzip en-ta.txt.zip" ta Tanzil.en-ta & download_and_select pib "http://preon.iiit.ac.in/~jerin/resources/datasets/pib-v0.tar" "tar -xvf pib-v0.tar" ta pib/en-ta/train & download_and_select mkb "http://preon.iiit.ac.in/~jerin/resources/datasets/mkb-v0.tar" "tar -xvf mkb-v0.tar" ta mkb/en-ta/mkb & download_and_select ufal "http://ufal.mff.cuni.cz/~ramasamy/parallel/data/v2/en-ta-parallel-v2.tar.gz" "tar -zxvf en-ta-parallel-v2.tar.gz" ta en-ta-parallel-v2/corpus.bcn.train & wait # need special handling for nlpc mkdir -p nlpc cd nlpc wget -nc https://raw.githubusercontent.com/nlpc-uom/English-Tamil-Parallel-Corpus/master/En-Ta%20Corpus/En-Ta%20English.txt wget -nc https://github.com/nlpc-uom/English-Tamil-Parallel-Corpus/raw/master/En-Ta%20Corpus/En-Ta%20Tamil.txt tail -n +4 "En-Ta English.txt" > en-ta.en tail -n +4 "En-Ta Tamil.txt" > en-ta.ta cd .. ln -sf nlpc/en-ta.en nlpc.en ln -sf nlpc/en-ta.ta nlpc.ta # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.ta" | sort -V | xargs cat > all.ta find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter ta all.ta $DEST/train.ta_IN-en_XX.ta_IN en all.en $DEST/train.ta_IN-en_XX.en_XX } prepare_iu() { OUTPUT_DIR=$TMP_DIR/iu mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select nh "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" "tar -zxvf Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0.1.tgz" iu Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/NunavutHansard > /dev/null & download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.iu-en.tsv.gz" "gunzip -f wikititles-v2.iu-en.tsv.gz" iu wikititles-v2.iu-en.tsv 1 2 & wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.iu" | sort -V | xargs cat | nh/Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/scripts/normalize-iu-spelling.pl > all.iu find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en paste all.iu all.en | awk -F $'\t' '$1!=""&&$2!=""' > all.iuen cut -f1 all.iuen > $DEST/train.iu_CA-en_XX.iu_CA cut -f2 all.iuen > $DEST/train.iu_CA-en_XX.en_XX } prepare_km() { OUTPUT_DIR=$TMP_DIR/km mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-km.xz" "unxz wmt20-sent.en-km.zx" km wmt20-sent.en-km 2 1 & # km-parallel has multiple sets, concat all of them together mkdir -p opus cd opus wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/km-parallel.tgz" tar -zxvf km-parallel.tgz find ./km-parallel -maxdepth 1 -name "*.km" | sort -V | xargs cat > opus.km find ./km-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en cd .. ln -sf opus/opus.km . ln -sf opus/opus.en . wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.km" | sort -V | xargs cat > all.km find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter km all.km $DEST/train.km_KH-en_XX.km_KH en all.en $DEST/train.km_KH-en_XX.en_XX } prepare_ps() { OUTPUT_DIR=$TMP_DIR/ps mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz" "unxz wmt20-sent.en-ps.xz" ps wmt20-sent.en-ps 2 1 & download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ps-en.tsv.gz" "gunzip -f wikititles-v2.ps-en.tsv.gz" ps wikititles-v2.ps-en.tsv 1 2 & # ps-parallel has multiple sets, concat all of them together mkdir -p opus cd opus wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz" tar -zxvf ps-parallel.tgz find ./ps-parallel -maxdepth 1 -name "*.ps" | sort -V | xargs cat > opus.ps find ./ps-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en cd .. ln -sf opus/opus.ps opus.ps ln -sf opus/opus.en opus.en wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.ps" | sort -V | xargs cat > all.ps find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter ps all.ps $DEST/train.ps_AF-en_XX.ps_AF en all.en $DEST/train.ps_AF-en_XX.en_XX } download_commoncrawl() { mkdir -p $COMMONCRAWL_DIR cd $COMMONCRAWL_DIR wget -nc "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz" tar -zxvf training-parallel-commoncrawl.tgz } link_commoncrawl() { LANG=$1 ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.en commoncrawl.en ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.$LANG commoncrawl.$LANG } strip_xlf() { INPUT_FILE=$1 SRC=$2 TGT=$3 grep '<source xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$SRC grep '<target xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$TGT } download_and_process_tilde() { URL=$1 UNCOMPRESS_CMD=$2 FILENAME=$3 LANG=$4 PROCESS_CMD=$5 mkdir -p tilde cd tilde wget -nc $URL $UNCOMPRESS_CMD echo "executing cmd" echo $PROCESS_CMD $PROCESS_CMD cd .. ln -sf tilde/$FILENAME.$LANG tilde.$LANG ln -sf tilde/$FILENAME.en tilde.en } prepare_cs() { OUTPUT_DIR=$TMP_DIR/cs mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR #download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.cs-en.tsv.gz" "gunzip europarl-v10.cs-en.tsv.gz" cs europarl-v10.cs-en.tsv 1 2 & #download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-cs.txt.gz" "gunzip en-cs.txt.gz" cs en-cs.txt 2 1 & #link_commoncrawl cs #download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.cs-en.tsv.gz" "gunzip news-commentary-v15.cs-en.tsv.gz" cs news-commentary-v15.cs-en.tsv 1 2 & #download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.cs-en.tsv.gz" "gunzip wikititles-v2.cs-en.tsv.gz" cs wikititles-v2.cs-en.tsv 1 2 & #download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.cs-en.xlf.gz" "gunzip RAPID_2019.cs-en.xlf.gz" RAPID_2019.cs-en.xlf cs "strip_xlf RAPID_2019.cs-en.xlf cs en" & #download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.cs-en.langid.tsv.gz" "gunzip WikiMatrix.v1.cs-en.langid.tsv.gz" cs WikiMatrix.v1.cs-en.langid.tsv 2 3 & #wait # remove previous results #rm -f all.?? #find ./ -maxdepth 1 -name "*.cs" | sort -V | xargs cat > all.cs #find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en if [ -z $CZENG_CORPUS ] ; then echo "Please download CZENG_CORPUS manually and place them at $CZENG_CORPUS. Exitting..." exit fi cat $CZENG_CORPUS | sed '/^$/d' | cut -f5 > all.cs cat $CZENG_CORPUS | sed '/^$/d' | cut -f6 > all.en lid_filter cs all.cs $DEST/train.cs_CZ-en_XX.cs_CZ en all.en $DEST/train.cs_CZ-en_XX.en_XX } prepare_de() { OUTPUT_DIR=$TMP_DIR/de mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz" "gunzip europarl-v10.de-en.tsv.gz" de europarl-v10.de-en.tsv 1 2 & download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-de.txt.gz" "gunzip en-de.txt.gz" de en-de.txt 2 1 & link_commoncrawl de download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.de-en.tsv.gz" "gunzip news-commentary-v15.de-en.tsv.gz" de news-commentary-v15.de-en.tsv 1 2 & download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.de-en.tsv.gz" "gunzip wikititles-v2.de-en.tsv.gz" de wikititles-v2.de-en.tsv 1 2 & download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.de-en.xlf.gz" "gunzip RAPID_2019.de-en.xlf.gz" RAPID_2019.de-en.xlf de "strip_xlf RAPID_2019.de-en.xlf de en" & download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.de-en.langid.tsv.gz" "gunzip WikiMatrix.v1.de-en.langid.tsv.gz" de WikiMatrix.v1.de-en.langid.tsv 2 3 & wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.de" | sort -V | xargs cat > all.de find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter de all.de $DEST/train.de_DE-en_XX.de_DE en all.en $DEST/train.de_DE-en_XX.en_XX } prepare_tmx() { TMX_FILE=$1 git clone https://github.com/amake/TMX2Corpus $UTILS/tmx2corpus pip install tinysegmenter python $UTILS/tmx2corpus/tmx2corpus.py $TMX_FILE } prepare_pl() { OUTPUT_DIR=$TMP_DIR/pl mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR # download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.pl-en.tsv.gz" "gunzip europarl-v10.pl-en.tsv.gz" pl europarl-v10.pl-en.tsv 1 2 & # download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-pl.txt.gz" "gunzip en-pl.txt.gz" pl en-pl.txt 2 1 & # download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.pl-en.tsv.gz" "gunzip wikititles-v2.pl-en.tsv.gz" pl wikititles-v2.pl-en.tsv 1 2 & download_and_select tilde "https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2019.en-pl.tmx.zip" "gunzip rapid2019.en-pl.tmx.zip" bitext pl "prepare_tmx RAPID_2019.UNIQUE.en-pl.tmx" & # download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-pl.langid.tsv.gz" "gunzip WikiMatrix.v1.en-pl.langid.tsv.gz" pl WikiMatrix.v1.en-pl.langid.tsv 3 2 & wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.pl" | sort -V | xargs cat > all.pl find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter pl all.pl $DEST/train.pl_PL-en_XX.pl_PL en all.en $DEST/train.pl_PL-en_XX.en_XX } prepare_uncorpus() { $URLS=$1 $FILES=$2 mkdir -p uncorpus cd uncorpus for URL in $URLS; do wget -nc $URL done cat $FILES > uncorpus.tar.gz tar -zxvf uncorpus.tar.gz cd .. ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.$LANG uncorpus.$LANG ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.en uncorpus.en } prepare_yandex() { mkdir -p yandex cd yandex unzip $YANDEX_CORPUS ./ cd .. ln -s yandex/corpus.en_ru.1m.en yandex.en ln -s yandex/corpus.en_ru.1m.ru yandex.ru } prepare_ru() { OUTPUT_DIR=$TMP_DIR/ru mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" "tar -zxvf paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" ru paracrawl-release1.en-ru.zipporah0-dedup-clean & link_commoncrawl ru download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ru.tsv.gz" "gunzip news-commentary-v15.en-ru.tsv.gz" ru news-commentary-v15.en-ru.tsv 2 1 & prepare_yandex & download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ru-en.tsv.gz" "gunzip wikititles-v2.ru-en.tsv.gz" ru wikititles-v2.ru-en.tsv 1 2 & prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" "UNv1.0.en-ru.tar.gz.00 UNv1.0.en-ru.tar.gz.01 UNv1.0.en-ru.tar.gz.02" & download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ru.langid.tsv.gz" "gunzip WikiMatrix.v1.en-ru.langid.tsv.gz" ru WikiMatrix.v1.en-ru.langid.tsv 3 2 & wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.ru" | sort -V | xargs cat > all.ru find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter ru all.ru $DEST/train.ru_RU-en_XX.ru_RU en all.en $DEST/train.ru_RU-en_XX.en_XX } prepare_ccmt() { mkdir -p ccmt cd ccmt # assume ccmt data is already unzipped under CCMT_DIR folder cat $CCMT_DIR/datum2017/Book*_cn.txt | sed 's/ //g' > datum2017.detok.zh cat $CCMT_DIR/datum2017/Book*_en.txt > datum2017.detok.en cat $CCMT_DIR/casict2011/casict-A_ch.txt $CCMT_DIR/casict2011/casict-B_ch.txt $CCMT_DIR/casict2015/casict2015_ch.txt $CCMT_DIR/datum2015/datum_ch.txt $CCMT_DIR/neu2017/NEU_cn.txt datum2017.detok.zh > ccmt.zh cat $CCMT_DIR/casict2011/casict-A_en.txt $CCMT_DIR/casict2011/casict-B_en.txt $CCMT_DIR/casict2015/casict2015_en.txt $CCMT_DIR/datum2015/datum_en.txt $CCMT_DIR/neu2017/NEU_en.txt datum2017.detok.en > ccmt.en cd .. ln -sf ccmt/ccmt.zh ccmt.zh ln -sf ccmt/ccmt.en ccmt.en } prepare_zh() { OUTPUT_DIR=$TMP_DIR/zh mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-zh.tsv.gz" "gunzip news-commentary-v15.en-zh.tsv.gz" zh news-commentary-v15.en-zh.tsv 2 1 & download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.zh-en.tsv.gz" "gunzip wikititles-v2.zh-en.tsv.gz" zh wikititles-v2.zh-en.tsv 1 2 & prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" "UNv1.0.en-zh.tar.gz.00 UNv1.0.en-zh.tar.gz.01" & prepare_ccmt & download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-zh.langid.tsv.gz" "gunzip WikiMatrix.v1.en-zh.langid.tsv.gz" zh WikiMatrix.v1.en-zh.langid.tsv 3 2 & wait # remove previous results rm -f all.?? find ./ -maxdepth 1 -name "*.zh" | sort -V | xargs cat > all.zh find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en lid_filter zh all.zh $DEST/train.zh_CN-en_XX.zh_CN en all.en $DEST/train.zh_CN-en_XX.en_XX } prepare_tests() { OUTPUT_DIR=$TMP_DIR mkdir -p $OUTPUT_DIR cd $OUTPUT_DIR wget -nc http://data.statmt.org/wmt20/translation-task/dev.tgz tar -zxvf dev.tgz cd dev cat newsdev2020-jaen-src.ja.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.ja cat newsdev2020-jaen-ref.en.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.en split newsdev2020-jaen.ja -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.ja_XX split newsdev2020-jaen.en -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.en_XX split newsdev2020-jaen.ja -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.ja_XX split newsdev2020-jaen.en -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.en_XX cat newsdev2020-iuen-src.iu.sgm | strip_sgm.sh > newsdev2020-iuen.iu cat newsdev2020-iuen-ref.en.sgm | strip_sgm.sh > newsdev2020-iuen.en split newsdev2020-iuen.iu -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.iu_CA split newsdev2020-iuen.en -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.en_XX split newsdev2020-iuen.iu -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.iu_CA split newsdev2020-iuen.en -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.en_XX cat newsdev2020-taen-src.ta.sgm | strip_sgm.sh > newsdev2020-taen.ta cat newsdev2020-taen-ref.en.sgm | strip_sgm.sh > newsdev2020-taen.en split newsdev2020-taen.ta -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.ta_IN split newsdev2020-taen.en -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.en_XX split newsdev2020-taen.ta -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.ta_IN split newsdev2020-taen.en -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.en_XX cp wikipedia.dev.km-en.km $DEST/valid.km_KH-en_XX.km_KH cp wikipedia.dev.km-en.en $DEST/valid.km_KH-en_XX.en_XX cp wikipedia.devtest.km-en.km $DEST/test.km_KH-en_XX.km_KH cp wikipedia.devtest.km-en.en $DEST/test.km_KH-en_XX.en_XX cp wikipedia.dev.ps-en.ps $DEST/valid.ps_AF-en_XX.ps_AF cp wikipedia.dev.ps-en.en $DEST/valid.ps_AF-en_XX.en_XX cp wikipedia.devtest.ps-en.ps $DEST/test.ps_AF-en_XX.ps_AF cp wikipedia.devtest.ps-en.en $DEST/test.ps_AF-en_XX.en_XX cat newsdev2020-plen-src.pl.sgm | strip_sgm.sh > newsdev2020-plen.pl cat newsdev2020-plen-ref.en.sgm | strip_sgm.sh > newsdev2020-plen.en split newsdev2020-plen.pl -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.pl_PL split newsdev2020-plen.en -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.en_XX split newsdev2020-plen.pl -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.pl_PL split newsdev2020-plen.en -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.en_XX cat newstest2018-encs-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.en_XX cat newstest2018-encs-ref.cs.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.cs_CZ cat newstest2019-encs-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.en_XX cat newstest2019-encs-ref.cs.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.cs_CZ cat newstest2018-deen-src.de.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.de_DE cat newstest2018-deen-ref.en.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.en_XX cat newstest2018-ende-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.en_XX cat newstest2018-ende-ref.de.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.de_DE cat newstest2019-deen-src.de.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.de_DE cat newstest2019-deen-ref.en.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.en_XX cat newstest2019-ende-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.en_XX cat newstest2019-ende-ref.de.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.de_DE cat newstest2018-ruen-src.ru.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.ru_RU cat newstest2018-ruen-ref.en.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.en_XX cat newstest2018-enru-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.en_XX cat newstest2018-enru-ref.ru.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.ru_RU cat newstest2019-ruen-src.ru.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.ru_RU cat newstest2019-ruen-ref.en.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.en_XX cat newstest2019-enru-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.en_XX cat newstest2019-enru-ref.ru.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.ru_RU cat newstest2018-zhen-src.zh.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.zh_CN cat newstest2018-zhen-ref.en.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.en_XX cat newstest2018-enzh-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.en_XX cat newstest2018-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.zh_CN cat newstest2019-zhen-src.zh.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.zh_CN cat newstest2019-zhen-ref.en.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.en_XX cat newstest2019-enzh-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.en_XX cat newstest2019-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.zh_CN } mkdir -p $DEST prepare_lid prepare_moses download_commoncrawl prepare_ja & prepare_ta & prepare_km & prepare_ps & prepare_iu & prepare_cs & prepare_de & prepare_pl & prepare_ru & prepare_zh & # prepare valid/test set prepare_tests & # wait # TODO remove intermediate files # rm -rf $TMP_DIR ================================================ FILE: examples/multilingual/data_scripts/preprocess_ML50_v1.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. if [ -z $WORKDIR_ROOT ] ; then echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..." exit fi if [ -z $SPM_PATH ] ; then echo "Please install sentence piecence from https://github.com/google/sentencepiece and set SPM_PATH pointing to the installed spm_encode.py. Exitting..." exit fi ML50=${WORKDIR_ROOT}/ML50 mkdir -p $ML50/dedup mkdir -p $ML50/cleaned_dedup python ./dedup_all.py --from-folder $ML50/raw --to-folder $ML50/dedup python ./remove_valid_test_in_train.py --from-folder $ML50/dedup --to-folder $ML50/clean python ./binarize.py --raw-folder $ML50/clean ================================================ FILE: examples/multilingual/data_scripts/remove_valid_test_in_train.py ================================================ import os, sys import glob, itertools import pandas as pd WORKDIR_ROOT = os.environ.get('WORKDIR_ROOT', None) if WORKDIR_ROOT is None or not WORKDIR_ROOT.strip(): print('please specify your working directory root in OS environment variable WORKDIR_ROOT. Exitting..."') sys.exit(-1) def load_langs(path): with open(path) as fr: langs = [l.strip() for l in fr] return langs def load_sentences(raw_data, split, direction): src, tgt = direction.split('-') src_path = f"{raw_data}/{split}.{direction}.{src}" tgt_path = f"{raw_data}/{split}.{direction}.{tgt}" if os.path.exists(src_path) and os.path.exists(tgt_path): return [(src, open(src_path).read().splitlines()), (tgt, open(tgt_path).read().splitlines())] else: return [] def swap_direction(d): src, tgt = d.split('-') return f'{tgt}-{src}' def get_all_test_data(raw_data, directions, split='test'): test_data = [ x for dd in directions for d in [dd, swap_direction(dd)] for x in load_sentences(raw_data, split, d) ] # all_test_data = {s for _, d in test_data for s in d} all_test_data = {} for lang, d in test_data: for s in d: s = s.strip() lgs = all_test_data.get(s, set()) lgs.add(lang) all_test_data[s] = lgs return all_test_data, test_data def check_train_sentences(raw_data, direction, all_test_data, mess_up_train={}): src, tgt = direction.split('-') tgt_path = f"{raw_data}/train.{direction}.{tgt}" src_path = f"{raw_data}/train.{direction}.{src}" print(f'check training data in {raw_data}/train.{direction}') size = 0 if not os.path.exists(tgt_path) or not os.path.exists(src_path): return mess_up_train, size with open(src_path) as f, open(tgt_path) as g: for src_line, tgt_line in zip(f, g): s = src_line.strip() t = tgt_line.strip() size += 1 if s in all_test_data: langs = mess_up_train.get(s, set()) langs.add(direction) mess_up_train[s] = langs if t in all_test_data: langs = mess_up_train.get(t, set()) langs.add(direction) mess_up_train[t] = langs return mess_up_train, size def check_train_all(raw_data, directions, all_test_data): mess_up_train = {} data_sizes = {} for direction in directions: _, size = check_train_sentences(raw_data, direction, all_test_data, mess_up_train) data_sizes[direction] = size return mess_up_train, data_sizes def count_train_in_other_set(mess_up_train): train_in_others = [(direction, s) for s, directions in mess_up_train.items() for direction in directions] counts = {} for direction, s in train_in_others: counts[direction] = counts.get(direction, 0) + 1 return counts def train_size_if_remove_in_otherset(data_sizes, mess_up_train): counts_in_other = count_train_in_other_set(mess_up_train) remain_sizes = [] for direction, count in counts_in_other.items(): remain_sizes.append((direction, data_sizes[direction] - count, data_sizes[direction], count, 100 * count / data_sizes[direction] )) return remain_sizes def remove_messed_up_sentences(raw_data, direction, mess_up_train, mess_up_train_pairs, corrected_langs): split = 'train' src_lang, tgt_lang = direction.split('-') tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}" src = f"{raw_data}/{split}.{direction}.{src_lang}" print(f'working on {direction}: ', src, tgt) if not os.path.exists(tgt) or not os.path.exists(src) : return corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}" corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}" line_num = 0 keep_num = 0 with open(src, encoding='utf8',) as fsrc, \ open(tgt, encoding='utf8',) as ftgt, \ open(corrected_src, 'w', encoding='utf8') as fsrc_corrected, \ open(corrected_tgt, 'w', encoding='utf8') as ftgt_corrected: for s, t in zip(fsrc, ftgt): s = s.strip() t = t.strip() if t not in mess_up_train \ and s not in mess_up_train \ and (s, t) not in mess_up_train_pairs \ and (t, s) not in mess_up_train_pairs: corrected_langs.add(direction) print(s, file=fsrc_corrected) print(t, file=ftgt_corrected) keep_num += 1 line_num += 1 if line_num % 1000 == 0: print(f'completed {line_num} lines', end='\r') return line_num, keep_num ########## def merge_valid_test_messup(mess_up_train_valid, mess_up_train_test): merged_mess = [] for s in set(list(mess_up_train_valid.keys()) + list(mess_up_train_test.keys())): if not s: continue valid = mess_up_train_valid.get(s, set()) test = mess_up_train_test.get(s, set()) merged_mess.append((s, valid | test)) return dict(merged_mess) ######### def check_train_pairs(raw_data, direction, all_test_data, mess_up_train={}): src, tgt = direction.split('-') #a hack; TODO: check the reversed directions path1 = f"{raw_data}/train.{src}-{tgt}.{src}" path2 = f"{raw_data}/train.{src}-{tgt}.{tgt}" if not os.path.exists(path1) or not os.path.exists(path2) : return with open(path1) as f1, open(path2) as f2: for src_line, tgt_line in zip(f1, f2): s = src_line.strip() t = tgt_line.strip() if (s, t) in all_test_data or (t, s) in all_test_data: langs = mess_up_train.get( (s, t), set()) langs.add(src) langs.add(tgt) mess_up_train[(s, t)] = langs def load_pairs(raw_data, split, direction): src, tgt = direction.split('-') src_f = f"{raw_data}/{split}.{direction}.{src}" tgt_f = f"{raw_data}/{split}.{direction}.{tgt}" if tgt != 'en_XX': src_f, tgt_f = tgt_f, src_f if os.path.exists(src_f) and os.path.exists(tgt_f): return list(zip(open(src_f).read().splitlines(), open(tgt_f).read().splitlines(), )) else: return [] # skip_langs = ['cs_CZ', 'en_XX', 'tl_XX', 'tr_TR'] def get_messed_up_test_pairs(split, directions): test_pairs = [ (d, load_pairs(raw_data, split, d)) for d in directions ] # all_test_data = {s for _, d in test_data for s in d} all_test_pairs = {} for direction, d in test_pairs: src, tgt = direction.split('-') for s in d: langs = all_test_pairs.get(s, set()) langs.add(src) langs.add(tgt) all_test_pairs[s] = langs mess_up_train_pairs = {} for direction in directions: check_train_pairs(raw_data, direction, all_test_pairs, mess_up_train_pairs) return all_test_pairs, mess_up_train_pairs if __name__ == "__main__": ####### import argparse parser = argparse.ArgumentParser() parser.add_argument( '--from-folder', required=True, type=str) parser.add_argument( '--to-folder', required=True, type=str) parser.add_argument( '--directions', default=None, type=str) args = parser.parse_args() raw_data = args.from_folder to_folder = args.to_folder os.makedirs(to_folder, exist_ok=True) if args.directions: directions = args.directions.split(',') else: raw_files = itertools.chain( glob.glob(f'{raw_data}/train*'), glob.glob(f'{raw_data}/valid*'), glob.glob(f'{raw_data}/test*'), ) directions = [os.path.split(file_path)[-1].split('.')[1] for file_path in raw_files] print('working on directions: ', directions) ########## all_test_data, test_data = get_all_test_data(raw_data, directions, 'test') print('==loaded test data==') all_valid_data, valid_data = get_all_test_data(raw_data, directions, 'valid') print('==loaded valid data==') all_valid_test_data = merge_valid_test_messup(all_test_data, all_valid_data) mess_up_train, data_sizes = check_train_all(raw_data, directions, all_valid_test_data) print('training messing up with valid, test data:', len(mess_up_train)) data_situation = train_size_if_remove_in_otherset(data_sizes, mess_up_train) df = pd.DataFrame(data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent']) df.sort_values('remove_percent', ascending=False) df.to_csv(f'{raw_data}/clean_summary.tsv', sep='\t') print(f'projected data clean summary in: {raw_data}/clean_summary.tsv') # correct the dataset: all_test_pairs, mess_up_test_train_pairs = get_messed_up_test_pairs('test', directions) all_valid_pairs, mess_up_valid_train_pairs = get_messed_up_test_pairs('valid', directions) all_messed_pairs = set(mess_up_test_train_pairs.keys()).union(set(mess_up_valid_train_pairs.keys())) corrected_directions = set() real_data_situation = [] for direction in directions: org_size, new_size = remove_messed_up_sentences(raw_data, direction, mess_up_train, all_messed_pairs, corrected_directions) if org_size == 0: print(f"{direction} has size 0") continue real_data_situation.append( (direction, new_size, org_size, org_size - new_size, (org_size - new_size) / org_size * 100) ) print('corrected directions: ', corrected_directions) df = pd.DataFrame(real_data_situation, columns=['direction', 'train_size_after_remove', 'orig_size', 'num_to_remove', 'remove_percent']) df.sort_values('remove_percent', ascending=False) df.to_csv(f'{raw_data}/actual_clean_summary.tsv', sep='\t') print(f'actual data clean summary (which can be different from the projected one because of duplications) in: {raw_data}/actual_clean_summary.tsv') import shutil for direction in directions: src_lang, tgt_lang = direction.split('-') for split in ['train', 'valid', 'test']: # copying valid, test and uncorrected train if direction in corrected_directions and split == 'train': continue tgt = f"{raw_data}/{split}.{direction}.{tgt_lang}" src = f"{raw_data}/{split}.{direction}.{src_lang}" if not (os.path.exists(src) and os.path.exists(tgt)): continue corrected_tgt = f"{to_folder}/{split}.{direction}.{tgt_lang}" corrected_src = f"{to_folder}/{split}.{direction}.{src_lang}" print(f'copying {src} to {corrected_src}') shutil.copyfile(src, corrected_src) print(f'copying {tgt} to {corrected_tgt}') shutil.copyfile(tgt, corrected_tgt) print('completed') ================================================ FILE: examples/multilingual/data_scripts/requirement.txt ================================================ wget pandas ================================================ FILE: examples/multilingual/data_scripts/utils/dedup.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse def deup(src_file, tgt_file, src_file_out, tgt_file_out): seen = set() dup_count = 0 with open(src_file, encoding='utf-8') as fsrc, \ open(tgt_file, encoding='utf-8') as ftgt, \ open(src_file_out, 'w', encoding='utf-8') as fsrc_out, \ open(tgt_file_out, 'w', encoding='utf-8') as ftgt_out: for s, t in zip(fsrc, ftgt): if (s, t) not in seen: fsrc_out.write(s) ftgt_out.write(t) seen.add((s, t)) else: dup_count += 1 print(f'number of duplication: {dup_count}') def main(): parser = argparse.ArgumentParser() parser.add_argument("--src-file", type=str, required=True, help="src file") parser.add_argument("--tgt-file", type=str, required=True, help="tgt file") parser.add_argument("--src-file-out", type=str, required=True, help="src ouptut file") parser.add_argument("--tgt-file-out", type=str, required=True, help="tgt ouput file") args = parser.parse_args() deup(args.src_file, args.tgt_file, args.src_file_out, args.tgt_file_out) if __name__ == "__main__": main() ================================================ FILE: examples/multilingual/data_scripts/utils/fasttext_multi_filter.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. #!/bin/python import fasttext from multiprocessing import Pool import contextlib import sys import argparse from functools import partial import io model = None def init(model_path): global model model = fasttext.load_model(model_path) def pred(lines): return lines, [model.predict(line.strip())[0][0][9:] for line in lines] def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", type=str, required=True, help="model to load") parser.add_argument("--inputs", nargs="+", default=['-'], help="input files to filter") parser.add_argument("--langs", nargs="+", required=True, help="lang ids of each input file") parser.add_argument("--outputs", nargs="+", default=['-'], help="path to save lid filtered outputs") parser.add_argument("--num-workers", type=int, metavar="N", default=10, help="number of processes in parallel") args = parser.parse_args() assert len(args.inputs) == len(args.langs) and len(args.inputs) == len(args.outputs) with contextlib.ExitStack() as stack: inputs = [ stack.enter_context(open(input, "r", encoding="utf-8", newline="\n", errors="replace")) if input != "-" else io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors="replace") for input in args.inputs ] outputs = [ stack.enter_context(open(output, "w", encoding="utf-8", newline="\n")) if output != "-" else sys.stdout for output in args.outputs ] with Pool(args.num_workers, initializer=partial(init, args.model)) as p: skip_cnt = 0 for lines, preds in p.imap(pred, list(zip(*inputs)), chunksize=500): if not all(a == b for a, b in zip(preds, args.langs)): skip_cnt += 1 continue for line, output_h in zip(lines, outputs): print(line.strip(), file=output_h) print(f"Skipped {skip_cnt} lines.") if __name__ == "__main__": main() ================================================ FILE: examples/multilingual/data_scripts/utils/strip_sgm.sh ================================================ grep "seg id" | sed 's/<seg id="[0-9]\+">//g' | sed 's/<\/seg>//g' ================================================ FILE: examples/multilingual/finetune_multilingual_model.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. path_2_data=$1 # <path to data> which contains binarized data for each directions lang_list=$2 # <path to a file which contains a list of languages separted by new lines> lang_pairs=$3 #a list language pairs to train multilingual models, e.g. "en-fr,en-cs,fr-en,cs-en" # pretrained can be an mBART pretrained model as well pretrained_model=$4 #<path to a pretrained model> fairseq-train "$path_2_data" \ --encoder-normalize-before --decoder-normalize-before \ --arch transformer --layernorm-embedding \ --task translation_multi_simple_epoch \ --finetune-from-model "$pretrained_model" \ --sampling-method "temperature" \ --sampling-temperature "1.5" \ --encoder-langtok "src" \ --decoder-langtok \ --lang-dict "$lang_list" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ --seed 222 --log-format simple --log-interval 2 ================================================ FILE: examples/multilingual/multilingual_fairseq_gen.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. lang_pairs="en-fr,en-cs,fr-en,cs-en" path_2_data=$1 # <path to data> lang_list=$2 # <path to a file which contains list of languages separted by new lines> model=$3 # <path to a trained model> source_lang=cs target_lang=en fairseq-generate "$path_2_data" \ --path "$model" \ --task translation_multi_simple_epoch \ --gen-subset test \ --source-lang "$source_lang" \ --target-lang "$target_lang" \ --sacrebleu --remove-bpe 'sentencepiece'\ --batch-size 32 \ --encoder-langtok "src" \ --decoder-langtok \ --lang-dict "$lang_list" \ --lang-pairs "$lang_pairs" ================================================ FILE: examples/multilingual/train_multilingual_model.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. path_2_data=$1 # <path to data> which contains binarized data for each directions lang_list=$2 # <path to a file which contains a list of languages separted by new lines> lang_pairs=$3 #a list language pairs to train multilingual models, e.g. "en-fr,en-cs,fr-en,cs-en" fairseq-train "$path_2_data" \ --encoder-normalize-before --decoder-normalize-before \ --arch transformer --layernorm-embedding \ --task translation_multi_simple_epoch \ --sampling-method "temperature" \ --sampling-temperature 1.5 \ --encoder-langtok "src" \ --decoder-langtok \ --lang-dict "$lang_list" \ --lang-pairs "$lang_pairs" \ --criterion label_smoothed_cross_entropy --label-smoothing 0.2 \ --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' \ --lr-scheduler inverse_sqrt --lr 3e-05 --warmup-updates 2500 --max-update 40000 \ --dropout 0.3 --attention-dropout 0.1 --weight-decay 0.0 \ --max-tokens 1024 --update-freq 2 \ --save-interval 1 --save-interval-updates 5000 --keep-interval-updates 10 --no-epoch-checkpoints \ --seed 222 --log-format simple --log-interval 2 ================================================ FILE: examples/noisychannel/README.md ================================================ # Simple and Effective Noisy Channel Modeling for Neural Machine Translation (Yee et al., 2019) This page contains pointers to pre-trained models as well as instructions on how to run the reranking scripts. ## Citation: ```bibtex @inproceedings{yee2019simple, title = {Simple and Effective Noisy Channel Modeling for Neural Machine Translation}, author = {Kyra Yee and Yann Dauphin and Michael Auli}, booktitle = {Conference on Empirical Methods in Natural Language Processing}, year = {2019}, } ``` ## Pre-trained Models: Model | Description | Download ---|---|--- `transformer.noisychannel.de-en` | De->En Forward Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/forward_de2en.tar.bz2) `transformer.noisychannel.en-de` | En->De Channel Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/backward_en2de.tar.bz2) `transformer_lm.noisychannel.en` | En Language model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/reranking_en_lm.tar.bz2) Test Data: [newstest_wmt17](https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/wmt17test.tar.bz2) ## Example usage ``` mkdir rerank_example curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/forward_de2en.tar.bz2 | tar xvjf - -C rerank_example curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/backward_en2de.tar.bz2 | tar xvjf - -C rerank_example curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/reranking_en_lm.tar.bz2 | tar xvjf - -C rerank_example curl https://dl.fbaipublicfiles.com/fairseq/models/noisychannel/wmt17test.tar.bz2 | tar xvjf - -C rerank_example beam=50 num_trials=1000 fw_name=fw_model_ex bw_name=bw_model_ex lm_name=lm_ex data_dir=rerank_example/hyphen-splitting-mixed-case-wmt17test-wmt14bpe data_dir_name=wmt17 lm=rerank_example/lm/checkpoint_best.pt lm_bpe_code=rerank_example/lm/bpe32k.code lm_dict=rerank_example/lm/dict.txt batch_size=32 bw=rerank_example/backward_en2de.pt fw=rerank_example/forward_de2en.pt # reranking with P(T|S) P(S|T) and P(T) python examples/noisychannel/rerank_tune.py $data_dir --tune-param lenpen weight1 weight3 \ --lower-bound 0 0 0 --upper-bound 3 3 3 --data-dir-name $data_dir_name \ --num-trials $num_trials --source-lang de --target-lang en --gen-model $fw \ -n $beam --batch-size $batch_size --score-model2 $fw --score-model1 $bw \ --backwards1 --weight2 1 \ -lm $lm --lm-dict $lm_dict --lm-name en_newscrawl --lm-bpe-code $lm_bpe_code \ --model2-name $fw_name --model1-name $bw_name --gen-model-name $fw_name # reranking with P(T|S) and P(T) python examples/noisychannel/rerank_tune.py $data_dir --tune-param lenpen weight3 \ --lower-bound 0 0 --upper-bound 3 3 --data-dir-name $data_dir_name \ --num-trials $num_trials --source-lang de --target-lang en --gen-model $fw \ -n $beam --batch-size $batch_size --score-model1 $fw \ -lm $lm --lm-dict $lm_dict --lm-name en_newscrawl --lm-bpe-code $lm_bpe_code \ --model1-name $fw_name --gen-model-name $fw_name # to run with a preconfigured set of hyperparameters for the lenpen and model weights, using rerank.py instead. python examples/noisychannel/rerank.py $data_dir \ --lenpen 0.269 --weight1 1 --weight2 0.929 --weight3 0.831 \ --data-dir-name $data_dir_name --source-lang de --target-lang en --gen-model $fw \ -n $beam --batch-size $batch_size --score-model2 $fw --score-model1 $bw --backwards1 \ -lm $lm --lm-dict $lm_dict --lm-name en_newscrawl --lm-bpe-code $lm_bpe_code \ --model2-name $fw_name --model1-name $bw_name --gen-model-name $fw_name ``` ================================================ FILE: examples/noisychannel/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .rerank_options import * # noqa ================================================ FILE: examples/noisychannel/rerank.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from multiprocessing import Pool import numpy as np from fairseq import options from fairseq.data import dictionary from fairseq.scoring import bleu from examples.noisychannel import ( rerank_generate, rerank_options, rerank_score_bw, rerank_score_lm, rerank_utils, ) def score_target_hypo( args, a, b, c, lenpen, target_outfile, hypo_outfile, write_hypos, normalize ): print("lenpen", lenpen, "weight1", a, "weight2", b, "weight3", c) gen_output_lst, bitext1_lst, bitext2_lst, lm_res_lst = load_score_files(args) dict = dictionary.Dictionary() scorer = scorer = bleu.Scorer( bleu.BleuConfig( pad=dict.pad(), eos=dict.eos(), unk=dict.unk(), ) ) ordered_hypos = {} ordered_targets = {} for shard_id in range(len(bitext1_lst)): bitext1 = bitext1_lst[shard_id] bitext2 = bitext2_lst[shard_id] gen_output = gen_output_lst[shard_id] lm_res = lm_res_lst[shard_id] total = len(bitext1.rescore_source.keys()) source_lst = [] hypo_lst = [] score_lst = [] reference_lst = [] j = 1 best_score = -math.inf for i in range(total): # length is measured in terms of words, not bpe tokens, since models may not share the same bpe target_len = len(bitext1.rescore_hypo[i].split()) if lm_res is not None: lm_score = lm_res.score[i] else: lm_score = 0 if bitext2 is not None: bitext2_score = bitext2.rescore_score[i] bitext2_backwards = bitext2.backwards else: bitext2_score = None bitext2_backwards = None score = rerank_utils.get_score( a, b, c, target_len, bitext1.rescore_score[i], bitext2_score, lm_score=lm_score, lenpen=lenpen, src_len=bitext1.source_lengths[i], tgt_len=bitext1.target_lengths[i], bitext1_backwards=bitext1.backwards, bitext2_backwards=bitext2_backwards, normalize=normalize, ) if score > best_score: best_score = score best_hypo = bitext1.rescore_hypo[i] if j == gen_output.num_hypos[i] or j == args.num_rescore: j = 1 hypo_lst.append(best_hypo) score_lst.append(best_score) source_lst.append(bitext1.rescore_source[i]) reference_lst.append(bitext1.rescore_target[i]) best_score = -math.inf best_hypo = "" else: j += 1 gen_keys = list(sorted(gen_output.no_bpe_target.keys())) for key in range(len(gen_keys)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], ( "pred and rescore hypo mismatch: i: " + str(key) + ", " + str(hypo_lst[key]) + str(gen_keys[key]) + str(gen_output.no_bpe_hypo[key]) ) sys_tok = dict.encode_line(hypo_lst[key]) ref_tok = dict.encode_line(gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]] ) sys_tok = dict.encode_line(full_hypo) ref_tok = dict.encode_line(gen_output.no_bpe_target[gen_keys[key]]) scorer.add(ref_tok, sys_tok) # if only one set of hyper parameters is provided, write the predictions to a file if write_hypos: # recover the orinal ids from n best list generation for key in range(len(gen_output.no_bpe_target)): if args.prefix_len is None: assert hypo_lst[key] in gen_output.no_bpe_hypo[gen_keys[key]], ( "pred and rescore hypo mismatch:" + "i:" + str(key) + str(hypo_lst[key]) + str(gen_output.no_bpe_hypo[key]) ) ordered_hypos[gen_keys[key]] = hypo_lst[key] ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key] ] else: full_hypo = rerank_utils.get_full_from_prefix( hypo_lst[key], gen_output.no_bpe_hypo[gen_keys[key]] ) ordered_hypos[gen_keys[key]] = full_hypo ordered_targets[gen_keys[key]] = gen_output.no_bpe_target[ gen_keys[key] ] # write the hypos in the original order from nbest list generation if args.num_shards == (len(bitext1_lst)): with open(target_outfile, "w") as t: with open(hypo_outfile, "w") as h: for key in range(len(ordered_hypos)): t.write(ordered_targets[key]) h.write(ordered_hypos[key]) res = scorer.result_string(4) if write_hypos: print(res) score = rerank_utils.parse_bleu_scoring(res) return score def match_target_hypo(args, target_outfile, hypo_outfile): """combine scores from the LM and bitext models, and write the top scoring hypothesis to a file""" if len(args.weight1) == 1: res = score_target_hypo( args, args.weight1[0], args.weight2[0], args.weight3[0], args.lenpen[0], target_outfile, hypo_outfile, True, args.normalize, ) rerank_scores = [res] else: print("launching pool") with Pool(32) as p: rerank_scores = p.starmap( score_target_hypo, [ ( args, args.weight1[i], args.weight2[i], args.weight3[i], args.lenpen[i], target_outfile, hypo_outfile, False, args.normalize, ) for i in range(len(args.weight1)) ], ) if len(rerank_scores) > 1: best_index = np.argmax(rerank_scores) best_score = rerank_scores[best_index] print("best score", best_score) print("best lenpen", args.lenpen[best_index]) print("best weight1", args.weight1[best_index]) print("best weight2", args.weight2[best_index]) print("best weight3", args.weight3[best_index]) return ( args.lenpen[best_index], args.weight1[best_index], args.weight2[best_index], args.weight3[best_index], best_score, ) else: return ( args.lenpen[0], args.weight1[0], args.weight2[0], args.weight3[0], rerank_scores[0], ) def load_score_files(args): if args.all_shards: shard_ids = list(range(args.num_shards)) else: shard_ids = [args.shard_id] gen_output_lst = [] bitext1_lst = [] bitext2_lst = [] lm_res1_lst = [] for shard_id in shard_ids: using_nbest = args.nbest_list is not None ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) rerank1_is_gen = ( args.gen_model == args.score_model1 and args.source_prefix_frac is None ) rerank2_is_gen = ( args.gen_model == args.score_model2 and args.source_prefix_frac is None ) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1, ) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2, ) if args.language_model is not None: lm_score_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.lm_name, lm_file=True ) # get gen output predictions_bpe_file = pre_gen + "/generate_output_bpe.txt" if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list gen_output = rerank_utils.BitextOutputFromGen( predictions_bpe_file, bpe_symbol=args.post_process, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac, ) if rerank1_is_gen: bitext1 = gen_output else: bitext1 = rerank_utils.BitextOutput( score1_file, args.backwards1, args.right_to_left1, args.post_process, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) if args.score_model2 is not None or args.nbest_list is not None: if rerank2_is_gen: bitext2 = gen_output else: bitext2 = rerank_utils.BitextOutput( score2_file, args.backwards2, args.right_to_left2, args.post_process, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) assert ( bitext2.source_lengths == bitext1.source_lengths ), "source lengths for rescoring models do not match" assert ( bitext2.target_lengths == bitext1.target_lengths ), "target lengths for rescoring models do not match" else: if args.diff_bpe: assert args.score_model2 is None bitext2 = gen_output else: bitext2 = None if args.language_model is not None: lm_res1 = rerank_utils.LMOutput( lm_score_file, args.lm_dict, args.prefix_len, args.post_process, args.target_prefix_frac, ) else: lm_res1 = None gen_output_lst.append(gen_output) bitext1_lst.append(bitext1) bitext2_lst.append(bitext2) lm_res1_lst.append(lm_res1) return gen_output_lst, bitext1_lst, bitext2_lst, lm_res1_lst def rerank(args): if type(args.lenpen) is not list: args.lenpen = [args.lenpen] if type(args.weight1) is not list: args.weight1 = [args.weight1] if type(args.weight2) is not list: args.weight2 = [args.weight2] if type(args.weight3) is not list: args.weight3 = [args.weight3] if args.all_shards: shard_ids = list(range(args.num_shards)) else: shard_ids = [args.shard_id] for shard_id in shard_ids: ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) rerank_generate.gen_and_reprocess_nbest(args) rerank_score_bw.score_bw(args) rerank_score_lm.score_lm(args) if args.write_hypos is None: write_targets = pre_gen + "/matched_targets" write_hypos = pre_gen + "/matched_hypos" else: write_targets = args.write_hypos + "_targets" + args.gen_subset write_hypos = args.write_hypos + "_hypos" + args.gen_subset if args.all_shards: write_targets += "_all_shards" write_hypos += "_all_shards" ( best_lenpen, best_weight1, best_weight2, best_weight3, best_score, ) = match_target_hypo(args, write_targets, write_hypos) return best_lenpen, best_weight1, best_weight2, best_weight3, best_score def cli_main(): parser = rerank_options.get_reranking_parser() args = options.parse_args_and_arch(parser) rerank(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/noisychannel/rerank_generate.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Generate n-best translations using a trained model. """ import os import subprocess from contextlib import redirect_stdout from fairseq import options from fairseq_cli import generate, preprocess from examples.noisychannel import rerank_options, rerank_utils def gen_and_reprocess_nbest(args): if args.score_dict_dir is None: args.score_dict_dir = args.data if args.prefix_len is not None: assert ( args.right_to_left1 is False ), "prefix length not compatible with right to left models" assert ( args.right_to_left2 is False ), "prefix length not compatible with right to left models" if args.nbest_list is not None: assert args.score_model2 is None if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang store_data = ( os.path.join(os.path.dirname(__file__)) + "/rerank_data/" + args.data_dir_name ) if not os.path.exists(store_data): os.makedirs(store_data) ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) assert not ( args.right_to_left1 and args.backwards1 ), "backwards right to left not supported" assert not ( args.right_to_left2 and args.backwards2 ), "backwards right to left not supported" assert not ( args.prefix_len is not None and args.target_prefix_frac is not None ), "target prefix frac and target prefix len incompatible" # make directory to store generation results if not os.path.exists(pre_gen): os.makedirs(pre_gen) rerank1_is_gen = ( args.gen_model == args.score_model1 and args.source_prefix_frac is None ) rerank2_is_gen = ( args.gen_model == args.score_model2 and args.source_prefix_frac is None ) if args.nbest_list is not None: rerank2_is_gen = True # make directories to store preprossed nbest list for reranking if not os.path.exists(left_to_right_preprocessed_dir): os.makedirs(left_to_right_preprocessed_dir) if not os.path.exists(right_to_left_preprocessed_dir): os.makedirs(right_to_left_preprocessed_dir) if not os.path.exists(lm_preprocessed_dir): os.makedirs(lm_preprocessed_dir) if not os.path.exists(backwards_preprocessed_dir): os.makedirs(backwards_preprocessed_dir) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1, ) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2, ) predictions_bpe_file = pre_gen + "/generate_output_bpe.txt" using_nbest = args.nbest_list is not None if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list else: if not os.path.isfile(predictions_bpe_file): print("STEP 1: generate predictions using the p(T|S) model with bpe") print(args.data) param1 = [ args.data, "--path", args.gen_model, "--shard-id", str(args.shard_id), "--num-shards", str(args.num_shards), "--nbest", str(args.num_rescore), "--batch-size", str(args.batch_size), "--beam", str(args.num_rescore), "--batch-size", str(args.num_rescore), "--gen-subset", args.gen_subset, "--source-lang", args.source_lang, "--target-lang", args.target_lang, ] if args.sampling: param1 += ["--sampling"] gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, param1) print(input_args) with open(predictions_bpe_file, "w") as f: with redirect_stdout(f): generate.main(input_args) gen_output = rerank_utils.BitextOutputFromGen( predictions_bpe_file, bpe_symbol=args.post_process, nbest=using_nbest, prefix_len=args.prefix_len, target_prefix_frac=args.target_prefix_frac, ) if args.diff_bpe: rerank_utils.write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/source_gen_bpe." + args.source_lang, pre_gen + "/target_gen_bpe." + args.target_lang, pre_gen + "/reference_gen_bpe." + args.target_lang, ) bitext_bpe = args.rescore_bpe_code bpe_src_param = [ "-c", bitext_bpe, "--input", pre_gen + "/source_gen_bpe." + args.source_lang, "--output", pre_gen + "/rescore_data." + args.source_lang, ] bpe_tgt_param = [ "-c", bitext_bpe, "--input", pre_gen + "/target_gen_bpe." + args.target_lang, "--output", pre_gen + "/rescore_data." + args.target_lang, ] subprocess.call( [ "python", os.path.join( os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py" ), ] + bpe_src_param, shell=False, ) subprocess.call( [ "python", os.path.join( os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py" ), ] + bpe_tgt_param, shell=False, ) if (not os.path.isfile(score1_file) and not rerank1_is_gen) or ( args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen ): print( "STEP 2: process the output of generate.py so we have clean text files with the translations" ) rescore_file = "/rescore_data" if args.prefix_len is not None: prefix_len_rescore_file = rescore_file + "prefix" + str(args.prefix_len) if args.target_prefix_frac is not None: target_prefix_frac_rescore_file = ( rescore_file + "target_prefix_frac" + str(args.target_prefix_frac) ) if args.source_prefix_frac is not None: source_prefix_frac_rescore_file = ( rescore_file + "source_prefix_frac" + str(args.source_prefix_frac) ) if not args.right_to_left1 or not args.right_to_left2: if not args.diff_bpe: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + rescore_file + "." + args.source_lang, pre_gen + rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, ) if args.prefix_len is not None: bw_rescore_file = prefix_len_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + prefix_len_rescore_file + "." + args.source_lang, pre_gen + prefix_len_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", prefix_len=args.prefix_len, bpe_symbol=args.post_process, ) elif args.target_prefix_frac is not None: bw_rescore_file = target_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + target_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + target_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, target_prefix_frac=args.target_prefix_frac, ) else: bw_rescore_file = rescore_file if args.source_prefix_frac is not None: fw_rescore_file = source_prefix_frac_rescore_file rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + source_prefix_frac_rescore_file + "." + args.source_lang, pre_gen + source_prefix_frac_rescore_file + "." + args.target_lang, pre_gen + "/reference_file", bpe_symbol=args.post_process, source_prefix_frac=args.source_prefix_frac, ) else: fw_rescore_file = rescore_file if args.right_to_left1 or args.right_to_left2: rerank_utils.write_reprocessed( gen_output.source, gen_output.hypo, gen_output.target, pre_gen + "/right_to_left_rescore_data." + args.source_lang, pre_gen + "/right_to_left_rescore_data." + args.target_lang, pre_gen + "/right_to_left_reference_file", right_to_left=True, bpe_symbol=args.post_process, ) print("STEP 3: binarize the translations") if ( not args.right_to_left1 or args.score_model2 is not None and not args.right_to_left2 or not rerank1_is_gen ): if args.backwards1 or args.backwards2: if args.backwards_score_dict_dir is not None: bw_dict = args.backwards_score_dict_dir else: bw_dict = args.score_dict_dir bw_preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + bw_rescore_file, "--srcdict", bw_dict + "/dict." + scorer1_src + ".txt", "--tgtdict", bw_dict + "/dict." + scorer1_tgt + ".txt", "--destdir", backwards_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(bw_preprocess_param) preprocess.main(input_args) preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + fw_rescore_file, "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", left_to_right_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) if args.right_to_left1 or args.right_to_left2: preprocess_param = [ "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, "--trainpref", pre_gen + "/right_to_left_rescore_data", "--srcdict", args.score_dict_dir + "/dict." + scorer1_src + ".txt", "--tgtdict", args.score_dict_dir + "/dict." + scorer1_tgt + ".txt", "--destdir", right_to_left_preprocessed_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_param) preprocess.main(input_args) return gen_output def cli_main(): parser = rerank_options.get_reranking_parser() args = options.parse_args_and_arch(parser) gen_and_reprocess_nbest(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/noisychannel/rerank_options.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq import options def get_reranking_parser(default_task="translation"): parser = options.get_parser("Generation and reranking", default_task) add_reranking_args(parser) return parser def get_tuning_parser(default_task="translation"): parser = options.get_parser("Reranking tuning", default_task) add_reranking_args(parser) add_tuning_args(parser) return parser def add_reranking_args(parser): group = parser.add_argument_group("Reranking") # fmt: off group.add_argument('--score-model1', '-s1', type=str, metavar='FILE', required=True, help='path to first model or ensemble of models for rescoring') group.add_argument('--score-model2', '-s2', type=str, metavar='FILE', required=False, help='path to second model or ensemble of models for rescoring') group.add_argument('--num-rescore', '-n', type=int, metavar='N', default=10, help='the number of candidate hypothesis to rescore') group.add_argument('-bz', '--batch-size', type=int, metavar='N', default=128, help='batch size for generating the nbest list') group.add_argument('--gen-subset', default='test', metavar='SET', choices=['test', 'train', 'valid'], help='data subset to generate (train, valid, test)') group.add_argument('--gen-model', default=None, metavar='FILE', help='the model to generate translations') group.add_argument('-b1', '--backwards1', action='store_true', help='whether or not the first model group is backwards') group.add_argument('-b2', '--backwards2', action='store_true', help='whether or not the second model group is backwards') group.add_argument('-a', '--weight1', default=1, nargs='+', type=float, help='the weight(s) of the first model') group.add_argument('-b', '--weight2', default=1, nargs='+', type=float, help='the weight(s) of the second model, or the gen model if using nbest from interactive.py') group.add_argument('-c', '--weight3', default=1, nargs='+', type=float, help='the weight(s) of the third model') # lm arguments group.add_argument('-lm', '--language-model', default=None, metavar='FILE', help='language model for target language to rescore translations') group.add_argument('--lm-dict', default=None, metavar='FILE', help='the dict of the language model for the target language') group.add_argument('--lm-name', default=None, help='the name of the language model for the target language') group.add_argument('--lm-bpe-code', default=None, metavar='FILE', help='the bpe code for the language model for the target language') group.add_argument('--data-dir-name', default=None, help='name of data directory') group.add_argument('--lenpen', default=1, nargs='+', type=float, help='length penalty: <1.0 favors shorter, >1.0 favors longer sentences') group.add_argument('--score-dict-dir', default=None, help='the directory with dictionaries for the scoring models') group.add_argument('--right-to-left1', action='store_true', help='whether the first model group is a right to left model') group.add_argument('--right-to-left2', action='store_true', help='whether the second model group is a right to left model') group.add_argument('--post-process', '--remove-bpe', default='@@ ', help='the bpe symbol, used for the bitext and LM') group.add_argument('--prefix-len', default=None, type=int, help='the length of the target prefix to use in rescoring (in terms of words wo bpe)') group.add_argument('--sampling', action='store_true', help='use sampling instead of beam search for generating n best list') group.add_argument('--diff-bpe', action='store_true', help='bpe for rescoring and nbest list not the same') group.add_argument('--rescore-bpe-code', default=None, help='bpe code for rescoring models') group.add_argument('--nbest-list', default=None, help='use predefined nbest list in interactive.py format') group.add_argument('--write-hypos', default=None, help='filename prefix to write hypos to') group.add_argument('--ref-translation', default=None, help='reference translation to use with nbest list from interactive.py') group.add_argument('--backwards-score-dict-dir', default=None, help='the directory with dictionaries for the backwards model,' 'if None then it is assumed the fw and backwards models share dictionaries') # extra scaling args group.add_argument('--gen-model-name', default=None, help='the name of the models that generated the nbest list') group.add_argument('--model1-name', default=None, help='the name of the set for model1 group ') group.add_argument('--model2-name', default=None, help='the name of the set for model2 group') group.add_argument('--shard-id', default=0, type=int, help='the id of the shard to generate') group.add_argument('--num-shards', default=1, type=int, help='the number of shards to generate across') group.add_argument('--all-shards', action='store_true', help='use all shards') group.add_argument('--target-prefix-frac', default=None, type=float, help='the fraction of the target prefix to use in rescoring (in terms of words wo bpe)') group.add_argument('--source-prefix-frac', default=None, type=float, help='the fraction of the source prefix to use in rescoring (in terms of words wo bpe)') group.add_argument('--normalize', action='store_true', help='whether to normalize by src and target len') # fmt: on return group def add_tuning_args(parser): group = parser.add_argument_group("Tuning") group.add_argument( "--lower-bound", default=[-0.7], nargs="+", type=float, help="lower bound of search space", ) group.add_argument( "--upper-bound", default=[3], nargs="+", type=float, help="upper bound of search space", ) group.add_argument( "--tune-param", default=["lenpen"], nargs="+", choices=["lenpen", "weight1", "weight2", "weight3"], help="the parameter(s) to tune", ) group.add_argument( "--tune-subset", default="valid", choices=["valid", "test", "train"], help="the subset to tune on ", ) group.add_argument( "--num-trials", default=1000, type=int, help="number of trials to do for random search", ) group.add_argument( "--share-weights", action="store_true", help="share weight2 and weight 3" ) return group ================================================ FILE: examples/noisychannel/rerank_score_bw.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os from contextlib import redirect_stdout from fairseq import options from fairseq_cli import generate from examples.noisychannel import rerank_options, rerank_utils def score_bw(args): if args.backwards1: scorer1_src = args.target_lang scorer1_tgt = args.source_lang else: scorer1_src = args.source_lang scorer1_tgt = args.target_lang if args.score_model2 is not None: if args.backwards2: scorer2_src = args.target_lang scorer2_tgt = args.source_lang else: scorer2_src = args.source_lang scorer2_tgt = args.target_lang rerank1_is_gen = ( args.gen_model == args.score_model1 and args.source_prefix_frac is None ) rerank2_is_gen = ( args.gen_model == args.score_model2 and args.source_prefix_frac is None ) ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) score1_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model1_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards1, ) if args.score_model2 is not None: score2_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.model2_name, target_prefix_frac=args.target_prefix_frac, source_prefix_frac=args.source_prefix_frac, backwards=args.backwards2, ) if args.right_to_left1: rerank_data1 = right_to_left_preprocessed_dir elif args.backwards1: rerank_data1 = backwards_preprocessed_dir else: rerank_data1 = left_to_right_preprocessed_dir gen_param = ["--batch-size", str(128), "--score-reference", "--gen-subset", "train"] if not rerank1_is_gen and not os.path.isfile(score1_file): print("STEP 4: score the translations for model 1") model_param1 = [ "--path", args.score_model1, "--source-lang", scorer1_src, "--target-lang", scorer1_tgt, ] gen_model1_param = [rerank_data1] + gen_param + model_param1 gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, gen_model1_param) with open(score1_file, "w") as f: with redirect_stdout(f): generate.main(input_args) if ( args.score_model2 is not None and not os.path.isfile(score2_file) and not rerank2_is_gen ): print("STEP 4: score the translations for model 2") if args.right_to_left2: rerank_data2 = right_to_left_preprocessed_dir elif args.backwards2: rerank_data2 = backwards_preprocessed_dir else: rerank_data2 = left_to_right_preprocessed_dir model_param2 = [ "--path", args.score_model2, "--source-lang", scorer2_src, "--target-lang", scorer2_tgt, ] gen_model2_param = [rerank_data2] + gen_param + model_param2 gen_parser = options.get_generation_parser() input_args = options.parse_args_and_arch(gen_parser, gen_model2_param) with open(score2_file, "w") as f: with redirect_stdout(f): generate.main(input_args) def cli_main(): parser = rerank_options.get_reranking_parser() args = options.parse_args_and_arch(parser) score_bw(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/noisychannel/rerank_score_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os from fairseq import options from examples.noisychannel import rerank_options, rerank_utils def score_lm(args): using_nbest = args.nbest_list is not None ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) = rerank_utils.get_directories( args.data_dir_name, args.num_rescore, args.gen_subset, args.gen_model_name, args.shard_id, args.num_shards, args.sampling, args.prefix_len, args.target_prefix_frac, args.source_prefix_frac, ) predictions_bpe_file = pre_gen + "/generate_output_bpe.txt" if using_nbest: print("Using predefined n-best list from interactive.py") predictions_bpe_file = args.nbest_list gen_output = rerank_utils.BitextOutputFromGen( predictions_bpe_file, bpe_symbol=args.post_process, nbest=using_nbest ) if args.language_model is not None: lm_score_file = rerank_utils.rescore_file_name( pre_gen, args.prefix_len, args.lm_name, lm_file=True ) if args.language_model is not None and not os.path.isfile(lm_score_file): print("STEP 4.5: language modeling for P(T)") if args.lm_bpe_code is None: bpe_status = "no bpe" elif args.lm_bpe_code == "shared": bpe_status = "shared" else: bpe_status = "different" rerank_utils.lm_scoring( lm_preprocessed_dir, bpe_status, gen_output, pre_gen, args.lm_dict, args.lm_name, args.language_model, args.lm_bpe_code, 128, lm_score_file, args.target_lang, args.source_lang, prefix_len=args.prefix_len, ) def cli_main(): parser = rerank_options.get_reranking_parser() args = options.parse_args_and_arch(parser) score_lm(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/noisychannel/rerank_tune.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import random import numpy as np from fairseq import options from examples.noisychannel import rerank, rerank_options def random_search(args): param_values = [] tuneable_parameters = ["lenpen", "weight1", "weight2", "weight3"] initial_params = [args.lenpen, args.weight1, args.weight2, args.weight3] for i, elem in enumerate(initial_params): if type(elem) is not list: initial_params[i] = [elem] else: initial_params[i] = elem tune_parameters = args.tune_param.copy() for i in range(len(args.tune_param)): assert args.upper_bound[i] >= args.lower_bound[i] index = tuneable_parameters.index(args.tune_param[i]) del tuneable_parameters[index] del initial_params[index] tune_parameters += tuneable_parameters param_values += initial_params random.seed(args.seed) random_params = np.array( [ [ random.uniform(args.lower_bound[i], args.upper_bound[i]) for i in range(len(args.tune_param)) ] for k in range(args.num_trials) ] ) set_params = np.array( [ [initial_params[i][0] for i in range(len(tuneable_parameters))] for k in range(args.num_trials) ] ) random_params = np.concatenate((random_params, set_params), 1) rerank_args = vars(args).copy() if args.nbest_list: rerank_args["gen_subset"] = "test" else: rerank_args["gen_subset"] = args.tune_subset for k in range(len(tune_parameters)): rerank_args[tune_parameters[k]] = list(random_params[:, k]) if args.share_weights: k = tune_parameters.index("weight2") rerank_args["weight3"] = list(random_params[:, k]) rerank_args = argparse.Namespace(**rerank_args) best_lenpen, best_weight1, best_weight2, best_weight3, best_score = rerank.rerank( rerank_args ) rerank_args = vars(args).copy() rerank_args["lenpen"] = [best_lenpen] rerank_args["weight1"] = [best_weight1] rerank_args["weight2"] = [best_weight2] rerank_args["weight3"] = [best_weight3] # write the hypothesis from the valid set from the best trial if args.gen_subset != "valid": rerank_args["gen_subset"] = "valid" rerank_args = argparse.Namespace(**rerank_args) rerank.rerank(rerank_args) # test with the best hyperparameters on gen subset rerank_args = vars(args).copy() rerank_args["gen_subset"] = args.gen_subset rerank_args["lenpen"] = [best_lenpen] rerank_args["weight1"] = [best_weight1] rerank_args["weight2"] = [best_weight2] rerank_args["weight3"] = [best_weight3] rerank_args = argparse.Namespace(**rerank_args) rerank.rerank(rerank_args) def cli_main(): parser = rerank_options.get_tuning_parser() args = options.parse_args_and_arch(parser) random_search(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/noisychannel/rerank_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import os import re import subprocess from contextlib import redirect_stdout from fairseq import options from fairseq_cli import eval_lm, preprocess def reprocess(fle): # takes in a file of generate.py translation generate_output # returns a source dict and hypothesis dict, where keys are the ID num (as a string) # and values and the corresponding source and translation. There may be several translations # per source, so the values for hypothesis_dict are lists. # parses output of generate.py with open(fle, "r") as f: txt = f.read() """reprocess generate.py output""" p = re.compile(r"[STHP][-]\d+\s*") hp = re.compile(r"(\s*[-]?\d+[.]?\d+\s*)|(\s*(-inf)\s*)") source_dict = {} hypothesis_dict = {} score_dict = {} target_dict = {} pos_score_dict = {} lines = txt.split("\n") for line in lines: line += "\n" prefix = re.search(p, line) if prefix is not None: assert len(prefix.group()) > 2, "prefix id not found" _, j = prefix.span() id_num = prefix.group()[2:] id_num = int(id_num) line_type = prefix.group()[0] if line_type == "H": h_txt = line[j:] hypo = re.search(hp, h_txt) assert ( hypo is not None ), "regular expression failed to find the hypothesis scoring" _, i = hypo.span() score = hypo.group() if id_num in hypothesis_dict: hypothesis_dict[id_num].append(h_txt[i:]) score_dict[id_num].append(float(score)) else: hypothesis_dict[id_num] = [h_txt[i:]] score_dict[id_num] = [float(score)] elif line_type == "S": source_dict[id_num] = line[j:] elif line_type == "T": target_dict[id_num] = line[j:] elif line_type == "P": pos_scores = (line[j:]).split() pos_scores = [float(x) for x in pos_scores] if id_num in pos_score_dict: pos_score_dict[id_num].append(pos_scores) else: pos_score_dict[id_num] = [pos_scores] return source_dict, hypothesis_dict, score_dict, target_dict, pos_score_dict def reprocess_nbest(fle): """reprocess interactive.py output""" with open(fle, "r") as f: txt = f.read() source_dict = {} hypothesis_dict = {} score_dict = {} target_dict = {} pos_score_dict = {} lines = txt.split("\n") hp = re.compile(r"[-]?\d+[.]?\d+") j = -1 for _i, line in enumerate(lines): line += "\n" line_type = line[0] if line_type == "H": hypo = re.search(hp, line) _, start_index = hypo.span() score = hypo.group() if j in score_dict: score_dict[j].append(float(score)) hypothesis_dict[j].append(line[start_index:].strip("\t")) else: score_dict[j] = [float(score)] hypothesis_dict[j] = [line[start_index:].strip("\t")] elif line_type == "O": j += 1 source_dict[j] = line[2:] # we don't have the targets for interactive.py target_dict[j] = "filler" elif line_type == "P": pos_scores = [float(pos_score) for pos_score in line.split()[1:]] if j in pos_score_dict: pos_score_dict[j].append(pos_scores) else: pos_score_dict[j] = [pos_scores] assert source_dict.keys() == hypothesis_dict.keys() assert source_dict.keys() == pos_score_dict.keys() assert source_dict.keys() == score_dict.keys() return source_dict, hypothesis_dict, score_dict, target_dict, pos_score_dict def write_reprocessed( sources, hypos, targets, source_outfile, hypo_outfile, target_outfile, right_to_left=False, prefix_len=None, bpe_symbol=None, target_prefix_frac=None, source_prefix_frac=None, ): """writes nbest hypothesis for rescoring""" assert not ( prefix_len is not None and target_prefix_frac is not None ), "in writing reprocessed, only one type of prefix may be used" assert not ( prefix_len is not None and source_prefix_frac is not None ), "in writing reprocessed, only one type of prefix may be used" assert not ( target_prefix_frac is not None and source_prefix_frac is not None ), "in writing reprocessed, only one type of prefix may be used" with open(source_outfile, "w") as source_file, open( hypo_outfile, "w" ) as hypo_file, open(target_outfile, "w") as target_file: assert len(sources) == len(hypos), "sources and hypos list length mismatch" if right_to_left: for i in range(len(sources)): for j in range(len(hypos[i])): if prefix_len is None: hypo_file.write(make_right_to_left(hypos[i][j]) + "\n") else: raise NotImplementedError() source_file.write(make_right_to_left(sources[i]) + "\n") target_file.write(make_right_to_left(targets[i]) + "\n") else: for i in sorted(sources.keys()): for j in range(len(hypos[i])): if prefix_len is not None: shortened = ( get_prefix_no_bpe(hypos[i][j], bpe_symbol, prefix_len) + "\n" ) hypo_file.write(shortened) source_file.write(sources[i]) target_file.write(targets[i]) elif target_prefix_frac is not None: num_words, shortened, num_bpe_tokens = calc_length_from_frac( hypos[i][j], target_prefix_frac, bpe_symbol ) shortened += "\n" hypo_file.write(shortened) source_file.write(sources[i]) target_file.write(targets[i]) elif source_prefix_frac is not None: num_words, shortened, num_bpe_tokensn = calc_length_from_frac( sources[i], source_prefix_frac, bpe_symbol ) shortened += "\n" hypo_file.write(hypos[i][j]) source_file.write(shortened) target_file.write(targets[i]) else: hypo_file.write(hypos[i][j]) source_file.write(sources[i]) target_file.write(targets[i]) def calc_length_from_frac(bpe_sentence, prefix_frac, bpe_symbol): # return number of words, (not bpe tokens) that we want no_bpe_sen = remove_bpe(bpe_sentence, bpe_symbol) len_sen = len(no_bpe_sen.split()) num_words = math.ceil(len_sen * prefix_frac) prefix = get_prefix_no_bpe(bpe_sentence, bpe_symbol, num_words) num_bpe_tokens = len(prefix.split()) return num_words, prefix, num_bpe_tokens def get_prefix(sentence, prefix_len): """assuming no bpe, gets the prefix of the sentence with prefix_len words""" tokens = sentence.strip("\n").split() if prefix_len >= len(tokens): return sentence.strip("\n") else: return " ".join(tokens[:prefix_len]) def get_prefix_no_bpe(sentence, bpe_symbol, prefix_len): if bpe_symbol is None: return get_prefix(sentence, prefix_len) else: return " ".join(get_prefix_from_len(sentence.split(), bpe_symbol, prefix_len)) def get_prefix_from_len(sentence, bpe_symbol, prefix_len): """get the prefix of sentence with bpe, with prefix len in terms of words, not bpe tokens""" bpe_count = sum([bpe_symbol.strip(" ") in t for t in sentence[:prefix_len]]) if bpe_count == 0: return sentence[:prefix_len] else: return sentence[:prefix_len] + get_prefix_from_len( sentence[prefix_len:], bpe_symbol, bpe_count ) def get_num_bpe_tokens_from_len(sentence, bpe_symbol, prefix_len): """given a prefix length in terms of words, return the number of bpe tokens""" prefix = get_prefix_no_bpe(sentence, bpe_symbol, prefix_len) assert len(remove_bpe(prefix, bpe_symbol).split()) <= prefix_len return len(prefix.split(" ")) def make_right_to_left(line): tokens = line.split() tokens.reverse() new_line = " ".join(tokens) return new_line def remove_bpe(line, bpe_symbol): line = line.replace("\n", "") line = (line + " ").replace(bpe_symbol, "").rstrip() return line + ("\n") def remove_bpe_dict(pred_dict, bpe_symbol): new_dict = {} for i in pred_dict: if type(pred_dict[i]) == list: new_list = [remove_bpe(elem, bpe_symbol) for elem in pred_dict[i]] new_dict[i] = new_list else: new_dict[i] = remove_bpe(pred_dict[i], bpe_symbol) return new_dict def parse_bleu_scoring(line): p = re.compile(r"(BLEU4 = )\d+[.]\d+") res = re.search(p, line) assert res is not None, line return float(res.group()[8:]) def get_full_from_prefix(hypo_prefix, hypos): """given a hypo prefix, recover the first hypo from the list of complete hypos beginning with that prefix""" for hypo in hypos: hypo_prefix = hypo_prefix.strip("\n") len_prefix = len(hypo_prefix) if hypo[:len_prefix] == hypo_prefix: return hypo # no match found raise Exception() def get_score( a, b, c, target_len, bitext_score1, bitext_score2=None, lm_score=None, lenpen=None, src_len=None, tgt_len=None, bitext1_backwards=False, bitext2_backwards=False, normalize=False, ): if bitext1_backwards: bitext1_norm = src_len else: bitext1_norm = tgt_len if bitext_score2 is not None: if bitext2_backwards: bitext2_norm = src_len else: bitext2_norm = tgt_len else: bitext2_norm = 1 bitext_score2 = 0 if normalize: score = ( a * bitext_score1 / bitext1_norm + b * bitext_score2 / bitext2_norm + c * lm_score / src_len ) else: score = a * bitext_score1 + b * bitext_score2 + c * lm_score if lenpen is not None: score /= (target_len) ** float(lenpen) return score class BitextOutput(object): def __init__( self, output_file, backwards, right_to_left, bpe_symbol, prefix_len=None, target_prefix_frac=None, source_prefix_frac=None, ): """process output from rescoring""" source, hypo, score, target, pos_score = reprocess(output_file) if backwards: self.hypo_fracs = source_prefix_frac else: self.hypo_fracs = target_prefix_frac # remove length penalty so we can use raw scores score, num_bpe_tokens = get_score_from_pos( pos_score, prefix_len, hypo, bpe_symbol, self.hypo_fracs, backwards ) source_lengths = {} target_lengths = {} assert hypo.keys() == source.keys(), "key mismatch" if backwards: tmp = hypo hypo = source source = tmp for i in source: # since we are reranking, there should only be one hypo per source sentence if backwards: len_src = len(source[i][0].split()) # record length without <eos> if len_src == num_bpe_tokens[i][0] - 1: source_lengths[i] = num_bpe_tokens[i][0] - 1 else: source_lengths[i] = num_bpe_tokens[i][0] target_lengths[i] = len(hypo[i].split()) source[i] = remove_bpe(source[i][0], bpe_symbol) target[i] = remove_bpe(target[i], bpe_symbol) hypo[i] = remove_bpe(hypo[i], bpe_symbol) score[i] = float(score[i][0]) pos_score[i] = pos_score[i][0] else: len_tgt = len(hypo[i][0].split()) # record length without <eos> if len_tgt == num_bpe_tokens[i][0] - 1: target_lengths[i] = num_bpe_tokens[i][0] - 1 else: target_lengths[i] = num_bpe_tokens[i][0] source_lengths[i] = len(source[i].split()) if right_to_left: source[i] = remove_bpe(make_right_to_left(source[i]), bpe_symbol) target[i] = remove_bpe(make_right_to_left(target[i]), bpe_symbol) hypo[i] = remove_bpe(make_right_to_left(hypo[i][0]), bpe_symbol) score[i] = float(score[i][0]) pos_score[i] = pos_score[i][0] else: assert ( len(hypo[i]) == 1 ), "expected only one hypothesis per source sentence" source[i] = remove_bpe(source[i], bpe_symbol) target[i] = remove_bpe(target[i], bpe_symbol) hypo[i] = remove_bpe(hypo[i][0], bpe_symbol) score[i] = float(score[i][0]) pos_score[i] = pos_score[i][0] self.rescore_source = source self.rescore_hypo = hypo self.rescore_score = score self.rescore_target = target self.rescore_pos_score = pos_score self.backwards = backwards self.right_to_left = right_to_left self.target_lengths = target_lengths self.source_lengths = source_lengths class BitextOutputFromGen(object): def __init__( self, predictions_bpe_file, bpe_symbol=None, nbest=False, prefix_len=None, target_prefix_frac=None, ): if nbest: ( pred_source, pred_hypo, pred_score, pred_target, pred_pos_score, ) = reprocess_nbest(predictions_bpe_file) else: pred_source, pred_hypo, pred_score, pred_target, pred_pos_score = reprocess( predictions_bpe_file ) assert len(pred_source) == len(pred_hypo) assert len(pred_source) == len(pred_score) assert len(pred_source) == len(pred_target) assert len(pred_source) == len(pred_pos_score) # remove length penalty so we can use raw scores pred_score, num_bpe_tokens = get_score_from_pos( pred_pos_score, prefix_len, pred_hypo, bpe_symbol, target_prefix_frac, False ) self.source = pred_source self.target = pred_target self.score = pred_score self.pos_score = pred_pos_score self.hypo = pred_hypo self.target_lengths = {} self.source_lengths = {} self.no_bpe_source = remove_bpe_dict(pred_source.copy(), bpe_symbol) self.no_bpe_hypo = remove_bpe_dict(pred_hypo.copy(), bpe_symbol) self.no_bpe_target = remove_bpe_dict(pred_target.copy(), bpe_symbol) # indexes to match those from the rescoring models self.rescore_source = {} self.rescore_target = {} self.rescore_pos_score = {} self.rescore_hypo = {} self.rescore_score = {} self.num_hypos = {} self.backwards = False self.right_to_left = False index = 0 for i in sorted(pred_source.keys()): for j in range(len(pred_hypo[i])): self.target_lengths[index] = len(self.hypo[i][j].split()) self.source_lengths[index] = len(self.source[i].split()) self.rescore_source[index] = self.no_bpe_source[i] self.rescore_target[index] = self.no_bpe_target[i] self.rescore_hypo[index] = self.no_bpe_hypo[i][j] self.rescore_score[index] = float(pred_score[i][j]) self.rescore_pos_score[index] = pred_pos_score[i][j] self.num_hypos[index] = len(pred_hypo[i]) index += 1 def get_score_from_pos( pos_score_dict, prefix_len, hypo_dict, bpe_symbol, hypo_frac, backwards ): score_dict = {} num_bpe_tokens_dict = {} assert prefix_len is None or hypo_frac is None for key in pos_score_dict: score_dict[key] = [] num_bpe_tokens_dict[key] = [] for i in range(len(pos_score_dict[key])): if prefix_len is not None and not backwards: num_bpe_tokens = get_num_bpe_tokens_from_len( hypo_dict[key][i], bpe_symbol, prefix_len ) score_dict[key].append(sum(pos_score_dict[key][i][:num_bpe_tokens])) num_bpe_tokens_dict[key].append(num_bpe_tokens) elif hypo_frac is not None: num_words, shortened, hypo_prefix_len = calc_length_from_frac( hypo_dict[key][i], hypo_frac, bpe_symbol ) score_dict[key].append(sum(pos_score_dict[key][i][:hypo_prefix_len])) num_bpe_tokens_dict[key].append(hypo_prefix_len) else: score_dict[key].append(sum(pos_score_dict[key][i])) num_bpe_tokens_dict[key].append(len(pos_score_dict[key][i])) return score_dict, num_bpe_tokens_dict class LMOutput(object): def __init__( self, lm_score_file, lm_dict=None, prefix_len=None, bpe_symbol=None, target_prefix_frac=None, ): ( lm_sentences, lm_sen_scores, lm_sen_pos_scores, lm_no_bpe_sentences, lm_bpe_tokens, ) = parse_lm( lm_score_file, prefix_len=prefix_len, bpe_symbol=bpe_symbol, target_prefix_frac=target_prefix_frac, ) self.sentences = lm_sentences self.score = lm_sen_scores self.pos_score = lm_sen_pos_scores self.lm_dict = lm_dict self.no_bpe_sentences = lm_no_bpe_sentences self.bpe_tokens = lm_bpe_tokens def parse_lm(input_file, prefix_len=None, bpe_symbol=None, target_prefix_frac=None): """parse output of eval_lm""" with open(input_file, "r") as f: text = f.readlines() text = text[7:] cleaned_text = text[:-2] sentences = {} sen_scores = {} sen_pos_scores = {} no_bpe_sentences = {} num_bpe_tokens_dict = {} for _i, line in enumerate(cleaned_text): tokens = line.split() if tokens[0].isdigit(): line_id = int(tokens[0]) scores = [float(x[1:-1]) for x in tokens[2::2]] sentences[line_id] = " ".join(tokens[1::2][:-1]) + "\n" if bpe_symbol is not None: # exclude <eos> symbol to match output from generate.py bpe_sen = " ".join(tokens[1::2][:-1]) + "\n" no_bpe_sen = remove_bpe(bpe_sen, bpe_symbol) no_bpe_sentences[line_id] = no_bpe_sen if prefix_len is not None: num_bpe_tokens = get_num_bpe_tokens_from_len( bpe_sen, bpe_symbol, prefix_len ) sen_scores[line_id] = sum(scores[:num_bpe_tokens]) num_bpe_tokens_dict[line_id] = num_bpe_tokens elif target_prefix_frac is not None: num_words, shortened, target_prefix_len = calc_length_from_frac( bpe_sen, target_prefix_frac, bpe_symbol ) sen_scores[line_id] = sum(scores[:target_prefix_len]) num_bpe_tokens_dict[line_id] = target_prefix_len else: sen_scores[line_id] = sum(scores) num_bpe_tokens_dict[line_id] = len(scores) sen_pos_scores[line_id] = scores return sentences, sen_scores, sen_pos_scores, no_bpe_sentences, num_bpe_tokens_dict def get_directories( data_dir_name, num_rescore, gen_subset, fw_name, shard_id, num_shards, sampling=False, prefix_len=None, target_prefix_frac=None, source_prefix_frac=None, ): nbest_file_id = ( "nbest_" + str(num_rescore) + "_subset_" + gen_subset + "_fw_name_" + fw_name + "_shard_" + str(shard_id) + "_of_" + str(num_shards) ) if sampling: nbest_file_id += "_sampling" # the directory containing all information for this nbest list pre_gen = ( os.path.join(os.path.dirname(__file__)) + "/rerank_data/" + data_dir_name + "/" + nbest_file_id ) # the directory to store the preprocessed nbest list, for left to right rescoring left_to_right_preprocessed_dir = pre_gen + "/left_to_right_preprocessed" if source_prefix_frac is not None: left_to_right_preprocessed_dir = ( left_to_right_preprocessed_dir + "/prefix_frac" + str(source_prefix_frac) ) # the directory to store the preprocessed nbest list, for right to left rescoring right_to_left_preprocessed_dir = pre_gen + "/right_to_left_preprocessed" # the directory to store the preprocessed nbest list, for backwards rescoring backwards_preprocessed_dir = pre_gen + "/backwards" if target_prefix_frac is not None: backwards_preprocessed_dir = ( backwards_preprocessed_dir + "/prefix_frac" + str(target_prefix_frac) ) elif prefix_len is not None: backwards_preprocessed_dir = ( backwards_preprocessed_dir + "/prefix_" + str(prefix_len) ) # the directory to store the preprocessed nbest list, for rescoring with P(T) lm_preprocessed_dir = pre_gen + "/lm_preprocessed" return ( pre_gen, left_to_right_preprocessed_dir, right_to_left_preprocessed_dir, backwards_preprocessed_dir, lm_preprocessed_dir, ) def lm_scoring( preprocess_directory, bpe_status, gen_output, pre_gen, cur_lm_dict, cur_lm_name, cur_language_model, cur_lm_bpe_code, batch_size, lm_score_file, target_lang, source_lang, prefix_len=None, ): if prefix_len is not None: assert ( bpe_status == "different" ), "bpe status must be different to use prefix len" if bpe_status == "no bpe": # run lm on output without bpe write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, pre_gen + "/rescore_data_no_bpe.de", pre_gen + "/rescore_data_no_bpe.en", pre_gen + "/reference_file_no_bpe", ) preprocess_lm_param = [ "--only-source", "--trainpref", pre_gen + "/rescore_data_no_bpe." + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "shared": preprocess_lm_param = [ "--only-source", "--trainpref", pre_gen + "/rescore_data." + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_directory, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_directory, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args) elif bpe_status == "different": rescore_file = pre_gen + "/rescore_data_no_bpe" rescore_bpe = pre_gen + "/rescore_data_new_bpe" rescore_file += "." rescore_bpe += "." write_reprocessed( gen_output.no_bpe_source, gen_output.no_bpe_hypo, gen_output.no_bpe_target, rescore_file + source_lang, rescore_file + target_lang, pre_gen + "/reference_file_no_bpe", bpe_symbol=None, ) # apply LM bpe to nbest list bpe_src_param = [ "-c", cur_lm_bpe_code, "--input", rescore_file + target_lang, "--output", rescore_bpe + target_lang, ] subprocess.call( [ "python", os.path.join( os.path.dirname(__file__), "subword-nmt/subword_nmt/apply_bpe.py" ), ] + bpe_src_param, shell=False, ) # uncomment to use fastbpe instead of subword-nmt bpe # bpe_src_param = [rescore_bpe+target_lang, rescore_file+target_lang, cur_lm_bpe_code] # subprocess.call(["/private/home/edunov/fastBPE/fast", "applybpe"] + bpe_src_param, shell=False) preprocess_dir = preprocess_directory preprocess_lm_param = [ "--only-source", "--trainpref", rescore_bpe + target_lang, "--srcdict", cur_lm_dict, "--destdir", preprocess_dir, ] preprocess_parser = options.get_preprocessing_parser() input_args = preprocess_parser.parse_args(preprocess_lm_param) preprocess.main(input_args) eval_lm_param = [ preprocess_dir, "--path", cur_language_model, "--output-word-probs", "--batch-size", str(batch_size), "--max-tokens", "1024", "--sample-break-mode", "eos", "--gen-subset", "train", ] eval_lm_parser = options.get_eval_lm_parser() input_args = options.parse_args_and_arch(eval_lm_parser, eval_lm_param) with open(lm_score_file, "w") as f: with redirect_stdout(f): eval_lm.main(input_args) def rescore_file_name( nbest_dir, prefix_len, scorer_name, lm_file=False, target_prefix_frac=None, source_prefix_frac=None, backwards=None, ): if lm_file: score_file = nbest_dir + "/lm_score_translations_model_" + scorer_name + ".txt" else: score_file = nbest_dir + "/" + scorer_name + "_score_translations.txt" if backwards: if prefix_len is not None: score_file += "prefix_len" + str(prefix_len) elif target_prefix_frac is not None: score_file += "target_prefix_frac" + str(target_prefix_frac) else: if source_prefix_frac is not None: score_file += "source_prefix_frac" + str(source_prefix_frac) return score_file ================================================ FILE: examples/nonautoregressive_translation/README.md ================================================ # Non-autoregressive Neural Machine Translation (NAT) This page mainly includes instructions for reproducing results from the following papers * [Levenshtein Transformer (Gu et al., 2019)](https://arxiv.org/abs/1905.11006). * [Understanding Knowledge Distillation in Non-autoregressive Machine Translation (Zhou et al., 2019)](https://arxiv.org/abs/1911.02727). We also provided our own implementations for several popular non-autoregressive-based models as reference:<br> * [Non-Autoregressive Neural Machine Translation (Gu et al., 2017)](https://arxiv.org/abs/1711.02281)<br> * [Deterministic Non-Autoregressive Neural Sequence Modeling by Iterative Refinement (Lee et al., 2018)](https://arxiv.org/abs/1802.06901)<br> * [Insertion Transformer: Flexible Sequence Generation via Insertion Operations (Stern et al., 2019)](https://arxiv.org/abs/1902.03249)<br> * [Mask-Predict: Parallel Decoding of Conditional Masked Language Models (Ghazvininejad et al., 2019)](https://arxiv.org/abs/1904.09324v2)<br> * [Fast Structured Decoding for Sequence Models (Sun et al., 2019)](https://arxiv.org/abs/1910.11555) ## Dataset First, follow the [instructions to download and preprocess the WMT'14 En-De dataset](../translation#wmt14-english-to-german-convolutional). Make sure to learn a joint vocabulary by passing the `--joined-dictionary` option to `fairseq-preprocess`. ### Knowledge Distillation Following [Gu et al. 2019](https://arxiv.org/abs/1905.11006), [knowledge distillation](https://arxiv.org/abs/1606.07947) from an autoregressive model can effectively simplify the training data distribution, which is sometimes essential for NAT-based models to learn good translations. The easiest way of performing distillation is to follow the [instructions of training a standard transformer model](../translation) on the same data, and then decode the training set to produce a distillation dataset for NAT. ### Download We also provided the preprocessed [original](http://dl.fbaipublicfiles.com/nat/original_dataset.zip) and [distillation](http://dl.fbaipublicfiles.com/nat/distill_dataset.zip) datasets. Please build the binarized dataset on your own. ## Train a model Then we can train a nonautoregressive model using the `translation_lev` task and a new criterion `nat_loss`. Use the `--noise` flag to specify the input noise used on the target sentences. In default, we run the task for *Levenshtein Transformer*, with `--noise='random_delete'`. Full scripts to run other models can also be found [here](./scripts.md). The following command will train a *Levenshtein Transformer* on the binarized dataset. ```bash fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch levenshtein_transformer \ --noise random_delete \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ --encoder-learned-pos \ --apply-bert-init \ --log-format 'simple' --log-interval 100 \ --fixed-validation-seed 7 \ --max-tokens 8000 \ --save-interval-updates 10000 \ --max-update 300000 ``` ## Translate Once a model is trained, we can generate translations using an `iterative_refinement_generator` which will based on the model's initial output and iteratively read and greedily refine the translation until (1) the model predicts the same translations for two consecutive iterations; or (2) the generator reaches the maximum iterations (`--iter-decode-max-iter`). Use `--print-step` to check the actual # of iteration for each sentence. For *Levenshtein Transformer*, it sometimes helps to apply a `--iter-decode-eos-penalty` (typically, 0~3) to penalize the model finishing generation too early and generating too short translations. For example, to generate with `--iter-decode-max-iter=9`: ```bash fairseq-generate \ data-bin/wmt14_en_de_distill \ --gen-subset test \ --task translation_lev \ --path checkpoints/checkpoint_best.pt \ --iter-decode-max-iter 9 \ --iter-decode-eos-penalty 0 \ --beam 1 --remove-bpe \ --print-step \ --batch-size 400 ``` In the end of the generation, we can see the tokenized BLEU score for the translation. ## Advanced Decoding Methods ### Ensemble The NAT models use special implementations of [ensembling](https://github.com/fairinternal/fairseq-py/blob/b98d88da52f2f21f1b169bab8c70c1c4ca19a768/fairseq/sequence_generator.py#L522) to support iterative refinement and a variety of parallel operations in different models, while it shares the same API as standard autoregressive models as follows: ```bash fairseq-generate \ data-bin/wmt14_en_de_distill \ --gen-subset test \ --task translation_lev \ --path checkpoint_1.pt:checkpoint_2.pt:checkpoint_3.pt \ --iter-decode-max-iter 9 \ --iter-decode-eos-penalty 0 \ --beam 1 --remove-bpe \ --print-step \ --batch-size 400 ``` We use ``:`` to split multiple models. Note that, not all NAT models support ensembling for now. ### Length-beam For models that predict lengths before decoding (e.g. the vanilla NAT, Mask-Predict, etc), it is possible to improve the translation quality by varying the target lengths around the predicted value, and translating the same example multiple times in parallel. We can select the best translation with the highest scores defined by your model's output. Note that, not all models support length beams. For models which dynamically change the lengths (e.g. *Insertion Transformer*, *Levenshtein Transformer*), the same trick does not apply. ### Re-ranking If the model generates multiple translations with length beam, we can also introduce an autoregressive model to rerank the translations considering scoring from an autoregressive model is much faster than decoding from that. For example, to generate translations with length beam and reranking, ```bash fairseq-generate \ data-bin/wmt14_en_de_distill \ --gen-subset test \ --task translation_lev \ --path checkpoints/checkpoint_best.pt:at_checkpoints/checkpoint_best.pt \ --iter-decode-max-iter 9 \ --iter-decode-eos-penalty 0 \ --iter-decode-with-beam 9 \ --iter-decode-with-external-reranker \ --beam 1 --remove-bpe \ --print-step \ --batch-size 100 ``` Note that we need to make sure the autoregressive model shares the same vocabulary as our target non-autoregressive model. ## Citation ```bibtex @incollection{NIPS2019_9297, title = {Levenshtein Transformer}, author = {Gu, Jiatao and Wang, Changhan and Zhao, Junbo}, booktitle = {Advances in Neural Information Processing Systems 32}, editor = {H. Wallach and H. Larochelle and A. Beygelzimer and F. d\textquotesingle Alch\'{e}-Buc and E. Fox and R. Garnett}, pages = {11179--11189}, year = {2019}, publisher = {Curran Associates, Inc.}, url = {http://papers.nips.cc/paper/9297-levenshtein-transformer.pdf} } ``` ```bibtex @article{zhou2019understanding, title={Understanding Knowledge Distillation in Non-autoregressive Machine Translation}, author={Zhou, Chunting and Neubig, Graham and Gu, Jiatao}, journal={arXiv preprint arXiv:1911.02727}, year={2019} } ``` ================================================ FILE: examples/nonautoregressive_translation/scripts.md ================================================ # Examples of Training scripts for Non-autoregressive Machine Translation models ### Non-autoregressive Transformer (NAT, Gu et al., 2017) Note that we need to have an additional module to perform "length prediction" (`--length-loss-factor`) before generating the whole sequence. ```bash fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch nonautoregressive_transformer \ --noise full_mask \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ --encoder-learned-pos \ --pred-length-offset \ --length-loss-factor 0.1 \ --apply-bert-init \ --log-format 'simple' --log-interval 100 \ --fixed-validation-seed 7 \ --max-tokens 8000 \ --save-interval-updates 10000 \ --max-update 300000 ``` ### Fast Structured Decoding for Sequence Models (NAT-CRF, Sun et al., 2019) Note that we implemented a low-rank appromixated CRF model by setting `--crf-lowrank-approx=32` and `--crf-beam-approx=64` as discribed in the original paper. All other settings are the same as the vanilla NAT model. ```bash fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch nacrf_transformer \ --noise full_mask \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ --encoder-learned-pos \ --pred-length-offset \ --length-loss-factor 0.1 \ --word-ins-loss-factor 0.5 \ --crf-lowrank-approx 32 \ --crf-beam-approx 64 \ --apply-bert-init \ --log-format 'simple' --log-interval 100 \ --fixed-validation-seed 7 \ --max-tokens 8000 \ --save-interval-updates 10000 \ --max-update 300000 ``` ### Non-autoregressive Transformer with Iterative Refinement (iNAT, Lee et al., 2018) Note that `--train-step` means how many iterations of refinement we used during training, and `--dae-ratio` controls the ratio of denoising auto-encoder training described in the original paper. ```bash fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch iterative_nonautoregressive_transformer \ --noise full_mask \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ --encoder-learned-pos \ --pred-length-offset \ --length-loss-factor 0.1 \ --train-step 4 \ --dae-ratio 0.5 \ --stochastic-approx \ --apply-bert-init \ --log-format 'simple' --log-interval 100 \ --fixed-validation-seed 7 \ --max-tokens 8000 \ --save-interval-updates 10000 \ --max-update 300000 ``` ### Insertion Transformer (InsT, Stern et al., 2019) Note that we need to specify the "slot-loss" (uniform or balanced tree) described in the original paper. Here we use `--label-tau` to control the temperature. ```bash fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch insertion_transformer \ --noise random_delete \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ --encoder-learned-pos \ --apply-bert-init \ --log-format 'simple' --log-interval 100 \ --fixed-validation-seed 7 \ --max-tokens 8000 \ --save-interval-updates 10000 \ --max-update 300000 ``` ### Mask Predict (CMLM, Ghazvininejad et al., 2019) ```bash fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch cmlm_transformer \ --noise random_mask \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ --encoder-learned-pos \ --apply-bert-init \ --log-format 'simple' --log-interval 100 \ --fixed-validation-seed 7 \ --max-tokens 8000 \ --save-interval-updates 10000 \ --max-update 300000 ``` ### Levenshtein Transformer (LevT, Gu et al., 2019) ```bash fairseq-train \ data-bin/wmt14_en_de_distill \ --save-dir checkpoints \ --ddp-backend=legacy_ddp \ --task translation_lev \ --criterion nat_loss \ --arch levenshtein_transformer \ --noise random_delete \ --share-all-embeddings \ --optimizer adam --adam-betas '(0.9,0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --stop-min-lr '1e-09' --warmup-updates 10000 \ --warmup-init-lr '1e-07' --label-smoothing 0.1 \ --dropout 0.3 --weight-decay 0.01 \ --decoder-learned-pos \ --encoder-learned-pos \ --apply-bert-init \ --log-format 'simple' --log-interval 100 \ --fixed-validation-seed 7 \ --max-tokens 8000 \ --save-interval-updates 10000 \ --max-update 300000 ``` ================================================ FILE: examples/normformer/README.md ================================================ ### NormFormer This is the code for the ["NormFormer: Improved Transformer Pretraining with Extra Normalization"](https://arxiv.org/abs/2110.09456) - 2021-10-19: Commands for CLM Experiments - Coming soon: Commands for MLM experiments If you have any issues or questions please post a github issue and tag `@sshleifer`. ### Data - To preprocess language modeling data, see [here](https://github.com/pytorch/fairseq/blob/d0fbcb0baef6f6ff3425ded62d8daea0e8b12114/examples/language_model/README.md#1-preprocess-the-data). - The replication commands below expect `$DATA` to be the path to the binarized data directory. - Note that NormFormer results in Table 2 use a much larger private dataset, and to get good results you should adapt the pre-processing instructions to your dataset and compare to a baseline on the same data, rather than Table 2. - The code uses `FSDP`, which requires `pip install fairscale>=0.4.0`. ### Modify existing Command To modify an existing `fairseq-train` command to use NormFormer, simply add the following flags: ```bash fairseq-train ... \ --scale-attn --scale-fc --scale-heads ``` - you probably also want to increase your learning rate - if your model is small, you may want to add `--scale-resids` ### Exact Training Commands - Note that NormFormer results in Table 2 use a much larger private dataset, and to get good results you should adapt the pre-processing instructions to your dataset. The full commands are functions defined here, so to run them you must `source examples/normformer/train_lm.sh`. - We default `--distributed-world-size 8`. You should adjust `--update-freq` and `--batch-size` and such that the effective batch size is (1024x1024x0.5) tokens for 125M and 355M, and (1024x1024) for 1.3B parameter and above. For small models, `--update-freq`=256/`global_bs`. For large models, `--update-freq`=512/`global_bs`, where `global_bs` = `--batch-size` * `--distributed-world-size` - The small models will all train on as few as 8 GPUs. ```bash train_125M --lr 6e-4 # GPT-3 Replicated train_125M --lr 1e-3 # stronger high-lr baseline train_125M --lr 3e-3 --scale-attn --scale-fc --scale-heads # No scale-resids train_125M --lr 3e-3 --scale-attn --scale-fc --scale-heads --scale-resids # Best command ``` ```bash train_355M --lr 6e-4 # GPT-3 Replicated train_355M --lr 1e-3 # stronger high-lr baseline train_355M --lr 1e-3 --scale-attn --scale-fc --scale-heads # No scale-resids train_355M --lr 1e-3 --scale-attn --scale-fc --scale-heads --scale-resids # Slightly better ``` ```bash train_1.3B --lr 2e-4 # GPT-3 Replicated train_1.3B --lr 6e-4 # stronger high-lr baseline train_1.3B --lr 6e-4 --scale-attn --scale-fc --scale-heads # NormFormer ``` ```bash train_2.7B --lr 1.6e-4 # GPT-3 Replicated train_2.7B --lr 1.6e-4 --activation-fn relu_squared # stronger Relu^2 baseline train_2.7B --lr 6e-4 --activation-fn relu_squared --scale-attn --scale-fc --scale-heads # NormFormer 2.7B ``` ### Citation ```bibtex @misc{shleifer2021normformer, title={NormFormer: Improved Transformer Pretraining with Extra Normalization}, author={Sam Shleifer and Jason Weston and Myle Ott}, year={2021}, eprint={2110.09456}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ================================================ FILE: examples/normformer/train_lm.sh ================================================ #!/usr/bin/env bash train_common () { fairseq-train "$DATA" \ --combine-val \ --train-subset train \ --num-workers 2 \ --validate-interval-updates 1000 \ --save-interval-updates 1000 \ --no-epoch-checkpoints \ --ddp-backend fully_sharded \ --memory-efficient-fp16 \ --fp16-init-scale 4 \ --checkpoint-activations \ --arch transformer_lm_gpt \ --activation-fn gelu \ --share-decoder-input-output-embed \ --task language_modeling \ --sample-break-mode none \ --tokens-per-sample 2048 \ --optimizer adam --adam-betas "(0.9, 0.98)" \ --adam-eps 1e-08 \ --clip-norm 0.0 \ --lr-scheduler polynomial_decay \ --warmup-updates 750 \ --dropout 0.1 \ --attention-dropout 0.1 \ --weight-decay 0.01 \ --batch-size 16 \ --update-freq 2 \ --required-batch-size-multiple 1 \ --total-num-update 572204 \ --max-update 572204 \ --seed 1 \ --log-format json --log-interval 1 \ --distributed-world-size 8 --distributed-port 13177 \ "$@" } train_125M () { train_common --decoder-layers 12 \ --decoder-embed-dim 768 \ --decoder-ffn-embed-dim 3072 \ --decoder-attention-heads 12 "$@" } train_355M () { train_common --decoder-layers 24 \ --decoder-embed-dim 1024\ --decoder-ffn-embed-dim 4096 \ --decoder-attention-heads 16 \ --dropout 0.0 \ --attention-dropout 0.0 \ "$@" } train_1.3B () { train_common --decoder-layers 24 \ --decoder-embed-dim 2048 \ --decoder-ffn-embed-dim 8192 \ --decoder-attention-heads 32 \ --batch-size 4 \ --update-freq 16 \ --total-num-update 286102 \ --max-update 286102 \ "$@" } train_2.7B () { train_common --decoder-layers 32 \ --decoder-embed-dim 2560 \ --decoder-ffn-embed-dim 10240 \ --decoder-attention-heads 32 \ --batch-size 4 \ --update-freq 16 \ --total-num-update 286102 \ --max-update 286102 \ "$@" } ================================================ FILE: examples/operators/alignment_train_cpu.cpp ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #include <torch/extension.h> // @manual=//caffe2:torch_extension #include <algorithm> namespace { template <typename T> void exclusiveCumprod( const T* p_choose, T* cumprod_1mp, uint32_t bsz, uint32_t tgt_len, uint32_t src_len) { // cumprod_1mp = 1 - p_choose for (uint32_t b = 0; b < bsz; b++) { for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { for (uint32_t src = 0; src < src_len; src++) { uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; cumprod_1mp[idx] = 1 - p_choose[idx]; } } } // Implementing exclusive cumprod in the innermost dimension // cumprod_1mp = cumprod(1 - p_choose) // There is cumprod in pytorch, however there is no exclusive mode. // cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i] // exclusive means // cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] for (uint32_t b = 0; b < bsz; b++) { for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { uint32_t idx_offset = b * tgt_len * src_len + tgt * src_len; T prev = cumprod_1mp[idx_offset]; // index [b][tgt][0] cumprod_1mp[idx_offset] = (T)1.0; T curr; for (uint32_t src = 1; src < src_len; src++) { uint32_t idx = idx_offset + src; curr = cumprod_1mp[idx]; cumprod_1mp[idx] = cumprod_1mp[idx - 1] * prev; prev = curr; } } } } template <typename T> void clamp( const T* cumprod_1mp, T* cumprod_1mp_clamp, uint32_t bsz, uint32_t tgt_len, uint32_t src_len, T min_val, T max_val) { for (uint32_t b = 0; b < bsz; b++) { for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { for (uint32_t src = 0; src < src_len; src++) { uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; if (cumprod_1mp[idx] < min_val) { cumprod_1mp_clamp[idx] = min_val; } else if (cumprod_1mp[idx] > max_val) { cumprod_1mp_clamp[idx] = max_val; } else { cumprod_1mp_clamp[idx] = cumprod_1mp[idx]; } } } } } template <typename T> void alignmentTrainCPUImpl( const T* p_choose, T* alpha, uint32_t bsz, uint32_t tgt_len, uint32_t src_len, float eps) { // p_choose: bsz , tgt_len, src_len // cumprod_1mp: bsz , tgt_len, src_len // cumprod_1mp_clamp : bsz, tgt_len, src_len // alpha: bsz + 1, tgt_len, src_len uint32_t elements = bsz * tgt_len * src_len; T* cumprod_1mp = new T[elements]; T* cumprod_1mp_clamp = new T[elements]; exclusiveCumprod<T>(p_choose, cumprod_1mp, bsz, tgt_len, src_len); clamp<T>( cumprod_1mp, cumprod_1mp_clamp, bsz, tgt_len, src_len, (T)eps, (T)1.0); // ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) // Initialize alpha [:, 0, 0] for (uint32_t b = 0; b < bsz; b++) { alpha[b * tgt_len * src_len] = 1.0; } for (uint32_t tgt = 0; tgt < tgt_len; tgt++) { for (uint32_t b = 0; b < bsz; b++) { uint32_t alpha_idx, inout_idx; T prev_scan = 0, curr_scan, out; for (uint32_t src = 0; src < src_len; src++) { // Apply scan/cumsum if (tgt == 0) { // alpha index is [b][tgt][src] alpha_idx = b * tgt_len * src_len + src; } else { // alpha index is [b][tgt-1][src] alpha_idx = b * tgt_len * src_len + (tgt - 1) * src_len + src; } // input index is [b][tgt][src] inout_idx = b * tgt_len * src_len + tgt * src_len + src; curr_scan = prev_scan + alpha[alpha_idx] / cumprod_1mp_clamp[inout_idx]; out = curr_scan * p_choose[inout_idx] * cumprod_1mp[inout_idx]; alpha[inout_idx] = std::min<T>(std::max<T>(out, 0), 1.0); prev_scan = curr_scan; } } } free(cumprod_1mp); free(cumprod_1mp_clamp); } void alignmentTrainCPU( const torch::Tensor& p_choose, torch::Tensor& alpha, float eps) { uint32_t bsz = p_choose.size(0); uint32_t tgt_len = p_choose.size(1); uint32_t src_len = p_choose.size(2); AT_DISPATCH_FLOATING_TYPES_AND2( torch::ScalarType::Half, torch::ScalarType::BFloat16, p_choose.scalar_type(), "alignmentCPUImpl", [&]() { alignmentTrainCPUImpl<scalar_t>( p_choose.data_ptr<scalar_t>(), alpha.data_ptr<scalar_t>(), bsz, tgt_len, src_len, eps); }); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def( "alignment_train_cpu", &alignmentTrainCPU, "expected_alignment_from_p_choose (CPU)"); } } // namespace ================================================ FILE: examples/operators/alignment_train_cuda.cpp ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #include "alignment_train_cuda.h" #include "utils.h" namespace { void alignmentTrainCUDA( const torch::Tensor& p_choose, torch::Tensor& alpha, float eps) { CHECK_INPUT(p_choose); CHECK_INPUT(alpha); alignmentTrainCUDAWrapper(p_choose, alpha, eps); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def( "alignment_train_cuda", &alignmentTrainCUDA, "expected_alignment_from_p_choose (CUDA)"); } } // namespace ================================================ FILE: examples/operators/alignment_train_cuda.h ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include <torch/extension.h> // @manual=//caffe2:torch_extension void alignmentTrainCUDAWrapper( const torch::Tensor& p_choose, torch::Tensor& alpha, float eps); ================================================ FILE: examples/operators/alignment_train_kernel.cu ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #include <ATen/ATen.h> #include <ATen/cuda/CUDAContext.h> // @manual=//caffe2/aten:ATen-cu #include <cuda_runtime.h> #include <algorithm> // std::min/max #include <cub/cub.cuh> #include "alignment_train_cuda.h" #include "utils.h" namespace { // The thread block length in threads along the X dimension constexpr int BLOCK_DIM_X = 128; // The thread block length in threads along the Y dimension constexpr int BLOCK_DIM_Y = 8; // The thread block length in threads for scan operation constexpr int SCAN_BLOCK = 512; #define gpuErrchk(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) { if (code != cudaSuccess) { fprintf( stderr, "\nGPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); if (abort) exit(code); } } template <typename T> struct Prod { /// prod operator, returns <tt>a * b</tt> __host__ __device__ __forceinline__ T operator()(const T& a, const T& b) const { return a * b; } }; template <typename T> struct BlockPrefixProdCallbackOp { // Running prefix T running_total; // Constructor __device__ BlockPrefixProdCallbackOp(T running_total) : running_total(running_total) {} // Callback operator to be entered by the first warp of threads in the block. // Thread-0 is responsible for returning a value for seeding the block-wide // scan. __device__ T operator()(const T block_aggregate) { T old_prefix = running_total; running_total *= block_aggregate; return old_prefix; } }; template <typename T> struct BlockPrefixSumCallbackOp { // Running prefix T running_total; // Constructor __device__ BlockPrefixSumCallbackOp(T running_total) : running_total(running_total) {} // Callback operator to be entered by the first warp of threads in the block. // Thread-0 is responsible for returning a value for seeding the block-wide // scan. __device__ T operator()(const T block_aggregate) { T old_prefix = running_total; running_total += block_aggregate; return old_prefix; } }; template <typename T> __global__ void oneMinusPKernel( const T* __restrict__ p_choose, T* __restrict__ cumprod_1mp, uint32_t bsz, uint32_t tgt_len, uint32_t src_len) { for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { for (uint32_t tgt = threadIdx.y; tgt < tgt_len; tgt += blockDim.y) { for (uint32_t src = threadIdx.x; src < src_len; src += blockDim.x) { uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; cumprod_1mp[idx] = 1 - p_choose[idx]; } } } } template <typename T, int TPB> __global__ void innermostScanKernel( T* __restrict__ cumprod_1mp, uint32_t bsz, uint32_t tgt_len, uint32_t src_len) { for (uint32_t b = blockIdx.y; b < bsz; b += gridDim.y) { for (uint32_t tgt = blockIdx.x; tgt < tgt_len; tgt += gridDim.x) { // Specialize BlockScan for a 1D block of TPB threads on type T typedef cub::BlockScan<T, TPB> BlockScan; // Allocate shared memory for BlockScan __shared__ typename BlockScan::TempStorage temp_storage; // Initialize running total BlockPrefixProdCallbackOp<T> prefix_op(1); const uint32_t tid = threadIdx.x; for (uint32_t block_src = 0; block_src < src_len; block_src += blockDim.x) { uint32_t src = block_src + tid; uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; T thread_data = (src < src_len) ? cumprod_1mp[idx] : (T)0; // Collectively compute the block-wide inclusive prefix sum BlockScan(temp_storage) .ExclusiveScan(thread_data, thread_data, Prod<T>(), prefix_op); __syncthreads(); // write the scanned value to output if (src < src_len) { cumprod_1mp[idx] = thread_data; } } } } } template <typename T> __global__ void clampKernel( const T* __restrict__ cumprod_1mp, T* __restrict__ cumprod_1mp_clamp, uint32_t bsz, uint32_t tgt_len, uint32_t src_len, T min_val, T max_val) { for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { for (uint32_t tgt = threadIdx.y; tgt < tgt_len; tgt += blockDim.y) { for (uint32_t src = threadIdx.x; src < src_len; src += blockDim.x) { uint32_t idx = b * tgt_len * src_len + tgt * src_len + src; if (cumprod_1mp[idx] < min_val) { cumprod_1mp_clamp[idx] = min_val; } else if (cumprod_1mp[idx] > max_val) { cumprod_1mp_clamp[idx] = max_val; } else { cumprod_1mp_clamp[idx] = cumprod_1mp[idx]; } } } } } template <typename T> __global__ void initAlphaCUDAKernel( T* alpha, uint32_t bsz, uint32_t tgt_len, uint32_t src_len) { // alpha[:, 0, 0] = 1.0 for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { alpha[b * tgt_len * src_len] = (T)1.0; } } template <typename T, int TPB> __global__ void alignmentTrainCUDAKernel( const T* __restrict__ p_choose, const T* __restrict__ cumprod_1mp, const T* __restrict__ cumprod_1mp_clamp, T* __restrict__ alpha, uint32_t bsz, uint32_t tgt_len, uint32_t src_len, uint32_t tgt) { for (uint32_t b = blockIdx.x; b < bsz; b += gridDim.x) { // Specialize BlockScan for a 1D block of TPB threads on type T typedef cub::BlockScan<T, TPB> BlockScan; // Allocate shared memory for BlockScan __shared__ typename BlockScan::TempStorage temp_storage; // Initialize running total BlockPrefixSumCallbackOp<T> prefix_op(0); uint32_t b_offset = b * tgt_len * src_len; const uint32_t tid = threadIdx.x; for (uint32_t block_src = 0; block_src < src_len; block_src += blockDim.x) { uint32_t src = block_src + tid; // Obtain a segment of consecutive items that are blocked across threads uint32_t inout_idx, alpha_idx; if (tgt == 0) { // both alpha and other input index is [b][0][src] alpha_idx = b_offset + src; } else { // alpha index is [b][tgt-1][src] alpha_idx = b_offset + (tgt - 1) * src_len + src; } inout_idx = b_offset + tgt * src_len + src; T thread_data = (T)0; if (src < src_len) { thread_data = alpha[alpha_idx] / cumprod_1mp_clamp[inout_idx]; } // Collectively compute the block-wide inclusive prefix sum BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, prefix_op); __syncthreads(); if (src < src_len) { T out = thread_data * p_choose[inout_idx] * cumprod_1mp[inout_idx]; // Clamps all elements into the range [ 0, 1.0 ] alpha[inout_idx] = std::min<T>(std::max<T>(out, 0), (T)1.0); } } } } template <typename T> void exclusiveCumprod( const T* p_choose, T* cumprod_1mp, uint32_t bsz, uint32_t tgt_len, uint32_t src_len, uint32_t max_grid_x, uint32_t max_grid_y, cudaStream_t& stream) { // cumprod_1mp = 1 - p_choose dim3 grid(std::min<T>(max_grid_x, bsz), 1, 1); dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y, 1); oneMinusPKernel<T><<<grid, block, 0, stream>>>( p_choose, cumprod_1mp, bsz, tgt_len, src_len); gpuErrchk(cudaGetLastError()); // scan on the innermost dimension of cumprod_1mp // cumprod_1mp = cumprod(cumprod_1mp) dim3 grid_scan( std::min<T>(max_grid_x, tgt_len), std::min<T>(max_grid_y, bsz), 1); innermostScanKernel<T, SCAN_BLOCK><<<grid_scan, SCAN_BLOCK, 0, stream>>>( cumprod_1mp, bsz, tgt_len, src_len); gpuErrchk(cudaGetLastError()); } template <typename T> void alignmentTrainCUDAImpl( const T* p_choose, T* alpha, uint32_t bsz, uint32_t tgt_len, uint32_t src_len, float eps) { // p_choose: bsz , tgt_len, src_len // cumprod_1mp: bsz , tgt_len, src_len // cumprod_1mp_clamp : bsz, tgt_len, src_len // alpha: bsz, tgt_len, src_len cudaStream_t stream = at::cuda::getCurrentCUDAStream(); uint32_t max_grid_x = at::cuda::getCurrentDeviceProperties()->maxGridSize[0]; uint32_t max_grid_y = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; // Implementing exclusive cumprod. // cumprod_1mp = cumprod(1 - p_choose) // There is cumprod in pytorch, however there is no exclusive mode. // cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i] // exclusive means // cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] uint32_t elements = bsz * tgt_len * src_len; T* cumprod_1mp; gpuErrchk(cudaMalloc(&cumprod_1mp, elements * sizeof(T))); exclusiveCumprod<T>( p_choose, cumprod_1mp, bsz, tgt_len, src_len, max_grid_x, max_grid_y, stream); // clamp cumprod_1mp to the range [eps, 1.0] T* cumprod_1mp_clamp; gpuErrchk(cudaMalloc(&cumprod_1mp_clamp, elements * sizeof(T))); dim3 grid_clamp(std::min<T>(max_grid_x, bsz), 1, 1); dim3 block_clamp(BLOCK_DIM_X, BLOCK_DIM_Y, 1); clampKernel<T><<<grid_clamp, block_clamp, 0, stream>>>( cumprod_1mp, cumprod_1mp_clamp, bsz, tgt_len, src_len, (T)eps, (T)1.0); gpuErrchk(cudaGetLastError()); // ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) dim3 grid_init(std::min<int>(max_grid_x, bsz), 1, 1); initAlphaCUDAKernel<T> <<<grid_init, 1, 0, stream>>>(alpha, bsz, tgt_len, src_len); gpuErrchk(cudaGetLastError()); const int grid = std::min(bsz, max_grid_x); for (uint32_t i = 0; i < tgt_len; i++) { alignmentTrainCUDAKernel<T, SCAN_BLOCK><<<grid, SCAN_BLOCK, 0, stream>>>( p_choose, cumprod_1mp, cumprod_1mp_clamp, alpha, bsz, tgt_len, src_len, i); gpuErrchk(cudaGetLastError()); } gpuErrchk(cudaFree(cumprod_1mp)); gpuErrchk(cudaFree(cumprod_1mp_clamp)); } } // namespace void alignmentTrainCUDAWrapper( const torch::Tensor& p_choose, torch::Tensor& alpha, float eps) { // p_choose dimension: bsz, tgt_len, src_len uint32_t bsz = p_choose.size(0); uint32_t tgt_len = p_choose.size(1); uint32_t src_len = p_choose.size(2); cudaSetDevice(p_choose.get_device()); AT_DISPATCH_FLOATING_TYPES_AND2( torch::ScalarType::Half, torch::ScalarType::BFloat16, p_choose.scalar_type(), "alignmentTrainCUDAImpl", [&]() { alignmentTrainCUDAImpl<scalar_t>( p_choose.data_ptr<scalar_t>(), alpha.data_ptr<scalar_t>(), bsz, tgt_len, src_len, eps); }); } ================================================ FILE: examples/operators/utils.h ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include <torch/extension.h> // @manual=//caffe2:torch_extension #define CHECK_CUDA(x) \ TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) ================================================ FILE: examples/paraphraser/README.md ================================================ # Paraphrasing with round-trip translation and mixture of experts Machine translation models can be used to paraphrase text by translating it to an intermediate language and back (round-trip translation). This example shows how to paraphrase text by first passing it to an English-French translation model, followed by a French-English [mixture of experts translation model](/examples/translation_moe). ##### 0. Setup Clone fairseq from source and install necessary dependencies: ```bash git clone https://github.com/pytorch/fairseq.git cd fairseq pip install --editable . pip install sacremoses sentencepiece ``` ##### 1. Download models ```bash wget https://dl.fbaipublicfiles.com/fairseq/models/paraphraser.en-fr.tar.gz wget https://dl.fbaipublicfiles.com/fairseq/models/paraphraser.fr-en.hMoEup.tar.gz tar -xzvf paraphraser.en-fr.tar.gz tar -xzvf paraphraser.fr-en.hMoEup.tar.gz ``` ##### 2. Paraphrase ```bash python examples/paraphraser/paraphrase.py \ --en2fr paraphraser.en-fr \ --fr2en paraphraser.fr-en.hMoEup # Example input: # The new date for the Games, postponed for a year in response to the coronavirus pandemic, gives athletes time to recalibrate their training schedules. # Example outputs: # Delayed one year in response to the coronavirus pandemic, the new date of the Games gives athletes time to rebalance their training schedule. # The new date of the Games, which was rescheduled one year in response to the coronavirus (CV) pandemic, gives athletes time to rebalance their training schedule. # The new date of the Games, postponed one year in response to the coronavirus pandemic, provides athletes with time to rebalance their training schedule. # The Games' new date, postponed one year in response to the coronavirus pandemic, gives athletes time to rebalance their training schedule. # The new Games date, postponed one year in response to the coronavirus pandemic, gives the athletes time to rebalance their training schedule. # The new date of the Games, which was postponed one year in response to the coronavirus pandemic, gives the athletes time to rebalance their training schedule. # The new date of the Games, postponed one year in response to the coronavirus pandemic, gives athletes time to rebalance their training schedule. # The new date of the Games, postponed one year in response to the coronavirus pandemic, gives athletes time to re-balance their training schedule. # The new date of the Games, postponed one year in response to the coronavirus pandemic, gives the athletes time to rebalance their schedule of training. # The new date of the Games, postponed one year in response to the pandemic of coronavirus, gives the athletes time to rebalance their training schedule. ``` ================================================ FILE: examples/paraphraser/paraphrase.py ================================================ #!/usr/bin/env python3 -u import argparse import fileinput import logging import os import sys from fairseq.models.transformer import TransformerModel logging.getLogger().setLevel(logging.INFO) def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("--en2fr", required=True, help="path to en2fr model") parser.add_argument( "--fr2en", required=True, help="path to fr2en mixture of experts model" ) parser.add_argument( "--user-dir", help="path to fairseq examples/translation_moe/src directory" ) parser.add_argument( "--num-experts", type=int, default=10, help="(keep at 10 unless using a different model)", ) parser.add_argument( "files", nargs="*", default=["-"], help='input files to paraphrase; "-" for stdin', ) args = parser.parse_args() if args.user_dir is None: args.user_dir = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # examples/ "translation_moe", "src", ) if os.path.exists(args.user_dir): logging.info("found user_dir:" + args.user_dir) else: raise RuntimeError( "cannot find fairseq examples/translation_moe/src " "(tried looking here: {})".format(args.user_dir) ) logging.info("loading en2fr model from:" + args.en2fr) en2fr = TransformerModel.from_pretrained( model_name_or_path=args.en2fr, tokenizer="moses", bpe="sentencepiece", ).eval() logging.info("loading fr2en model from:" + args.fr2en) fr2en = TransformerModel.from_pretrained( model_name_or_path=args.fr2en, tokenizer="moses", bpe="sentencepiece", user_dir=args.user_dir, task="translation_moe", ).eval() def gen_paraphrases(en): fr = en2fr.translate(en) return [ fr2en.translate(fr, inference_step_args={"expert": i}) for i in range(args.num_experts) ] logging.info("Type the input sentence and press return:") for line in fileinput.input(args.files): line = line.strip() if len(line) == 0: continue for paraphrase in gen_paraphrases(line): print(paraphrase) if __name__ == "__main__": main() ================================================ FILE: examples/pay_less_attention_paper/README.md ================================================ # Pay Less Attention with Lightweight and Dynamic Convolutions (Wu et al., 2019) This page contains pointers to pre-trained models as well as instructions on how to train new models for [our paper](https://arxiv.org/abs/1901.10430). ## Citation: ```bibtex @inproceedings{wu2018pay, title = {Pay Less Attention with Lightweight and Dynamic Convolutions}, author = {Felix Wu and Angela Fan and Alexei Baevski and Yann Dauphin and Michael Auli}, booktitle = {International Conference on Learning Representations}, year = {2019}, url = {https://arxiv.org/abs/1901.10430}, } ``` ## Translation ### Pre-trained models For some datasets we release models without GLUs which are faster at inference. Model | Description | Dataset | Download ---|---|---|--- `lightconv.no_glu.iwslt14.de-en` | LightConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2) `dynamicconv.no_glu.iwslt14.de-en` | DynamicConv (without GLUs) | [IWSLT14 German-English](https://wit3.fbk.eu/archive/2014-01/texts/de/en/de-en.tgz) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz) <br> IWSLT14 test: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/iwslt14.de-en.test.tar.bz2) `lightconv.no_glu.wmt16.en-de` | LightConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2) `dynamicconv.no_glu.wmt16.en-de` | DynamicConv (without GLUs) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2) `lightconv.glu.wmt16.en-de` | LightConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2) `dynamicconv.glu.wmt16.en-de` | DynamicConv | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014 (shared vocab): <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2) `lightconv.glu.wmt14.en-fr` | LightConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2) `dynamicconv.glu.wmt14.en-fr` | DynamicConv | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2) `lightconv.glu.wmt17.zh-en` | LightConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2) `dynamicconv.glu.wmt17.zh-en` | DynamicConv | [WMT17 Chinese-English](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz) <br> newstest2017: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.zh-en.newstest2017.tar.bz2) ### Memory-Efficient CUDA Kernels Since the PyTorch implementations of Light/Dynamic conv are quite memory intensive, we have developed CUDA kernels that implement the light and dynamic convolution operator in a memory-efficient and performant manner. For large sequence lengths, these kernels save about 50% memory compared to the PyTorch equivalent. To install the kernels, use the commands below. Once installed, they will automatically be used in place of the PyTorch implementations whenever a light or dynamic convolution is used. ```sh # to install lightconv cd fairseq/modules/lightconv_layer python cuda_function_gen.py python setup.py install # to install dynamicconv cd fairseq/modules/dynamicconv_layer python cuda_function_gen.py python setup.py install ``` ### Example usage (torch.hub) We require a few additional Python dependencies for preprocessing: ```bash pip install sacremoses subword_nmt ``` Interactive translation via PyTorch Hub: ```python import torch # List available models torch.hub.list('pytorch/fairseq') # [..., 'lightconv.glu.wmt17.zh-en', ... ] # Load a transformer trained on WMT'16 En-De zh2en = torch.hub.load('pytorch/fairseq', 'lightconv.glu.wmt17.zh-en', tokenizer='moses', bpe='subword_nmt') # The underlying model is available under the *models* attribute assert isinstance(zh2en.models[0], fairseq.models.lightconv.LightConvModel) # Translate a sentence zh2en.translate('你好 世界') # 'Hello World' ``` Loading custom models: ```python from fairseq.models.lightconv import LightConvModel en2fr = LightConvModel.from_pretrained( '/path/to/checkpoints', checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin/wmt14_en_fr', bpe='subword_nmt', bpe_codes='data-bin/wmt14_en_fr/en.code' ) en2fr.translate('Hello world!') # 'Bonjour le monde' ``` ### Preprocessing the training datasets Please follow the instructions in [`examples/translation/README.md`](../translation/README.md) to preprocess the data. ### Training and evaluation options: To use the model without GLU, please set `--encoder-glu 0 --decoder-glu 0`. For LightConv, please use `--encoder-conv-type lightweight --decoder-conv-type lightweight`, otherwise the default is DynamicConv. For best BLEU results, lenpen may need to be manually tuned. To use the CUDA kernels, first install the PyTorch modules using the commands above. Once the CUDA modules are installed, they will automatically be used instead of the PyTorch modules. ### IWSLT14 De-En Training and evaluating DynamicConv (without GLU) on a GPU: ```sh # Training SAVE="save/dynamic_conv_iwslt" mkdir -p $SAVE CUDA_VISIBLE_DEVICES=0 $(which fairseq-train) data-bin/iwslt14.tokenized.de-en \ --clip-norm 0 --optimizer adam --lr 0.0005 \ --source-lang de --target-lang en --max-tokens 4000 --no-progress-bar \ --log-interval 100 --stop-min-lr '1e-09' --weight-decay 0.0001 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --lr-scheduler inverse_sqrt \ --ddp-backend=legacy_ddp \ --max-update 50000 --warmup-updates 4000 --warmup-init-lr '1e-07' \ --adam-betas '(0.9, 0.98)' --keep-last-epochs 10 \ -a lightconv_iwslt_de_en --save-dir $SAVE \ --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \ --encoder-glu 0 --decoder-glu 0 python scripts/average_checkpoints.py --inputs $SAVE \ --num-epoch-checkpoints 10 --output "${SAVE}/checkpoint_last10_avg.pt" # Evaluation CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/iwslt14.tokenized.de-en --path "${SAVE}/checkpoint_last10_avg.pt" --batch-size 128 --beam 4 --remove-bpe --lenpen 1 --gen-subset test --quiet ``` ### WMT16 En-De Training and evaluating DynamicConv (with GLU) on WMT16 En-De using cosine scheduler on one machine with 8 V100 GPUs: ```sh # Training SAVE="save/dynamic_conv_wmt16en2de" mkdir -p $SAVE python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ data-bin/wmt16_en_de_bpe32k --fp16 --log-interval 100 --no-progress-bar \ --max-update 30000 --share-all-embeddings --optimizer adam \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ --ddp-backend=legacy_ddp --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 20000 \ --arch lightconv_wmt_en_de_big --save-dir $SAVE \ --dropout 0.3 --attention-dropout 0.1 --weight-dropout 0.1 \ --encoder-glu 1 --decoder-glu 1 # Evaluation CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt16.en-de.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.5 --gen-subset test > wmt16_gen.txt bash scripts/compound_split_bleu.sh wmt16_gen.txt ``` ### WMT14 En-Fr Training DynamicConv (with GLU) on WMT14 En-Fr using cosine scheduler on one machine with 8 V100 GPUs: ```sh # Training SAVE="save/dynamic_conv_wmt14en2fr" mkdir -p $SAVE python -m torch.distributed.launch --nproc_per_node 8 $(which fairseq-train) \ data-bin/wmt14_en_fr --fp16 --log-interval 100 --no-progress-bar \ --max-update 30000 --share-all-embeddings --optimizer adam \ --adam-betas '(0.9, 0.98)' --clip-norm 0.0 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --stop-min-lr 1e-09 --update-freq 16 --attention-dropout 0.1 --keep-last-epochs 10 \ --ddp-backend=legacy_ddp --max-tokens 3584 \ --lr-scheduler cosine --warmup-init-lr 1e-7 --warmup-updates 10000 \ --lr-shrink 1 --lr 0.001 --min-lr 1e-7 --warmup-init-lr 1e-07 \ --t-mult 1 --lr-period-updates 70000 \ --arch lightconv_wmt_en_fr_big --save-dir $SAVE \ --dropout 0.1 --attention-dropout 0.1 --weight-dropout 0.1 \ --encoder-glu 1 --decoder-glu 1 # Evaluation CUDA_VISIBLE_DEVICES=0 fairseq-generate data-bin/wmt14.en-fr.joined-dict.newstest2014 --path "${SAVE}/checkpoint_best.pt" --batch-size 128 --beam 5 --remove-bpe --lenpen 0.9 --gen-subset test ``` ================================================ FILE: examples/pointer_generator/README.md ================================================ # Transformer with Pointer-Generator Network This page describes the `transformer_pointer_generator` model that incorporates a pointing mechanism in the Transformer model that facilitates copying of input words to the output. This architecture is described in [Enarvi et al. (2020)](https://www.aclweb.org/anthology/2020.nlpmc-1.4/). ## Background The pointer-generator network was introduced in [See et al. (2017)](https://arxiv.org/abs/1704.04368) for RNN encoder-decoder attention models. A similar mechanism can be incorporated in a Transformer model by reusing one of the many attention distributions for pointing. The attention distribution over the input words is interpolated with the normal output distribution over the vocabulary words. This allows the model to generate words that appear in the input, even if they don't appear in the vocabulary, helping especially with small vocabularies. ## Implementation The mechanism for copying out-of-vocabulary words from the input has been implemented differently to See et al. In their [implementation](https://github.com/abisee/pointer-generator) they convey the word identities through the model in order to be able to produce words that appear in the input sequence but not in the vocabulary. A different approach was taken in the Fairseq implementation to keep it self-contained in the model file, avoiding any changes to the rest of the code base. Copying out-of-vocabulary words is possible by pre-processing the input and post-processing the output. This is described in detail in the next section. ## Usage The training and evaluation procedure is outlined below. You can also find a more detailed example for the XSum dataset on [this page](README.xsum.md). ##### 1. Create a vocabulary and extend it with source position markers The pointing mechanism is especially helpful with small vocabularies, if we are able to recover the identities of any out-of-vocabulary words that are copied from the input. For this purpose, the model allows extending the vocabulary with special tokens that can be used in place of `<unk>` tokens to identify different input positions. For example, the user may add `<unk-0>`, `<unk-1>`, `<unk-2>`, etc. to the end of the vocabulary, after the normal words. Below is an example of how to create a vocabulary of 10000 most common words and add 1000 input position markers. ```bash vocab_size=10000 position_markers=1000 export LC_ALL=C cat train.src train.tgt | tr -s '[:space:]' '\n' | sort | uniq -c | sort -k1,1bnr -k2 | head -n "$((vocab_size - 4))" | awk '{ print $2 " " $1 }' >dict.pg.txt python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt ``` ##### 2. Preprocess the text data The idea is that any `<unk>` tokens in the text are replaced with `<unk-0>` if it appears in the first input position, `<unk-1>` if it appears in the second input position, and so on. This can be achieved using the `preprocess.py` script that is provided in this directory. ##### 3. Train a model The number of these special tokens is given to the model with the `--source-position-markers` argument—the model simply maps all of these to the same word embedding as `<unk>`. The attention distribution that is used for pointing is selected using the `--alignment-heads` and `--alignment-layer` command-line arguments in the same way as with the `transformer_align` model. ##### 4. Generate text and postprocess it When using the model to generate text, you want to preprocess the input text in the same way that training data was processed, replacing out-of-vocabulary words with `<unk-N>` tokens. If any of these tokens are copied to the output, the actual words can be retrieved from the unprocessed input text. Any `<unk-N>` token should be replaced with the word at position N in the original input sequence. This can be achieved using the `postprocess.py` script. ================================================ FILE: examples/pointer_generator/README.xsum.md ================================================ ## Training a pointer-generator model on the Extreme Summarization dataset ##### 1. Download the Extreme Summarization data and preprocess it Follow the instructions [here](https://github.com/EdinburghNLP/XSum) to obtain the original Extreme Summarization dataset. You should have six files, {train,validation,test}.{document,summary}. ##### 2. Create a vocabulary and extend it with source position markers ```bash vocab_size=10000 position_markers=1000 export LC_ALL=C cat train.document train.summary | tr -s '[:space:]' '\n' | sort | uniq -c | sort -k1,1bnr -k2 | head -n "$((vocab_size - 4))" | awk '{ print $2 " " $1 }' >dict.pg.txt python3 -c "[print('<unk-{}> 0'.format(n)) for n in range($position_markers)]" >>dict.pg.txt ``` This creates the file dict.pg.txt that contains the 10k most frequent words, followed by 1k source position markers: ``` the 4954867 . 4157552 , 3439668 to 2212159 a 1916857 of 1916820 and 1823350 ... <unk-0> 0 <unk-1> 0 <unk-2> 0 <unk-3> 0 <unk-4> 0 ... ``` ##### 2. Preprocess the text data ```bash ./preprocess.py --source train.document --target train.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out train.pg.src --target-out train.pg.tgt ./preprocess.py --source validation.document --target validation.summary --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out valid.pg.src --target-out valid.pg.tgt ./preprocess.py --source test.document --vocab <(cut -d' ' -f1 dict.pg.txt) --source-out test.pg.src ``` The data should now contain `<unk-N>` tokens in place of out-of-vocabulary words. ##### 3. Binarize the dataset: ```bash fairseq-preprocess \ --source-lang src \ --target-lang tgt \ --trainpref train.pg \ --validpref valid.pg \ --destdir bin \ --workers 60 \ --srcdict dict.pg.txt \ --joined-dictionary ``` ##### 3. Train a model ```bash total_updates=20000 warmup_updates=500 lr=0.001 max_tokens=4096 update_freq=4 pointer_layer=-2 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 fairseq-train bin \ --user-dir examples/pointer_generator/pointer_generator_src \ --max-tokens "$max_tokens" \ --task translation \ --source-lang src --target-lang tgt \ --truncate-source \ --layernorm-embedding \ --share-all-embeddings \ --encoder-normalize-before \ --decoder-normalize-before \ --required-batch-size-multiple 1 \ --arch transformer_pointer_generator \ --alignment-layer "$pointer_layer" \ --alignment-heads 1 \ --source-position-markers 1000 \ --criterion label_smoothed_cross_entropy \ --label-smoothing 0.1 \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.01 --optimizer adam --adam-betas "(0.9, 0.999)" --adam-eps 1e-08 \ --clip-norm 0.1 \ --lr-scheduler inverse_sqrt --lr "$lr" --max-update "$total_updates" --warmup-updates "$warmup_updates" \ --update-freq "$update_freq" \ --skip-invalid-size-inputs-valid-test ``` Above we specify that our dictionary contains 1000 source position markers, and that we want to use one attention head from the penultimate decoder layer for pointing. It should run in 5.5 hours on one node with eight 32GB V100 GPUs. The logged messages confirm that dictionary indices above 10000 will be mapped to the `<unk>` embedding: ``` 2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [src] dictionary: 11000 types 2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | [tgt] dictionary: 11000 types 2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.src 2020-09-24 20:43:53 | INFO | fairseq.data.data_utils | loaded 11332 examples from: bin/valid.src-tgt.tgt 2020-09-24 20:43:53 | INFO | fairseq.tasks.translation | bin valid src-tgt 11332 examples 2020-09-24 20:43:53 | INFO | fairseq.models.transformer_pg | dictionary indices from 10000 to 10999 will be mapped to 3 ``` ##### 4. Summarize the test sequences ```bash batch_size=32 beam_size=6 max_length=60 length_penalty=1.0 fairseq-interactive bin \ --user-dir examples/pointer_generator/pointer_generator_src \ --batch-size "$batch_size" \ --task translation \ --source-lang src --target-lang tgt \ --path checkpoints/checkpoint_last.pt \ --input test.pg.src \ --buffer-size 200 \ --max-len-a 0 \ --max-len-b "$max_length" \ --lenpen "$length_penalty" \ --beam "$beam_size" \ --skip-invalid-size-inputs-valid-test | tee generate.out grep ^H generate.out | cut -f 3- >generate.hyp ``` Now you should have the generated sequences in `generate.hyp`. They contain `<unk-N>` tokens that the model has copied from the source sequence. In order to retrieve the original words, we need the unprocessed source sequences from `test.document`. ##### 5. Process the generated output Since we skipped too long inputs when producing `generate.hyp`, we also have to skip too long sequences now that we read `test.document`. ```bash ./postprocess.py \ --source <(awk 'NF<1024' test.document) \ --target generate.hyp \ --target-out generate.hyp.processed ``` Now you'll find the final sequences from `generate.hyp.processed`, with `<unk-N>` replaced with the original word from the source sequence. ##### An example of a summarized sequence The original source document in `test.document`: > de roon moved to teesside in june 2016 for an initial # 8.8 m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page . The preprocessed source document in `test.src.pg`: > de \<unk-1> moved to \<unk-4> in june 2016 for an initial # \<unk-12> m fee and played 33 premier league games last term . the netherlands international , 26 , scored five goals in 36 league and cup games during his spell at boro . meanwhile , manager garry monk confirmed the championship club 's interest in signing chelsea midfielder lewis baker . `` he 's a target and one of many that we 've had throughout the summer months , '' said monk . find all the latest football transfers on our dedicated page . The generated summary in `generate.hyp`: > middlesbrough striker \<unk> de \<unk-1> has joined spanish side \<unk> on a season-long loan . The generated summary after postprocessing in `generate.hyp.processed`: > middlesbrough striker \<unk> de roon has joined spanish side \<unk> on a season-long loan . ================================================ FILE: examples/pointer_generator/pointer_generator_src/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import transformer_pg # noqa ================================================ FILE: examples/pointer_generator/pointer_generator_src/transformer_pg.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import Any, Dict, Optional, List, Tuple import torch import torch.nn as nn from fairseq import utils from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer import ( DEFAULT_MAX_SOURCE_POSITIONS, DEFAULT_MAX_TARGET_POSITIONS, TransformerDecoder, TransformerEncoder, TransformerModel, base_architecture, ) from torch import Tensor logger = logging.getLogger(__name__) @register_model("transformer_pointer_generator") class TransformerPointerGeneratorModel(TransformerModel): """ Transformer model from `"Attention Is All You Need" (Vaswani et al, 2017) <https://arxiv.org/abs/1706.03762>`_, augmented with a pointer-generator network from `"Get To The Point: Summarization with Pointer-Generator Networks" (See et al, 2017) <https://arxiv.org/abs/1704.04368>`_. Args: encoder (TransformerPointerGeneratorEncoder): the encoder decoder (TransformerPointerGeneratorDecoder): the decoder The Transformer pointer-generator model provides the following named architectures and command-line arguments: .. argparse:: :ref: fairseq.models.transformer_pointer_generator_parser :prog: """ @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off TransformerModel.add_args(parser) parser.add_argument('--alignment-heads', type=int, metavar='N', help='number of attention heads to be used for ' 'pointing') parser.add_argument('--alignment-layer', type=int, metavar='I', help='layer number to be used for pointing (0 ' 'corresponding to the bottommost layer)') parser.add_argument('--source-position-markers', type=int, metavar='N', help='dictionary includes N additional items that ' 'represent an OOV token at a particular input ' 'position') parser.add_argument('--force-generation', type=float, metavar='P', default=None, help='set the vocabulary distribution weight to P, ' 'instead of predicting it from the input (1.0 ' 'corresponding to generation, 0.0 to pointing)') # fmt: on @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if getattr(args, "max_source_positions", None) is None: args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS if getattr(args, "max_target_positions", None) is None: args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS if getattr(args, "source_position_markers", None) is None: args.source_position_markers = args.max_source_positions src_dict, tgt_dict = task.source_dictionary, task.target_dictionary if src_dict != tgt_dict: raise ValueError("Pointer-generator requires a joined dictionary") def build_embedding(dictionary, embed_dim, path=None): # The dictionary may include additional items that can be used in # place of the normal OOV token and that all map to the same # embedding. Using a different token for each input position allows # one to restore the word identities from the original source text. num_embeddings = len(dictionary) - args.source_position_markers padding_idx = dictionary.pad() unk_idx = dictionary.unk() logger.info( "dictionary indices from {0} to {1} will be mapped to {2}".format( num_embeddings, len(dictionary) - 1, unk_idx ) ) emb = Embedding(num_embeddings, embed_dim, padding_idx, unk_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb if args.share_all_embeddings: if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embeddings not compatible with --decoder-embed-path" ) encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = encoder_embed_tokens args.share_decoder_input_output_embed = True else: encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = build_embedding( tgt_dict, args.decoder_embed_dim, args.decoder_embed_path ) encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens) decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) return cls(args, encoder, decoder) @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return TransformerPointerGeneratorEncoder(args, src_dict, embed_tokens) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): return TransformerPointerGeneratorDecoder(args, tgt_dict, embed_tokens) class TransformerPointerGeneratorEncoder(TransformerEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`TransformerEncoderLayer`. The pointer-generator variant adds the source tokens to the encoder output as these are otherwise not passed to the decoder. """ def forward( self, src_tokens, src_lengths: Optional[Tensor] = None, return_all_hiddens: bool = False, token_embeddings: Optional[Tensor] = None ): """ Runs the `forward()` method of the parent Transformer class. Then adds the source tokens into the encoder output tuple. While it might be more elegant that the model would pass the source tokens to the `forward()` method of the decoder too, this would require changes to `SequenceGenerator`. Args: src_tokens (torch.LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). token_embeddings (torch.Tensor, optional): precomputed embeddings default `None` will recompute embeddings Returns: namedtuple: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. - **src_tokens** (Tensor): input token ids of shape `(batch, src_len)` """ encoder_out = self.forward_scriptable(src_tokens, src_lengths, return_all_hiddens, token_embeddings) # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `forward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. return { "encoder_out": encoder_out["encoder_out"], # T x B x C "encoder_padding_mask": encoder_out["encoder_padding_mask"], # B x T "encoder_embedding": encoder_out["encoder_embedding"], # B x T x C "encoder_states": encoder_out["encoder_states"], # List[T x B x C] "src_tokens": [src_tokens], # B x T "src_lengths": [], } class TransformerPointerGeneratorDecoder(TransformerDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. The pointer-generator variant mixes the output probabilities with an attention distribution in the output layer. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding """ def __init__(self, args, dictionary, embed_tokens): super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False) # In the pointer-generator model these arguments define the decoder # layer and the number of attention heads that will be averaged to # create the alignment for pointing. self.alignment_heads = args.alignment_heads self.alignment_layer = args.alignment_layer input_embed_dim = embed_tokens.embedding_dim # Generation probabilities / interpolation coefficients are predicted # from the current decoder input embedding and the decoder output, which # is the size of output_embed_dim. p_gen_input_size = input_embed_dim + self.output_embed_dim self.project_p_gens = nn.Linear(p_gen_input_size, 1) nn.init.zeros_(self.project_p_gens.bias) # The dictionary may include a separate entry for an OOV token in each # input position, so that their identity can be restored from the # original source text. self.num_types = len(dictionary) self.num_oov_types = args.source_position_markers self.num_embeddings = self.num_types - self.num_oov_types self.force_p_gen = args.force_generation def forward( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, alignment_layer: Optional[int] = 0, alignment_heads: Optional[int] = 1, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict, optional): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False) alignment_layer (int, optional): 0-based index of the layer to be used for pointing (default: 0) alignment_heads (int, optional): number of attention heads to be used for pointing (default: 1) Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ # The normal Transformer model doesn't pass the alignment_layer and # alignment_heads parameters correctly. We use our local variables. x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, alignment_layer=self.alignment_layer, alignment_heads=self.alignment_heads, ) if not features_only: # Embedding the tokens again for generation probability prediction, # so that we don't have to reimplement the whole extract_features() # method. if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] prev_output_embed = self.embed_tokens(prev_output_tokens) prev_output_embed *= self.embed_scale predictors = torch.cat((prev_output_embed, x), 2) p_gens = self.project_p_gens(predictors) p_gens = torch.sigmoid(p_gens.float()) # Torchscript complains if encoder_out or attn are None because # `output_layer()` signature expects tensors instead attn: Optional[Tensor] = extra["attn"][0] assert encoder_out is not None assert attn is not None x = self.output_layer(x, attn, encoder_out["src_tokens"][0], p_gens) return x, extra def output_layer( self, features: Tensor, attn: Tensor, src_tokens: Tensor, p_gens: Tensor ) -> Tensor: """ Project features to the vocabulary size and mix with the attention distributions. """ if self.force_p_gen is not None: p_gens = self.force_p_gen # project back to size of vocabulary if self.adaptive_softmax is None: logits = self.output_projection(features) else: logits = features batch_size = logits.shape[0] output_length = logits.shape[1] assert logits.shape[2] == self.num_embeddings assert src_tokens.shape[0] == batch_size src_length = src_tokens.shape[1] # The final output distribution will be a mixture of the normal output # distribution (softmax of logits) and attention weights. gen_dists = self.get_normalized_probs_scriptable( (logits, None), log_probs=False, sample=None ) gen_dists = torch.mul(gen_dists, p_gens) padding_size = (batch_size, output_length, self.num_oov_types) padding = gen_dists.new_zeros(padding_size) gen_dists = torch.cat((gen_dists, padding), 2) assert gen_dists.shape[2] == self.num_types # Scatter attention distributions to distributions over the extended # vocabulary in a tensor of shape [batch_size, output_length, # vocab_size]. Each attention weight will be written into a location # that is for other dimensions the same as in the index tensor, but for # the third dimension it's the value of the index tensor (the token ID). attn = torch.mul(attn.float(), 1 - p_gens) index = src_tokens[:, None, :] index = index.expand(batch_size, output_length, src_length) attn_dists_size = (batch_size, output_length, self.num_types) attn_dists = attn.new_zeros(attn_dists_size) attn_dists.scatter_add_(2, index, attn.float()) # Final distributions, [batch_size, output_length, num_types]. return gen_dists + attn_dists def get_normalized_probs( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): """ Get normalized probabilities (or log probs) from a net's output. Pointer-generator network output is already normalized. """ probs = net_output[0] # Make sure the probabilities are greater than zero when returning log # probabilities. return probs.clamp(1e-10, 1.0).log() if log_probs else probs class Embedding(nn.Embedding): r"""A simple lookup table that stores embeddings of a fixed dictionary and size. This module is often used to store word embeddings and retrieve them using indices. The input to the module is a list of indices, and the output is the corresponding word embeddings. This subclass differs from the standard PyTorch Embedding class by allowing additional vocabulary entries that will be mapped to the unknown token embedding. Args: num_embeddings (int): size of the dictionary of embeddings embedding_dim (int): the size of each embedding vector padding_idx (int): Pads the output with the embedding vector at :attr:`padding_idx` (initialized to zeros) whenever it encounters the index. unk_idx (int): Maps all token indices that are greater than or equal to num_embeddings to this index. Attributes: weight (Tensor): the learnable weights of the module of shape (num_embeddings, embedding_dim) initialized from :math:`\mathcal{N}(0, 1)` Shape: - Input: :math:`(*)`, LongTensor of arbitrary shape containing the indices to extract - Output: :math:`(*, H)`, where `*` is the input shape and :math:`H=\text{embedding\_dim}` .. note:: Keep in mind that only a limited number of optimizers support sparse gradients: currently it's :class:`optim.SGD` (`CUDA` and `CPU`), :class:`optim.SparseAdam` (`CUDA` and `CPU`) and :class:`optim.Adagrad` (`CPU`) .. note:: With :attr:`padding_idx` set, the embedding vector at :attr:`padding_idx` is initialized to all zeros. However, note that this vector can be modified afterwards, e.g., using a customized initialization method, and thus changing the vector used to pad the output. The gradient for this vector from :class:`~torch.nn.Embedding` is always zero. """ __constants__ = ["unk_idx"] # Torchscript: Inheriting from Embedding class produces an error when exporting to Torchscript # -> RuntimeError: Unable to cast Python instance to C++ type (compile in debug mode for details # It's happening because max_norm attribute from nn.Embedding is None by default and it cannot be # cast to a C++ type def __init__( self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int], unk_idx: int, max_norm: Optional[float] = float("inf"), ): super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx, max_norm=max_norm) self.unk_idx = unk_idx nn.init.normal_(self.weight, mean=0, std=embedding_dim ** -0.5) nn.init.constant_(self.weight[padding_idx], 0) def forward(self, input): input = torch.where( input >= self.num_embeddings, torch.ones_like(input) * self.unk_idx, input ) return nn.functional.embedding( input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse ) @register_model_architecture( "transformer_pointer_generator", "transformer_pointer_generator" ) def transformer_pointer_generator(args): args.alignment_heads = getattr(args, "alignment_heads", 1) args.alignment_layer = getattr(args, "alignment_layer", -1) base_architecture(args) if args.alignment_layer < 0: args.alignment_layer = args.decoder_layers + args.alignment_layer @register_model_architecture( "transformer_pointer_generator", "transformer_pointer_generator_iwslt_de_en" ) def transformer_pointer_generator_iwslt_de_en(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.encoder_layers = getattr(args, "encoder_layers", 6) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.decoder_layers = getattr(args, "decoder_layers", 6) transformer_pointer_generator(args) @register_model_architecture( "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de" ) def transformer_pointer_generator_wmt_en_de(args): transformer_pointer_generator(args) # Transformer pointer-generator with the base Transformer parameters as used in # the "Attention Is All You Need" paper (Vaswani et al., 2017) @register_model_architecture( "transformer_pointer_generator", "transformer_pointer_generator_vaswani_wmt_en_de_big", ) def transformer_pointer_generator_vaswani_wmt_en_de_big(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.dropout = getattr(args, "dropout", 0.3) transformer_pointer_generator(args) @register_model_architecture( "transformer_pointer_generator", "transformer_pointer_generator_vaswani_wmt_en_fr_big", ) def transformer_pointer_generator_vaswani_wmt_en_fr_big(args): args.dropout = getattr(args, "dropout", 0.1) transformer_pointer_generator_vaswani_wmt_en_de_big(args) @register_model_architecture( "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big" ) def transformer_pointer_generator_wmt_en_de_big(args): args.attention_dropout = getattr(args, "attention_dropout", 0.1) transformer_pointer_generator_vaswani_wmt_en_de_big(args) # default parameters used in tensor2tensor implementation @register_model_architecture( "transformer_pointer_generator", "transformer_pointer_generator_wmt_en_de_big_t2t" ) def transformer_pointer_generator_wmt_en_de_big_t2t(args): args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_dropout = getattr(args, "activation_dropout", 0.1) transformer_pointer_generator_vaswani_wmt_en_de_big(args) ================================================ FILE: examples/pointer_generator/postprocess.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import re import sys class OOVIndexError(IndexError): def __init__(self, pos, source_seq, target_seq): super(OOVIndexError, self).__init__( "A <unk-N> tag in the target sequence refers to a position that is " "outside the source sequence. Most likely there was a mismatch in " "provided source and target sequences. Otherwise this would mean that " "the pointing mechanism somehow attended to a position that is past " "the actual sequence end." ) self.source_pos = pos self.source_seq = source_seq self.target_seq = target_seq def replace_oovs(source_in, target_in, target_out): """Replaces <unk-N> tokens in the target text with the corresponding word in the source text. """ oov_re = re.compile("^<unk-([0-9]+)>$") for source_seq, target_seq in zip(source_in, target_in): target_seq_out = [] pos_to_word = source_seq.strip().split() for token in target_seq.strip().split(): m = oov_re.match(token) if m: pos = int(m.group(1)) if pos >= len(pos_to_word): raise OOVIndexError(pos, source_seq, target_seq) token_out = pos_to_word[pos] else: token_out = token target_seq_out.append(token_out) target_out.write(" ".join(target_seq_out) + "\n") def main(): parser = argparse.ArgumentParser( description="Replaces <unk-N> tokens in target sequences with words from " "the corresponding position in the source sequence." ) parser.add_argument( "--source", type=str, help="text file with source sequences", required=True ) parser.add_argument( "--target", type=str, help="text file with target sequences", required=True ) parser.add_argument( "--target-out", type=str, help="where to write target sequences without <unk-N> " "entries", required=True, ) args = parser.parse_args() target_in = ( open(args.target, "r", encoding="utf-8") if args.target is not None else None ) target_out = ( open(args.target_out, "w", encoding="utf-8") if args.target_out is not None else None ) with open(args.source, "r", encoding="utf-8") as source_in, open( args.target, "r", encoding="utf-8" ) as target_in, open(args.target_out, "w", encoding="utf-8") as target_out: replace_oovs(source_in, target_in, target_out) if __name__ == "__main__": try: main() except OOVIndexError as e: print(e, file=sys.stderr) print("Source sequence:", e.source_seq.strip(), file=sys.stderr) print("Target sequence:", e.target_seq.strip(), file=sys.stderr) print( "Source sequence length:", len(e.source_seq.strip().split()), file=sys.stderr, ) print("The offending tag points to:", e.source_pos) sys.exit(2) ================================================ FILE: examples/pointer_generator/preprocess.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse from itertools import zip_longest def replace_oovs(source_in, target_in, vocabulary, source_out, target_out): """Replaces out-of-vocabulary words in source and target text with <unk-N>, where N in is the position of the word in the source sequence. """ def format_unk(pos): return "<unk-{}>".format(pos) if target_in is None: target_in = [] for seq_num, (source_seq, target_seq) in enumerate( zip_longest(source_in, target_in) ): source_seq_out = [] target_seq_out = [] word_to_pos = dict() for position, token in enumerate(source_seq.strip().split()): if token in vocabulary: token_out = token else: if token in word_to_pos: oov_pos = word_to_pos[token] else: word_to_pos[token] = position oov_pos = position token_out = format_unk(oov_pos) source_seq_out.append(token_out) source_out.write(" ".join(source_seq_out) + "\n") if target_seq is not None: for token in target_seq.strip().split(): if token in word_to_pos: token_out = format_unk(word_to_pos[token]) else: token_out = token target_seq_out.append(token_out) if target_out is not None: target_out.write(" ".join(target_seq_out) + "\n") def main(): parser = argparse.ArgumentParser( description="Replaces out-of-vocabulary words in both source and target " "sequences with tokens that indicate the position of the word " "in the source sequence." ) parser.add_argument( "--source", type=str, help="text file with source sequences", required=True ) parser.add_argument( "--target", type=str, help="text file with target sequences", default=None ) parser.add_argument("--vocab", type=str, help="vocabulary file", required=True) parser.add_argument( "--source-out", type=str, help="where to write source sequences with <unk-N> entries", required=True, ) parser.add_argument( "--target-out", type=str, help="where to write target sequences with <unk-N> entries", default=None, ) args = parser.parse_args() with open(args.vocab, encoding="utf-8") as vocab: vocabulary = vocab.read().splitlines() target_in = ( open(args.target, "r", encoding="utf-8") if args.target is not None else None ) target_out = ( open(args.target_out, "w", encoding="utf-8") if args.target_out is not None else None ) with open(args.source, "r", encoding="utf-8") as source_in, open( args.source_out, "w", encoding="utf-8" ) as source_out: replace_oovs(source_in, target_in, vocabulary, source_out, target_out) if target_in is not None: target_in.close() if target_out is not None: target_out.close() if __name__ == "__main__": main() ================================================ FILE: examples/quant_noise/README.md ================================================ # Training with Quantization Noise for Extreme Model Compression ({Fan\*, Stock\*} *et al.*, 2020) This page contains information for how to train and quantize models with Quantization Noise, for both scalar quantization like `int8` and Iterative Product Quantization. Check out our paper [here](https://arxiv.org/abs/2004.07320). Looking for pretrained models? They will be added shortly. Looking for code to train vision models? We are working on open sourcing our code as part of ClassyVision. Please check back, but note that both the Scalar and Iterative Product Quantization counterparts of the `nn.Conv2d` module are already included in this release. **Contents**: - [Walk through of code](#walk-through-the-code) - [Reproduce NLP Results](#looking-to-reproduce-the-nlp-results-in-the-paper) - [Reproduce Vision Results](#looking-to-reproduce-the-vision-results-in-the-paper) ## Citation ```bibtex @article{fan2020training, title={Training with Quantization Noise for Extreme Model Compression}, author={Angela Fan* and Pierre Stock* and and Benjamin Graham and Edouard Grave and Remi Gribonval and Herve Jegou and Armand Joulin}, year={2020}, eprint={2004.07320}, archivePrefix={arXiv}, primaryClass={cs.ML} } ``` ## Walk through the code Training a model with Quant-Noise improves the performance in subsequent inference-time quantization by training models to be robust to quantization. This technique is useful for both scalar and product quantization methods, as well as multiple domains. We detail below our approach to train, quantize models and integrate our code to quantize your favorite models. ### Scalar Quantization Unlike the section [Iterative Product Quantization](#iterative-product-quantization) which gives state-of-the-art compression, this section showcases the usefulness of our approach for simple scalar quantization baselines such as int8 using on-GPU Fake Quantization. #### Training Scalar quantization with Quant-Noise consists in randomly quantizing a proportion `p` of the weights during training. Scalar quantization is implemented [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quantization/scalar) under the form of Fake Quantization, meaning that we emulate int8 on GPU by quantizing and de-quantizing both the weights and the activations. We rely on PyTorch's [quantization primitives](https://github.com/pytorch/pytorch/tree/master/torch/quantization). To train a model with Quant-Noise, add the following flag: ``` --quant-noise-scalar 0.5 ``` Large values of noise make the network easier to quantize but may result in higher non-quantized test and validation perplexities. #### Quantization When evaluating a network, all quantized modules and activation hooks automatically switch to `p=1` so the validation accuracy reported by Fairseq is actually the quantized one, nothing more to do. #### Integration with your own code Looking to quantize your own models with Quant-Noise + Scalar Quantization? - Use the function `quantize_model_` implemented [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quantization/scalar/utils.py) to (1) replace all your modules by their quantized counterparts and (2) add hooks to those modules to quantize the activations. - Then, perform your training as usual. Note that in `eval()` mode, the network is always fully quantized (weights and activations) by default (`p=1`). ### Iterative Product Quantization Iterative Product Quantization with Quant-Noise proceeds in two steps. First, a model must be trained uncompressed with Quant-Noise. Second, the model must be quantized with iPQ. Note that we implement here the simplest form of noise, which consists in randomly dropping a proportion `p` of blocks, and that worked as well as assigning those blocks to their current centroid. #### Training To train a model with Quant-Noise, add the following flags: ``` --quant-noise-pq 0.1 --quant-noise-pq-block-size 8 ``` `quant-noise-pq` controls how much dropout is applied to the blocks of the weight matrix. `quant-noise-pq-block-size` controls the size of the weight matrix blocks. We recommend training with 0.05 to 0.2 Quant-Noise, a value that worked well in our experiments. For the block-size, we recommend training with block-size of 8. Note that the block size must be a multiple of `input_features`, see the size checks [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quant_noise.py). Large block sizes result in higher compression ratio but may induce a loss in accuracy. We currently support training Transformer based models, such as sequence-to-sequence, language models, and BERT architectures. The `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quant_noise.py) wraps a module. It splits a weight matrix into blocks and applies random dropout to these blocks. In the Transformer architectures, quant-noise is applied to the input and output embeddings, the attention, and the FFN. Quant-Noise can also be combined with **LayerDrop** (see [here](https://github.com/pytorch/fairseq/tree/main/examples/layerdrop)) to add its pruning effect to the quantized model and make the model even smaller. We recommend training with LayerDrop 0.1 or 0.2. #### Quantization We implement an improved version of product quantization from Stock et al, **iPQ**, described [here](https://arxiv.org/abs/1907.05686), see code with old API [here](https://github.com/facebookresearch/kill-the-bits). Note that we improved the iPQ API in terms of both compute speed and usability as described below. For the particular case of PQ, quantization is made sequentially. We recommend first quantizing the FFNs, then the EMBs, and finally the ATTNs. Quantization is done in two sub-steps: - First, perform `n` steps of Product Quantization (generally `n=20` is enough). - Then, finetune the obtained centroids. #### Integration with your own code Looking to quantize your own models with Quant-Noise + iPQ? - First wrap your modules with the `quant_noise` function [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quant_noise.py), which is module-agnostic and train your favorite model. - Then, quantize your trained model using the code [here](https://github.com/pytorch/fairseq/tree/main/fairseq/modules/quantization/pq). This can be done *without any changes to your training loop*. Below is an example code for integration. Note that we tried our approach only on Transformers and various Convolutional Models such as EfficientNets. ```python from fairseq.modules.quantization.pq import quantize_model_, SizeTracker # get configuration parameters n_centroids_config = config["n_centroids"] block_sizes_config = config["block_sizes"] layers_to_quantize = config["layers_to_quantize"] # size tracker for keeping track of assignments, centroids and non-compressed sizes size_tracker = SizeTracker(model) # Quantize model by stages for step in range(len(layers_to_quantize)): # quantize model in-place quantized_layers = quantize_model_( model, size_tracker, layers_to_quantize, block_sizes_config, n_centroids_config, step=step, ) logger.info(f"Finetuning stage {step}, quantized layers: {quantized_layers}") logger.info(f"{size_tracker}") # Don't forget to re-create/update trainer/optimizer since model parameters have changed optimizer = ... # Finetune the centroids with your usual training loop for a few epochs trainer.train_epoch() ``` ## Looking to reproduce the NLP results in the paper? We detail below how to reproduce the state-of-the-art results in reported in the paper for Quant-Noise + Iterative Product Quantization. ### Training with Quant-Noise To **train** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/roberta). The following command can be used to train a RoBERTa Base + QuantNoise model: ```bash TOTAL_UPDATES=125000 WARMUP_UPDATES=10000 PEAK_LR=0.0005 TOKENS_PER_SAMPLE=512 MAX_POSITIONS=512 MAX_SENTENCES=16 UPDATE_FREQ=2 DATA_DIR=/path/to/data/here fairseq-train $DATA_DIR \ --task masked_lm --criterion masked_lm --arch roberta_base \ --sample-break-mode complete \ --tokens-per-sample $TOKENS_PER_SAMPLE --max-positions $MAX_POSITIONS \ --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-6 \ --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr $PEAK_LR \ --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_UPDATES \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.01 \ --batch-size $MAX_SENTENCES \ --update-freq $UPDATE_FREQ --max-update $TOTAL_UPDATES \ --save-dir checkpoint/roberta \ --ddp-backend legacy_ddp --encoder-layerdrop 0.2 \ --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 --untie-weights-roberta ``` To **finetune** RoBERTa + QuantNoise, we followed this setting [here](https://github.com/pytorch/fairseq/blob/main/examples/roberta/README.glue.md). The following command can be used to finetune a RoBERTa Base + QuantNoise model on the RTE dataset: ```bash TOTAL_NUM_UPDATES=2036 WARMUP_UPDATES=122 LR=2e-05 NUM_CLASSES=2 MAX_SENTENCES=16 ROBERTA_PATH=/path/to/roberta_quantnoise/model.pt fairseq-train /path/to/rte/data/ \ --restore-file $ROBERTA_PATH \ --max-positions 512 \ --batch-size $MAX_SENTENCES \ --max-tokens 4400 \ --task sentence_prediction \ --reset-optimizer --reset-dataloader --reset-meters \ --required-batch-size-multiple 1 \ --init-token 0 --separator-token 2 \ --arch roberta_large \ --criterion sentence_prediction \ --num-classes $NUM_CLASSES \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --max-epoch 10 \ --find-unused-parameters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --ddp-backend legacy_ddp \ --quant-noise-pq 0.2 --quant-noise-pq-block-size 8 ``` To **train** Language Models on Wikitext-103, we followed this setting [here](https://github.com/pytorch/fairseq/tree/main/examples/language_model). The following command can be used to train a Transformer + QuantNoise model on Wikitext-103: ```bash fairseq-train --task language_modeling /path/to/wikitext-103/data \ --save-dir checkpoints/transformer_wikitext-103 \ --adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \ --adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \ --tie-adaptive-proj --tie-adaptive-weights \ --arch transformer_lm_gbw \ --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \ --clip-norm 0.1 --criterion adaptive_loss \ --ddp-backend legacy_ddp \ --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 \ --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \ --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 1.0 --t-mult 2.0 \ --max-tokens 3072 --tokens-per-sample 3072 --momentum 0.99 --optimizer nag \ --sample-break-mode none --update-freq 3 \ --warmup-init-lr 1e-07 --warmup-updates 16000 \ --weight-decay 0 --seed 1 --stop-min-lr 1e-09 \ --quant-noise-pq 0.05 --quant-noise-pq-block-size 8 ``` To **evaluate** this model, note you need to use the `eval.py` script. The following command can be used to evaluate: ```bash fairseq-eval-lm /path/to/wikitext-103/data --path /path/to/model/checkpoint \ --sample-break-mode complete \ --max-tokens 3072 \ --context-window 2560 \ --softmax-batch 1024 \ --gen-subset valid ``` and change the `--gen-subset` to `test` if you would like to evaluate on the test set instead. ### Iterative Product Quantization To quantize the finetuned RoBERTa model, we use this command on 1 GPU. This should run in a day. ```bash TOTAL_NUM_UPDATES=6108 # 2036 updates for each iteration WARMUP_UPDATES=122 LR=2e-05 NUM_CLASSES=2 MAX_SENTENCES=16 fairseq-train --task sentence_prediction /path/to/data/ \ --restore-file $ROBERTA_PATH \ --save-dir checkpoints/roberta_finetuned \ --max-positions 512 \ --batch-size $MAX_SENTENCES \ --max-tokens 4400 \ --init-token 0 --separator-token 2 \ --arch roberta_large \ --criterion sentence_prediction \ --num-classes $NUM_CLASSES \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --clip-norm 0.0 --lr-scheduler polynomial_decay \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --no-progress-bar --skip-invalid-size-inputs-valid-test --ddp-backend legacy_ddp \ --quantization-config-path /path/to/config/yaml ``` To quantize the trained Language Model, we use this command on 8 V100 23GB GPUs. This should run in a couple of hours. ```bash fairseq-train --task language_modeling /path/to/wikitext-103/data \ --save-dir checkpoints/transformer_wikitext-103 \ --adaptive-input --adaptive-input-cutoff 20000,60000 --adaptive-input-factor 4 \ --adaptive-softmax-cutoff 20000,60000 --adaptive-softmax-dropout 0.2 --adaptive-softmax-factor 4.0 \ --arch transformer_lm_gbw \ --attention-dropout 0.1 --dropout 0.2 --relu-dropout 0.1 \ --bucket-cap-mb 25 --char-embedder-highway-layers 2 --character-embedding-dim 4 \ --clip-norm 0.1 --criterion adaptive_loss \ --ddp-backend legacy_ddp \ --decoder-attention-heads 8 --decoder-embed-dim 1024 --decoder-ffn-embed-dim 4096 --decoder-input-dim 1024 --decoder-layers 16 --decoder-normalize-before --decoder-output-dim 1024 \ --fp16 --keep-last-epochs -1 \ --min-lr 0.0001 --lr-period-updates 270000 --lr-scheduler cosine --lr-shrink 0.75 --lr 0.05 --stop-min-lr 1e-09 \ --max-tokens 2944 --tokens-per-sample 2944\ --momentum 0.99 --no-epoch-checkpoints --no-progress-bar --optimizer nag --required-batch-size-multiple 8 \ --sample-break-mode none --t-mult 2.0 --skip-invalid-size-inputs-valid-test \ --tie-adaptive-proj --tie-adaptive-weights --update-freq 3 --weight-decay 0 --seed 1 \ --log-interval 100 --no-progress-bar --skip-invalid-size-inputs-valid-test \ --restore-file path/to/trained/lm/with/quant/noise \ --max-update 13500 --quantization-config-path /path/to/config/yaml ``` If you have less capacity or if your distributed training freezes, try reducing `--max-tokens` and `--tokens-per-sample` (this may reduce the quantized accuracy a bit). ### Remarks We try to keep the open-sourced code as readable and as easy-to-plug as possible. Therefore, we did not test it for the following cases: - Scalar quantization with RoBERTa. - Quantization with iPQ and `int8` combined. If you have trouble adapting it, we will be more than happy to help! ## Looking to reproduce the Vision results in the paper? We are working on open sourcing our code as part of ClassyVision. Please check back. ## Having an issue or have a question? Please open an issue in this repository with the details of your question. Thanks! ================================================ FILE: examples/quant_noise/transformer_quantization_config.yaml ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # This file defines example configuration arguments for quantizing # a transformer model with product quantization # Number of Centroids for Product Quantization, by default 256 (byte-aligned) n_centroids: Linear: key: in_features value: {"*": 256} Embedding: key: embedding_dim value: {"*": 256} # Block Sizes for Product Quantization # We suggest: 8 for FFN, 4 for ATTN, 4 for embedding projections, 8 for embeddings block_sizes: Linear: key: fuzzy_name value: {fc: 8, attn: 4, emb: 4} Embedding: key: fuzzy_name value: {emb: 8} # Layers to Quantize Sequentially # We suggest: first FFN, then EMB, then ATTN layers_to_quantize: - decoder\\.layers\\.\d+\\.fc[12] - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01] - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj) ================================================ FILE: examples/roberta/README.custom_classification.md ================================================ # Finetuning RoBERTa on a custom classification task This example shows how to finetune RoBERTa on the IMDB dataset, but should illustrate the process for most classification tasks. ### 1) Get the data ```bash wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz tar zxvf aclImdb_v1.tar.gz ``` ### 2) Format data `IMDB` data has one data-sample in each file, below python code-snippet converts it one file for train and valid each for ease of processing. ```python import argparse import os import random from glob import glob random.seed(0) def main(args): for split in ['train', 'test']: samples = [] for class_label in ['pos', 'neg']: fnames = glob(os.path.join(args.datadir, split, class_label) + '/*.txt') for fname in fnames: with open(fname) as fin: line = fin.readline() samples.append((line, 1 if class_label == 'pos' else 0)) random.shuffle(samples) out_fname = 'train' if split == 'train' else 'dev' f1 = open(os.path.join(args.datadir, out_fname + '.input0'), 'w') f2 = open(os.path.join(args.datadir, out_fname + '.label'), 'w') for sample in samples: f1.write(sample[0] + '\n') f2.write(str(sample[1]) + '\n') f1.close() f2.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--datadir', default='aclImdb') args = parser.parse_args() main(args) ``` ### 3) BPE encode Run `multiprocessing_bpe_encoder`, you can also do this in previous step for each sample but that might be slower. ```bash # Download encoder.json and vocab.bpe wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' for SPLIT in train dev; do python -m examples.roberta.multiprocessing_bpe_encoder \ --encoder-json encoder.json \ --vocab-bpe vocab.bpe \ --inputs "aclImdb/$SPLIT.input0" \ --outputs "aclImdb/$SPLIT.input0.bpe" \ --workers 60 \ --keep-empty done ``` ### 4) Preprocess data ```bash # Download fairseq dictionary. wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' fairseq-preprocess \ --only-source \ --trainpref "aclImdb/train.input0.bpe" \ --validpref "aclImdb/dev.input0.bpe" \ --destdir "IMDB-bin/input0" \ --workers 60 \ --srcdict dict.txt fairseq-preprocess \ --only-source \ --trainpref "aclImdb/train.label" \ --validpref "aclImdb/dev.label" \ --destdir "IMDB-bin/label" \ --workers 60 ``` ### 5) Run training ```bash TOTAL_NUM_UPDATES=7812 # 10 epochs through IMDB for bsz 32 WARMUP_UPDATES=469 # 6 percent of the number of updates LR=1e-05 # Peak LR for polynomial LR scheduler. HEAD_NAME=imdb_head # Custom name for the classification head. NUM_CLASSES=2 # Number of classes for the classification task. MAX_SENTENCES=8 # Batch size. ROBERTA_PATH=/path/to/roberta.large/model.pt CUDA_VISIBLE_DEVICES=0 fairseq-train IMDB-bin/ \ --restore-file $ROBERTA_PATH \ --max-positions 512 \ --batch-size $MAX_SENTENCES \ --max-tokens 4400 \ --task sentence_prediction \ --reset-optimizer --reset-dataloader --reset-meters \ --required-batch-size-multiple 1 \ --init-token 0 --separator-token 2 \ --arch roberta_large \ --criterion sentence_prediction \ --classification-head-name $HEAD_NAME \ --num-classes $NUM_CLASSES \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --max-epoch 10 \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --shorten-method "truncate" \ --find-unused-parameters \ --update-freq 4 ``` The above command will finetune RoBERTa-large with an effective batch-size of 32 sentences (`--batch-size=8 --update-freq=4`). The expected `best-validation-accuracy` after 10 epochs is ~96.5%. If you run out of GPU memory, try decreasing `--batch-size` and increase `--update-freq` to compensate. ### 6) Load model using hub interface Now we can load the trained model checkpoint using the RoBERTa hub interface. Assuming your checkpoints are stored in `checkpoints/`: ```python from fairseq.models.roberta import RobertaModel roberta = RobertaModel.from_pretrained( 'checkpoints', checkpoint_file='checkpoint_best.pt', data_name_or_path='IMDB-bin' ) roberta.eval() # disable dropout ``` Finally you can make predictions using the `imdb_head` (or whatever you set `--classification-head-name` to during training): ```python label_fn = lambda label: roberta.task.label_dictionary.string( [label + roberta.task.label_dictionary.nspecial] ) tokens = roberta.encode('Best movie this year') pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item()) assert pred == '1' # positive tokens = roberta.encode('Worst movie ever') pred = label_fn(roberta.predict('imdb_head', tokens).argmax().item()) assert pred == '0' # negative ``` ================================================ FILE: examples/roberta/README.glue.md ================================================ # Finetuning RoBERTa on GLUE tasks ### 1) Download the data from GLUE website (https://gluebenchmark.com/tasks) using following commands: ```bash wget https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py python download_glue_data.py --data_dir glue_data --tasks all ``` ### 2) Preprocess GLUE task data: ```bash ./examples/roberta/preprocess_GLUE_tasks.sh glue_data <glue_task_name> ``` `glue_task_name` is one of the following: `{ALL, QQP, MNLI, QNLI, MRPC, RTE, STS-B, SST-2, CoLA}` Use `ALL` for preprocessing all the glue tasks. ### 3) Fine-tuning on GLUE task: Example fine-tuning cmd for `RTE` task ```bash ROBERTA_PATH=/path/to/roberta/model.pt CUDA_VISIBLE_DEVICES=0 fairseq-hydra-train -config-dir examples/roberta/config/finetuning --config-name rte \ task.data=RTE-bin checkpoint.restore_file=$ROBERTA_PATH ``` There are additional config files for each of the GLUE tasks in the examples/roberta/config/finetuning directory. **Note:** a) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. b) All the settings in above table are suggested settings based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search. ### Inference on GLUE task After training the model as mentioned in previous step, you can perform inference with checkpoints in `checkpoints/` directory using following python code snippet: ```python from fairseq.models.roberta import RobertaModel roberta = RobertaModel.from_pretrained( 'checkpoints/', checkpoint_file='checkpoint_best.pt', data_name_or_path='RTE-bin' ) label_fn = lambda label: roberta.task.label_dictionary.string( [label + roberta.task.label_dictionary.nspecial] ) ncorrect, nsamples = 0, 0 roberta.cuda() roberta.eval() with open('glue_data/RTE/dev.tsv') as fin: fin.readline() for index, line in enumerate(fin): tokens = line.strip().split('\t') sent1, sent2, target = tokens[1], tokens[2], tokens[3] tokens = roberta.encode(sent1, sent2) prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() prediction_label = label_fn(prediction) ncorrect += int(prediction_label == target) nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) ``` ================================================ FILE: examples/roberta/README.md ================================================ # RoBERTa: A Robustly Optimized BERT Pretraining Approach https://arxiv.org/abs/1907.11692 ## Introduction RoBERTa iterates on BERT's pretraining procedure, including training the model longer, with bigger batches over more data; removing the next sentence prediction objective; training on longer sequences; and dynamically changing the masking pattern applied to the training data. See the associated paper for more details. ### What's New: - December 2020: German model (GottBERT) is available: [GottBERT](https://github.com/pytorch/fairseq/tree/main/examples/gottbert). - January 2020: Italian model (UmBERTo) is available from Musixmatch Research: [UmBERTo](https://github.com/musixmatchresearch/umberto). - November 2019: French model (CamemBERT) is available: [CamemBERT](https://github.com/pytorch/fairseq/tree/main/examples/camembert). - November 2019: Multilingual encoder (XLM-RoBERTa) is available: [XLM-R](https://github.com/pytorch/fairseq/tree/main/examples/xlmr). - September 2019: TensorFlow and TPU support via the [transformers library](https://github.com/huggingface/transformers). - August 2019: RoBERTa is now supported in the [pytorch-transformers library](https://github.com/huggingface/pytorch-transformers). - August 2019: Added [tutorial for finetuning on WinoGrande](https://github.com/pytorch/fairseq/tree/main/examples/roberta/wsc#roberta-training-on-winogrande-dataset). - August 2019: Added [tutorial for pretraining RoBERTa using your own data](README.pretraining.md). ## Pre-trained models Model | Description | # params | Download ---|---|---|--- `roberta.base` | RoBERTa using the BERT-base architecture | 125M | [roberta.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz) `roberta.large` | RoBERTa using the BERT-large architecture | 355M | [roberta.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz) `roberta.large.mnli` | `roberta.large` finetuned on [MNLI](http://www.nyu.edu/projects/bowman/multinli) | 355M | [roberta.large.mnli.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz) `roberta.large.wsc` | `roberta.large` finetuned on [WSC](wsc/README.md) | 355M | [roberta.large.wsc.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz) ## Results **[GLUE (Wang et al., 2019)](https://gluebenchmark.com/)** _(dev set, single model, single-task finetuning)_ Model | MNLI | QNLI | QQP | RTE | SST-2 | MRPC | CoLA | STS-B ---|---|---|---|---|---|---|---|--- `roberta.base` | 87.6 | 92.8 | 91.9 | 78.7 | 94.8 | 90.2 | 63.6 | 91.2 `roberta.large` | 90.2 | 94.7 | 92.2 | 86.6 | 96.4 | 90.9 | 68.0 | 92.4 `roberta.large.mnli` | 90.2 | - | - | - | - | - | - | - **[SuperGLUE (Wang et al., 2019)](https://super.gluebenchmark.com/)** _(dev set, single model, single-task finetuning)_ Model | BoolQ | CB | COPA | MultiRC | RTE | WiC | WSC ---|---|---|---|---|---|---|--- `roberta.large` | 86.9 | 98.2 | 94.0 | 85.7 | 89.5 | 75.6 | - `roberta.large.wsc` | - | - | - | - | - | - | 91.3 **[SQuAD (Rajpurkar et al., 2018)](https://rajpurkar.github.io/SQuAD-explorer/)** _(dev set, no additional data used)_ Model | SQuAD 1.1 EM/F1 | SQuAD 2.0 EM/F1 ---|---|--- `roberta.large` | 88.9/94.6 | 86.5/89.4 **[RACE (Lai et al., 2017)](http://www.qizhexie.com/data/RACE_leaderboard.html)** _(test set)_ Model | Accuracy | Middle | High ---|---|---|--- `roberta.large` | 83.2 | 86.5 | 81.3 **[HellaSwag (Zellers et al., 2019)](https://rowanzellers.com/hellaswag/)** _(test set)_ Model | Overall | In-domain | Zero-shot | ActivityNet | WikiHow ---|---|---|---|---|--- `roberta.large` | 85.2 | 87.3 | 83.1 | 74.6 | 90.9 **[Commonsense QA (Talmor et al., 2019)](https://www.tau-nlp.org/commonsenseqa)** _(test set)_ Model | Accuracy ---|--- `roberta.large` (single model) | 72.1 `roberta.large` (ensemble) | 72.5 **[Winogrande (Sakaguchi et al., 2019)](https://arxiv.org/abs/1907.10641)** _(test set)_ Model | Accuracy ---|--- `roberta.large` | 78.1 **[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)** _(TRANSLATE-TEST)_ Model | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur ---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--- `roberta.large.mnli` | 91.3 | 82.91 | 84.27 | 81.24 | 81.74 | 83.13 | 78.28 | 76.79 | 76.64 | 74.17 | 74.05 | 77.5 | 70.9 | 66.65 | 66.81 ## Example usage ##### Load RoBERTa from torch.hub (PyTorch >= 1.1): ```python import torch roberta = torch.hub.load('pytorch/fairseq', 'roberta.large') roberta.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Load RoBERTa (for PyTorch 1.0 or custom models): ```python # Download roberta.large model wget https://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz tar -xzvf roberta.large.tar.gz # Load the model in fairseq from fairseq.models.roberta import RobertaModel roberta = RobertaModel.from_pretrained('/path/to/roberta.large', checkpoint_file='model.pt') roberta.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Apply Byte-Pair Encoding (BPE) to input text: ```python tokens = roberta.encode('Hello world!') assert tokens.tolist() == [0, 31414, 232, 328, 2] roberta.decode(tokens) # 'Hello world!' ``` ##### Extract features from RoBERTa: ```python # Extract the last layer's features last_layer_features = roberta.extract_features(tokens) assert last_layer_features.size() == torch.Size([1, 5, 1024]) # Extract all layer's features (layer 0 is the embedding layer) all_layers = roberta.extract_features(tokens, return_all_hiddens=True) assert len(all_layers) == 25 assert torch.all(all_layers[-1] == last_layer_features) ``` ##### Use RoBERTa for sentence-pair classification tasks: ```python # Download RoBERTa already finetuned for MNLI roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli') roberta.eval() # disable dropout for evaluation # Encode a pair of sentences and make a prediction tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.') roberta.predict('mnli', tokens).argmax() # 0: contradiction # Encode another pair of sentences tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.') roberta.predict('mnli', tokens).argmax() # 2: entailment ``` ##### Register a new (randomly initialized) classification head: ```python roberta.register_classification_head('new_task', num_classes=3) logprobs = roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>) ``` ##### Batched prediction: ```python import torch from fairseq.data.data_utils import collate_tokens roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli') roberta.eval() batch_of_pairs = [ ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'], ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'], ['potatoes are awesome.', 'I like to run.'], ['Mars is very far from earth.', 'Mars is very close.'], ] batch = collate_tokens( [roberta.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1 ) logprobs = roberta.predict('mnli', batch) print(logprobs.argmax(dim=1)) # tensor([0, 2, 1, 0]) ``` ##### Using the GPU: ```python roberta.cuda() roberta.predict('new_task', tokens) # tensor([[-1.1050, -1.0672, -1.1245]], device='cuda:0', grad_fn=<LogSoftmaxBackward>) ``` ## Advanced usage #### Filling masks: RoBERTa can be used to fill `<mask>` tokens in the input. Some examples from the [Natural Questions dataset](https://ai.google.com/research/NaturalQuestions/): ```python roberta.fill_mask('The first Star wars movie came out in <mask>', topk=3) # [('The first Star wars movie came out in 1977', 0.9504708051681519, ' 1977'), ('The first Star wars movie came out in 1978', 0.009986862540245056, ' 1978'), ('The first Star wars movie came out in 1979', 0.009574787691235542, ' 1979')] roberta.fill_mask('Vikram samvat calender is official in <mask>', topk=3) # [('Vikram samvat calender is official in India', 0.21878819167613983, ' India'), ('Vikram samvat calender is official in Delhi', 0.08547237515449524, ' Delhi'), ('Vikram samvat calender is official in Gujarat', 0.07556215673685074, ' Gujarat')] roberta.fill_mask('<mask> is the common currency of the European Union', topk=3) # [('Euro is the common currency of the European Union', 0.9456493854522705, 'Euro'), ('euro is the common currency of the European Union', 0.025748178362846375, 'euro'), ('€ is the common currency of the European Union', 0.011183084920048714, '€')] ``` #### Pronoun disambiguation (Winograd Schema Challenge): RoBERTa can be used to disambiguate pronouns. First install spaCy and download the English-language model: ```bash pip install spacy python -m spacy download en_core_web_lg ``` Next load the `roberta.large.wsc` model and call the `disambiguate_pronoun` function. The pronoun should be surrounded by square brackets (`[]`) and the query referent surrounded by underscores (`_`), or left blank to return the predicted candidate text directly: ```python roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.wsc', user_dir='examples/roberta/wsc') roberta.cuda() # use the GPU (optional) roberta.disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.') # True roberta.disambiguate_pronoun('The trophy would not fit in the brown _suitcase_ because [it] was too big.') # False roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] feared violence.') # 'The city councilmen' roberta.disambiguate_pronoun('The city councilmen refused the demonstrators a permit because [they] advocated violence.') # 'demonstrators' ``` See the [RoBERTA Winograd Schema Challenge (WSC) README](wsc/README.md) for more details on how to train this model. #### Extract features aligned to words: By default RoBERTa outputs one feature vector per BPE token. You can instead realign the features to match [spaCy's word-level tokenization](https://spacy.io/usage/linguistic-features#tokenization) with the `extract_features_aligned_to_words` method. This will compute a weighted average of the BPE-level features for each word and expose them in spaCy's `Token.vector` attribute: ```python doc = roberta.extract_features_aligned_to_words('I said, "hello RoBERTa."') assert len(doc) == 10 for tok in doc: print('{:10}{} (...)'.format(str(tok), tok.vector[:5])) # <s> tensor([-0.1316, -0.0386, -0.0832, -0.0477, 0.1943], grad_fn=<SliceBackward>) (...) # I tensor([ 0.0559, 0.1541, -0.4832, 0.0880, 0.0120], grad_fn=<SliceBackward>) (...) # said tensor([-0.1565, -0.0069, -0.8915, 0.0501, -0.0647], grad_fn=<SliceBackward>) (...) # , tensor([-0.1318, -0.0387, -0.0834, -0.0477, 0.1944], grad_fn=<SliceBackward>) (...) # " tensor([-0.0486, 0.1818, -0.3946, -0.0553, 0.0981], grad_fn=<SliceBackward>) (...) # hello tensor([ 0.0079, 0.1799, -0.6204, -0.0777, -0.0923], grad_fn=<SliceBackward>) (...) # RoBERTa tensor([-0.2339, -0.1184, -0.7343, -0.0492, 0.5829], grad_fn=<SliceBackward>) (...) # . tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=<SliceBackward>) (...) # " tensor([-0.1341, -0.1203, -0.1012, -0.0621, 0.1892], grad_fn=<SliceBackward>) (...) # </s> tensor([-0.0930, -0.0392, -0.0821, 0.0158, 0.0649], grad_fn=<SliceBackward>) (...) ``` #### Evaluating the `roberta.large.mnli` model: Example python code snippet to evaluate accuracy on the MNLI `dev_matched` set. ```python label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'} ncorrect, nsamples = 0, 0 roberta.cuda() roberta.eval() with open('glue_data/MNLI/dev_matched.tsv') as fin: fin.readline() for index, line in enumerate(fin): tokens = line.strip().split('\t') sent1, sent2, target = tokens[8], tokens[9], tokens[-1] tokens = roberta.encode(sent1, sent2) prediction = roberta.predict('mnli', tokens).argmax().item() prediction_label = label_map[prediction] ncorrect += int(prediction_label == target) nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) # Expected output: 0.9060 ``` ## Finetuning - [Finetuning on GLUE](README.glue.md) - [Finetuning on custom classification tasks (e.g., IMDB)](README.custom_classification.md) - [Finetuning on Winograd Schema Challenge (WSC)](wsc/README.md) - [Finetuning on Commonsense QA (CQA)](commonsense_qa/README.md) ## Pretraining using your own data See the [tutorial for pretraining RoBERTa using your own data](README.pretraining.md). ## Citation ```bibtex @article{liu2019roberta, title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach}, author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov}, journal={arXiv preprint arXiv:1907.11692}, year = {2019}, } ``` ================================================ FILE: examples/roberta/README.pretraining.md ================================================ # Pretraining RoBERTa using your own data This tutorial will walk you through pretraining RoBERTa over your own data. ### 1) Preprocess the data Data should be preprocessed following the [language modeling format](/examples/language_model), i.e. each document should be separated by an empty line (only useful with `--sample-break-mode complete_doc`). Lines will be concatenated as a 1D text stream during training. We'll use the [WikiText-103 dataset](https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/) to demonstrate how to preprocess raw text data with the GPT-2 BPE. Of course this dataset is quite small, so the resulting pretrained model will perform poorly, but it gives the general idea. First download the dataset: ```bash wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip unzip wikitext-103-raw-v1.zip ``` Next encode it with the GPT-2 BPE: ```bash mkdir -p gpt2_bpe wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe for SPLIT in train valid test; do \ python -m examples.roberta.multiprocessing_bpe_encoder \ --encoder-json gpt2_bpe/encoder.json \ --vocab-bpe gpt2_bpe/vocab.bpe \ --inputs wikitext-103-raw/wiki.${SPLIT}.raw \ --outputs wikitext-103-raw/wiki.${SPLIT}.bpe \ --keep-empty \ --workers 60; \ done ``` Finally preprocess/binarize the data using the GPT-2 fairseq dictionary: ```bash wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt fairseq-preprocess \ --only-source \ --srcdict gpt2_bpe/dict.txt \ --trainpref wikitext-103-raw/wiki.train.bpe \ --validpref wikitext-103-raw/wiki.valid.bpe \ --testpref wikitext-103-raw/wiki.test.bpe \ --destdir data-bin/wikitext-103 \ --workers 60 ``` ### 2) Train RoBERTa base ```bash DATA_DIR=data-bin/wikitext-103 fairseq-hydra-train -m --config-dir examples/roberta/config/pretraining \ --config-name base task.data=$DATA_DIR ``` **Note:** You can optionally resume training the released RoBERTa base model by adding `checkpoint.restore_file=/path/to/roberta.base/model.pt`. **Note:** The above command assumes training on 8x32GB V100 GPUs. Each GPU uses a batch size of 16 sequences (`dataset.batch_size`) and accumulates gradients to further increase the batch size by 16x (`optimization.update_freq`), for a total batch size of 2048 sequences. If you have fewer GPUs or GPUs with less memory you may need to reduce `dataset.batch_size` and increase dataset.update_freq to compensate. Alternatively if you have more GPUs you can decrease `dataset.update_freq` accordingly to increase training speed. **Note:** The learning rate and batch size are tightly connected and need to be adjusted together. We generally recommend increasing the learning rate as you increase the batch size according to the following table (although it's also dataset dependent, so don't rely on the following values too closely): batch size | peak learning rate ---|--- 256 | 0.0001 2048 | 0.0005 8192 | 0.0007 ### 3) Load your pretrained model ```python from fairseq.models.roberta import RobertaModel roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'path/to/data') assert isinstance(roberta.model, torch.nn.Module) ``` ================================================ FILE: examples/roberta/README.race.md ================================================ # Finetuning RoBERTa on RACE tasks ### 1) Download the data from RACE website (http://www.cs.cmu.edu/~glai1/data/race/) ### 2) Preprocess RACE data: ```bash python ./examples/roberta/preprocess_RACE.py --input-dir <input-dir> --output-dir <extracted-data-dir> ./examples/roberta/preprocess_RACE.sh <extracted-data-dir> <output-dir> ``` ### 3) Fine-tuning on RACE: ```bash MAX_EPOCH=5 # Number of training epochs. LR=1e-05 # Peak LR for fixed LR scheduler. NUM_CLASSES=4 MAX_SENTENCES=1 # Batch size per GPU. UPDATE_FREQ=8 # Accumulate gradients to simulate training on 8 GPUs. DATA_DIR=/path/to/race-output-dir ROBERTA_PATH=/path/to/roberta/model.pt CUDA_VISIBLE_DEVICES=0,1 fairseq-train $DATA_DIR --ddp-backend=legacy_ddp \ --restore-file $ROBERTA_PATH \ --reset-optimizer --reset-dataloader --reset-meters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --task sentence_ranking \ --num-classes $NUM_CLASSES \ --init-token 0 --separator-token 2 \ --max-option-length 128 \ --max-positions 512 \ --shorten-method "truncate" \ --arch roberta_large \ --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ --criterion sentence_ranking \ --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \ --clip-norm 0.0 \ --lr-scheduler fixed --lr $LR \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --batch-size $MAX_SENTENCES \ --required-batch-size-multiple 1 \ --update-freq $UPDATE_FREQ \ --max-epoch $MAX_EPOCH ``` **Note:** a) As contexts in RACE are relatively long, we are using smaller batch size per GPU while increasing update-freq to achieve larger effective batch size. b) Above cmd-args and hyperparams are tested on one Nvidia `V100` GPU with `32gb` of memory for each task. Depending on the GPU memory resources available to you, you can use increase `--update-freq` and reduce `--batch-size`. c) The setting in above command is based on our hyperparam search within a fixed search space (for careful comparison across models). You might be able to find better metrics with wider hyperparam search. ### 4) Evaluation: ``` DATA_DIR=/path/to/race-output-dir # data directory used during training MODEL_PATH=/path/to/checkpoint_best.pt # path to the finetuned model checkpoint PREDS_OUT=preds.tsv # output file path to save prediction TEST_SPLIT=test # can be test (Middle) or test1 (High) fairseq-validate \ $DATA_DIR \ --valid-subset $TEST_SPLIT \ --path $MODEL_PATH \ --batch-size 1 \ --task sentence_ranking \ --criterion sentence_ranking \ --save-predictions $PREDS_OUT ``` ================================================ FILE: examples/roberta/commonsense_qa/README.md ================================================ # Finetuning RoBERTa on Commonsense QA We follow a similar approach to [finetuning RACE](../README.race.md). Specifically for each question we construct five inputs, one for each of the five candidate answer choices. Each input is constructed by concatenating the question and candidate answer. We then encode each input and pass the resulting "[CLS]" representations through a fully-connected layer to predict the correct answer. We train with a standard cross-entropy loss. We also found it helpful to prepend a prefix of `Q:` to the question and `A:` to the answer. The complete input format is: ``` <s> Q: Where would I not want a fox? </s> A: hen house </s> ``` Our final submission is based on a hyperparameter search over the learning rate (1e-5, 2e-5, 3e-5), batch size (8, 16), number of training steps (2000, 3000, 4000) and random seed. We selected the model with the best performance on the development set after 100 trials. ### 1) Download data from the Commonsense QA website (https://www.tau-nlp.org/commonsenseqa) ```bash bash examples/roberta/commonsense_qa/download_cqa_data.sh ``` ### 2) Finetune ```bash MAX_UPDATES=3000 # Number of training steps. WARMUP_UPDATES=150 # Linearly increase LR over this many steps. LR=1e-05 # Peak LR for polynomial LR scheduler. MAX_SENTENCES=16 # Batch size. SEED=1 # Random seed. ROBERTA_PATH=/path/to/roberta/model.pt DATA_DIR=data/CommonsenseQA # we use the --user-dir option to load the task from # the examples/roberta/commonsense_qa directory: FAIRSEQ_PATH=/path/to/fairseq FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/commonsense_qa CUDA_VISIBLE_DEVICES=0 fairseq-train --fp16 --ddp-backend=legacy_ddp \ $DATA_DIR \ --user-dir $FAIRSEQ_USER_DIR \ --restore-file $ROBERTA_PATH \ --reset-optimizer --reset-dataloader --reset-meters \ --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --task commonsense_qa --init-token 0 --bpe gpt2 \ --arch roberta_large --max-positions 512 \ --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ --criterion sentence_ranking --num-classes 5 \ --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr $LR \ --warmup-updates $WARMUP_UPDATES --total-num-update $MAX_UPDATES \ --batch-size $MAX_SENTENCES \ --max-update $MAX_UPDATES \ --log-format simple --log-interval 25 \ --seed $SEED ``` The above command assumes training on 1 GPU with 32GB of RAM. For GPUs with less memory, decrease `--batch-size` and increase `--update-freq` accordingly to compensate. ### 3) Evaluate ```python import json import torch from fairseq.models.roberta import RobertaModel from examples.roberta import commonsense_qa # load the Commonsense QA task roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'data/CommonsenseQA') roberta.eval() # disable dropout roberta.cuda() # use the GPU (optional) nsamples, ncorrect = 0, 0 with open('data/CommonsenseQA/valid.jsonl') as h: for line in h: example = json.loads(line) scores = [] for choice in example['question']['choices']: input = roberta.encode( 'Q: ' + example['question']['stem'], 'A: ' + choice['text'], no_separator=True ) score = roberta.predict('sentence_classification_head', input, return_logits=True) scores.append(score) pred = torch.cat(scores).argmax() answer = ord(example['answerKey']) - ord('A') nsamples += 1 if pred == answer: ncorrect += 1 print('Accuracy: ' + str(ncorrect / float(nsamples))) # Accuracy: 0.7846027846027847 ``` The above snippet is not batched, which makes it quite slow. See [instructions for batched prediction with RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta#batched-prediction). ================================================ FILE: examples/roberta/commonsense_qa/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import commonsense_qa_task # noqa ================================================ FILE: examples/roberta/commonsense_qa/commonsense_qa_task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import os import numpy as np import torch from fairseq.data import ( Dictionary, IdDataset, ListDataset, NestedDictionaryDataset, NumelDataset, NumSamplesDataset, RawLabelDataset, RightPadDataset, SortDataset, data_utils, encoders, ) from fairseq.tasks import LegacyFairseqTask, register_task @register_task("commonsense_qa") class CommonsenseQATask(LegacyFairseqTask): """Task to finetune RoBERTa for Commonsense QA.""" @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument( "data", metavar="DIR", help="path to data directory; we load <split>.jsonl" ) parser.add_argument( "--init-token", type=int, default=None, help="add token at the beginning of each batch item", ) parser.add_argument("--num-classes", type=int, default=5) def __init__(self, args, vocab): super().__init__(args) self.vocab = vocab self.mask = vocab.add_symbol("<mask>") self.bpe = encoders.build_bpe(args) @classmethod def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol("<mask>") return dictionary @classmethod def setup_task(cls, args, **kwargs): assert ( args.criterion == "sentence_ranking" ), "Must set --criterion=sentence_ranking" # load data and label dictionaries vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt")) print("| dictionary: {} types".format(len(vocab))) return cls(args, vocab) def load_dataset( self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs ): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ def binarize(s, append_bos=False): if self.bpe is not None: s = self.bpe.encode(s) tokens = self.vocab.encode_line( s, append_eos=True, add_if_not_exist=False, ).long() if append_bos and self.args.init_token is not None: tokens = torch.cat([tokens.new([self.args.init_token]), tokens]) return tokens if data_path is None: data_path = os.path.join(self.args.data, split + ".jsonl") if not os.path.exists(data_path): raise FileNotFoundError("Cannot find data: {}".format(data_path)) src_tokens = [[] for i in range(self.args.num_classes)] src_lengths = [[] for i in range(self.args.num_classes)] labels = [] with open(data_path) as h: for line in h: example = json.loads(line.strip()) if "answerKey" in example: label = ord(example["answerKey"]) - ord("A") labels.append(label) question = example["question"]["stem"] assert len(example["question"]["choices"]) == self.args.num_classes # format: `<s> Q: Where would I not want a fox? </s> A: hen house </s>` question = "Q: " + question question_toks = binarize(question, append_bos=True) for i, choice in enumerate(example["question"]["choices"]): src = "A: " + choice["text"] src_bin = torch.cat([question_toks, binarize(src)]) src_tokens[i].append(src_bin) src_lengths[i].append(len(src_bin)) assert all( len(src_tokens[0]) == len(src_tokens[i]) for i in range(self.args.num_classes) ) assert len(src_tokens[0]) == len(src_lengths[0]) assert len(labels) == 0 or len(labels) == len(src_tokens[0]) for i in range(self.args.num_classes): src_lengths[i] = np.array(src_lengths[i]) src_tokens[i] = ListDataset(src_tokens[i], src_lengths[i]) src_lengths[i] = ListDataset(src_lengths[i]) dataset = { "id": IdDataset(), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens[0], reduce=True), } for i in range(self.args.num_classes): dataset.update( { "net_input{}".format(i + 1): { "src_tokens": RightPadDataset( src_tokens[i], pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths[i], } } ) if len(labels) > 0: dataset.update({"target": RawLabelDataset(labels)}) dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])], ) with data_utils.numpy_seed(self.args.seed): dataset = SortDataset( dataset, # shuffle sort_order=[np.random.permutation(len(dataset))], ) print("| Loaded {} with {} samples".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split] def build_model(self, args, from_checkpoint=False): from fairseq import models model = models.build_model(args, self) model.register_classification_head( "sentence_classification_head", num_classes=1, ) return model @property def source_dictionary(self): return self.vocab @property def target_dictionary(self): return self.vocab ================================================ FILE: examples/roberta/commonsense_qa/download_cqa_data.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. OUTDIR=data/CommonsenseQA mkdir -p $OUTDIR wget -O $OUTDIR/train.jsonl https://s3.amazonaws.com/commensenseqa/train_rand_split.jsonl wget -O $OUTDIR/valid.jsonl https://s3.amazonaws.com/commensenseqa/dev_rand_split.jsonl wget -O $OUTDIR/test.jsonl https://s3.amazonaws.com/commensenseqa/test_rand_split_no_answers.jsonl wget -O $OUTDIR/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt ================================================ FILE: examples/roberta/config/finetuning/cola.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 320 optimization: clip_norm: 0.0 lr: [1e-05] max_update: 5336 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/finetuning/mnli.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 3 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 7432 optimization: clip_norm: 0.0 lr: [1e-05] max_update: 123873 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/finetuning/mrpc.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 137 optimization: clip_norm: 0.0 lr: [1e-05] max_update: 2296 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/finetuning/qnli.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 1986 optimization: clip_norm: 0.0 lr: [1e-05] max_update: 33112 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/finetuning/qqp.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 28318 optimization: clip_norm: 0.0 lr: [1e-05] max_update: 113272 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/finetuning/rte.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 122 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 2036 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/finetuning/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/roberta/config/finetuning/run_config/slurm_1g.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: '_' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/roberta_ft/${env:PREFIX}/${hydra.job.config_name}/${env:SUFFIX} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir}/submitit timeout_min: 1000 cpus_per_task: 8 gpus_per_node: 1 tasks_per_node: 1 mem_gb: 60 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 exclude: learnfair1381,learnfair5192,learnfair2304 ================================================ FILE: examples/roberta/config/finetuning/run_config/slurm_1g_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: '_' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /fsx-wav2vec/${env:USER}/roberta_ft/${env:PREFIX}/${hydra.job.config_name}/${env:SUFFIX} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir}/submitit timeout_min: 1000 cpus_per_task: 8 gpus_per_node: 1 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: learnfair,wav2vec max_num_timeout: 30 ================================================ FILE: examples/roberta/config/finetuning/sst_2.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 2 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true best_checkpoint_metric: accuracy maximize_best_checkpoint_metric: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction dataset: batch_size: 32 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 1256 optimization: clip_norm: 0.0 lr: [1e-05] max_update: 20935 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/finetuning/sts_b.yaml ================================================ # @package _group_ common: fp16: true fp16_init_scale: 4 threshold_loss_scale: 1 fp16_scale_window: 128 log_format: json log_interval: 200 task: _name: sentence_prediction data: ??? init_token: 0 separator_token: 2 num_classes: 1 max_positions: 512 checkpoint: restore_file: ??? reset_optimizer: true reset_dataloader: true reset_meters: true no_epoch_checkpoints: true distributed_training: find_unused_parameters: true distributed_world_size: 1 criterion: _name: sentence_prediction regression_target: true dataset: batch_size: 16 required_batch_size_multiple: 1 max_tokens: 4400 optimizer: _name: adam weight_decay: 0.1 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 214 optimization: clip_norm: 0.0 lr: [2e-05] max_update: 3598 max_epoch: 10 model: _name: roberta dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/pretraining/base.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: no_epoch_checkpoints: true task: _name: masked_lm data: ??? sample_break_mode: complete tokens_per_sample: 512 criterion: masked_lm dataset: batch_size: 16 ignore_unused_valid_subsets: true optimizer: _name: adam weight_decay: 0.01 adam_betas: (0.9,0.98) adam_eps: 1e-06 lr_scheduler: _name: polynomial_decay warmup_updates: 10000 optimization: clip_norm: 0 lr: [0.0005] max_update: 125000 update_freq: [16] model: _name: roberta max_positions: 512 dropout: 0.1 attention_dropout: 0.1 ================================================ FILE: examples/roberta/config/pretraining/run_config/local.yaml ================================================ # @package _global_ hydra: sweep: dir: ${env:PWD}/tmp_dbg/${now:%H-%M-%S} distributed_training: distributed_world_size: 1 nprocs_per_node: 1 distributed_port: -1 common: log_interval: 1 dataset: num_workers: 0 ================================================ FILE: examples/roberta/config/pretraining/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/roberta/config/pretraining/run_config/slurm_2_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.local_cache_path - task.data - task.post_save_script - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir - model.model_path sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 0 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec max_num_timeout: 30 ================================================ FILE: examples/roberta/config/pretraining/run_config/slurm_3.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 3 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/roberta/config/pretraining/run_config/slurm_4.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 4 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb,ib4 max_num_timeout: 30 ================================================ FILE: examples/roberta/fb_multilingual/README.multilingual.pretraining.md ================================================ # Multilingual pretraining RoBERTa This tutorial will walk you through pretraining multilingual RoBERTa. ### 1) Preprocess the data ```bash DICTIONARY="/private/home/namangoyal/dataset/XLM/wiki/17/175k/vocab" DATA_LOCATION="/private/home/namangoyal/dataset/XLM/wiki/17/175k" for LANG in en es it do fairseq-preprocess \ --only-source \ --srcdict $DICTIONARY \ --trainpref "$DATA_LOCATION/train.$LANG" \ --validpref "$DATA_LOCATION/valid.$LANG" \ --testpref "$DATA_LOCATION/test.$LANG" \ --destdir "wiki_17-bin/$LANG" \ --workers 60; done ``` ### 2) Train RoBERTa base [COMING UP...] ================================================ FILE: examples/roberta/multiprocessing_bpe_encoder.py ================================================ #!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import argparse import contextlib import sys from collections import Counter from multiprocessing import Pool from fairseq.data.encoders.gpt2_bpe import get_encoder def main(): """ Helper script to encode raw text with the GPT-2 BPE using multiple processes. The encoder.json and vocab.bpe files can be obtained here: - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe """ parser = argparse.ArgumentParser() parser.add_argument( "--encoder-json", help="path to encoder.json", ) parser.add_argument( "--vocab-bpe", type=str, help="path to vocab.bpe", ) parser.add_argument( "--inputs", nargs="+", default=["-"], help="input files to filter/encode", ) parser.add_argument( "--outputs", nargs="+", default=["-"], help="path to save encoded outputs", ) parser.add_argument( "--keep-empty", action="store_true", help="keep empty lines", ) parser.add_argument("--workers", type=int, default=20) args = parser.parse_args() assert len(args.inputs) == len( args.outputs ), "number of input and output paths should match" with contextlib.ExitStack() as stack: inputs = [ stack.enter_context(open(input, "r", encoding="utf-8")) if input != "-" else sys.stdin for input in args.inputs ] outputs = [ stack.enter_context(open(output, "w", encoding="utf-8")) if output != "-" else sys.stdout for output in args.outputs ] encoder = MultiprocessingEncoder(args) pool = Pool(args.workers, initializer=encoder.initializer) encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100) stats = Counter() for i, (filt, enc_lines) in enumerate(encoded_lines, start=1): if filt == "PASS": for enc_line, output_h in zip(enc_lines, outputs): print(enc_line, file=output_h) else: stats["num_filtered_" + filt] += 1 if i % 10000 == 0: print("processed {} lines".format(i), file=sys.stderr) for k, v in stats.most_common(): print("[{}] filtered {} lines".format(k, v), file=sys.stderr) class MultiprocessingEncoder(object): def __init__(self, args): self.args = args def initializer(self): global bpe bpe = get_encoder(self.args.encoder_json, self.args.vocab_bpe) def encode(self, line): global bpe ids = bpe.encode(line) return list(map(str, ids)) def decode(self, tokens): global bpe return bpe.decode(tokens) def encode_lines(self, lines): """ Encode a set of lines. All lines will be encoded together. """ enc_lines = [] for line in lines: line = line.strip() if len(line) == 0 and not self.args.keep_empty: return ["EMPTY", None] tokens = self.encode(line) enc_lines.append(" ".join(tokens)) return ["PASS", enc_lines] def decode_lines(self, lines): dec_lines = [] for line in lines: tokens = map(int, line.strip().split()) dec_lines.append(self.decode(tokens)) return ["PASS", dec_lines] if __name__ == "__main__": main() ================================================ FILE: examples/roberta/preprocess_GLUE_tasks.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # raw glue data as downloaded by glue download script (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e) if [[ $# -ne 2 ]]; then echo "Run as following:" echo "./examples/roberta/preprocess_GLUE_tasks.sh <glud_data_folder> <task_name>" exit 1 fi GLUE_DATA_FOLDER=$1 # download bpe encoder.json, vocabulary and fairseq dictionary wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' TASKS=$2 # QQP if [ "$TASKS" = "ALL" ] then TASKS="QQP MNLI QNLI MRPC RTE STS-B SST-2 CoLA" fi for TASK in $TASKS do echo "Preprocessing $TASK" TASK_DATA_FOLDER="$GLUE_DATA_FOLDER/$TASK" echo "Raw data as downloaded from glue website: $TASK_DATA_FOLDER" SPLITS="train dev test" INPUT_COUNT=2 if [ "$TASK" = "QQP" ] then INPUT_COLUMNS=( 4 5 ) TEST_INPUT_COLUMNS=( 2 3 ) LABEL_COLUMN=6 elif [ "$TASK" = "MNLI" ] then SPLITS="train dev_matched dev_mismatched test_matched test_mismatched" INPUT_COLUMNS=( 9 10 ) TEST_INPUT_COLUMNS=( 9 10 ) DEV_LABEL_COLUMN=16 LABEL_COLUMN=12 elif [ "$TASK" = "QNLI" ] then INPUT_COLUMNS=( 2 3 ) TEST_INPUT_COLUMNS=( 2 3 ) LABEL_COLUMN=4 elif [ "$TASK" = "MRPC" ] then INPUT_COLUMNS=( 4 5 ) TEST_INPUT_COLUMNS=( 4 5 ) LABEL_COLUMN=1 elif [ "$TASK" = "RTE" ] then INPUT_COLUMNS=( 2 3 ) TEST_INPUT_COLUMNS=( 2 3 ) LABEL_COLUMN=4 elif [ "$TASK" = "STS-B" ] then INPUT_COLUMNS=( 8 9 ) TEST_INPUT_COLUMNS=( 8 9 ) LABEL_COLUMN=10 # Following are single sentence tasks. elif [ "$TASK" = "SST-2" ] then INPUT_COLUMNS=( 1 ) TEST_INPUT_COLUMNS=( 2 ) LABEL_COLUMN=2 INPUT_COUNT=1 elif [ "$TASK" = "CoLA" ] then INPUT_COLUMNS=( 4 ) TEST_INPUT_COLUMNS=( 2 ) LABEL_COLUMN=2 INPUT_COUNT=1 fi # Strip out header and filter lines that don't have expected number of fields. rm -rf "$TASK_DATA_FOLDER/processed" mkdir -p "$TASK_DATA_FOLDER/processed" for SPLIT in $SPLITS do # CoLA train and dev doesn't have header. if [[ ( "$TASK" = "CoLA") && ( "$SPLIT" != "test" ) ]] then cp "$TASK_DATA_FOLDER/$SPLIT.tsv" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; else tail -n +2 "$TASK_DATA_FOLDER/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; fi # Remove unformatted lines from train and dev files for QQP dataset. if [[ ( "$TASK" = "QQP") && ( "$SPLIT" != "test" ) ]] then awk -F '\t' -v NUM_FIELDS=6 'NF==NUM_FIELDS{print}{}' "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"; else cp "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv"; fi rm "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp"; done # Split into input0, input1 and label for SPLIT in $SPLITS do for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) do if [[ "$SPLIT" != test* ]] then COLUMN_NUMBER=${INPUT_COLUMNS[$INPUT_TYPE]} else COLUMN_NUMBER=${TEST_INPUT_COLUMNS[$INPUT_TYPE]} fi cut -f"$COLUMN_NUMBER" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.raw.input$INPUT_TYPE"; done if [[ "$SPLIT" != test* ]] then if [ "$TASK" = "MNLI" ] && [ "$SPLIT" != "train" ] then cut -f"$DEV_LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label"; else cut -f"$LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label"; fi fi # BPE encode. for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) do LANG="input$INPUT_TYPE" echo "BPE encoding $SPLIT/$LANG" python -m examples.roberta.multiprocessing_bpe_encoder \ --encoder-json encoder.json \ --vocab-bpe vocab.bpe \ --inputs "$TASK_DATA_FOLDER/processed/$SPLIT.raw.$LANG" \ --outputs "$TASK_DATA_FOLDER/processed/$SPLIT.$LANG" \ --workers 60 \ --keep-empty; done done # Remove output directory. rm -rf "$TASK-bin" DEVPREF="$TASK_DATA_FOLDER/processed/dev.LANG" TESTPREF="$TASK_DATA_FOLDER/processed/test.LANG" if [ "$TASK" = "MNLI" ] then DEVPREF="$TASK_DATA_FOLDER/processed/dev_matched.LANG,$TASK_DATA_FOLDER/processed/dev_mismatched.LANG" TESTPREF="$TASK_DATA_FOLDER/processed/test_matched.LANG,$TASK_DATA_FOLDER/processed/test_mismatched.LANG" fi # Run fairseq preprocessing: for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1))) do LANG="input$INPUT_TYPE" fairseq-preprocess \ --only-source \ --trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \ --validpref "${DEVPREF//LANG/$LANG}" \ --testpref "${TESTPREF//LANG/$LANG}" \ --destdir "$TASK-bin/$LANG" \ --workers 60 \ --srcdict dict.txt; done if [[ "$TASK" != "STS-B" ]] then fairseq-preprocess \ --only-source \ --trainpref "$TASK_DATA_FOLDER/processed/train.label" \ --validpref "${DEVPREF//LANG/label}" \ --destdir "$TASK-bin/label" \ --workers 60; else # For STS-B output range is converted to be between: [0.0, 1.0] mkdir -p "$TASK-bin/label" awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label" awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label" fi done ================================================ FILE: examples/roberta/preprocess_RACE.py ================================================ #!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. import argparse import json import os import re class InputExample: def __init__(self, paragraph, qa_list, label): self.paragraph = paragraph self.qa_list = qa_list self.label = label def get_examples(data_dir, set_type): """ Extract paragraph and question-answer list from each json file """ examples = [] levels = ["middle", "high"] set_type_c = set_type.split("-") if len(set_type_c) == 2: levels = [set_type_c[1]] set_type = set_type_c[0] for level in levels: cur_dir = os.path.join(data_dir, set_type, level) for filename in os.listdir(cur_dir): cur_path = os.path.join(cur_dir, filename) with open(cur_path, "r") as f: cur_data = json.load(f) answers = cur_data["answers"] options = cur_data["options"] questions = cur_data["questions"] context = cur_data["article"].replace("\n", " ") context = re.sub(r"\s+", " ", context) for i in range(len(answers)): label = ord(answers[i]) - ord("A") qa_list = [] question = questions[i] for j in range(4): option = options[i][j] if "_" in question: qa_cat = question.replace("_", option) else: qa_cat = " ".join([question, option]) qa_cat = re.sub(r"\s+", " ", qa_cat) qa_list.append(qa_cat) examples.append(InputExample(context, qa_list, label)) return examples def main(): """ Helper script to extract paragraphs questions and answers from RACE datasets. """ parser = argparse.ArgumentParser() parser.add_argument( "--input-dir", help="input directory for downloaded RACE dataset", ) parser.add_argument( "--output-dir", help="output directory for extracted data", ) args = parser.parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir, exist_ok=True) for set_type in ["train", "dev", "test-middle", "test-high"]: examples = get_examples(args.input_dir, set_type) qa_file_paths = [ os.path.join(args.output_dir, set_type + ".input" + str(i + 1)) for i in range(4) ] qa_files = [open(qa_file_path, "w") for qa_file_path in qa_file_paths] outf_context_path = os.path.join(args.output_dir, set_type + ".input0") outf_label_path = os.path.join(args.output_dir, set_type + ".label") outf_context = open(outf_context_path, "w") outf_label = open(outf_label_path, "w") for example in examples: outf_context.write(example.paragraph + "\n") for i in range(4): qa_files[i].write(example.qa_list[i] + "\n") outf_label.write(str(example.label) + "\n") for f in qa_files: f.close() outf_label.close() outf_context.close() if __name__ == "__main__": main() ================================================ FILE: examples/roberta/preprocess_RACE.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # data should be downloaded and processed with reprocess_RACE.py if [[ $# -ne 2 ]]; then echo "Run as following:" echo "./examples/roberta/preprocess_RACE.sh <race_data_folder> <output_folder>" exit 1 fi RACE_DATA_FOLDER=$1 OUT_DATA_FOLDER=$2 # download bpe encoder.json, vocabulary and fairseq dictionary wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' SPLITS="train dev test-middle test-high" INPUT_TYPES="input0 input1 input2 input3 input4" for INPUT_TYPE in $INPUT_TYPES do for SPLIT in $SPLITS do echo "BPE encoding $SPLIT/$INPUT_TYPE" python -m examples.roberta.multiprocessing_bpe_encoder \ --encoder-json encoder.json \ --vocab-bpe vocab.bpe \ --inputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE" \ --outputs "$RACE_DATA_FOLDER/$SPLIT.$INPUT_TYPE.bpe" \ --workers 10 \ --keep-empty; done done for INPUT_TYPE in $INPUT_TYPES do LANG="input$INPUT_TYPE" fairseq-preprocess \ --only-source \ --trainpref "$RACE_DATA_FOLDER/train.$INPUT_TYPE.bpe" \ --validpref "$RACE_DATA_FOLDER/dev.$INPUT_TYPE.bpe" \ --testpref "$RACE_DATA_FOLDER/test-middle.$INPUT_TYPE.bpe,$RACE_DATA_FOLDER/test-high.$INPUT_TYPE.bpe" \ --destdir "$OUT_DATA_FOLDER/$INPUT_TYPE" \ --workers 10 \ --srcdict dict.txt; done rm -rf "$OUT_DATA_FOLDER/label" mkdir -p "$OUT_DATA_FOLDER/label" cp "$RACE_DATA_FOLDER/train.label" "$OUT_DATA_FOLDER/label/" cp "$RACE_DATA_FOLDER/dev.label" "$OUT_DATA_FOLDER/label/valid.label" cp "$RACE_DATA_FOLDER/test-middle.label" "$OUT_DATA_FOLDER/label/test.label" cp "$RACE_DATA_FOLDER/test-high.label" "$OUT_DATA_FOLDER/label/test1.label" ================================================ FILE: examples/roberta/wsc/README.md ================================================ # Finetuning RoBERTa on Winograd Schema Challenge (WSC) data The following instructions can be used to finetune RoBERTa on the WSC training data provided by [SuperGLUE](https://super.gluebenchmark.com/). Note that there is high variance in the results. For our GLUE/SuperGLUE submission we swept over the learning rate (1e-5, 2e-5, 3e-5), batch size (16, 32, 64) and total number of updates (500, 1000, 2000, 3000), as well as the random seed. Out of ~100 runs we chose the best 7 models and ensembled them. **Approach:** The instructions below use a slightly different loss function than what's described in the original RoBERTa arXiv paper. In particular, [Kocijan et al. (2019)](https://arxiv.org/abs/1905.06290) introduce a margin ranking loss between `(query, candidate)` pairs with tunable hyperparameters alpha and beta. This is supported in our code as well with the `--wsc-alpha` and `--wsc-beta` arguments. However, we achieved slightly better (and more robust) results on the development set by instead using a single cross entropy loss term over the log-probabilities for the query and all mined candidates. **The candidates are mined using spaCy from each input sentence in isolation, so the approach remains strictly pointwise.** This reduces the number of hyperparameters and our best model achieved 92.3% development set accuracy, compared to ~90% accuracy for the margin loss. Later versions of the RoBERTa arXiv paper will describe this updated formulation. ### 1) Download the WSC data from the SuperGLUE website: ```bash wget https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip unzip WSC.zip # we also need to copy the RoBERTa dictionary into the same directory wget -O WSC/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt ``` ### 2) Finetune over the provided training data: ```bash TOTAL_NUM_UPDATES=2000 # Total number of training steps. WARMUP_UPDATES=250 # Linearly increase LR over this many steps. LR=2e-05 # Peak LR for polynomial LR scheduler. MAX_SENTENCES=16 # Batch size per GPU. SEED=1 # Random seed. ROBERTA_PATH=/path/to/roberta/model.pt # we use the --user-dir option to load the task and criterion # from the examples/roberta/wsc directory: FAIRSEQ_PATH=/path/to/fairseq FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train WSC/ \ --restore-file $ROBERTA_PATH \ --reset-optimizer --reset-dataloader --reset-meters \ --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --valid-subset val \ --fp16 --ddp-backend legacy_ddp \ --user-dir $FAIRSEQ_USER_DIR \ --task wsc --criterion wsc --wsc-cross-entropy \ --arch roberta_large --bpe gpt2 --max-positions 512 \ --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \ --lr-scheduler polynomial_decay --lr $LR \ --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \ --batch-size $MAX_SENTENCES \ --max-update $TOTAL_NUM_UPDATES \ --log-format simple --log-interval 100 \ --seed $SEED ``` The above command assumes training on 4 GPUs, but you can achieve the same results on a single GPU by adding `--update-freq=4`. ### 3) Evaluate ```python from fairseq.models.roberta import RobertaModel from examples.roberta.wsc import wsc_utils # also loads WSC task and criterion roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', 'WSC/') roberta.cuda() nsamples, ncorrect = 0, 0 for sentence, label in wsc_utils.jsonl_iterator('WSC/val.jsonl', eval=True): pred = roberta.disambiguate_pronoun(sentence) nsamples += 1 if pred == label: ncorrect += 1 print('Accuracy: ' + str(ncorrect / float(nsamples))) # Accuracy: 0.9230769230769231 ``` ## RoBERTa training on WinoGrande dataset We have also provided `winogrande` task and criterion for finetuning on the [WinoGrande](https://mosaic.allenai.org/projects/winogrande) like datasets where there are always two candidates and one is correct. It's more efficient implementation for such subcases. ```bash TOTAL_NUM_UPDATES=23750 # Total number of training steps. WARMUP_UPDATES=2375 # Linearly increase LR over this many steps. LR=1e-05 # Peak LR for polynomial LR scheduler. MAX_SENTENCES=32 # Batch size per GPU. SEED=1 # Random seed. ROBERTA_PATH=/path/to/roberta/model.pt # we use the --user-dir option to load the task and criterion # from the examples/roberta/wsc directory: FAIRSEQ_PATH=/path/to/fairseq FAIRSEQ_USER_DIR=${FAIRSEQ_PATH}/examples/roberta/wsc cd fairseq CUDA_VISIBLE_DEVICES=0 fairseq-train winogrande_1.0/ \ --restore-file $ROBERTA_PATH \ --reset-optimizer --reset-dataloader --reset-meters \ --no-epoch-checkpoints --no-last-checkpoints --no-save-optimizer-state \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --valid-subset val \ --fp16 --ddp-backend legacy_ddp \ --user-dir $FAIRSEQ_USER_DIR \ --task winogrande --criterion winogrande \ --wsc-margin-alpha 5.0 --wsc-margin-beta 0.4 \ --arch roberta_large --bpe gpt2 --max-positions 512 \ --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 \ --optimizer adam --adam-betas '(0.9, 0.98)' --adam-eps 1e-06 \ --lr-scheduler polynomial_decay --lr $LR \ --warmup-updates $WARMUP_UPDATES --total-num-update $TOTAL_NUM_UPDATES \ --batch-size $MAX_SENTENCES \ --max-update $TOTAL_NUM_UPDATES \ --log-format simple --log-interval 100 ``` ================================================ FILE: examples/roberta/wsc/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import wsc_criterion # noqa from . import wsc_task # noqa ================================================ FILE: examples/roberta/wsc/wsc_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.criterions import LegacyFairseqCriterion, register_criterion from fairseq.data import encoders @register_criterion("wsc") class WSCCriterion(LegacyFairseqCriterion): def __init__(self, args, task): super().__init__(args, task) if self.args.save_predictions is not None: self.prediction_h = open(self.args.save_predictions, "w") else: self.prediction_h = None self.bpe = encoders.build_bpe(args.bpe) self.tokenizer = encoders.build_tokenizer(args.tokenizer) def __del__(self): if self.prediction_h is not None: self.prediction_h.close() @staticmethod def add_args(parser): """Add criterion-specific arguments to the parser.""" parser.add_argument("--wsc-margin-alpha", type=float, metavar="A", default=1.0) parser.add_argument("--wsc-margin-beta", type=float, metavar="B", default=0.0) parser.add_argument( "--wsc-cross-entropy", action="store_true", help="use cross entropy formulation instead of margin loss", ) parser.add_argument( "--save-predictions", metavar="FILE", help="file to save predictions to" ) def get_masked_input(self, tokens, mask): masked_tokens = tokens.clone() masked_tokens[mask] = self.task.mask return masked_tokens def get_lprobs(self, model, tokens, mask): logits, _ = model(src_tokens=self.get_masked_input(tokens, mask)) lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float) scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1) mask = mask.type_as(scores) scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1) return scores def get_loss(self, query_lprobs, cand_lprobs): if self.args.wsc_cross_entropy: return F.cross_entropy( torch.cat([query_lprobs, cand_lprobs]).unsqueeze(0), query_lprobs.new([0]).long(), ) else: return ( -query_lprobs + self.args.wsc_margin_alpha * (cand_lprobs - query_lprobs + self.args.wsc_margin_beta).clamp(min=0) ).sum() def forward(self, model, sample, reduce=True): # compute loss and accuracy loss, nloss = 0.0, 0 ncorrect, nqueries = 0, 0 for i, label in enumerate(sample["labels"]): query_lprobs = self.get_lprobs( model, sample["query_tokens"][i].unsqueeze(0), sample["query_masks"][i].unsqueeze(0), ) cand_lprobs = self.get_lprobs( model, sample["candidate_tokens"][i], sample["candidate_masks"][i], ) pred = (query_lprobs >= cand_lprobs).all().item() if label is not None: label = 1 if label else 0 ncorrect += 1 if pred == label else 0 nqueries += 1 if label: # only compute a loss for positive instances nloss += 1 loss += self.get_loss(query_lprobs, cand_lprobs) id = sample["id"][i].item() if self.prediction_h is not None: print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h) if nloss == 0: loss = torch.tensor(0.0, requires_grad=True) sample_size = nqueries if nqueries > 0 else 1 logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "ntokens": sample["ntokens"], "nsentences": sample["nsentences"], "sample_size": sample_size, "ncorrect": ncorrect, "nqueries": nqueries, } return loss, sample_size, logging_output @staticmethod def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) agg_output = { "loss": loss_sum / sample_size / math.log(2), "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size, } ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) nqueries = sum(log.get("nqueries", 0) for log in logging_outputs) if nqueries > 0: agg_output["accuracy"] = ncorrect / float(nqueries) return agg_output @register_criterion("winogrande") class WinograndeCriterion(WSCCriterion): def forward(self, model, sample, reduce=True): # compute loss and accuracy query_lprobs = self.get_lprobs( model, sample["query_tokens"], sample["query_masks"], ) cand_lprobs = self.get_lprobs( model, sample["candidate_tokens"], sample["candidate_masks"], ) pred = query_lprobs >= cand_lprobs loss = self.get_loss(query_lprobs, cand_lprobs) sample_size = sample["query_tokens"].size(0) ncorrect = pred.sum().item() logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "ntokens": sample["ntokens"], "nsentences": sample["nsentences"], "sample_size": sample_size, "ncorrect": ncorrect, "nqueries": sample_size, } return loss, sample_size, logging_output ================================================ FILE: examples/roberta/wsc/wsc_task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import os import tempfile import numpy as np import torch import torch.nn.functional as F from fairseq import utils from fairseq.data import ( Dictionary, IdDataset, ListDataset, NestedDictionaryDataset, NumelDataset, NumSamplesDataset, PadDataset, SortDataset, data_utils, encoders, ) from fairseq.tasks import LegacyFairseqTask, register_task from . import wsc_utils @register_task("wsc") class WSCTask(LegacyFairseqTask): """Task to finetune RoBERTa for Winograd Schemas.""" @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument( "data", metavar="DIR", help="path to data directory; we load <split>.jsonl" ) parser.add_argument( "--init-token", type=int, default=None, help="add token at the beginning of each batch item", ) def __init__(self, args, vocab): super().__init__(args) self.vocab = vocab self.mask = vocab.add_symbol("<mask>") self.bpe = encoders.build_bpe(args) self.tokenizer = encoders.build_tokenizer(args) # hack to handle GPT-2 BPE, which includes leading spaces if args.bpe == "gpt2": self.leading_space = True self.trailing_space = False else: self.leading_space = False self.trailing_space = True @classmethod def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol("<mask>") return dictionary @classmethod def setup_task(cls, args, **kwargs): assert args.criterion == "wsc", "Must set --criterion=wsc" # load data and label dictionaries vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt")) print("| dictionary: {} types".format(len(vocab))) return cls(args, vocab) def binarize(self, s: str, append_eos: bool = False): if self.tokenizer is not None: s = self.tokenizer.encode(s) if self.bpe is not None: s = self.bpe.encode(s) tokens = self.vocab.encode_line( s, append_eos=append_eos, add_if_not_exist=False, ).long() if self.args.init_token is not None: tokens = torch.cat([tokens.new([self.args.init_token]), tokens]) return tokens def binarize_with_mask(self, txt, prefix, suffix, leading_space, trailing_space): toks = self.binarize( prefix + leading_space + txt + trailing_space + suffix, append_eos=True, ) mask = torch.zeros_like(toks, dtype=torch.bool) mask_start = len(self.binarize(prefix)) mask_size = len(self.binarize(leading_space + txt)) mask[mask_start : mask_start + mask_size] = 1 return toks, mask def load_dataset( self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs ): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if data_path is None: data_path = os.path.join(self.args.data, split + ".jsonl") if not os.path.exists(data_path): raise FileNotFoundError("Cannot find data: {}".format(data_path)) query_tokens = [] query_masks = [] query_lengths = [] candidate_tokens = [] candidate_masks = [] candidate_lengths = [] labels = [] for sentence, pronoun_span, query, label in wsc_utils.jsonl_iterator(data_path): prefix = sentence[: pronoun_span.start].text suffix = sentence[pronoun_span.end :].text_with_ws # spaCy spans include trailing spaces, but we need to know about # leading spaces for the GPT-2 BPE leading_space = ( " " if sentence[: pronoun_span.start].text_with_ws.endswith(" ") else "" ) trailing_space = " " if pronoun_span.text_with_ws.endswith(" ") else "" # get noun phrases, excluding pronouns and anything overlapping with the query cand_spans = wsc_utils.filter_noun_chunks( wsc_utils.extended_noun_chunks(sentence), exclude_pronouns=True, exclude_query=query, exact_match=False, ) if query is not None: query_toks, query_mask = self.binarize_with_mask( query, prefix, suffix, leading_space, trailing_space ) query_len = len(query_toks) else: query_toks, query_mask, query_len = None, None, 0 query_tokens.append(query_toks) query_masks.append(query_mask) query_lengths.append(query_len) cand_toks, cand_masks = [], [] for cand_span in cand_spans: toks, mask = self.binarize_with_mask( cand_span.text, prefix, suffix, leading_space, trailing_space, ) cand_toks.append(toks) cand_masks.append(mask) # collate candidates cand_toks = data_utils.collate_tokens(cand_toks, pad_idx=self.vocab.pad()) cand_masks = data_utils.collate_tokens(cand_masks, pad_idx=0) assert cand_toks.size() == cand_masks.size() candidate_tokens.append(cand_toks) candidate_masks.append(cand_masks) candidate_lengths.append(cand_toks.size(1)) labels.append(label) query_lengths = np.array(query_lengths) query_tokens = ListDataset(query_tokens, query_lengths) query_masks = ListDataset(query_masks, query_lengths) candidate_lengths = np.array(candidate_lengths) candidate_tokens = ListDataset(candidate_tokens, candidate_lengths) candidate_masks = ListDataset(candidate_masks, candidate_lengths) labels = ListDataset(labels, [1] * len(labels)) dataset = { "id": IdDataset(), "query_tokens": query_tokens, "query_masks": query_masks, "candidate_tokens": candidate_tokens, "candidate_masks": candidate_masks, "labels": labels, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(query_tokens, reduce=True), } nested_dataset = NestedDictionaryDataset( dataset, sizes=[query_lengths], ) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(query_tokens)) dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) if return_only: return dataset self.datasets[split] = dataset return self.datasets[split] def build_dataset_for_inference(self, sample_json): with tempfile.NamedTemporaryFile(buffering=0) as h: h.write((json.dumps(sample_json) + "\n").encode("utf-8")) dataset = self.load_dataset( "disambiguate_pronoun", data_path=h.name, return_only=True, ) return dataset def disambiguate_pronoun(self, model, sentence, use_cuda=False): sample_json = wsc_utils.convert_sentence_to_json(sentence) dataset = self.build_dataset_for_inference(sample_json) sample = dataset.collater([dataset[0]]) if use_cuda: sample = utils.move_to_cuda(sample) def get_masked_input(tokens, mask): masked_tokens = tokens.clone() masked_tokens[mask.bool()] = self.mask return masked_tokens def get_lprobs(tokens, mask): logits, _ = model(src_tokens=get_masked_input(tokens, mask)) lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float) scores = lprobs.gather(2, tokens.unsqueeze(-1)).squeeze(-1) mask = mask.type_as(scores) scores = (scores * mask).sum(dim=-1) / mask.sum(dim=-1) return scores cand_lprobs = get_lprobs( sample["candidate_tokens"][0], sample["candidate_masks"][0], ) if sample["query_tokens"][0] is not None: query_lprobs = get_lprobs( sample["query_tokens"][0].unsqueeze(0), sample["query_masks"][0].unsqueeze(0), ) return (query_lprobs >= cand_lprobs).all().item() == 1 else: best_idx = cand_lprobs.argmax().item() full_cand = sample["candidate_tokens"][0][best_idx] mask = sample["candidate_masks"][0][best_idx] toks = full_cand[mask.bool()] return self.bpe.decode(self.source_dictionary.string(toks)).strip() @property def source_dictionary(self): return self.vocab @property def target_dictionary(self): return self.vocab @register_task("winogrande") class WinograndeTask(WSCTask): """ Task for WinoGrande dataset. Efficient implementation for Winograd schema tasks with exactly two candidates, one of which is correct. """ @classmethod def setup_task(cls, args, **kwargs): assert args.criterion == "winogrande", "Must set --criterion=winogrande" # load data and label dictionaries vocab = cls.load_dictionary(os.path.join(args.data, "dict.txt")) print("| dictionary: {} types".format(len(vocab))) return cls(args, vocab) def load_dataset( self, split, epoch=1, combine=False, data_path=None, return_only=False, **kwargs ): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if data_path is None: data_path = os.path.join(self.args.data, split + ".jsonl") if not os.path.exists(data_path): raise FileNotFoundError("Cannot find data: {}".format(data_path)) query_tokens = [] query_masks = [] query_lengths = [] candidate_tokens = [] candidate_masks = [] candidate_lengths = [] itr = wsc_utils.winogrande_jsonl_iterator(data_path, eval=(split == "test")) for sample in itr: sentence, pronoun_span, query, cand_text = sample prefix = sentence[: pronoun_span[0]].rstrip() suffix = sentence[pronoun_span[1] :] leading_space = " " if sentence[: pronoun_span[0]].endswith(" ") else "" trailing_space = "" if query is not None: query_toks, query_mask = self.binarize_with_mask( query, prefix, suffix, leading_space, trailing_space, ) query_len = len(query_toks) else: query_toks, query_mask, query_len = None, None, 0 query_tokens.append(query_toks) query_masks.append(query_mask) query_lengths.append(query_len) cand_toks, cand_mask = self.binarize_with_mask( cand_text, prefix, suffix, leading_space, trailing_space, ) candidate_tokens.append(cand_toks) candidate_masks.append(cand_mask) candidate_lengths.append(cand_toks.size(0)) query_lengths = np.array(query_lengths) def get_pad_dataset_fn(tokens, length, pad_idx): return PadDataset( ListDataset(tokens, length), pad_idx=pad_idx, left_pad=False, ) query_tokens = get_pad_dataset_fn(query_tokens, query_lengths, self.vocab.pad()) query_masks = get_pad_dataset_fn(query_masks, query_lengths, 0) candidate_lengths = np.array(candidate_lengths) candidate_tokens = get_pad_dataset_fn( candidate_tokens, candidate_lengths, self.vocab.pad() ) candidate_masks = get_pad_dataset_fn(candidate_masks, candidate_lengths, 0) dataset = { "id": IdDataset(), "query_tokens": query_tokens, "query_masks": query_masks, "candidate_tokens": candidate_tokens, "candidate_masks": candidate_masks, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(query_tokens, reduce=True), } nested_dataset = NestedDictionaryDataset( dataset, sizes=[query_lengths], ) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(query_tokens)) dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) if return_only: return dataset self.datasets[split] = dataset return self.datasets[split] ================================================ FILE: examples/roberta/wsc/wsc_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json from functools import lru_cache def convert_sentence_to_json(sentence): if "_" in sentence: prefix, rest = sentence.split("_", 1) query, rest = rest.split("_", 1) query_index = len(prefix.rstrip().split(" ")) else: query, query_index = None, None prefix, rest = sentence.split("[", 1) pronoun, rest = rest.split("]", 1) pronoun_index = len(prefix.rstrip().split(" ")) sentence = sentence.replace("_", "").replace("[", "").replace("]", "") return { "idx": 0, "text": sentence, "target": { "span1_index": query_index, "span1_text": query, "span2_index": pronoun_index, "span2_text": pronoun, }, } def extended_noun_chunks(sentence): noun_chunks = {(np.start, np.end) for np in sentence.noun_chunks} np_start, cur_np = 0, "NONE" for i, token in enumerate(sentence): np_type = token.pos_ if token.pos_ in {"NOUN", "PROPN"} else "NONE" if np_type != cur_np: if cur_np != "NONE": noun_chunks.add((np_start, i)) if np_type != "NONE": np_start = i cur_np = np_type if cur_np != "NONE": noun_chunks.add((np_start, len(sentence))) return [sentence[s:e] for (s, e) in sorted(noun_chunks)] def find_token(sentence, start_pos): found_tok = None for tok in sentence: if tok.idx == start_pos: found_tok = tok break return found_tok def find_span(sentence, search_text, start=0): search_text = search_text.lower() for tok in sentence[start:]: remainder = sentence[tok.i :].text.lower() if remainder.startswith(search_text): len_to_consume = len(search_text) start_idx = tok.idx for next_tok in sentence[tok.i :]: end_idx = next_tok.idx + len(next_tok.text) if end_idx - start_idx == len_to_consume: span = sentence[tok.i : next_tok.i + 1] return span return None @lru_cache(maxsize=1) def get_detokenizer(): from sacremoses import MosesDetokenizer detok = MosesDetokenizer(lang="en") return detok @lru_cache(maxsize=1) def get_spacy_nlp(): import en_core_web_lg nlp = en_core_web_lg.load() return nlp def jsonl_iterator(input_fname, positive_only=False, ngram_order=3, eval=False): detok = get_detokenizer() nlp = get_spacy_nlp() with open(input_fname) as fin: for line in fin: sample = json.loads(line.strip()) if positive_only and "label" in sample and not sample["label"]: # only consider examples where the query is correct continue target = sample["target"] # clean up the query query = target["span1_text"] if query is not None: if "\n" in query: continue if query.endswith(".") or query.endswith(","): query = query[:-1] # split tokens tokens = sample["text"].split(" ") def strip_pronoun(x): return x.rstrip('.,"') # find the pronoun pronoun_idx = target["span2_index"] pronoun = strip_pronoun(target["span2_text"]) if strip_pronoun(tokens[pronoun_idx]) != pronoun: # hack: sometimes the index is misaligned if strip_pronoun(tokens[pronoun_idx + 1]) == pronoun: pronoun_idx += 1 else: raise Exception("Misaligned pronoun!") assert strip_pronoun(tokens[pronoun_idx]) == pronoun # split tokens before and after the pronoun before = tokens[:pronoun_idx] after = tokens[pronoun_idx + 1 :] # the GPT BPE attaches leading spaces to tokens, so we keep track # of whether we need spaces before or after the pronoun leading_space = " " if pronoun_idx > 0 else "" trailing_space = " " if len(after) > 0 else "" # detokenize before = detok.detokenize(before, return_str=True) pronoun = detok.detokenize([pronoun], return_str=True) after = detok.detokenize(after, return_str=True) # hack: when the pronoun ends in a period (or comma), move the # punctuation to the "after" part if pronoun.endswith(".") or pronoun.endswith(","): after = pronoun[-1] + trailing_space + after pronoun = pronoun[:-1] # hack: when the "after" part begins with a comma or period, remove # the trailing space if after.startswith(".") or after.startswith(","): trailing_space = "" # parse sentence with spacy sentence = nlp(before + leading_space + pronoun + trailing_space + after) # find pronoun span start = len(before + leading_space) first_pronoun_tok = find_token(sentence, start_pos=start) pronoun_span = find_span(sentence, pronoun, start=first_pronoun_tok.i) assert pronoun_span.text == pronoun if eval: # convert to format where pronoun is surrounded by "[]" and # query is surrounded by "_" query_span = find_span(sentence, query) query_with_ws = "_{}_{}".format( query_span.text, (" " if query_span.text_with_ws.endswith(" ") else ""), ) pronoun_with_ws = "[{}]{}".format( pronoun_span.text, (" " if pronoun_span.text_with_ws.endswith(" ") else ""), ) if query_span.start < pronoun_span.start: first = (query_span, query_with_ws) second = (pronoun_span, pronoun_with_ws) else: first = (pronoun_span, pronoun_with_ws) second = (query_span, query_with_ws) sentence = ( sentence[: first[0].start].text_with_ws + first[1] + sentence[first[0].end : second[0].start].text_with_ws + second[1] + sentence[second[0].end :].text ) yield sentence, sample.get("label", None) else: yield sentence, pronoun_span, query, sample.get("label", None) def winogrande_jsonl_iterator(input_fname, eval=False): with open(input_fname) as fin: for line in fin: sample = json.loads(line.strip()) sentence, option1, option2 = ( sample["sentence"], sample["option1"], sample["option2"], ) pronoun_span = (sentence.index("_"), sentence.index("_") + 1) if eval: query, cand = option1, option2 else: query = option1 if sample["answer"] == "1" else option2 cand = option2 if sample["answer"] == "1" else option1 yield sentence, pronoun_span, query, cand def filter_noun_chunks( chunks, exclude_pronouns=False, exclude_query=None, exact_match=False ): if exclude_pronouns: chunks = [ np for np in chunks if (np.lemma_ != "-PRON-" and not all(tok.pos_ == "PRON" for tok in np)) ] if exclude_query is not None: excl_txt = [exclude_query.lower()] filtered_chunks = [] for chunk in chunks: lower_chunk = chunk.text.lower() found = False for excl in excl_txt: if ( not exact_match and (lower_chunk in excl or excl in lower_chunk) ) or lower_chunk == excl: found = True break if not found: filtered_chunks.append(chunk) chunks = filtered_chunks return chunks ================================================ FILE: examples/rxf/README.md ================================================ [Better Fine-Tuning by Reducing Representational Collapse](https://arxiv.org/abs/2008.03156) ===================== This repo contains the code to replicate all experiments from the _Better Fine-Tuning by Reducing Representational Collapse_ paper excluding the probing results. The R3F sentence prediction criterion is registered as `sentence_prediction_r3f` while the label smoothing version of it is implemented as `label_smoothed_cross_entropy_r3f`. The R4F version of the sentence prediction criterion can be achieved by applying spectral norm to the classification head via the `--spectral-norm-classification-head` parameter. ## Hyper-parameters Our methods introduce 3 new hyper-parameters; `--eps` which sets the standard deviation or range of the distribution we're sampling from, `--r3f-lambda` which controls the combining of logistic loss and noisy KL loss and `--noise-type` which controls which parametric distribution we use ('normal', 'uniform'). For example to run R3F on RTE from GLUE ``` TOTAL_NUM_UPDATES=3120 WARMUP_UPDATES=187 LR=1e-05 NUM_CLASSES=2 MAX_SENTENCES=8 # Batch size. ROBERTA_PATH=/path/to/roberta/model.pt CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin \ --restore-file $ROBERTA_PATH \ --max-positions 512 \ --max-sentences $MAX_SENTENCES \ --max-tokens 4400 \ --task sentence_prediction \ --reset-optimizer --reset-dataloader --reset-meters \ --required-batch-size-multiple 1 \ --init-token 0 --separator-token 2 \ --arch roberta_large \ --criterion sentence_prediction_r3f \ --num-classes $NUM_CLASSES \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --max-epoch 10 \ --find-unused-parameters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric \ --noise-type uniform --r3f-lambda 0.7 \ --user-dir examples/rxf/rxf_src ``` ## Citation ```bibtex @article{aghajanyan2020better, title={Better Fine-Tuning by Reducing Representational Collapse}, author={Aghajanyan, Armen and Shrivastava, Akshat and Gupta, Anchit and Goyal, Naman and Zettlemoyer, Luke and Gupta, Sonal}, journal={arXiv preprint arXiv:2008.03156}, year={2020} } ``` ================================================ FILE: examples/rxf/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import rxf_src # noqa ================================================ FILE: examples/rxf/rxf_src/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import label_smoothed_cross_entropy_r3f, sentence_prediction_r3f # noqa ================================================ FILE: examples/rxf/rxf_src/label_smoothed_cross_entropy_r3f.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss @register_criterion("label_smoothed_cross_entropy_r3f") class LabelSmoothedCrossEntropyR3FCriterion(FairseqCriterion): def __init__( self, task, sentence_avg, label_smoothing, eps, r3f_lambda, noise_type ): super().__init__(task) self.sentence_avg = sentence_avg self.label_smoothing = label_smoothing self.eps = eps self.r3f_lambda = r3f_lambda self.noise_type = noise_type if self.noise_type in {"normal"}: self.noise_sampler = torch.distributions.normal.Normal( loc=0.0, scale=self.eps ) elif self.noise_type == "uniform": self.noise_sampler = torch.distributions.uniform.Uniform( low=-self.eps, high=self.eps ) else: raise Exception(f"unrecognized noise type {self.noise_type}") @staticmethod def add_args(parser): """Add criterion-specific arguments to the parser.""" # fmt: off parser.add_argument('--label-smoothing', default=0., type=float, metavar='D', help='epsilon for label smoothing, 0 means no label smoothing') parser.add_argument('--eps', type=float, default=1e-5, help='noise eps') parser.add_argument('--r3f-lambda', type=float, default=1.0, help='lambda for combining logistic loss and noisy KL loss') parser.add_argument('--noise-type', type=str, default='normal', choices=['normal', 'uniform'], help='type of noises') # fmt: on def _get_symm_kl(self, noised_logits, input_logits): return ( F.kl_div( F.log_softmax(noised_logits, dim=-1, dtype=torch.float32), F.softmax(input_logits, dim=-1, dtype=torch.float32), None, None, "sum", ) + F.kl_div( F.log_softmax(input_logits, dim=-1, dtype=torch.float32), F.softmax(noised_logits, dim=-1, dtype=torch.float32), None, None, "sum", ) ) / noised_logits.size(0) def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ token_embeddings = model.encoder.embed_tokens(sample["net_input"]["src_tokens"]) input_logits, extra = model(**sample["net_input"]) loss, nll_loss = self.compute_loss( model, (input_logits, extra), sample, reduce=reduce ) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) if model.training: noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to( token_embeddings ) noised_embeddings = token_embeddings.clone() + noise noised_logits, _ = model( **sample["net_input"], token_embeddings=noised_embeddings ) symm_kl = self._get_symm_kl(noised_logits, input_logits) if model.training: symm_kl = symm_kl * sample_size loss = loss + self.r3f_lambda * symm_kl logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } if model.training: logging_output.update( symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data ) return loss, sample_size, logging_output def compute_loss(self, model, net_output, sample, reduce=True): lprobs = model.get_normalized_probs(net_output, log_probs=True) lprobs = lprobs.view(-1, lprobs.size(-1)) target = model.get_targets(sample, net_output).view(-1, 1) loss, nll_loss = label_smoothed_nll_loss( lprobs, target, self.label_smoothing, ignore_index=self.padding_idx, reduce=reduce, ) return loss, nll_loss @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs) metrics.log_scalar("symm_kl", symm_kl_sum / sample_size, sample_size, round=3) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar( "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: examples/rxf/rxf_src/sentence_prediction_r3f.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.criterions import FairseqCriterion, register_criterion @register_criterion("sentence_prediction_r3f") class SentencePredictionR3F(FairseqCriterion): def __init__( self, task, eps, r3f_lambda, noise_type, classification_head_name, regression_target, ): super().__init__(task) self.eps = eps self.r3f_lambda = r3f_lambda self.noise_type = noise_type self.classification_head_name = classification_head_name self.regression_target = regression_target if self.noise_type in {"normal"}: self.noise_sampler = torch.distributions.normal.Normal( loc=0.0, scale=self.eps ) elif self.noise_type == "uniform": self.noise_sampler = torch.distributions.uniform.Uniform( low=-self.eps, high=self.eps ) else: raise Exception(f"unrecognized noise type {self.noise_type}") @staticmethod def add_args(parser): # fmt: off parser.add_argument('--eps', type=float, default=1e-5, help='noise eps') parser.add_argument('--r3f-lambda', type=float, default=1.0, help='lambda for combining logistic loss and noisy KL loss') parser.add_argument('--noise-type', type=str, default='uniform', choices=['normal', 'uniform'], help='type of noises for RXF methods') parser.add_argument('--classification-head-name', default='sentence_classification_head', help='name of the classification head to use') parser.add_argument('--regression-target', action='store_true') # fmt: on def _get_symm_kl(self, noised_logits, input_logits): return ( F.kl_div( F.log_softmax(noised_logits, dim=-1, dtype=torch.float32), F.softmax(input_logits, dim=-1, dtype=torch.float32), None, None, "sum", ) + F.kl_div( F.log_softmax(input_logits, dim=-1, dtype=torch.float32), F.softmax(noised_logits, dim=-1, dtype=torch.float32), None, None, "sum", ) ) / noised_logits.size(0) def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ assert ( hasattr(model, "classification_heads") and self.classification_head_name in model.classification_heads ), "model must provide sentence classification head for --criterion=sentence_prediction" token_embeddings = model.encoder.sentence_encoder.embed_tokens( sample["net_input"]["src_tokens"] ) input_logits, _ = model( **sample["net_input"], features_only=True, classification_head_name=self.classification_head_name, token_embeddings=token_embeddings, ) if model.training and self.noise_sampler: noise = self.noise_sampler.sample(sample_shape=token_embeddings.shape).to( token_embeddings ) noised_embeddings = token_embeddings.detach().clone() + noise noised_logits, _ = model( **sample["net_input"], features_only=True, classification_head_name=self.classification_head_name, token_embeddings=noised_embeddings, ) symm_kl = self._get_symm_kl(noised_logits, input_logits) else: symm_kl = 0 targets = model.get_targets(sample, [input_logits]).view(-1) sample_size = targets.numel() if not self.regression_target: loss = F.nll_loss( F.log_softmax(input_logits, dim=-1, dtype=torch.float32), targets, reduction="sum", ) if model.training: symm_kl = symm_kl * sample_size loss = loss + self.r3f_lambda * symm_kl else: logits = input_logits.squeeze().float() targets = targets.float() loss = F.mse_loss(logits, targets, reduction="sum") logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "ntokens": sample["ntokens"], "nsentences": sample_size, "sample_size": sample_size, } if not self.regression_target: preds = input_logits.max(dim=1)[1] logging_output.update(ncorrect=(preds == targets).sum().item()) if model.training and self.noise_sampler: logging_output.update( symm_kl=utils.item(symm_kl.data) if reduce else symm_kl.data ) return loss, sample_size, logging_output @staticmethod def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) symm_kl_sum = sum(log.get("symm_kl", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) agg_output = { "loss": loss_sum / sample_size / math.log(2), "symm_kl": symm_kl_sum / sample_size, "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size, } if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]: ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) agg_output.update(accuracy=ncorrect / nsentences) if sample_size != ntokens: agg_output["nll_loss"] = loss_sum / ntokens / math.log(2) return agg_output ================================================ FILE: examples/scaling_nmt/README.md ================================================ # Scaling Neural Machine Translation (Ott et al., 2018) This page includes instructions for reproducing results from the paper [Scaling Neural Machine Translation (Ott et al., 2018)](https://arxiv.org/abs/1806.00187). ## Pre-trained models Model | Description | Dataset | Download ---|---|---|--- `transformer.wmt14.en-fr` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2) `transformer.wmt16.en-de` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2) ## Training a new model on WMT'16 En-De First download the [preprocessed WMT'16 En-De data provided by Google](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8). Then: ##### 1. Extract the WMT'16 En-De data ```bash TEXT=wmt16_en_de_bpe32k mkdir -p $TEXT tar -xzvf wmt16_en_de.tar.gz -C $TEXT ``` ##### 2. Preprocess the dataset with a joined dictionary ```bash fairseq-preprocess \ --source-lang en --target-lang de \ --trainpref $TEXT/train.tok.clean.bpe.32000 \ --validpref $TEXT/newstest2013.tok.bpe.32000 \ --testpref $TEXT/newstest2014.tok.bpe.32000 \ --destdir data-bin/wmt16_en_de_bpe32k \ --nwordssrc 32768 --nwordstgt 32768 \ --joined-dictionary \ --workers 20 ``` ##### 3. Train a model ```bash fairseq-train \ data-bin/wmt16_en_de_bpe32k \ --arch transformer_vaswani_wmt_en_de_big --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \ --dropout 0.3 --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --max-tokens 3584 \ --fp16 ``` Note that the `--fp16` flag requires you have CUDA 9.1 or greater and a Volta GPU or newer. ***IMPORTANT:*** You will get better performance by training with big batches and increasing the learning rate. If you want to train the above model with big batches (assuming your machine has 8 GPUs): - add `--update-freq 16` to simulate training on 8x16=128 GPUs - increase the learning rate; 0.001 works well for big batches ##### 4. Evaluate Now we can evaluate our trained model. Note that the original [Attention Is All You Need](https://arxiv.org/abs/1706.03762) paper used a couple tricks to achieve better BLEU scores. We use these same tricks in the Scaling NMT paper, so it's important to apply them when reproducing our results. First, use the [average_checkpoints.py](/scripts/average_checkpoints.py) script to average the last few checkpoints. Averaging the last 5-10 checkpoints is usually good, but you may need to adjust this depending on how long you've trained: ```bash python scripts/average_checkpoints \ --inputs /path/to/checkpoints \ --num-epoch-checkpoints 10 \ --output checkpoint.avg10.pt ``` Next, generate translations using a beam width of 4 and length penalty of 0.6: ```bash fairseq-generate \ data-bin/wmt16_en_de_bpe32k \ --path checkpoint.avg10.pt \ --beam 4 --lenpen 0.6 --remove-bpe > gen.out ``` Finally, we apply the ["compound splitting" script](/scripts/compound_split_bleu.sh) to add spaces around dashes. For example "Café-Liebhaber" would become three tokens: "Café - Liebhaber". This typically results in larger BLEU scores, but it is not appropriate to compare these inflated scores to work which does not include this trick. This trick was used in the [original AIAYN code](https://github.com/tensorflow/tensor2tensor/blob/fc9335c0203685cbbfe2b30c92db4352d8f60779/tensor2tensor/utils/get_ende_bleu.sh), so we used it in the Scaling NMT paper as well. That said, it's strongly advised to report [sacrebleu](https://github.com/mjpost/sacrebleu) scores instead. To compute "compound split" tokenized BLEU (not recommended!): ```bash bash scripts/compound_split_bleu.sh gen.out # BLEU4 = 29.29, 60.3/35.0/22.8/15.3 (BP=1.000, ratio=1.004, syslen=64763, reflen=64496) ``` To compute detokenized BLEU with sacrebleu (preferred): ```bash bash scripts/sacrebleu.sh wmt14/full en de gen.out # BLEU+case.mixed+lang.en-de+numrefs.1+smooth.exp+test.wmt14/full+tok.13a+version.1.4.3 = 28.6 59.3/34.3/22.1/14.9 (BP = 1.000 ratio = 1.016 hyp_len = 63666 ref_len = 62688) ``` ## Citation ```bibtex @inproceedings{ott2018scaling, title = {Scaling Neural Machine Translation}, author = {Ott, Myle and Edunov, Sergey and Grangier, David and Auli, Michael}, booktitle = {Proceedings of the Third Conference on Machine Translation (WMT)}, year = 2018, } ``` ================================================ FILE: examples/shuffled_word_order/README.finetuning.md ================================================ # Fine-tuning details For each task (GLUE and PAWS), we perform hyperparam search for each model, and report the mean and standard deviation across 5 seeds of the best model. First, get the datasets following the instructions in [RoBERTa fine-tuning README](../roberta/README.glue.md). Alternatively, you can use [huggingface datasets](https://huggingface.co/docs/datasets/) to get the task data: ```python from datasets import load_dataset import pandas as pd from pathlib import Path key2file = { "paws": { "loc": "paws_data", "columns": ["id", "sentence1", "sentence2", "label"], "train": "train.tsv", "validation": "dev.tsv", "test": "test.tsv" } } task_data = load_dataset("paws", "labeled_final") task_config = key2file["paws"] save_path = Path(task_config["loc"]) save_path.mkdir(exist_ok=True, parents=True) for key, fl in task_config.items(): if key in ["loc", "columns"]: continue print(f"Reading {key}") columns = task_config["columns"] df = pd.DataFrame(task_data[key]) print(df.columns) df = df[columns] print(f"Got {len(df)} records") save_loc = save_path / fl print(f"Saving to : {save_loc}") df.to_csv(save_loc, sep="\t", header=None, index=None) ``` - Preprocess using RoBERTa GLUE preprocessing script, while keeping in mind the column numbers for `sentence1`, `sentence2` and `label` (which is 0,1,2 if you save the data according to the above example.) - Then, fine-tuning is performed similarly to RoBERTa (for example, in case of RTE): ```bash TOTAL_NUM_UPDATES=30875 # 10 epochs through RTE for bsz 16 WARMUP_UPDATES=1852 # 6 percent of the number of updates LR=2e-05 # Peak LR for polynomial LR scheduler. NUM_CLASSES=2 MAX_SENTENCES=16 # Batch size. SHUFFLED_ROBERTA_PATH=/path/to/shuffled_roberta/model.pt CUDA_VISIBLE_DEVICES=0 fairseq-train RTE-bin/ \ --restore-file $SHUFFLED_ROBERTA_PATH \ --max-positions 512 \ --batch-size $MAX_SENTENCES \ --max-tokens 4400 \ --task sentence_prediction \ --reset-optimizer --reset-dataloader --reset-meters \ --required-batch-size-multiple 1 \ --init-token 0 --separator-token 2 \ --arch roberta_large \ --criterion sentence_prediction \ --num-classes $NUM_CLASSES \ --dropout 0.1 --attention-dropout 0.1 \ --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 \ --clip-norm 0.0 \ --lr-scheduler polynomial_decay --lr $LR --total-num-update $TOTAL_NUM_UPDATES --warmup-updates $WARMUP_UPDATES \ --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 \ --max-epoch 10 \ --find-unused-parameters \ --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric; ``` - `TOTAL_NUM_UPDATES` is computed based on the `--batch_size` value and the dataset size. - `WARMUP_UPDATES` is computed as 6% of `TOTAL_NUM_UPDATES` - Best hyperparam of `--lr` and `--batch_size` is reported below: ## `--lr` | | name | RTE | MRPC | SST-2 | CoLA | QQP | QNLI | MNLI | PAWS | | --: | :----------- | ----: | ----: | ----: | ----: | ----: | ----: | ----: | ----: | | 0 | original | 2e-05 | 2e-05 | 1e-05 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 2e-05 | | 1 | n_1 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | | 2 | n_2 | 2e-05 | 2e-05 | 1e-05 | 1e-05 | 2e-05 | 1e-05 | 1e-05 | 3e-05 | | 3 | n_3 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | 3e-05 | 1e-05 | 1e-05 | 2e-05 | | 4 | n_4 | 3e-05 | 1e-05 | 2e-05 | 2e-05 | 2e-05 | 1e-05 | 1e-05 | 2e-05 | | 5 | r512 | 1e-05 | 3e-05 | 2e-05 | 2e-05 | 3e-05 | 2e-05 | 3e-05 | 2e-05 | | 6 | rand_corpus | 2e-05 | 1e-05 | 3e-05 | 1e-05 | 3e-05 | 3e-05 | 3e-05 | 2e-05 | | 7 | rand_uniform | 2e-05 | 1e-05 | 3e-05 | 2e-05 | 3e-05 | 3e-05 | 3e-05 | 1e-05 | | 8 | rand_init | 1e-05 | 1e-05 | 3e-05 | 1e-05 | 1e-05 | 1e-05 | 2e-05 | 1e-05 | | 9 | no_pos | 1e-05 | 3e-05 | 2e-05 | 1e-05 | 1e-05 | 1e-05 | 1e-05 | 1e-05 | ## `--batch_size` | | name | RTE | MRPC | SST-2 | CoLA | QQP | QNLI | MNLI | PAWS | | --: | :----------- | --: | ---: | ----: | ---: | --: | ---: | ---: | ---: | | 0 | orig | 16 | 16 | 32 | 16 | 16 | 32 | 32 | 16 | | 1 | n_1 | 32 | 32 | 16 | 32 | 32 | 16 | 32 | 16 | | 2 | n_2 | 32 | 16 | 32 | 16 | 32 | 32 | 16 | 32 | | 3 | n_3 | 32 | 32 | 16 | 32 | 32 | 16 | 32 | 32 | | 4 | n_4 | 32 | 16 | 32 | 16 | 32 | 32 | 32 | 32 | | 5 | r512 | 32 | 16 | 16 | 32 | 32 | 16 | 16 | 16 | | 6 | rand_corpus | 16 | 16 | 16 | 16 | 32 | 16 | 16 | 32 | | 7 | rand_uniform | 16 | 32 | 16 | 16 | 32 | 16 | 16 | 16 | | 8 | rand_init | 16 | 16 | 32 | 16 | 16 | 16 | 32 | 16 | | 9 | no_pos | 16 | 32 | 16 | 16 | 32 | 16 | 16 | 16 | - Perform inference similar to RoBERTa as well: ```python from fairseq.models.roberta import RobertaModel roberta = RobertaModel.from_pretrained( 'checkpoints/', checkpoint_file='checkpoint_best.pt', data_name_or_path='PAWS-bin' ) label_fn = lambda label: roberta.task.label_dictionary.string( [label + roberta.task.label_dictionary.nspecial] ) ncorrect, nsamples = 0, 0 roberta.cuda() roberta.eval() with open('paws_data/dev.tsv') as fin: fin.readline() for index, line in enumerate(fin): tokens = line.strip().split('\t') sent1, sent2, target = tokens[0], tokens[1], tokens[2] tokens = roberta.encode(sent1, sent2) prediction = roberta.predict('sentence_classification_head', tokens).argmax().item() prediction_label = label_fn(prediction) ncorrect += int(prediction_label == target) nsamples += 1 print('| Accuracy: ', float(ncorrect)/float(nsamples)) ``` ================================================ FILE: examples/shuffled_word_order/README.md ================================================ # Masked Language Modeling and the Distributional Hypothesis: Order Word Matters Pre-training for Little [https://arxiv.org/abs/2104.06644](https://arxiv.org/abs/2104.06644) ## Introduction In this work, we pre-train [RoBERTa](../roberta) base on various word shuffled variants of BookWiki corpus (16GB). We observe that a word shuffled pre-trained model achieves surprisingly good scores on GLUE, PAWS and several parametric probing tasks. Please read our paper for more details on the experiments. ## Pre-trained models | Model | Description | Download | | ------------------------------------- | -------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | | `roberta.base.orig` | RoBERTa (base) trained on natural corpus | [roberta.base.orig.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.orig.tar.gz) | | `roberta.base.shuffle.n1` | RoBERTa (base) trained on n=1 gram sentence word shuffled data | [roberta.base.shuffle.n1.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.tar.gz) | | `roberta.base.shuffle.n2` | RoBERTa (base) trained on n=2 gram sentence word shuffled data | [roberta.base.shuffle.n2.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n2.tar.gz) | | `roberta.base.shuffle.n3` | RoBERTa (base) trained on n=3 gram sentence word shuffled data | [roberta.base.shuffle.n3.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n3.tar.gz) | | `roberta.base.shuffle.n4` | RoBERTa (base) trained on n=4 gram sentence word shuffled data | [roberta.base.shuffle.n4.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n4.tar.gz) | | `roberta.base.shuffle.512` | RoBERTa (base) trained on unigram 512 word block shuffled data | [roberta.base.shuffle.512.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.512.tar.gz) | | `roberta.base.shuffle.corpus` | RoBERTa (base) trained on unigram corpus word shuffled data | [roberta.base.shuffle.corpus.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus.tar.gz) | | `roberta.base.shuffle.corpus_uniform` | RoBERTa (base) trained on unigram corpus word shuffled data, where all words are uniformly sampled | [roberta.base.shuffle.corpus_uniform.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus_uniform.tar.gz) | | `roberta.base.nopos` | RoBERTa (base) without positional embeddings, trained on natural corpus | [roberta.base.nopos.tar.gz](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.nopos.tar.gz) | ## Results [GLUE (Wang et al, 2019)](https://gluebenchmark.com/) & [PAWS (Zhang et al, 2019)](https://github.com/google-research-datasets/paws) _(dev set, single model, single-task fine-tuning, median of 5 seeds)_ | name | CoLA | MNLI | MRPC | PAWS | QNLI | QQP | RTE | SST-2 | | :----------------------------------- | ----: | ----: | ----: | ----: | ----: | ----: | ----: | ----: | | `roberta.base.orig` | 61.4 | 86.11 | 89.19 | 94.46 | 92.53 | 91.26 | 74.64 | 93.92 | | `roberta.base.shuffle.n1` | 35.15 | 82.64 | 86 | 89.97 | 89.02 | 91.01 | 69.02 | 90.47 | | `roberta.base.shuffle.n2` | 54.37 | 83.43 | 86.24 | 93.46 | 90.44 | 91.36 | 70.83 | 91.79 | | `roberta.base.shuffle.n3` | 48.72 | 83.85 | 86.36 | 94.05 | 91.69 | 91.24 | 70.65 | 92.02 | | `roberta.base.shuffle.n4` | 58.64 | 83.77 | 86.98 | 94.32 | 91.69 | 91.4 | 70.83 | 92.48 | | `roberta.base.shuffle.512` | 12.76 | 77.52 | 79.61 | 84.77 | 85.19 | 90.2 | 56.52 | 86.34 | | `roberta.base.shuffle.corpus` | 0 | 71.9 | 70.52 | 58.52 | 71.11 | 85.52 | 53.99 | 83.35 | | `roberta.base.shuffle.corpus_random` | 9.19 | 72.33 | 70.76 | 58.42 | 77.76 | 85.93 | 53.99 | 84.04 | | `roberta.base.nopos` | 0 | 63.5 | 72.73 | 57.08 | 77.72 | 87.87 | 54.35 | 83.24 | For more results on probing tasks, please refer to [our paper](https://arxiv.org/abs/2104.06644). ## Example Usage Follow the same usage as in [RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta) to load and test your models: ```python # Download roberta.base.shuffle.n1 model wget https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.tar.gz tar -xzvf roberta.base.shuffle.n1.tar.gz # Copy the dictionary files cd roberta.base.shuffle.n1.tar.gz wget -O dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt && wget -O encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json && wget -O vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe cd .. # Load the model in fairseq from fairseq.models.roberta import RobertaModel roberta = RobertaModel.from_pretrained('/path/to/roberta.base.shuffle.n1', checkpoint_file='model.pt') roberta.eval() # disable dropout (or leave in train mode to finetune) ``` We have also provided a [Google Colab](https://colab.research.google.com/drive/1IJDVfNVWdvRfLjphQKBGzmob84t-OXpm) notebook to demonstrate the loading of the model. The models were trained on top of Fairseq from the following commit: [62cff008ebeeed855093837507d5e6bf52065ee6](https://github.com/pytorch/fairseq/commit/62cff008ebeeed855093837507d5e6bf52065ee6). **Note**: The model trained without positional embeddings (`roberta.base.nopos`) is a modified `RoBERTa` model, where the positional embeddings are not used. Thus, the typical `from_pretrained` method on fairseq version of RoBERTa will not be able to load the above model weights. To do so, construct a new `RoBERTaModel` object by setting the flag `use_positional_embeddings` to `False` (or [in the latest code](https://github.com/pytorch/fairseq/blob/main/fairseq/models/roberta/model.py#L543), set `no_token_positional_embeddings` to `True`), and then load the individual weights. ## Fine-tuning Evaluation We provide the trained fine-tuned models on MNLI here for each model above for quick evaluation (1 seed for each model). Please refer to [finetuning details](README.finetuning.md) for the parameters of these models. Follow [RoBERTa](https://github.com/pytorch/fairseq/tree/main/examples/roberta) instructions to evaluate these models. | Model | MNLI M Dev Accuracy | Link | | :----------------------------------------- | :------------------ | :--------------------------------------------------------------------------------------------------------------- | | `roberta.base.orig.mnli` | 86.14 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.orig.mnli.tar.gz) | | `roberta.base.shuffle.n1.mnli` | 82.55 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n1.mnli.tar.gz) | | `roberta.base.shuffle.n2.mnli` | 83.21 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n2.mnli.tar.gz) | | `roberta.base.shuffle.n3.mnli` | 83.89 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n3.mnli.tar.gz) | | `roberta.base.shuffle.n4.mnli` | 84.00 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.n4.mnli.tar.gz) | | `roberta.base.shuffle.512.mnli` | 77.22 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.512.mnli.tar.gz) | | `roberta.base.shuffle.corpus.mnli` | 71.88 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus.mnli.tar.gz) | | `roberta.base.shuffle.corpus_uniform.mnli` | 72.46 | [Download](https://dl.fbaipublicfiles.com/unnatural_pretraining/roberta.base.shuffle.corpus_uniform.mnli.tar.gz) | ## Citation ```bibtex @misc{sinha2021masked, title={Masked Language Modeling and the Distributional Hypothesis: Order Word Matters Pre-training for Little}, author={Koustuv Sinha and Robin Jia and Dieuwke Hupkes and Joelle Pineau and Adina Williams and Douwe Kiela}, year={2021}, eprint={2104.06644}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` ## Contact For questions and comments, please reach out to Koustuv Sinha (koustuv.sinha@mail.mcgill.ca). ================================================ FILE: examples/simultaneous_translation/README.md ================================================ # Simultaneous Translation Examples of simultaneous translation in fairseq - [English-to-Japanese text-to-text wait-k model](docs/enja-waitk.md) - [English-to-Germen text-to-text monotonic multihead attention model](docs/ende-mma.md) - [English-to-Germen speech-to-text simultaneous translation model](../speech_to_text/docs/simulst_mustc_example.md) ================================================ FILE: examples/simultaneous_translation/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import models # noqa ================================================ FILE: examples/simultaneous_translation/docs/ende-mma.md ================================================ # Simultaneous Machine Translation This directory contains the code for the paper [Monotonic Multihead Attention](https://openreview.net/forum?id=Hyg96gBKPS) ## Prepare Data [Please follow the instructions to download and preprocess the WMT'15 En-De dataset.](https://github.com/pytorch/fairseq/tree/simulastsharedtask/examples/translation#prepare-wmt14en2desh) Another example of training an English to Japanese model can be found [here](docs/enja.md) ## Training - MMA-IL ```shell fairseq-train \ data-bin/wmt15_en_de_32k \ --simul-type infinite_lookback \ --user-dir $FAIRSEQ/example/simultaneous_translation \ --mass-preservation \ --criterion latency_augmented_label_smoothed_cross_entropy \ --latency-weight-avg 0.1 \ --max-update 50000 \ --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr-scheduler 'inverse_sqrt' \ --warmup-init-lr 1e-7 --warmup-updates 4000 \ --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ --dropout 0.3 \ --label-smoothing 0.1\ --max-tokens 3584 ``` - MMA-H ```shell fairseq-train \ data-bin/wmt15_en_de_32k \ --simul-type hard_aligned \ --user-dir $FAIRSEQ/example/simultaneous_translation \ --mass-preservation \ --criterion latency_augmented_label_smoothed_cross_entropy \ --latency-weight-var 0.1 \ --max-update 50000 \ --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr-scheduler 'inverse_sqrt' \ --warmup-init-lr 1e-7 --warmup-updates 4000 \ --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ --dropout 0.3 \ --label-smoothing 0.1\ --max-tokens 3584 ``` - wait-k ```shell fairseq-train \ data-bin/wmt15_en_de_32k \ --simul-type wait-k \ --waitk-lagging 3 \ --user-dir $FAIRSEQ/example/simultaneous_translation \ --mass-preservation \ --criterion latency_augmented_label_smoothed_cross_entropy \ --max-update 50000 \ --arch transformer_monotonic_iwslt_de_en save_dir_key=lambda \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr-scheduler 'inverse_sqrt' \ --warmup-init-lr 1e-7 --warmup-updates 4000 \ --lr 5e-4 --stop-min-lr 1e-9 --clip-norm 0.0 --weight-decay 0.0001\ --dropout 0.3 \ --label-smoothing 0.1\ --max-tokens 3584 ``` ================================================ FILE: examples/simultaneous_translation/docs/enja-waitk.md ================================================ # An example of English to Japaneses Simultaneous Translation System This is an example of training and evaluating a transformer *wait-k* English to Japanese simultaneous text-to-text translation model. ## Data Preparation This section introduces the data preparation for training and evaluation. If you only want to evaluate the model, please jump to [Inference & Evaluation](#inference-&-evaluation) For illustration, we only use the following subsets of the available data from [WMT20 news translation task](http://www.statmt.org/wmt20/translation-task.html), which results in 7,815,391 sentence pairs. - News Commentary v16 - Wiki Titles v3 - WikiMatrix V1 - Japanese-English Subtitle Corpus - The Kyoto Free Translation Task Corpus We use WMT20 development data as development set. Training `transformer_vaswani_wmt_en_de_big` model on such amount of data will result in 17.3 BLEU with greedy search and 19.7 with beam (10) search. Notice that a better performance can be achieved with the full WMT training data. We use [sentencepiece](https://github.com/google/sentencepiece) toolkit to tokenize the data with a vocabulary size of 32000. Additionally, we filtered out the sentences longer than 200 words after tokenization. Assuming the tokenized text data is saved at `${DATA_DIR}`, we prepare the data binary with the following command. ```bash fairseq-preprocess \ --source-lang en --target-lang ja \ --trainpref ${DATA_DIR}/train \ --validpref ${DATA_DIR}/dev \ --testpref ${DATA_DIR}/test \ --destdir ${WMT20_ENJA_DATA_BIN} \ --nwordstgt 32000 --nwordssrc 32000 \ --workers 20 ``` ## Simultaneous Translation Model Training To train a wait-k `(k=10)` model. ```bash fairseq-train ${WMT20_ENJA_DATA_BIN} \ --save-dir ${SAVEDIR} --simul-type waitk \ --waitk-lagging 10 \ --max-epoch 70 \ --arch transformer_monotonic_vaswani_wmt_en_de_big \ --optimizer adam \ --adam-betas '(0.9, 0.98)' \ --lr-scheduler inverse_sqrt \ --warmup-init-lr 1e-07 \ --warmup-updates 4000 \ --lr 0.0005 \ --stop-min-lr 1e-09 \ --clip-norm 10.0 \ --dropout 0.3 \ --weight-decay 0.0 \ --criterion label_smoothed_cross_entropy \ --label-smoothing 0.1 \ --max-tokens 3584 ``` This command is for training on 8 GPUs. Equivalently, the model can be trained on one GPU with `--update-freq 8`. ## Inference & Evaluation First of all, install [SimulEval](https://github.com/facebookresearch/SimulEval) for evaluation. ```bash git clone https://github.com/facebookresearch/SimulEval.git cd SimulEval pip install -e . ``` The following command is for the evaluation. Assuming the source and reference files are `${SRC_FILE}` and `${REF_FILE}`, the sentencepiece model file for English is saved at `${SRC_SPM_PATH}` ```bash simuleval \ --source ${SRC_FILE} \ --target ${TGT_FILE} \ --data-bin ${WMT20_ENJA_DATA_BIN} \ --sacrebleu-tokenizer ja-mecab \ --eval-latency-unit char \ --no-space \ --src-splitter-type sentencepiecemodel \ --src-splitter-path ${SRC_SPM_PATH} \ --agent ${FAIRSEQ}/examples/simultaneous_translation/agents/simul_trans_text_agent_enja.py \ --model-path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \ --output ${OUTPUT} \ --scores ``` The `--data-bin` should be the same in previous sections if you prepare the data from the scratch. If only for evaluation, a prepared data directory can be found [here](https://dl.fbaipublicfiles.com/simultaneous_translation/wmt20_enja_medium_databin.tgz) and a pretrained checkpoint (wait-k=10 model) can be downloaded from [here](https://dl.fbaipublicfiles.com/simultaneous_translation/wmt20_enja_medium_wait10_ckpt.pt). The output should look like this: ```bash { "Quality": { "BLEU": 11.442253287568398 }, "Latency": { "AL": 8.6587861866951, "AP": 0.7863304776251316, "DAL": 9.477850951194764 } } ``` The latency is evaluated by characters (`--eval-latency-unit`) on the target side. The latency is evaluated with `sacrebleu` with `MeCab` tokenizer `--sacrebleu-tokenizer ja-mecab`. `--no-space` indicates that do not add space when merging the predicted words. If `--output ${OUTPUT}` option is used, the detailed log and scores will be stored under the `${OUTPUT}` directory. ================================================ FILE: examples/simultaneous_translation/eval/agents/simul_t2t_enja.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os from fairseq import checkpoint_utils, tasks import sentencepiece as spm import torch try: from simuleval import READ_ACTION, WRITE_ACTION, DEFAULT_EOS from simuleval.agents import TextAgent except ImportError: print("Please install simuleval 'pip install simuleval'") BOS_PREFIX = "\u2581" class SimulTransTextAgentJA(TextAgent): """ Simultaneous Translation Text agent for Japanese """ def __init__(self, args): # Whether use gpu self.gpu = getattr(args, "gpu", False) # Max len self.max_len = args.max_len # Load Model self.load_model_vocab(args) # build word splitter self.build_word_splitter(args) self.eos = DEFAULT_EOS def initialize_states(self, states): states.incremental_states = dict() states.incremental_states["online"] = dict() def to_device(self, tensor): if self.gpu: return tensor.cuda() else: return tensor.cpu() def load_model_vocab(self, args): filename = args.model_path if not os.path.exists(filename): raise IOError("Model file not found: {}".format(filename)) state = checkpoint_utils.load_checkpoint_to_cpu(filename) task_args = state["cfg"]["task"] task_args.data = args.data_bin task = tasks.setup_task(task_args) # build model for ensemble state["cfg"]["model"].load_pretrained_encoder_from = None state["cfg"]["model"].load_pretrained_decoder_from = None self.model = task.build_model(state["cfg"]["model"]) self.model.load_state_dict(state["model"], strict=True) self.model.eval() self.model.share_memory() if self.gpu: self.model.cuda() # Set dictionary self.dict = {} self.dict["tgt"] = task.target_dictionary self.dict["src"] = task.source_dictionary @staticmethod def add_args(parser): # fmt: off parser.add_argument('--model-path', type=str, required=True, help='path to your pretrained model.') parser.add_argument("--data-bin", type=str, required=True, help="Path of data binary") parser.add_argument("--max-len", type=int, default=100, help="Max length of translation") parser.add_argument("--tgt-splitter-type", type=str, default="SentencePiece", help="Subword splitter type for target text.") parser.add_argument("--tgt-splitter-path", type=str, default=None, help="Subword splitter model path for target text.") parser.add_argument("--src-splitter-type", type=str, default="SentencePiece", help="Subword splitter type for source text.") parser.add_argument("--src-splitter-path", type=str, default=None, help="Subword splitter model path for source text.") # fmt: on return parser def build_word_splitter(self, args): self.spm = {} for lang in ['src', 'tgt']: if getattr(args, f'{lang}_splitter_type', None): path = getattr(args, f'{lang}_splitter_path', None) if path: self.spm[lang] = spm.SentencePieceProcessor() self.spm[lang].Load(path) def segment_to_units(self, segment, states): # Split a full word (segment) into subwords (units) return self.spm['src'].EncodeAsPieces(segment) def update_model_encoder(self, states): if len(states.units.source) == 0: return src_indices = [ self.dict['src'].index(x) for x in states.units.source.value ] if states.finish_read(): # Append the eos index when the prediction is over src_indices += [self.dict["tgt"].eos_index] src_indices = self.to_device( torch.LongTensor(src_indices).unsqueeze(0) ) src_lengths = self.to_device( torch.LongTensor([src_indices.size(1)]) ) states.encoder_states = self.model.encoder(src_indices, src_lengths) torch.cuda.empty_cache() def update_states_read(self, states): # Happens after a read action. self.update_model_encoder(states) def units_to_segment(self, units, states): # Merge sub words (units) to full word (segment). # For Japanese, we can directly send # the untokenized token to server except the BOS token # with following option # --sacrebleu-tokenizer MeCab # --eval-latency-unit char # --no-space token = units.value.pop() if ( token == self.dict["tgt"].eos_word or len(states.segments.target) > self.max_len ): return DEFAULT_EOS if BOS_PREFIX == token: return None if token[0] == BOS_PREFIX: return token[1:] else: return token def policy(self, states): if not getattr(states, "encoder_states", None): # No encoder states, read a token first return READ_ACTION # encode previous predicted target tokens tgt_indices = self.to_device( torch.LongTensor( [self.model.decoder.dictionary.eos()] + [ self.dict['tgt'].index(x) for x in states.units.target.value if x is not None ] ).unsqueeze(0) ) # Current steps states.incremental_states["steps"] = { "src": states.encoder_states["encoder_out"][0].size(0), "tgt": 1 + len(states.units.target), } # Online only means the reading is not finished states.incremental_states["online"]["only"] = ( torch.BoolTensor([not states.finish_read()]) ) x, outputs = self.model.decoder.forward( prev_output_tokens=tgt_indices, encoder_out=states.encoder_states, incremental_state=states.incremental_states, ) states.decoder_out = x torch.cuda.empty_cache() if outputs.action == 0: return READ_ACTION else: return WRITE_ACTION def predict(self, states): # Predict target token from decoder states decoder_states = states.decoder_out lprobs = self.model.get_normalized_probs( [decoder_states[:, -1:]], log_probs=True ) index = lprobs.argmax(dim=-1)[0, 0].item() if index != self.dict['tgt'].eos_index: token = self.dict['tgt'].string([index]) else: token = self.dict['tgt'].eos_word return token ================================================ FILE: examples/simultaneous_translation/models/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): model_name = file[: file.find(".py")] importlib.import_module( "examples.simultaneous_translation.models." + model_name ) ================================================ FILE: examples/simultaneous_translation/models/convtransformer_simul_trans.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. from fairseq import checkpoint_utils from fairseq.models import ( register_model, register_model_architecture, ) from fairseq.models.speech_to_text import ( ConvTransformerModel, convtransformer_espnet, ConvTransformerEncoder, ) from fairseq.models.speech_to_text.modules.augmented_memory_attention import ( augmented_memory, SequenceEncoder, AugmentedMemoryConvTransformerEncoder, ) from torch import nn, Tensor from typing import Dict, List from fairseq.models.speech_to_text.modules.emformer import NoSegAugmentedMemoryTransformerEncoderLayer @register_model("convtransformer_simul_trans") class SimulConvTransformerModel(ConvTransformerModel): """ Implementation of the paper: SimulMT to SimulST: Adapting Simultaneous Text Translation to End-to-End Simultaneous Speech Translation https://www.aclweb.org/anthology/2020.aacl-main.58.pdf """ @staticmethod def add_args(parser): super(SimulConvTransformerModel, SimulConvTransformerModel).add_args(parser) parser.add_argument( "--train-monotonic-only", action="store_true", default=False, help="Only train monotonic attention", ) @classmethod def build_decoder(cls, args, task, embed_tokens): tgt_dict = task.tgt_dict from examples.simultaneous_translation.models.transformer_monotonic_attention import ( TransformerMonotonicDecoder, ) decoder = TransformerMonotonicDecoder(args, tgt_dict, embed_tokens) if getattr(args, "load_pretrained_decoder_from", None): decoder = checkpoint_utils.load_pretrained_component_from_model( component=decoder, checkpoint=args.load_pretrained_decoder_from ) return decoder @register_model_architecture( "convtransformer_simul_trans", "convtransformer_simul_trans_espnet" ) def convtransformer_simul_trans_espnet(args): convtransformer_espnet(args) @register_model("convtransformer_augmented_memory") @augmented_memory class AugmentedMemoryConvTransformerModel(SimulConvTransformerModel): @classmethod def build_encoder(cls, args): encoder = SequenceEncoder(args, AugmentedMemoryConvTransformerEncoder(args)) if getattr(args, "load_pretrained_encoder_from", None) is not None: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=args.load_pretrained_encoder_from ) return encoder @register_model_architecture( "convtransformer_augmented_memory", "convtransformer_augmented_memory" ) def augmented_memory_convtransformer_espnet(args): convtransformer_espnet(args) # ============================================================================ # # Convtransformer # with monotonic attention decoder # with emformer encoder # ============================================================================ # class ConvTransformerEmformerEncoder(ConvTransformerEncoder): def __init__(self, args): super().__init__(args) stride = self.conv_layer_stride(args) trf_left_context = args.segment_left_context // stride trf_right_context = args.segment_right_context // stride context_config = [trf_left_context, trf_right_context] self.transformer_layers = nn.ModuleList( [ NoSegAugmentedMemoryTransformerEncoderLayer( input_dim=args.encoder_embed_dim, num_heads=args.encoder_attention_heads, ffn_dim=args.encoder_ffn_embed_dim, num_layers=args.encoder_layers, dropout_in_attn=args.dropout, dropout_on_attn=args.dropout, dropout_on_fc1=args.dropout, dropout_on_fc2=args.dropout, activation_fn=args.activation_fn, context_config=context_config, segment_size=args.segment_length, max_memory_size=args.max_memory_size, scaled_init=True, # TODO: use constant for now. tanh_on_mem=args.amtrf_tanh_on_mem, ) ] ) self.conv_transformer_encoder = ConvTransformerEncoder(args) def forward(self, src_tokens, src_lengths): encoder_out: Dict[str, List[Tensor]] = self.conv_transformer_encoder(src_tokens, src_lengths.to(src_tokens.device)) output = encoder_out["encoder_out"][0] encoder_padding_masks = encoder_out["encoder_padding_mask"] return { "encoder_out": [output], # This is because that in the original implementation # the output didn't consider the last segment as right context. "encoder_padding_mask": [encoder_padding_masks[0][:, : output.size(0)]] if len(encoder_padding_masks) > 0 else [], "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [], } @staticmethod def conv_layer_stride(args): # TODO: make it configurable from the args return 4 @register_model("convtransformer_emformer") class ConvtransformerEmformer(SimulConvTransformerModel): @staticmethod def add_args(parser): super(ConvtransformerEmformer, ConvtransformerEmformer).add_args(parser) parser.add_argument( "--segment-length", type=int, metavar="N", help="length of each segment (not including left context / right context)", ) parser.add_argument( "--segment-left-context", type=int, help="length of left context in a segment", ) parser.add_argument( "--segment-right-context", type=int, help="length of right context in a segment", ) parser.add_argument( "--max-memory-size", type=int, default=-1, help="Right context for the segment.", ) parser.add_argument( "--amtrf-tanh-on-mem", default=False, action="store_true", help="whether to use tanh on memory vector", ) @classmethod def build_encoder(cls, args): encoder = ConvTransformerEmformerEncoder(args) if getattr(args, "load_pretrained_encoder_from", None): encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=args.load_pretrained_encoder_from ) return encoder @register_model_architecture( "convtransformer_emformer", "convtransformer_emformer", ) def convtransformer_emformer_base(args): convtransformer_espnet(args) ================================================ FILE: examples/simultaneous_translation/models/transformer_monotonic_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, NamedTuple, Optional import torch import torch.nn as nn from examples.simultaneous_translation.modules.monotonic_transformer_layer import ( TransformerMonotonicDecoderLayer, TransformerMonotonicEncoderLayer, ) from fairseq.models import ( register_model, register_model_architecture, ) from fairseq.models.transformer import ( TransformerModel, TransformerEncoder, TransformerDecoder, base_architecture, transformer_iwslt_de_en, transformer_vaswani_wmt_en_de_big, tiny_architecture ) from torch import Tensor DEFAULT_MAX_SOURCE_POSITIONS = 1024 DEFAULT_MAX_TARGET_POSITIONS = 1024 READ_ACTION = 0 WRITE_ACTION = 1 TransformerMonotonicDecoderOut = NamedTuple( "TransformerMonotonicDecoderOut", [ ("action", int), ("p_choose", Optional[Tensor]), ("attn_list", Optional[List[Optional[Dict[str, Tensor]]]]), ("encoder_out", Optional[Dict[str, List[Tensor]]]), ("encoder_padding_mask", Optional[Tensor]), ], ) @register_model("transformer_unidirectional") class TransformerUnidirectionalModel(TransformerModel): @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return TransformerMonotonicEncoder(args, src_dict, embed_tokens) @register_model("transformer_monotonic") class TransformerModelSimulTrans(TransformerModel): @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return TransformerMonotonicEncoder(args, src_dict, embed_tokens) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): return TransformerMonotonicDecoder(args, tgt_dict, embed_tokens) class TransformerMonotonicEncoder(TransformerEncoder): def __init__(self, args, dictionary, embed_tokens): super().__init__(args, dictionary, embed_tokens) self.dictionary = dictionary self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerMonotonicEncoderLayer(args) for i in range(args.encoder_layers) ] ) class TransformerMonotonicDecoder(TransformerDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(args, dictionary, embed_tokens, no_encoder_attn=False) self.dictionary = dictionary self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerMonotonicDecoderLayer(args) for _ in range(args.decoder_layers) ] ) self.policy_criterion = getattr(args, "policy_criterion", "any") self.num_updates = None def set_num_updates(self, num_updates): self.num_updates = num_updates def pre_attention( self, prev_output_tokens, encoder_out_dict: Dict[str, List[Tensor]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): positions = ( self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_out = encoder_out_dict["encoder_out"][0] if "encoder_padding_mask" in encoder_out_dict: encoder_padding_mask = ( encoder_out_dict["encoder_padding_mask"][0] if encoder_out_dict["encoder_padding_mask"] and len(encoder_out_dict["encoder_padding_mask"]) > 0 else None ) else: encoder_padding_mask = None return x, encoder_out, encoder_padding_mask def post_attention(self, x): if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x def clean_cache( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], end_id: Optional[int] = None, ): """ Clean cache in the monotonic layers. The cache is generated because of a forward pass of decoder has run but no prediction, so that the self attention key value in decoder is written in the incremental state. end_id is the last idx of the layers """ if end_id is None: end_id = len(self.layers) for index, layer in enumerate(self.layers): if index < end_id: layer.prune_incremental_state(incremental_state) def extract_features( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, # unused alignment_layer: Optional[int] = None, # unused alignment_heads: Optional[int] = None, # unsed ): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ # incremental_state = None assert encoder_out is not None (x, encoder_outs, encoder_padding_mask) = self.pre_attention( prev_output_tokens, encoder_out, incremental_state ) attn = None inner_states = [x] attn_list: List[Optional[Dict[str, Tensor]]] = [] p_choose = torch.tensor([1.0]) for i, layer in enumerate(self.layers): x, attn, _ = layer( x=x, encoder_out=encoder_outs, encoder_padding_mask=encoder_padding_mask, incremental_state=incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, ) inner_states.append(x) attn_list.append(attn) if incremental_state is not None: if_online = incremental_state["online"]["only"] assert if_online is not None if if_online.to(torch.bool): # Online indicates that the encoder states are still changing assert attn is not None if self.policy_criterion == "any": # Any head decide to read than read head_read = layer.encoder_attn._get_monotonic_buffer(incremental_state)["head_read"] assert head_read is not None if head_read.any(): # We need to prune the last self_attn saved_state # if model decide not to read # otherwise there will be duplicated saved_state self.clean_cache(incremental_state, i + 1) return x, TransformerMonotonicDecoderOut( action=0, p_choose=p_choose, attn_list=None, encoder_out=None, encoder_padding_mask=None, ) x = self.post_attention(x) return x, TransformerMonotonicDecoderOut( action=1, p_choose=p_choose, attn_list=attn_list, encoder_out=encoder_out, encoder_padding_mask=encoder_padding_mask, ) @register_model_architecture("transformer_monotonic", "transformer_monotonic") def base_monotonic_architecture(args): base_architecture(args) args.encoder_unidirectional = getattr(args, "encoder_unidirectional", False) @register_model_architecture( "transformer_monotonic", "transformer_monotonic_iwslt_de_en" ) def transformer_monotonic_iwslt_de_en(args): transformer_iwslt_de_en(args) base_monotonic_architecture(args) # parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) @register_model_architecture( "transformer_monotonic", "transformer_monotonic_vaswani_wmt_en_de_big" ) def transformer_monotonic_vaswani_wmt_en_de_big(args): transformer_vaswani_wmt_en_de_big(args) @register_model_architecture( "transformer_monotonic", "transformer_monotonic_vaswani_wmt_en_fr_big" ) def transformer_monotonic_vaswani_wmt_en_fr_big(args): transformer_monotonic_vaswani_wmt_en_fr_big(args) @register_model_architecture( "transformer_unidirectional", "transformer_unidirectional_iwslt_de_en" ) def transformer_unidirectional_iwslt_de_en(args): transformer_iwslt_de_en(args) @register_model_architecture("transformer_monotonic", "transformer_monotonic_tiny") def monotonic_tiny_architecture(args): tiny_architecture(args) base_monotonic_architecture(args) ================================================ FILE: examples/simultaneous_translation/modules/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import importlib from fairseq import registry ( build_monotonic_attention, register_monotonic_attention, MONOTONIC_ATTENTION_REGISTRY, _, ) = registry.setup_registry("--simul-type") for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): model_name = file[: file.find(".py")] importlib.import_module( "examples.simultaneous_translation.modules." + model_name ) ================================================ FILE: examples/simultaneous_translation/modules/fixed_pre_decision.py ================================================ from functools import partial import torch from torch import Tensor import math import torch.nn.functional as F from . import register_monotonic_attention from .monotonic_multihead_attention import ( MonotonicAttention, MonotonicInfiniteLookbackAttention, WaitKAttention ) from typing import Dict, Optional def fixed_pooling_monotonic_attention(monotonic_attention): def create_model(monotonic_attention, klass): class FixedStrideMonotonicAttention(monotonic_attention): def __init__(self, args): self.waitk_lagging = 0 self.num_heads = 0 self.noise_mean = 0.0 self.noise_var = 0.0 super().__init__(args) self.pre_decision_type = args.fixed_pre_decision_type self.pre_decision_ratio = args.fixed_pre_decision_ratio self.pre_decision_pad_threshold = args.fixed_pre_decision_pad_threshold assert self.pre_decision_ratio > 1 if args.fixed_pre_decision_type == "average": self.pooling_layer = torch.nn.AvgPool1d( kernel_size=self.pre_decision_ratio, stride=self.pre_decision_ratio, ceil_mode=True, ) elif args.fixed_pre_decision_type == "last": def last(key): if key.size(2) < self.pre_decision_ratio: return key else: k = key[ :, :, self.pre_decision_ratio - 1:: self.pre_decision_ratio, ].contiguous() if key.size(-1) % self.pre_decision_ratio != 0: k = torch.cat([k, key[:, :, -1:]], dim=-1).contiguous() return k self.pooling_layer = last else: raise NotImplementedError @staticmethod def add_args(parser): super( FixedStrideMonotonicAttention, FixedStrideMonotonicAttention ).add_args(parser) parser.add_argument( "--fixed-pre-decision-ratio", type=int, required=True, help=( "Ratio for the fixed pre-decision," "indicating how many encoder steps will start" "simultaneous decision making process." ), ) parser.add_argument( "--fixed-pre-decision-type", default="average", choices=["average", "last"], help="Pooling type", ) parser.add_argument( "--fixed-pre-decision-pad-threshold", type=float, default=0.3, help="If a part of the sequence has pad" ",the threshold the pooled part is a pad.", ) def insert_zeros(self, x): bsz_num_heads, tgt_len, src_len = x.size() stride = self.pre_decision_ratio weight = F.pad(torch.ones(1, 1, 1).to(x), (stride - 1, 0)) x_upsample = F.conv_transpose1d( x.view(-1, src_len).unsqueeze(1), weight, stride=stride, padding=0, ) return x_upsample.squeeze(1).view(bsz_num_heads, tgt_len, -1) def p_choose( self, query: Optional[Tensor], key: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): assert key is not None assert query is not None src_len = key.size(0) tgt_len = query.size(0) batch_size = query.size(1) key_pool = self.pooling_layer(key.transpose(0, 2)).transpose(0, 2) if key_padding_mask is not None: key_padding_mask_pool = ( self.pooling_layer(key_padding_mask.unsqueeze(0).float()) .squeeze(0) .gt(self.pre_decision_pad_threshold) ) # Make sure at least one element is not pad key_padding_mask_pool[:, 0] = 0 else: key_padding_mask_pool = None if incremental_state is not None: # The floor instead of ceil is used for inference # But make sure the length key_pool at least 1 if ( max(1, math.floor(key.size(0) / self.pre_decision_ratio)) ) < key_pool.size(0): key_pool = key_pool[:-1] if key_padding_mask_pool is not None: key_padding_mask_pool = key_padding_mask_pool[:-1] p_choose_pooled = self.p_choose_from_qk( query, key_pool, key_padding_mask_pool, incremental_state=incremental_state, ) # Upsample, interpolate zeros p_choose = self.insert_zeros(p_choose_pooled) if p_choose.size(-1) < src_len: # Append zeros if the upsampled p_choose is shorter than src_len p_choose = torch.cat( [ p_choose, torch.zeros( p_choose.size(0), tgt_len, src_len - p_choose.size(-1) ).to(p_choose) ], dim=2 ) else: # can be larger than src_len because we used ceil before p_choose = p_choose[:, :, :src_len] p_choose[:, :, -1] = p_choose_pooled[:, :, -1] assert list(p_choose.size()) == [ batch_size * self.num_heads, tgt_len, src_len, ] return p_choose FixedStrideMonotonicAttention.__name__ = klass.__name__ return FixedStrideMonotonicAttention return partial(create_model, monotonic_attention) @register_monotonic_attention("waitk_fixed_pre_decision") @fixed_pooling_monotonic_attention(WaitKAttention) class WaitKAttentionFixedStride: pass @register_monotonic_attention("hard_aligned_fixed_pre_decision") @fixed_pooling_monotonic_attention(MonotonicAttention) class MonotonicAttentionFixedStride: pass @register_monotonic_attention("infinite_lookback_fixed_pre_decision") @fixed_pooling_monotonic_attention(MonotonicInfiniteLookbackAttention) class MonotonicInfiniteLookbackAttentionFixedStride: pass ================================================ FILE: examples/simultaneous_translation/modules/monotonic_multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch from torch import Tensor import torch.nn as nn from examples.simultaneous_translation.utils.p_choose_strategy import ( learnable_p_choose, waitk_p_choose ) from examples.simultaneous_translation.utils.monotonic_attention import ( expected_alignment_from_p_choose, expected_soft_attention, mass_preservation, ) from fairseq.modules import MultiheadAttention from . import register_monotonic_attention from typing import Dict, Optional @register_monotonic_attention("hard_aligned") class MonotonicAttention(MultiheadAttention): """ Abstract class of monotonic attentions """ k_in_proj: Dict[str, nn.Linear] q_in_proj: Dict[str, nn.Linear] def __init__(self, args): super().__init__( embed_dim=args.decoder_embed_dim, num_heads=args.decoder_attention_heads, kdim=getattr(args, "encoder_embed_dim", None), vdim=getattr(args, "encoder_embed_dim", None), dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.soft_attention = False self.eps = getattr(args, "attention_eps", True) self.mass_preservation = getattr(args, "mass_preservation", True) self.noise_type = args.noise_type self.noise_mean = args.noise_mean self.noise_var = args.noise_var self.energy_bias_init = args.energy_bias_init self.energy_bias = ( nn.Parameter(self.energy_bias_init * torch.ones([1])) if args.energy_bias is True else 0 ) self.k_in_proj = {"monotonic": self.k_proj} self.q_in_proj = {"monotonic": self.q_proj} self.chunk_size = None @staticmethod def add_args(parser): # fmt: off parser.add_argument('--no-mass-preservation', action="store_false", dest="mass_preservation", help='Do not stay on the last token when decoding') parser.add_argument('--mass-preservation', action="store_true", dest="mass_preservation", help='Stay on the last token when decoding') parser.set_defaults(mass_preservation=True) parser.add_argument('--noise-var', type=float, default=1.0, help='Variance of discretness noise') parser.add_argument('--noise-mean', type=float, default=0.0, help='Mean of discretness noise') parser.add_argument('--noise-type', type=str, default="flat", help='Type of discretness noise') parser.add_argument('--energy-bias', action="store_true", default=False, help='Bias for energy') parser.add_argument('--energy-bias-init', type=float, default=-2.0, help='Initial value of the bias for energy') parser.add_argument('--attention-eps', type=float, default=1e-6, help='Epsilon when calculating expected attention') def energy_from_qk( self, query: Tensor, key: Tensor, energy_type: str, key_padding_mask: Optional[Tensor] = None, bias: int = 0 ): """ Compute energy from query and key q_func_value is a tuple looks like (q_proj_func, q_tensor) q_tensor size: bsz, tgt_len, emb_dim k_tensor size: bsz, src_len, emb_dim key_padding_mask size: bsz, src_len attn_mask: bsz, src_len """ length, bsz, _ = query.size() q = self.q_in_proj[energy_type].forward(query) q = ( q.contiguous() .view(length, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) q = q * self.scaling length, bsz, _ = key.size() k = self.k_in_proj[energy_type].forward(key) k = ( k.contiguous() .view(length, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) energy = torch.bmm(q, k.transpose(1, 2)) + bias if key_padding_mask is not None: energy = energy.masked_fill( key_padding_mask.unsqueeze(1).to(torch.bool), - float("inf") ) return energy def p_choose_from_qk(self, query, key, key_padding_mask, incremental_states=None): monotonic_energy = self.energy_from_qk( query, key, "monotonic", key_padding_mask=key_padding_mask, bias=self.energy_bias, ) p_choose = learnable_p_choose( monotonic_energy, self.noise_mean, self.noise_var, self.training ) return p_choose def p_choose(self, query, key, key_padding_mask, incremental_states=None): return self.p_choose_from_qk(self, query, key, key_padding_mask) def monotonic_attention_process_infer( self, query: Optional[Tensor], key: Optional[Tensor], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], ): """ Monotonic attention at inference time Notice that this function is designed for simuleval not sequence_generator """ assert query is not None assert key is not None if query.size(1) != 1: raise RuntimeError( "Simultaneous translation models don't support batch decoding." ) # 1. compute stepwise probability p_choose = self.p_choose( query, key, None, incremental_state ).squeeze(1) # 2. Compute the alpha src_len = key.size(0) # Maximum steps allows in this iteration max_steps = src_len - 1 if self.mass_preservation else src_len monotonic_cache = self._get_monotonic_buffer(incremental_state) # Step for each head monotonic_step = monotonic_cache.get( 'head_step', p_choose.new_zeros(1, self.num_heads).long() ) assert monotonic_step is not None finish_read = monotonic_step.eq(max_steps) p_choose_i = torch.tensor(1) while finish_read.sum().item() < self.num_heads: # p_choose: self.num_heads, src_len # only choose the p at monotonic steps # p_choose_i: 1, self.num_heads p_choose_i = ( p_choose.gather( 1, monotonic_step .clamp(0, src_len - 1), ) ) read_one_step = ( (p_choose_i < 0.5) .type_as(monotonic_step) .masked_fill(finish_read, 0) ) # 1 x bsz # sample actions on unfinished seq # 0 means stay, finish reading # 1 means leave, continue reading monotonic_step += read_one_step finish_read = monotonic_step.eq(max_steps) | (read_one_step == 0) # p_choose at last steps p_choose_i = ( p_choose.gather( 1, monotonic_step .clamp(0, src_len - 1), ) ) monotonic_cache["head_step"] = monotonic_step # Whether a head is looking for new input monotonic_cache["head_read"] = ( monotonic_step.eq(max_steps) & (p_choose_i < 0.5) ) self._set_monotonic_buffer(incremental_state, monotonic_cache) # 2. Update alpha alpha = ( p_choose .new_zeros([self.num_heads, src_len]) .scatter( 1, (monotonic_step) .view(self.num_heads, 1).clamp(0, src_len - 1), 1 ) ) if not self.mass_preservation: alpha = alpha.masked_fill( (monotonic_step == max_steps) .view(self.num_heads, 1), 0 ) # 4. Compute Beta if self.soft_attention: monotonic_step = monotonic_step.t() beta_mask = torch.arange(src_len).expand_as(alpha).gt(monotonic_step).unsqueeze(1) # If it's soft attention just do softmax on current context soft_energy = self.energy_from_qk( query, key, "soft" ) beta = torch.nn.functional.softmax( soft_energy.masked_fill(beta_mask, -float("inf")), dim=-1 ) # It could happen that a head doesn't move at all beta = beta.masked_fill(monotonic_step.eq(0).unsqueeze(1), 0) else: # If it's hard attention just select the last state beta = alpha return p_choose, alpha, beta def monotonic_attention_process_train( self, query: Optional[Tensor], key: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, ): """ Calculating monotonic attention process for training Including: stepwise probability: p_choose expected hard alignment: alpha expected soft attention: beta """ assert query is not None assert key is not None # 1. compute stepwise probability p_choose = self.p_choose_from_qk(query, key, key_padding_mask) # 2. compute expected_alignment alpha = expected_alignment_from_p_choose( p_choose, key_padding_mask, eps=self.eps, ) if self.mass_preservation: alpha = mass_preservation( alpha, key_padding_mask ) # 3. compute expected soft attention (soft aligned model only) if self.soft_attention: soft_energy = self.energy_from_qk( query, key, "soft", key_padding_mask=None, ) beta = expected_soft_attention( alpha, soft_energy, padding_mask=key_padding_mask, chunk_size=self.chunk_size, eps=self.eps, ) else: beta = alpha soft_energy = alpha return p_choose, alpha, beta, soft_energy def forward( self, query: Optional[Tensor], key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, attn_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, need_weights: bool = True, static_kv: bool = False, need_head_weights: bool = False, ): """ query: tgt_len, bsz, embed_dim key: src_len, bsz, embed_dim value: src_len, bsz, embed_dim """ assert attn_mask is None assert query is not None assert key is not None assert value is not None tgt_len, bsz, embed_dim = query.size() src_len = value.size(0) if key_padding_mask is not None: assert not key_padding_mask[:, 0].any(), ( "Only right padding is supported." ) key_padding_mask = ( key_padding_mask .unsqueeze(1) .expand([bsz, self.num_heads, src_len]) .contiguous() .view(-1, src_len) ) if incremental_state is not None: # Inference ( p_choose, alpha, beta ) = self.monotonic_attention_process_infer( query, key, incremental_state ) soft_energy = beta else: # Train ( p_choose, alpha, beta, soft_energy ) = self.monotonic_attention_process_train( query, key, key_padding_mask ) v = self.v_proj(value) length, bsz, _ = v.size() v = ( v.contiguous() .view(length, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) attn = torch.bmm(beta.type_as(v), v) attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) attn = self.out_proj(attn) p_choose = p_choose.view(bsz, self.num_heads, tgt_len, src_len) alpha = alpha.view(bsz, self.num_heads, tgt_len, src_len) beta = beta.view(bsz, self.num_heads, tgt_len, src_len) return attn, { "p_choose": p_choose, "alpha": alpha, "beta": beta, "soft_energy": soft_energy, } def _get_monotonic_buffer(self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]): maybe_incremental_state = self.get_incremental_state( incremental_state, 'monotonic', ) if maybe_incremental_state is None: typed_empty_dict: Dict[str, Optional[Tensor]] = {} return typed_empty_dict else: return maybe_incremental_state def _set_monotonic_buffer(self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], buffer: Dict[str, Optional[Tensor]]): self.set_incremental_state( incremental_state, 'monotonic', buffer, ) @register_monotonic_attention("infinite_lookback") class MonotonicInfiniteLookbackAttention( MonotonicAttention ): def __init__(self, args): super().__init__(args) self.soft_attention = True self.init_soft_attention() def init_soft_attention(self): self.k_proj_soft = nn.Linear(self.kdim, self.embed_dim, bias=True) self.q_proj_soft = nn.Linear(self.embed_dim, self.embed_dim, bias=True) self.k_in_proj["soft"] = self.k_proj_soft self.q_in_proj["soft"] = self.q_proj_soft if self.qkv_same_dim: # Empirically observed the convergence to be much better with # the scaled initialization nn.init.xavier_uniform_( self.k_in_proj["soft"].weight, gain=1 / math.sqrt(2) ) nn.init.xavier_uniform_( self.q_in_proj["soft"].weight, gain=1 / math.sqrt(2) ) else: nn.init.xavier_uniform_(self.k_in_proj["soft"].weight) nn.init.xavier_uniform_(self.q_in_proj["soft"].weight) @register_monotonic_attention("waitk") class WaitKAttention( MonotonicInfiniteLookbackAttention ): """ STACL: Simultaneous Translation with Implicit Anticipation and Controllable Latency using Prefix-to-Prefix Framework https://www.aclweb.org/anthology/P19-1289/ """ def __init__(self, args): super().__init__(args) self.q_in_proj["soft"] = self.q_in_proj["monotonic"] self.k_in_proj["soft"] = self.k_in_proj["monotonic"] self.waitk_lagging = args.waitk_lagging assert self.waitk_lagging > 0, ( f"Lagging has to been larger than 0, get {self.waitk_lagging}." ) @staticmethod def add_args(parser): super( MonotonicInfiniteLookbackAttention, MonotonicInfiniteLookbackAttention ).add_args(parser) parser.add_argument( "--waitk-lagging", type=int, required=True, help="Wait K lagging" ) def p_choose_from_qk( self, query: Optional[Tensor], key: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): assert query is not None assert key is not None p_choose = waitk_p_choose( tgt_len=query.size(0), src_len=key.size(0), bsz=query.size(1) * self.num_heads, waitk_lagging=self.waitk_lagging, key_padding_mask=key_padding_mask, incremental_state=incremental_state, ) return p_choose.to(query) @register_monotonic_attention("chunkwise") class ChunkwiseAttention( MonotonicInfiniteLookbackAttention ): def __init__(self, args): super().__init__(args) self.chunk_size = args.mocha_chunk_size assert self.chunk_size > 1 @staticmethod def add_args(parser): super( MonotonicInfiniteLookbackAttention ).add_args(parser) parser.add_argument( "--mocha-chunk-size", type=int, required=True, help="Mocha chunk size" ) ================================================ FILE: examples/simultaneous_translation/modules/monotonic_transformer_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer from . import build_monotonic_attention from typing import Dict, Optional, List from torch import Tensor import torch class TransformerMonotonicEncoderLayer(TransformerEncoderLayer): def forward(self, x, encoder_padding_mask): seq_len, _, _ = x.size() attn_mask = x.new_ones([seq_len, seq_len]).triu(1) attn_mask = attn_mask.masked_fill(attn_mask.bool(), float("-inf")) return super().forward(x, encoder_padding_mask, attn_mask) class TransformerMonotonicDecoderLayer(TransformerDecoderLayer): def __init__(self, args): super().__init__(args) assert args.simul_type is not None, "A --simul-type is needed." self.encoder_attn = build_monotonic_attention(args) def prune_incremental_state( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ): input_buffer = self.self_attn._get_input_buffer(incremental_state) for key in ["prev_key", "prev_value"]: input_buffer_key = input_buffer[key] assert input_buffer_key is not None if input_buffer_key.size(2) > 1: input_buffer[key] = input_buffer_key[:, :, :-1, :] else: typed_empty_dict: Dict[str, Optional[Tensor]] = {} input_buffer = typed_empty_dict break assert incremental_state is not None self.self_attn._set_input_buffer(incremental_state, input_buffer) def forward( self, x, encoder_out: Optional[Tensor] = None, encoder_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, prev_self_attn_state: Optional[List[Tensor]] = None, prev_attn_state: Optional[List[Tensor]] = None, self_attn_mask: Optional[Tensor] = None, self_attn_padding_mask: Optional[Tensor] = None, need_attn: bool = False, need_head_weights: bool = False, ): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor, optional): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. need_attn (bool, optional): return attention weights need_head_weights (bool, optional): return attention weights for each head (default: return average over heads). Returns: encoded output of shape `(seq_len, batch, embed_dim)` """ if need_head_weights: need_attn = True residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) if prev_self_attn_state is not None: prev_key, prev_value = prev_self_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_self_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] assert incremental_state is not None self.self_attn._set_input_buffer(incremental_state, saved_state) _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state) if self.cross_self_attention and not ( incremental_state is not None and _self_attn_input_buffer is not None and "prev_key" in _self_attn_input_buffer ): if self_attn_mask is not None: assert encoder_out is not None self_attn_mask = torch.cat( (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1 ) if self_attn_padding_mask is not None: if encoder_padding_mask is None: assert encoder_out is not None encoder_padding_mask = self_attn_padding_mask.new_zeros( encoder_out.size(1), encoder_out.size(0) ) self_attn_padding_mask = torch.cat( (encoder_padding_mask, self_attn_padding_mask), dim=1 ) assert encoder_out is not None y = torch.cat((encoder_out, x), dim=0) else: y = x x, attn = self.self_attn( query=x, key=y, value=y, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state, need_weights=False, attn_mask=self_attn_mask, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.self_attn_layer_norm(x) assert self.encoder_attn is not None residual = x if self.normalize_before: x = self.encoder_attn_layer_norm(x) if prev_attn_state is not None: prev_key, prev_value = prev_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_attn_state[2] assert incremental_state is not None self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.encoder_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) if self.onnx_trace and incremental_state is not None: saved_state = self.self_attn._get_input_buffer(incremental_state) assert saved_state is not None if self_attn_padding_mask is not None: self_attn_state = [ saved_state["prev_key"], saved_state["prev_value"], saved_state["prev_key_padding_mask"], ] else: self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]] return x, attn, self_attn_state return x, attn, None ================================================ FILE: examples/simultaneous_translation/tests/test_alignment_train.py ================================================ import unittest import numpy as np import torch import hypothesis.strategies as st from hypothesis import assume, given, settings from torch.testing._internal.common_utils import TestCase from examples.simultaneous_translation.utils.functions import exclusive_cumprod TEST_CUDA = torch.cuda.is_available() class AlignmentTrainTest(TestCase): def _test_custom_alignment_train_ref(self, p_choose, eps): cumprod_1mp = exclusive_cumprod(1 - p_choose, dim=2, eps=eps) cumprod_1mp_clamp = torch.clamp(cumprod_1mp, eps, 1.0) bsz = p_choose.size(0) tgt_len = p_choose.size(1) src_len = p_choose.size(2) alpha_0 = p_choose.new_zeros([bsz, 1, src_len]) alpha_0[:, :, 0] = 1.0 previous_alpha = [alpha_0] for i in range(tgt_len): # p_choose: bsz , tgt_len, src_len # cumprod_1mp_clamp : bsz, tgt_len, src_len # previous_alpha[i]: bsz, 1, src_len # alpha_i: bsz, src_len alpha_i = ( p_choose[:, i] * cumprod_1mp[:, i] * torch.cumsum( previous_alpha[i][:, 0] / cumprod_1mp_clamp[:, i], dim=1 ) ).clamp(0, 1.0) previous_alpha.append(alpha_i.unsqueeze(1)) # alpha: bsz * num_heads, tgt_len, src_len alpha = torch.cat(previous_alpha[1:], dim=1) return alpha def _test_custom_alignment_train_impl(self, p_choose, alpha, eps): if p_choose.is_cuda: from alignment_train_cuda_binding import alignment_train_cuda # @manual=//deeplearning/projects/fairseq-py:alignment_train_cuda_binding alignment_train_cuda(p_choose, alpha, eps) else: from alignment_train_cpu_binding import alignment_train_cpu # @manual=//deeplearning/projects/fairseq-py:alignment_train_cpu_binding alignment_train_cpu(p_choose, alpha, eps) @settings(deadline=None) @given( bsz=st.integers(1, 100), tgt_len=st.integers(1, 100), src_len=st.integers(1, 550), device=st.sampled_from(["cpu", "cuda"]), ) def test_alignment_train(self, bsz, tgt_len, src_len, device): eps = 1e-6 assume(device == "cpu" or TEST_CUDA) p_choose = torch.rand(bsz, tgt_len, src_len, device=device) # run the alignment with the custom operator alpha_act = p_choose.new_zeros([bsz, tgt_len, src_len]) self._test_custom_alignment_train_impl(p_choose, alpha_act, eps) # runu the alignment with the ref implementation alpha_ref = self._test_custom_alignment_train_ref(p_choose, eps) # verify the results alpha_act = alpha_act.cpu().detach().numpy() alpha_ref = alpha_ref.cpu().detach().numpy() np.testing.assert_allclose( alpha_act, alpha_ref, atol=1e-3, rtol=1e-3, ) if __name__ == "__main__": unittest.main() ================================================ FILE: examples/simultaneous_translation/tests/test_text_models.py ================================================ import argparse import unittest from typing import Any, Dict import torch from examples.simultaneous_translation.models import ( transformer_monotonic_attention ) from tests.test_roberta import FakeTask DEFAULT_CONFIG = { "attention_eps": 1e-6, "mass_preservation": True, "noise_type": "flat", "noise_mean": 0.0, "noise_var": 1.0, "energy_bias_init": -2, "energy_bias": True } PAD_INDEX = 1 def generate_config(overrides_kv): new_dict = {key: value for key, value in DEFAULT_CONFIG.items()} for key, value in overrides_kv.items(): new_dict[key] = value return new_dict def make_sample_with_padding(longer_src=False) -> Dict[str, Any]: tokens_1 = torch.LongTensor( [ [2, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15, 2], [ 2, 11, 12, 14, 15, 10, 11, 12, 13, 14, 15, 2, PAD_INDEX, PAD_INDEX ], ] ) tokens_2 = torch.LongTensor( [ [2, 11, 12, 13, 14, 2, PAD_INDEX, PAD_INDEX], [2, 11, 22, 33, 2, PAD_INDEX, PAD_INDEX, PAD_INDEX] ] ) if longer_src: src_tokens = tokens_1[:, 1:] prev_output_tokens = tokens_2 else: src_tokens = tokens_2[:, 1:8] prev_output_tokens = tokens_1 src_lengths = src_tokens.ne(PAD_INDEX).sum(dim=1).long() sample = { "net_input": { "src_tokens": src_tokens, "prev_output_tokens": prev_output_tokens, "src_lengths": src_lengths, }, "target": prev_output_tokens[:, 1:], } return sample def build_transformer_monotonic_attention(**extra_args: Any): overrides = { # Use characteristics dimensions "encoder_embed_dim": 12, "encoder_ffn_embed_dim": 14, "decoder_embed_dim": 12, "decoder_ffn_embed_dim": 14, # Disable dropout so we have comparable tests. "dropout": 0, "attention_dropout": 0, "activation_dropout": 0, "encoder_layerdrop": 0, } overrides.update(extra_args) # Overrides the defaults from the parser args = argparse.Namespace(**overrides) transformer_monotonic_attention.monotonic_tiny_architecture(args) torch.manual_seed(0) task = FakeTask(args) return ( transformer_monotonic_attention .TransformerModelSimulTrans .build_model(args, task) ) def expected_alignment_formula( p_choose, mass_perservation=True, padding_mask=None ): # Online and Linear-Time Attention by Enforcing Monotonic Alignments # https://arxiv.org/pdf/1704.00784.pdf # Eq 18, 19 bsz, tgt_len, src_len = p_choose.size() alpha = torch.zeros_like(p_choose) if padding_mask is not None: bsz_pad = padding_mask.size(0) num_heads = int(bsz / bsz_pad) padding_mask = ( padding_mask .unsqueeze(1) .expand([bsz_pad, num_heads, src_len]) .contiguous() .view(-1, src_len) ) p_choose = p_choose.masked_fill(padding_mask.unsqueeze(1), 0) for bsz_i in range(bsz): for i in range(tgt_len): for j in range(src_len): if i == 0: if j == 0: # First source token alpha[bsz_i, i, j] = p_choose[bsz_i, i, j] else: # First target token alpha[bsz_i, i, j] = ( p_choose[bsz_i, i, j] * torch.prod( 1 - p_choose[bsz_i, i, :j] ) ) else: alpha[bsz_i, i, j] = alpha[bsz_i, i - 1, j] for k in range(j): alpha[bsz_i, i, j] += ( alpha[bsz_i, i - 1, k] * torch.prod( 1 - p_choose[bsz_i, i, k:j] ) ) alpha[bsz_i, i, j] *= p_choose[bsz_i, i, j] alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0) if mass_perservation: alpha = mass_perservation_formula(alpha, False, padding_mask) return alpha def mass_perservation_formula(alpha, left_padding=False, padding_mask=None): if padding_mask is None or alpha.size(-1) == 1: if alpha.size(-1) > 1: alpha[:, :, -1] = 1 - alpha[:, :, :-1].sum(dim=-1) return alpha src_lens = (padding_mask.logical_not()).sum(dim=1).long() bsz, tgt_len, src_len = alpha.size() assert ( not left_padding or (left_padding and (not padding_mask[:, 0].any())) ) alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0) for bsz_i in range(bsz): if left_padding: alpha[bsz_i, :, -1] = ( 1 - alpha[bsz_i, :, :-1].sum(dim=-1) ) else: alpha[bsz_i, :, src_lens[bsz_i] - 1] = ( 1 - alpha[bsz_i, :, :src_lens[bsz_i] - 1].sum(dim=-1) ) return alpha def expected_soft_attention_formula( alpha, soft_energy, padding_mask=None, chunksize=1e10, ): # Monotonic Infinite Lookback Attention for Simultaneous Machine Translation # https://arxiv.org/pdf/1906.05218.pdf # Eq 14 # Monotonic Chunkwise Attention # https://arxiv.org/abs/1712.05382 # Eq 17 bsz, tgt_len, src_len = alpha.size() beta = torch.zeros_like(alpha) if padding_mask is not None: bsz_pad = padding_mask.size(0) num_heads = int(bsz / bsz_pad) # Expanding for potential head dimension padding_mask = ( padding_mask .unsqueeze(1) .expand([bsz_pad, num_heads, src_len]) .contiguous() .view(-1, src_len) ) soft_energy = soft_energy.masked_fill(padding_mask.unsqueeze(1), float('-inf')) for bsz_i in range(bsz): for i in range(tgt_len): for j in range(src_len): for k in range(j, min([src_len, j + chunksize])): if not padding_mask[bsz_i, j]: beta[bsz_i, i, j] += ( alpha[bsz_i, i, k] * torch.exp(soft_energy[bsz_i, i, j]) / torch.sum(torch.exp(soft_energy[bsz_i, i, max([0, k - chunksize + 1]):k + 1])) ) return beta class MonotonicAttentionTestAbstractClass(object): def test_forward(self): sample = make_sample_with_padding() out, _ = self.model.forward(**sample["net_input"]) loss = out.sum() loss.backward() def test_p_choose(self): sample = make_sample_with_padding() _, extra_out = self.model.forward(**sample["net_input"]) for item in extra_out.attn_list: p_choose = item["p_choose"] self.assertTrue(p_choose.le(1.0).all()) self.assertTrue(p_choose.ge(0.0).all()) def test_expected_alignment(self): for longer_src in [True, False]: sample = make_sample_with_padding(longer_src) _, extra_out = self.model.forward(**sample["net_input"]) for item in extra_out.attn_list: p_choose = item["p_choose"] alpha_system = item["alpha"] self.assertTrue(p_choose.size() == alpha_system.size()) bsz, num_head, tgt_len, src_len = alpha_system.size() alpha_system = alpha_system.view(-1, tgt_len, src_len) p_choose = p_choose.view(-1, tgt_len, src_len) alpha_real = expected_alignment_formula( p_choose, self.model.decoder.layers[0].encoder_attn.mass_preservation, sample["net_input"]["src_tokens"].eq(PAD_INDEX) ) self.assertTrue( torch.abs(alpha_system - alpha_real).le(5e-5).all(), ) class HardMonotonicAttentionTestCase( unittest.TestCase, MonotonicAttentionTestAbstractClass ): def setUp(self): self.model = build_transformer_monotonic_attention( **generate_config({"simul_type": "hard_aligned"}) ) class InfiniteLookbackTestCase( unittest.TestCase, MonotonicAttentionTestAbstractClass ): def setUp(self): self.model = build_transformer_monotonic_attention( **generate_config( { "simul_type": "infinite_lookback" } ) ) self.model.train() def test_fp16_for_long_input(self): sample = { "net_input": { "src_tokens": torch.LongTensor([7] * 1000 + [2]).cuda().unsqueeze(0), "prev_output_tokens": torch.LongTensor([7] * 1000 + [2]).cuda().unsqueeze(0), "src_lengths": torch.LongTensor([1000]).cuda(), }, "target": torch.LongTensor([2] + [7] * 1000).unsqueeze(0).cuda() } self.model.cuda().half() _, extra_out = self.model.forward(**sample["net_input"]) for item in extra_out.attn_list: for key in ["p_choose", "alpha", "beta", "soft_energy"]: self.assertFalse(torch.isnan(item[key]).any()) def test_expected_attention(self): for longer_src in [True, False]: sample = make_sample_with_padding(longer_src) _, extra_out = self.model.forward(**sample["net_input"]) for item in extra_out.attn_list: p_choose = item["p_choose"] alpha_system = item["alpha"] beta_system = item["beta"] soft_energy_system = item["soft_energy"] self.assertTrue(beta_system.size() == alpha_system.size()) self.assertTrue(p_choose.size() == alpha_system.size()) bsz, num_head, tgt_len, src_len = alpha_system.size() alpha_system = alpha_system.view(-1, tgt_len, src_len) beta_system = beta_system.view(-1, tgt_len, src_len) p_choose = p_choose.view(-1, tgt_len, src_len) soft_energy_system = soft_energy_system.view(-1, tgt_len, src_len) alpha_real = expected_alignment_formula( p_choose, self.model.decoder.layers[0].encoder_attn.mass_preservation, sample["net_input"]["src_tokens"].eq(PAD_INDEX) ) beta_real = expected_soft_attention_formula( alpha_real, soft_energy_system, sample["net_input"]["src_tokens"].eq(PAD_INDEX), chunksize=getattr( self.model.decoder.layers[0].encoder_attn, "chunk_size", int(1e10) ) or int(1e10) ) self.assertTrue( torch.abs(beta_system - beta_real).le(1e-5).all(), ) class ChunkwiswTestCase( InfiniteLookbackTestCase ): def setUp(self): self.model = build_transformer_monotonic_attention( **generate_config( { "simul_type": "chunkwise", "mocha_chunk_size": 3 } ) ) class WaitkTestCase(InfiniteLookbackTestCase): def setUp(self): self.model = build_transformer_monotonic_attention( **generate_config( { "simul_type": "waitk", "waitk_lagging": 3, } ) ) def check_waitk(self, p_choose, lagging, padding_mask): bsz, tgt_len, src_len = p_choose.size() for bsz_i in range(bsz): for i in range(tgt_len): for j in range(src_len): if not padding_mask[bsz_i, j]: if j - i == lagging - 1: self.assertTrue(p_choose[bsz_i, i, j] == 1) else: self.assertTrue(p_choose[bsz_i, i, j] == 0) def test_waitk_p_choose(self): for longer_src in [True, False]: for k in [1, 3, 10, 20, 100]: sample = make_sample_with_padding(longer_src) model = build_transformer_monotonic_attention( **generate_config( { "simul_type": "waitk", "waitk_lagging": k, } ) ) model.train() _, extra_out = model.forward(**sample["net_input"]) for item in extra_out.attn_list: p_choose = item["p_choose"] bsz, num_heads, tgt_len, src_len = p_choose.size() padding_mask = sample["net_input"]["src_tokens"].eq(PAD_INDEX) padding_mask = ( padding_mask .unsqueeze(1) .expand([bsz, num_heads, src_len]) .contiguous() .view(-1, src_len) ) p_choose = p_choose.view(bsz * num_heads, tgt_len, src_len) self.check_waitk(p_choose, k, padding_mask) ================================================ FILE: examples/simultaneous_translation/utils/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os # automatically import any Python files in the criterions/ directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("examples.simultaneous_translation.utils." + module) ================================================ FILE: examples/simultaneous_translation/utils/functions.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch def prob_check(tensor, eps=1e-10): assert not torch.isnan(tensor).any(), ( "Nan in a probability tensor." ) # Add the eps here to prevent errors introduced by precision assert tensor.le(1.0 + eps).all() and tensor.ge(0.0 - eps).all(), ( "Incorrect values in a probability tensor" ", 0.0 <= tensor <= 1.0" ) def exclusive_cumprod(tensor, dim: int, eps: float = 1e-10): """ Implementing exclusive cumprod. There is cumprod in pytorch, however there is no exclusive mode. cumprod(x) = [x1, x1x2, x2x3x4, ..., prod_{i=1}^n x_i] exclusive means cumprod(x) = [1, x1, x1x2, x1x2x3, ..., prod_{i=1}^{n-1} x_i] """ tensor_size = list(tensor.size()) tensor_size[dim] = 1 return_tensor = safe_cumprod( torch.cat([torch.ones(tensor_size).type_as(tensor), tensor], dim=dim), dim=dim, eps=eps, ) if dim == 0: return return_tensor[:-1] elif dim == 1: return return_tensor[:, :-1] elif dim == 2: return return_tensor[:, :, :-1] else: raise RuntimeError( "Cumprod on dimension 3 and more is not implemented" ) def safe_cumprod(tensor, dim: int, eps: float = 1e-10): """ An implementation of cumprod to prevent precision issue. cumprod(x) = [x1, x1x2, x1x2x3, ....] = [exp(log(x1)), exp(log(x1) + log(x2)), exp(log(x1) + log(x2) + log(x3)), ...] = exp(cumsum(log(x))) """ if (tensor + eps < 0).any().item(): raise RuntimeError( "Safe cumprod can only take non-negative tensors as input." "Consider use torch.cumprod if you want to calculate negative values." ) log_tensor = torch.log(tensor + eps) cumsum_log_tensor = torch.cumsum(log_tensor, dim) exp_cumsum_log_tensor = torch.exp(cumsum_log_tensor) return exp_cumsum_log_tensor def moving_sum(x, start_idx: int, end_idx: int): """ From MONOTONIC CHUNKWISE ATTENTION https://arxiv.org/pdf/1712.05382.pdf Equation (18) x = [x_1, x_2, ..., x_N] MovingSum(x, start_idx, end_idx)_n = Sigma_{m=n−(start_idx−1)}^{n+end_idx-1} x_m for n in {1, 2, 3, ..., N} x : src_len, batch_size start_idx : start idx end_idx : end idx Example src_len = 5 batch_size = 3 x = [[ 0, 5, 10], [ 1, 6, 11], [ 2, 7, 12], [ 3, 8, 13], [ 4, 9, 14]] MovingSum(x, 3, 1) = [[ 0, 5, 10], [ 1, 11, 21], [ 3, 18, 33], [ 6, 21, 36], [ 9, 24, 39]] MovingSum(x, 1, 3) = [[ 3, 18, 33], [ 6, 21, 36], [ 9, 24, 39], [ 7, 17, 27], [ 4, 9, 14]] """ # TODO: Make dimension configurable assert start_idx > 0 and end_idx > 0 batch_size, tgt_len, src_len = x.size() x = x.view(-1, src_len).unsqueeze(1) # batch_size, 1, src_len moving_sum_weight = torch.ones([1, 1, end_idx + start_idx - 1]).type_as(x) moving_sum = torch.nn.functional.conv1d( x, moving_sum_weight, padding=start_idx + end_idx - 1 ).squeeze(1) moving_sum = moving_sum[:, end_idx:-start_idx] assert src_len == moving_sum.size(1) assert batch_size * tgt_len == moving_sum.size(0) moving_sum = moving_sum.view(batch_size, tgt_len, src_len) return moving_sum ================================================ FILE: examples/simultaneous_translation/utils/monotonic_attention.py ================================================ from typing import Optional import torch from torch import Tensor from examples.simultaneous_translation.utils.functions import ( exclusive_cumprod, prob_check, moving_sum, ) def expected_alignment_from_p_choose( p_choose: Tensor, padding_mask: Optional[Tensor] = None, eps: float = 1e-6 ): """ Calculating expected alignment for from stepwise probability Reference: Online and Linear-Time Attention by Enforcing Monotonic Alignments https://arxiv.org/pdf/1704.00784.pdf q_ij = (1 − p_{ij−1})q_{ij−1} + a+{i−1j} a_ij = p_ij q_ij Parallel solution: ai = p_i * cumprod(1 − pi) * cumsum(a_i / cumprod(1 − pi)) ============================================================ Expected input size p_choose: bsz, tgt_len, src_len """ prob_check(p_choose) # p_choose: bsz, tgt_len, src_len bsz, tgt_len, src_len = p_choose.size() dtype = p_choose.dtype p_choose = p_choose.float() if padding_mask is not None: p_choose = p_choose.masked_fill(padding_mask.unsqueeze(1), 0.0) if p_choose.is_cuda: p_choose = p_choose.contiguous() from alignment_train_cuda_binding import alignment_train_cuda as alignment_train else: from alignment_train_cpu_binding import alignment_train_cpu as alignment_train alpha = p_choose.new_zeros([bsz, tgt_len, src_len]) alignment_train(p_choose, alpha, eps) # Mix precision to prevent overflow for fp16 alpha = alpha.type(dtype) prob_check(alpha) return alpha def expected_soft_attention( alpha: Tensor, soft_energy: Tensor, padding_mask: Optional[Tensor] = None, chunk_size: Optional[int] = None, eps: float = 1e-10 ): """ Function to compute expected soft attention for monotonic infinite lookback attention from expected alignment and soft energy. Reference: Monotonic Chunkwise Attention https://arxiv.org/abs/1712.05382 Monotonic Infinite Lookback Attention for Simultaneous Machine Translation https://arxiv.org/abs/1906.05218 alpha: bsz, tgt_len, src_len soft_energy: bsz, tgt_len, src_len padding_mask: bsz, src_len left_padding: bool """ if padding_mask is not None: alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0) soft_energy = soft_energy.masked_fill( padding_mask.unsqueeze(1), -float("inf") ) prob_check(alpha) dtype = alpha.dtype alpha = alpha.float() soft_energy = soft_energy.float() soft_energy = soft_energy - soft_energy.max(dim=2, keepdim=True)[0] exp_soft_energy = torch.exp(soft_energy) + eps if chunk_size is not None: # Chunkwise beta = ( exp_soft_energy * moving_sum( alpha / (eps + moving_sum(exp_soft_energy, chunk_size, 1)), 1, chunk_size ) ) else: # Infinite lookback # Notice that infinite lookback is a special case of chunkwise # where chunksize = inf inner_items = alpha / (eps + torch.cumsum(exp_soft_energy, dim=2)) beta = ( exp_soft_energy * torch.cumsum(inner_items.flip(dims=[2]), dim=2) .flip(dims=[2]) ) if padding_mask is not None: beta = beta.masked_fill( padding_mask.unsqueeze(1).to(torch.bool), 0.0) # Mix precision to prevent overflow for fp16 beta = beta.type(dtype) beta = beta.clamp(0, 1) prob_check(beta) return beta def mass_preservation( alpha: Tensor, padding_mask: Optional[Tensor] = None, left_padding: bool = False ): """ Function to compute the mass perservation for alpha. This means that the residual weights of alpha will be assigned to the last token. Reference: Monotonic Infinite Lookback Attention for Simultaneous Machine Translation https://arxiv.org/abs/1906.05218 alpha: bsz, tgt_len, src_len padding_mask: bsz, src_len left_padding: bool """ prob_check(alpha) if padding_mask is not None: if not left_padding: assert not padding_mask[:, 0].any(), ( "Find padding on the beginning of the sequence." ) alpha = alpha.masked_fill(padding_mask.unsqueeze(1), 0.0) if left_padding or padding_mask is None: residuals = 1 - alpha[:, :, :-1].sum(dim=-1).clamp(0, 1) alpha[:, :, -1] = residuals else: # right padding _, tgt_len, src_len = alpha.size() residuals = 1 - alpha.sum(dim=-1, keepdim=True).clamp(0, 1) src_lens = src_len - padding_mask.sum(dim=1, keepdim=True) src_lens = src_lens.expand(-1, tgt_len).contiguous() # add back the last value residuals += alpha.gather(2, src_lens.unsqueeze(2) - 1) alpha = alpha.scatter(2, src_lens.unsqueeze(2) - 1, residuals) prob_check(alpha) return alpha ================================================ FILE: examples/simultaneous_translation/utils/p_choose_strategy.py ================================================ from typing import Optional, Dict from torch import Tensor import torch def waitk_p_choose( tgt_len: int, src_len: int, bsz: int, waitk_lagging: int, key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None ): max_src_len = src_len if incremental_state is not None: # Retrieve target length from incremental states # For inference the length of query is always 1 max_tgt_len = incremental_state["steps"]["tgt"] assert max_tgt_len is not None max_tgt_len = int(max_tgt_len) else: max_tgt_len = tgt_len if max_src_len < waitk_lagging: if incremental_state is not None: max_tgt_len = 1 return torch.zeros( bsz, max_tgt_len, max_src_len ) # Assuming the p_choose looks like this for wait k=3 # src_len = 6, max_tgt_len = 5 # [0, 0, 1, 0, 0, 0, 0] # [0, 0, 0, 1, 0, 0, 0] # [0, 0, 0, 0, 1, 0, 0] # [0, 0, 0, 0, 0, 1, 0] # [0, 0, 0, 0, 0, 0, 1] # linearize the p_choose matrix: # [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0...] # The indices of linearized matrix that equals 1 is # 2 + 6 * 0 # 3 + 6 * 1 # ... # n + src_len * n + k - 1 = n * (src_len + 1) + k - 1 # n from 0 to max_tgt_len - 1 # # First, generate the indices (activate_indices_offset: bsz, max_tgt_len) # Second, scatter a zeros tensor (bsz, max_tgt_len * src_len) # with activate_indices_offset # Third, resize the tensor to (bsz, max_tgt_len, src_len) activate_indices_offset = ( ( torch.arange(max_tgt_len) * (max_src_len + 1) + waitk_lagging - 1 ) .unsqueeze(0) .expand(bsz, max_tgt_len) .long() ) if key_padding_mask is not None: if key_padding_mask[:, 0].any(): # Left padding activate_indices_offset += ( key_padding_mask.sum(dim=1, keepdim=True) ) # Need to clamp the indices that are too large activate_indices_offset = ( activate_indices_offset .clamp( 0, min( [ max_tgt_len, max_src_len - waitk_lagging + 1 ] ) * max_src_len - 1 ) ) p_choose = torch.zeros(bsz, max_tgt_len * max_src_len) p_choose = p_choose.scatter( 1, activate_indices_offset, 1.0 ).view(bsz, max_tgt_len, max_src_len) if key_padding_mask is not None: p_choose = p_choose.to(key_padding_mask) p_choose = p_choose.masked_fill(key_padding_mask.unsqueeze(1), 0) if incremental_state is not None: p_choose = p_choose[:, -1:] return p_choose.float() def learnable_p_choose( energy, noise_mean: float = 0.0, noise_var: float = 0.0, training: bool = True ): """ Calculating step wise prob for reading and writing 1 to read, 0 to write energy: bsz, tgt_len, src_len """ noise = 0 if training: # add noise here to encourage discretness noise = ( torch.normal(noise_mean, noise_var, energy.size()) .type_as(energy) .to(energy.device) ) p_choose = torch.sigmoid(energy + noise) # p_choose: bsz * self.num_heads, tgt_len, src_len return p_choose ================================================ FILE: examples/speech_recognition/README.md ================================================ ### 2021 Update: We are merging this example into the [S2T framework](../speech_to_text), which supports more generic speech-to-text tasks (e.g. speech translation) and more flexible data processing pipelines. Please stay tuned. # Speech Recognition `examples/speech_recognition` is implementing ASR task in Fairseq, along with needed features, datasets, models and loss functions to train and infer model described in [Transformers with convolutional context for ASR (Abdelrahman Mohamed et al., 2019)](https://arxiv.org/abs/1904.11660). ## Additional dependencies On top of main fairseq dependencies there are couple more additional requirements. 1) Please follow the instructions to install [torchaudio](https://github.com/pytorch/audio). This is required to compute audio fbank features. 2) [Sclite](http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/sclite.htm#sclite_name_0) is used to measure WER. Sclite can be downloaded and installed from source from sctk package [here](http://www.openslr.org/4/). Training and inference doesn't require Sclite dependency. 3) [sentencepiece](https://github.com/google/sentencepiece) is required in order to create dataset with word-piece targets. ## Preparing librispeech data ``` ./examples/speech_recognition/datasets/prepare-librispeech.sh $DIR_TO_SAVE_RAW_DATA $DIR_FOR_PREPROCESSED_DATA ``` ## Training librispeech data ``` python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 80 --task speech_recognition --arch vggtransformer_2 --optimizer adadelta --lr 1.0 --adadelta-eps 1e-8 --adadelta-rho 0.95 --clip-norm 10.0 --max-tokens 5000 --log-format json --log-interval 1 --criterion cross_entropy_acc --user-dir examples/speech_recognition/ ``` ## Inference for librispeech `$SET` can be `test_clean` or `test_other` Any checkpoint in `$MODEL_PATH` can be selected. In this example we are working with `checkpoint_last.pt` ``` python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --max-tokens 25000 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --beam 20 --results-path $RES_DIR --batch-size 40 --gen-subset $SET --user-dir examples/speech_recognition/ ``` ## Inference for librispeech ``` sclite -r ${RES_DIR}/ref.word-checkpoint_last.pt-${SET}.txt -h ${RES_DIR}/hypo.word-checkpoint_last.pt-${SET}.txt -i rm -o all stdout > $RES_REPORT ``` `Sum/Avg` row from first table of the report has WER ## Using flashlight (previously called [wav2letter](https://github.com/facebookresearch/wav2letter)) components [flashlight](https://github.com/facebookresearch/flashlight) now has integration with fairseq. Currently this includes: * AutoSegmentationCriterion (ASG) * flashlight-style Conv/GLU model * flashlight's beam search decoder To use these, follow the instructions on [this page](https://github.com/flashlight/flashlight/tree/e16682fa32df30cbf675c8fe010f929c61e3b833/bindings/python) to install python bindings. **Flashlight v0.3.2** must be used to install the bindings. Running: ``` git clone --branch v0.3.2 https://github.com/flashlight/flashlight ``` will properly clone and check out this version. ## Training librispeech data (flashlight style, Conv/GLU + ASG loss) Training command: ``` python train.py $DIR_FOR_PREPROCESSED_DATA --save-dir $MODEL_PATH --max-epoch 100 --task speech_recognition --arch w2l_conv_glu_enc --batch-size 4 --optimizer sgd --lr 0.3,0.8 --momentum 0.8 --clip-norm 0.2 --max-tokens 50000 --log-format json --log-interval 100 --num-workers 0 --sentence-avg --criterion asg_loss --asg-transitions-init 5 --max-replabel 2 --linseg-updates 8789 --user-dir examples/speech_recognition ``` Note that ASG loss currently doesn't do well with word-pieces. You should prepare a dataset with character targets by setting `nbpe=31` in `prepare-librispeech.sh`. ## Inference for librispeech (flashlight decoder, n-gram LM) Inference command: ``` python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder kenlm --kenlm-model $KENLM_MODEL_PATH --lexicon $LEXICON_PATH --beam 200 --beam-threshold 15 --lm-weight 1.5 --word-score 1.5 --sil-weight -0.3 --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition ``` `$KENLM_MODEL_PATH` should be a standard n-gram language model file. `$LEXICON_PATH` should be a flashlight-style lexicon (list of known words and their spellings). For ASG inference, a lexicon line should look like this (note the repetition labels): ``` doorbell D O 1 R B E L 1 ▁ ``` For CTC inference with word-pieces, repetition labels are not used and the lexicon should have most common spellings for each word (one can use sentencepiece's `NBestEncodeAsPieces` for this): ``` doorbell ▁DOOR BE LL doorbell ▁DOOR B E LL doorbell ▁DO OR BE LL doorbell ▁DOOR B EL L doorbell ▁DOOR BE L L doorbell ▁DO OR B E LL doorbell ▁DOOR B E L L doorbell ▁DO OR B EL L doorbell ▁DO O R BE LL doorbell ▁DO OR BE L L ``` Lowercase vs. uppercase matters: the *word* should match the case of the n-gram language model (i.e. `$KENLM_MODEL_PATH`), while the *spelling* should match the case of the token dictionary (i.e. `$DIR_FOR_PREPROCESSED_DATA/dict.txt`). ## Inference for librispeech (flashlight decoder, viterbi only) Inference command: ``` python examples/speech_recognition/infer.py $DIR_FOR_PREPROCESSED_DATA --task speech_recognition --seed 1 --nbest 1 --path $MODEL_PATH/checkpoint_last.pt --gen-subset $SET --results-path $RES_DIR --w2l-decoder viterbi --criterion asg_loss --max-replabel 2 --user-dir examples/speech_recognition ``` ================================================ FILE: examples/speech_recognition/__init__.py ================================================ from . import criterions, models, tasks # noqa ================================================ FILE: examples/speech_recognition/criterions/ASG_loss.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from examples.speech_recognition.data.replabels import pack_replabels from fairseq import utils from fairseq.criterions import FairseqCriterion, register_criterion @register_criterion("asg_loss") class ASGCriterion(FairseqCriterion): @staticmethod def add_args(parser): group = parser.add_argument_group("ASG Loss") group.add_argument( "--asg-transitions-init", help="initial diagonal value of transition matrix", type=float, default=0.0, ) group.add_argument( "--max-replabel", help="maximum # of replabels", type=int, default=2 ) group.add_argument( "--linseg-updates", help="# of training updates to use LinSeg initialization", type=int, default=0, ) group.add_argument( "--hide-linseg-messages", help="hide messages about LinSeg initialization", action="store_true", ) def __init__( self, task, silence_token, asg_transitions_init, max_replabel, linseg_updates, hide_linseg_messages, ): from flashlight.lib.sequence.criterion import ASGLoss, CriterionScaleMode super().__init__(task) self.tgt_dict = task.target_dictionary self.eos = self.tgt_dict.eos() self.silence = ( self.tgt_dict.index(silence_token) if silence_token in self.tgt_dict else None ) self.max_replabel = max_replabel num_labels = len(self.tgt_dict) self.asg = ASGLoss(num_labels, scale_mode=CriterionScaleMode.TARGET_SZ_SQRT) self.asg.trans = torch.nn.Parameter( asg_transitions_init * torch.eye(num_labels), requires_grad=True ) self.linseg_progress = torch.nn.Parameter( torch.tensor([0], dtype=torch.int), requires_grad=False ) self.linseg_maximum = linseg_updates self.linseg_message_state = "none" if hide_linseg_messages else "start" @classmethod def build_criterion(cls, args, task): return cls( task, args.silence_token, args.asg_transitions_init, args.max_replabel, args.linseg_updates, args.hide_linseg_messages, ) def linseg_step(self): if not self.training: return False if self.linseg_progress.item() < self.linseg_maximum: if self.linseg_message_state == "start": print("| using LinSeg to initialize ASG") self.linseg_message_state = "finish" self.linseg_progress.add_(1) return True elif self.linseg_message_state == "finish": print("| finished LinSeg initialization") self.linseg_message_state = "none" return False def replace_eos_with_silence(self, tgt): if tgt[-1] != self.eos: return tgt elif self.silence is None or (len(tgt) > 1 and tgt[-2] == self.silence): return tgt[:-1] else: return tgt[:-1] + [self.silence] def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) emissions = net_output["encoder_out"].transpose(0, 1).contiguous() B = emissions.size(0) T = emissions.size(1) device = emissions.device target = torch.IntTensor(B, T) target_size = torch.IntTensor(B) using_linseg = self.linseg_step() for b in range(B): initial_target_size = sample["target_lengths"][b].item() if initial_target_size == 0: raise ValueError("target size cannot be zero") tgt = sample["target"][b, :initial_target_size].tolist() tgt = self.replace_eos_with_silence(tgt) tgt = pack_replabels(tgt, self.tgt_dict, self.max_replabel) tgt = tgt[:T] if using_linseg: tgt = [tgt[t * len(tgt) // T] for t in range(T)] target[b][: len(tgt)] = torch.IntTensor(tgt) target_size[b] = len(tgt) loss = self.asg.forward(emissions, target.to(device), target_size.to(device)) if reduce: loss = torch.sum(loss) sample_size = ( sample["target"].size(0) if self.args.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } return loss, sample_size, logging_output @staticmethod def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) agg_output = { "loss": loss_sum / nsentences, "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size, } return agg_output ================================================ FILE: examples/speech_recognition/criterions/__init__.py ================================================ import importlib import os # ASG loss requires flashlight bindings files_to_skip = set() try: import flashlight.lib.sequence.criterion except ImportError: files_to_skip.add("ASG_loss.py") for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_") and file not in files_to_skip: criterion_name = file[: file.find(".py")] importlib.import_module( "examples.speech_recognition.criterions." + criterion_name ) ================================================ FILE: examples/speech_recognition/criterions/cross_entropy_acc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from __future__ import absolute_import, division, print_function, unicode_literals import logging import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.criterions import FairseqCriterion, register_criterion @register_criterion("cross_entropy_acc") class CrossEntropyWithAccCriterion(FairseqCriterion): def __init__(self, task, sentence_avg): super().__init__(task) self.sentence_avg = sentence_avg def compute_loss(self, model, net_output, target, reduction, log_probs): # N, T -> N * T target = target.view(-1) lprobs = model.get_normalized_probs(net_output, log_probs=log_probs) if not hasattr(lprobs, "batch_first"): logging.warning( "ERROR: we need to know whether " "batch first for the net output; " "you need to set batch_first attribute for the return value of " "model.get_normalized_probs. Now, we assume this is true, but " "in the future, we will raise exception instead. " ) batch_first = getattr(lprobs, "batch_first", True) if not batch_first: lprobs = lprobs.transpose(0, 1) # N, T, D -> N * T, D lprobs = lprobs.view(-1, lprobs.size(-1)) loss = F.nll_loss( lprobs, target, ignore_index=self.padding_idx, reduction=reduction ) return lprobs, loss def get_logging_output(self, sample, target, lprobs, loss): target = target.view(-1) mask = target != self.padding_idx correct = torch.sum( lprobs.argmax(1).masked_select(mask) == target.masked_select(mask) ) total = torch.sum(mask) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data), # * sample['ntokens'], "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, "correct": utils.item(correct.data), "total": utils.item(total.data), "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(), } return sample_size, logging_output def forward(self, model, sample, reduction="sum", log_probs=True): """Computes the cross entropy with accuracy metric for the given sample. This is similar to CrossEntropyCriterion in fairseq, but also computes accuracy metrics as part of logging Args: logprobs (Torch.tensor) of shape N, T, D i.e. batchsize, timesteps, dimensions targets (Torch.tensor) of shape N, T i.e batchsize, timesteps Returns: tuple: With three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training TODO: * Currently this Criterion will only work with LSTMEncoderModels or FairseqModels which have decoder, or Models which return TorchTensor as net_output. We need to make a change to support all FairseqEncoder models. """ net_output = model(**sample["net_input"]) target = model.get_targets(sample, net_output) lprobs, loss = self.compute_loss( model, net_output, target, reduction, log_probs ) sample_size, logging_output = self.get_logging_output( sample, target, lprobs, loss ) return loss, sample_size, logging_output @staticmethod def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" correct_sum = sum(log.get("correct", 0) for log in logging_outputs) total_sum = sum(log.get("total", 0) for log in logging_outputs) loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) nframes = sum(log.get("nframes", 0) for log in logging_outputs) agg_output = { "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0, # if args.sentence_avg, then sample_size is nsentences, then loss # is per-sentence loss; else sample_size is ntokens, the loss # becomes per-output token loss "ntokens": ntokens, "nsentences": nsentences, "nframes": nframes, "sample_size": sample_size, "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0, "correct": correct_sum, "total": total_sum, # total is the number of validate tokens } if sample_size != ntokens: agg_output["nll_loss"] = loss_sum / ntokens / math.log(2) # loss: per output token loss # nll_loss: per sentence loss return agg_output ================================================ FILE: examples/speech_recognition/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .asr_dataset import AsrDataset __all__ = [ "AsrDataset", ] ================================================ FILE: examples/speech_recognition/data/asr_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import numpy as np from fairseq.data import FairseqDataset from . import data_utils from .collaters import Seq2SeqCollater class AsrDataset(FairseqDataset): """ A dataset representing speech and corresponding transcription. Args: aud_paths: (List[str]): A list of str with paths to audio files. aud_durations_ms (List[int]): A list of int containing the durations of audio files. tgt (List[torch.LongTensor]): A list of LongTensors containing the indices of target transcriptions. tgt_dict (~fairseq.data.Dictionary): target vocabulary. ids (List[str]): A list of utterance IDs. speakers (List[str]): A list of speakers corresponding to utterances. num_mel_bins (int): Number of triangular mel-frequency bins (default: 80) frame_length (float): Frame length in milliseconds (default: 25.0) frame_shift (float): Frame shift in milliseconds (default: 10.0) """ def __init__( self, aud_paths, aud_durations_ms, tgt, tgt_dict, ids, speakers, num_mel_bins=80, frame_length=25.0, frame_shift=10.0, ): assert frame_length > 0 assert frame_shift > 0 assert all(x > frame_length for x in aud_durations_ms) self.frame_sizes = [ int(1 + (d - frame_length) / frame_shift) for d in aud_durations_ms ] assert len(aud_paths) > 0 assert len(aud_paths) == len(aud_durations_ms) assert len(aud_paths) == len(tgt) assert len(aud_paths) == len(ids) assert len(aud_paths) == len(speakers) self.aud_paths = aud_paths self.tgt_dict = tgt_dict self.tgt = tgt self.ids = ids self.speakers = speakers self.num_mel_bins = num_mel_bins self.frame_length = frame_length self.frame_shift = frame_shift self.s2s_collater = Seq2SeqCollater( 0, 1, pad_index=self.tgt_dict.pad(), eos_index=self.tgt_dict.eos(), move_eos_to_beginning=True, ) def __getitem__(self, index): import torchaudio import torchaudio.compliance.kaldi as kaldi tgt_item = self.tgt[index] if self.tgt is not None else None path = self.aud_paths[index] if not os.path.exists(path): raise FileNotFoundError("Audio file not found: {}".format(path)) sound, sample_rate = torchaudio.load_wav(path) output = kaldi.fbank( sound, num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, frame_shift=self.frame_shift, ) output_cmvn = data_utils.apply_mv_norm(output) return {"id": index, "data": [output_cmvn.detach(), tgt_item]} def __len__(self): return len(self.aud_paths) def collater(self, samples): """Merge a list of samples to form a mini-batch. Args: samples (List[int]): sample indices to collate Returns: dict: a mini-batch suitable for forwarding with a Model """ return self.s2s_collater.collate(samples) def num_tokens(self, index): return self.frame_sizes[index] def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" return ( self.frame_sizes[index], len(self.tgt[index]) if self.tgt is not None else 0, ) def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" return np.arange(len(self)) ================================================ FILE: examples/speech_recognition/data/collaters.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ This module contains collection of classes which implement collate functionalities for various tasks. Collaters should know what data to expect for each sample and they should pack / collate them into batches """ from __future__ import absolute_import, division, print_function, unicode_literals import numpy as np import torch from fairseq.data import data_utils as fairseq_data_utils class Seq2SeqCollater(object): """ Implements collate function mainly for seq2seq tasks This expects each sample to contain feature (src_tokens) and targets. This collator is also used for aligned training task. """ def __init__( self, feature_index=0, label_index=1, pad_index=1, eos_index=2, move_eos_to_beginning=True, ): self.feature_index = feature_index self.label_index = label_index self.pad_index = pad_index self.eos_index = eos_index self.move_eos_to_beginning = move_eos_to_beginning def _collate_frames(self, frames): """Convert a list of 2d frames into a padded 3d tensor Args: frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is length of i-th frame and f_dim is static dimension of features Returns: 3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i] """ len_max = max(frame.size(0) for frame in frames) f_dim = frames[0].size(1) res = frames[0].new(len(frames), len_max, f_dim).fill_(0.0) for i, v in enumerate(frames): res[i, : v.size(0)] = v return res def collate(self, samples): """ utility function to collate samples into batch for speech recognition. """ if len(samples) == 0: return {} # parse samples into torch tensors parsed_samples = [] for s in samples: # skip invalid samples if s["data"][self.feature_index] is None: continue source = s["data"][self.feature_index] if isinstance(source, (np.ndarray, np.generic)): source = torch.from_numpy(source) target = s["data"][self.label_index] if isinstance(target, (np.ndarray, np.generic)): target = torch.from_numpy(target).long() elif isinstance(target, list): target = torch.LongTensor(target) parsed_sample = {"id": s["id"], "source": source, "target": target} parsed_samples.append(parsed_sample) samples = parsed_samples id = torch.LongTensor([s["id"] for s in samples]) frames = self._collate_frames([s["source"] for s in samples]) # sort samples by descending number of frames frames_lengths = torch.LongTensor([s["source"].size(0) for s in samples]) frames_lengths, sort_order = frames_lengths.sort(descending=True) id = id.index_select(0, sort_order) frames = frames.index_select(0, sort_order) target = None target_lengths = None prev_output_tokens = None if samples[0].get("target", None) is not None: ntokens = sum(len(s["target"]) for s in samples) target = fairseq_data_utils.collate_tokens( [s["target"] for s in samples], self.pad_index, self.eos_index, left_pad=False, move_eos_to_beginning=False, ) target = target.index_select(0, sort_order) target_lengths = torch.LongTensor( [s["target"].size(0) for s in samples] ).index_select(0, sort_order) prev_output_tokens = fairseq_data_utils.collate_tokens( [s["target"] for s in samples], self.pad_index, self.eos_index, left_pad=False, move_eos_to_beginning=self.move_eos_to_beginning, ) prev_output_tokens = prev_output_tokens.index_select(0, sort_order) else: ntokens = sum(len(s["source"]) for s in samples) batch = { "id": id, "ntokens": ntokens, "net_input": {"src_tokens": frames, "src_lengths": frames_lengths}, "target": target, "target_lengths": target_lengths, "nsentences": len(samples), } if prev_output_tokens is not None: batch["net_input"]["prev_output_tokens"] = prev_output_tokens return batch ================================================ FILE: examples/speech_recognition/data/data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch def calc_mean_invstddev(feature): if len(feature.size()) != 2: raise ValueError("We expect the input feature to be 2-D tensor") mean = feature.mean(0) var = feature.var(0) # avoid division by ~zero eps = 1e-8 if (var < eps).any(): return mean, 1.0 / (torch.sqrt(var) + eps) return mean, 1.0 / torch.sqrt(var) def apply_mv_norm(features): # If there is less than 2 spectrograms, the variance cannot be computed (is NaN) # and normalization is not possible, so return the item as it is if features.size(0) < 2: return features mean, invstddev = calc_mean_invstddev(features) res = (features - mean) * invstddev return res def lengths_to_encoder_padding_mask(lengths, batch_first=False): """ convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor Args: lengths: a (B, )-shaped tensor Return: max_length: maximum length of B sequences encoder_padding_mask: a (max_length, B) binary mask, where [t, b] = 0 for t < lengths[b] and 1 otherwise TODO: kernelize this function if benchmarking shows this function is slow """ max_lengths = torch.max(lengths).item() bsz = lengths.size(0) encoder_padding_mask = torch.arange( max_lengths ).to( # a (T, ) tensor with [0, ..., T-1] lengths.device ).view( # move to the right device 1, max_lengths ).expand( # reshape to (1, T)-shaped tensor bsz, -1 ) >= lengths.view( # expand to (B, T)-shaped tensor bsz, 1 ).expand( -1, max_lengths ) if not batch_first: return encoder_padding_mask.t(), max_lengths else: return encoder_padding_mask, max_lengths def encoder_padding_mask_to_lengths( encoder_padding_mask, max_lengths, batch_size, device ): """ convert encoder_padding_mask (2-D binary tensor) to a 1-D tensor Conventionally, encoder output contains a encoder_padding_mask, which is a 2-D mask in a shape (T, B), whose (t, b) element indicate whether encoder_out[t, b] is a valid output (=0) or not (=1). Occasionally, we need to convert this mask tensor to a 1-D tensor in shape (B, ), where [b] denotes the valid length of b-th sequence Args: encoder_padding_mask: a (T, B)-shaped binary tensor or None; if None, indicating all are valid Return: seq_lengths: a (B,)-shaped tensor, where its (b, )-th element is the number of valid elements of b-th sequence max_lengths: maximum length of all sequence, if encoder_padding_mask is not None, max_lengths must equal to encoder_padding_mask.size(0) batch_size: batch size; if encoder_padding_mask is not None, max_lengths must equal to encoder_padding_mask.size(1) device: which device to put the result on """ if encoder_padding_mask is None: return torch.Tensor([max_lengths] * batch_size).to(torch.int32).to(device) assert encoder_padding_mask.size(0) == max_lengths, "max_lengths does not match" assert encoder_padding_mask.size(1) == batch_size, "batch_size does not match" return max_lengths - torch.sum(encoder_padding_mask, dim=0) ================================================ FILE: examples/speech_recognition/data/replabels.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Replabel transforms for use with flashlight's ASG criterion. """ def replabel_symbol(i): """ Replabel symbols used in flashlight, currently just "1", "2", ... This prevents training with numeral tokens, so this might change in the future """ return str(i) def pack_replabels(tokens, dictionary, max_reps): """ Pack a token sequence so that repeated symbols are replaced by replabels """ if len(tokens) == 0 or max_reps <= 0: return tokens replabel_value_to_idx = [0] * (max_reps + 1) for i in range(1, max_reps + 1): replabel_value_to_idx[i] = dictionary.index(replabel_symbol(i)) result = [] prev_token = -1 num_reps = 0 for token in tokens: if token == prev_token and num_reps < max_reps: num_reps += 1 else: if num_reps > 0: result.append(replabel_value_to_idx[num_reps]) num_reps = 0 result.append(token) prev_token = token if num_reps > 0: result.append(replabel_value_to_idx[num_reps]) return result def unpack_replabels(tokens, dictionary, max_reps): """ Unpack a token sequence so that replabels are replaced by repeated symbols """ if len(tokens) == 0 or max_reps <= 0: return tokens replabel_idx_to_value = {} for i in range(1, max_reps + 1): replabel_idx_to_value[dictionary.index(replabel_symbol(i))] = i result = [] prev_token = -1 for token in tokens: try: for _ in range(replabel_idx_to_value[token]): result.append(prev_token) prev_token = -1 except KeyError: result.append(token) prev_token = token return result ================================================ FILE: examples/speech_recognition/datasets/asr_prep_json.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from __future__ import absolute_import, division, print_function, unicode_literals import argparse import concurrent.futures import json import multiprocessing import os from collections import namedtuple from itertools import chain import sentencepiece as spm from fairseq.data import Dictionary MILLISECONDS_TO_SECONDS = 0.001 def process_sample(aud_path, lable, utt_id, sp, tgt_dict): import torchaudio input = {} output = {} si, ei = torchaudio.info(aud_path) input["length_ms"] = int( si.length / si.channels / si.rate / MILLISECONDS_TO_SECONDS ) input["path"] = aud_path token = " ".join(sp.EncodeAsPieces(lable)) ids = tgt_dict.encode_line(token, append_eos=False) output["text"] = lable output["token"] = token output["tokenid"] = ", ".join(map(str, [t.tolist() for t in ids])) return {utt_id: {"input": input, "output": output}} def main(): parser = argparse.ArgumentParser() parser.add_argument( "--audio-dirs", nargs="+", default=["-"], required=True, help="input directories with audio files", ) parser.add_argument( "--labels", required=True, help="aggregated input labels with format <ID LABEL> per line", type=argparse.FileType("r", encoding="UTF-8"), ) parser.add_argument( "--spm-model", required=True, help="sentencepiece model to use for encoding", type=argparse.FileType("r", encoding="UTF-8"), ) parser.add_argument( "--dictionary", required=True, help="file to load fairseq dictionary from", type=argparse.FileType("r", encoding="UTF-8"), ) parser.add_argument("--audio-format", choices=["flac", "wav"], default="wav") parser.add_argument( "--output", required=True, type=argparse.FileType("w"), help="path to save json output", ) args = parser.parse_args() sp = spm.SentencePieceProcessor() sp.Load(args.spm_model.name) tgt_dict = Dictionary.load(args.dictionary) labels = {} for line in args.labels: (utt_id, label) = line.split(" ", 1) labels[utt_id] = label if len(labels) == 0: raise Exception("No labels found in ", args.labels_path) Sample = namedtuple("Sample", "aud_path utt_id") samples = [] for path, _, files in chain.from_iterable( os.walk(path) for path in args.audio_dirs ): for f in files: if f.endswith(args.audio_format): if len(os.path.splitext(f)) != 2: raise Exception("Expect <utt_id.extension> file name. Got: ", f) utt_id = os.path.splitext(f)[0] if utt_id not in labels: continue samples.append(Sample(os.path.join(path, f), utt_id)) utts = {} num_cpu = multiprocessing.cpu_count() with concurrent.futures.ThreadPoolExecutor(max_workers=num_cpu) as executor: future_to_sample = { executor.submit( process_sample, s.aud_path, labels[s.utt_id], s.utt_id, sp, tgt_dict ): s for s in samples } for future in concurrent.futures.as_completed(future_to_sample): try: data = future.result() except Exception as exc: print("generated an exception: ", exc) else: utts.update(data) json.dump({"utts": utts}, args.output, indent=4) if __name__ == "__main__": main() ================================================ FILE: examples/speech_recognition/datasets/prepare-librispeech.sh ================================================ #!/usr/bin/env bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # Prepare librispeech dataset base_url=www.openslr.org/resources/12 train_dir=train_960 if [ "$#" -ne 2 ]; then echo "Usage: $0 <download_dir> <out_dir>" echo "e.g.: $0 /tmp/librispeech_raw/ ~/data/librispeech_final" exit 1 fi download_dir=${1%/} out_dir=${2%/} fairseq_root=~/fairseq-py/ mkdir -p ${out_dir} cd ${out_dir} || exit nbpe=5000 bpemode=unigram if [ ! -d "$fairseq_root" ]; then echo "$0: Please set correct fairseq_root" exit 1 fi echo "Data Download" for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do url=$base_url/$part.tar.gz if ! wget -P $download_dir $url; then echo "$0: wget failed for $url" exit 1 fi if ! tar -C $download_dir -xvzf $download_dir/$part.tar.gz; then echo "$0: error un-tarring archive $download_dir/$part.tar.gz" exit 1 fi done echo "Merge all train packs into one" mkdir -p ${download_dir}/LibriSpeech/${train_dir}/ for part in train-clean-100 train-clean-360 train-other-500; do mv ${download_dir}/LibriSpeech/${part}/* $download_dir/LibriSpeech/${train_dir}/ done echo "Merge train text" find ${download_dir}/LibriSpeech/${train_dir}/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/${train_dir}/text # Use combined dev-clean and dev-other as validation set find ${download_dir}/LibriSpeech/dev-clean/ ${download_dir}/LibriSpeech/dev-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/valid_text find ${download_dir}/LibriSpeech/test-clean/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-clean/text find ${download_dir}/LibriSpeech/test-other/ -name '*.txt' -exec cat {} \; >> ${download_dir}/LibriSpeech/test-other/text dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_units.txt encoded=data/lang_char/${train_dir}_${bpemode}${nbpe}_encoded.txt fairseq_dict=data/lang_char/${train_dir}_${bpemode}${nbpe}_fairseq_dict.txt bpemodel=data/lang_char/${train_dir}_${bpemode}${nbpe} echo "dictionary: ${dict}" echo "Dictionary preparation" mkdir -p data/lang_char/ echo "<unk> 3" > ${dict} echo "</s> 2" >> ${dict} echo "<pad> 1" >> ${dict} cut -f 2- -d" " ${download_dir}/LibriSpeech/${train_dir}/text > data/lang_char/input.txt spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 --unk_id=3 --eos_id=2 --pad_id=1 --bos_id=-1 --character_coverage=1 spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt > ${encoded} cat ${encoded} | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+3}' >> ${dict} cat ${encoded} | tr ' ' '\n' | sort | uniq -c | awk '{print $2 " " $1}' > ${fairseq_dict} wc -l ${dict} echo "Prepare train and test jsons" for part in train_960 test-other test-clean; do python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/${part} --labels ${download_dir}/LibriSpeech/${part}/text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output ${part}.json done # fairseq expects to find train.json and valid.json during training mv train_960.json train.json echo "Prepare valid json" python ${fairseq_root}/examples/speech_recognition/datasets/asr_prep_json.py --audio-dirs ${download_dir}/LibriSpeech/dev-clean ${download_dir}/LibriSpeech/dev-other --labels ${download_dir}/LibriSpeech/valid_text --spm-model ${bpemodel}.model --audio-format flac --dictionary ${fairseq_dict} --output valid.json cp ${fairseq_dict} ./dict.txt cp ${bpemodel}.model ./spm.model ================================================ FILE: examples/speech_recognition/infer.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Run inference for pre-processed data with a trained model. """ import ast import logging import math import os import sys import editdistance import numpy as np import torch from fairseq import checkpoint_utils, options, progress_bar, tasks, utils from fairseq.data.data_utils import post_process from fairseq.logging.meters import StopwatchMeter, TimeMeter logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def add_asr_eval_argument(parser): parser.add_argument("--kspmodel", default=None, help="sentence piece model") parser.add_argument( "--wfstlm", default=None, help="wfstlm on dictonary output units" ) parser.add_argument( "--rnnt_decoding_type", default="greedy", help="wfstlm on dictonary\ output units", ) try: parser.add_argument( "--lm-weight", "--lm_weight", type=float, default=0.2, help="weight for lm while interpolating with neural score", ) except: pass parser.add_argument( "--rnnt_len_penalty", default=-0.5, help="rnnt length penalty on word level" ) parser.add_argument( "--w2l-decoder", choices=["viterbi", "kenlm", "fairseqlm"], help="use a w2l decoder", ) parser.add_argument("--lexicon", help="lexicon for w2l decoder") parser.add_argument("--unit-lm", action="store_true", help="if using a unit lm") parser.add_argument("--kenlm-model", "--lm-model", help="lm model for w2l decoder") parser.add_argument("--beam-threshold", type=float, default=25.0) parser.add_argument("--beam-size-token", type=float, default=100) parser.add_argument("--word-score", type=float, default=1.0) parser.add_argument("--unk-weight", type=float, default=-math.inf) parser.add_argument("--sil-weight", type=float, default=0.0) parser.add_argument( "--dump-emissions", type=str, default=None, help="if present, dumps emissions into this file and exits", ) parser.add_argument( "--dump-features", type=str, default=None, help="if present, dumps features into this file and exits", ) parser.add_argument( "--load-emissions", type=str, default=None, help="if present, loads emissions from this file", ) return parser def check_args(args): # assert args.path is not None, "--path required for generation!" # assert args.results_path is not None, "--results_path required for generation!" assert ( not args.sampling or args.nbest == args.beam ), "--sampling requires --nbest to be equal to --beam" assert ( args.replace_unk is None or args.raw_text ), "--replace-unk requires a raw text dataset (--raw-text)" def get_dataset_itr(args, task, models): return task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=(sys.maxsize, sys.maxsize), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) def process_predictions( args, hypos, sp, tgt_dict, target_tokens, res_files, speaker, id ): for hypo in hypos[: min(len(hypos), args.nbest)]: hyp_pieces = tgt_dict.string(hypo["tokens"].int().cpu()) if "words" in hypo: hyp_words = " ".join(hypo["words"]) else: hyp_words = post_process(hyp_pieces, args.post_process) if res_files is not None: print( "{} ({}-{})".format(hyp_pieces, speaker, id), file=res_files["hypo.units"], ) print( "{} ({}-{})".format(hyp_words, speaker, id), file=res_files["hypo.words"], ) tgt_pieces = tgt_dict.string(target_tokens) tgt_words = post_process(tgt_pieces, args.post_process) if res_files is not None: print( "{} ({}-{})".format(tgt_pieces, speaker, id), file=res_files["ref.units"], ) print( "{} ({}-{})".format(tgt_words, speaker, id), file=res_files["ref.words"] ) if not args.quiet: logger.info("HYPO:" + hyp_words) logger.info("TARGET:" + tgt_words) logger.info("___________________") hyp_words = hyp_words.split() tgt_words = tgt_words.split() return editdistance.eval(hyp_words, tgt_words), len(tgt_words) def prepare_result_files(args): def get_res_file(file_prefix): if args.num_shards > 1: file_prefix = f"{args.shard_id}_{file_prefix}" path = os.path.join( args.results_path, "{}-{}-{}.txt".format( file_prefix, os.path.basename(args.path), args.gen_subset ), ) return open(path, "w", buffering=1) if not args.results_path: return None return { "hypo.words": get_res_file("hypo.word"), "hypo.units": get_res_file("hypo.units"), "ref.words": get_res_file("ref.word"), "ref.units": get_res_file("ref.units"), } def optimize_models(args, use_cuda, models): """Optimize ensemble for generation""" for model in models: model.make_generation_fast_( beamable_mm_beam_size=None if args.no_beamable_mm else args.beam, need_attn=args.print_alignment, ) if args.fp16: model.half() if use_cuda: model.cuda() def apply_half(t): if t.dtype is torch.float32: return t.to(dtype=torch.half) return t class ExistingEmissionsDecoder(object): def __init__(self, decoder, emissions): self.decoder = decoder self.emissions = emissions def generate(self, models, sample, **unused): ids = sample["id"].cpu().numpy() try: emissions = np.stack(self.emissions[ids]) except: print([x.shape for x in self.emissions[ids]]) raise Exception("invalid sizes") emissions = torch.from_numpy(emissions) return self.decoder.decode(emissions) def main(args, task=None, model_state=None): check_args(args) use_fp16 = args.fp16 if args.max_tokens is None and args.batch_size is None: args.max_tokens = 4000000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu logger.info("| decoding with criterion {}".format(args.criterion)) task = tasks.setup_task(args) # Load ensemble if args.load_emissions: models, criterions = [], [] task.load_dataset(args.gen_subset) else: logger.info("| loading model(s) from {}".format(args.path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( utils.split_paths(args.path, separator="\\"), arg_overrides=ast.literal_eval(args.model_overrides), task=task, suffix=args.checkpoint_suffix, strict=(args.checkpoint_shard_count == 1), num_shards=args.checkpoint_shard_count, state=model_state, ) optimize_models(args, use_cuda, models) task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task) # Set dictionary tgt_dict = task.target_dictionary logger.info( "| {} {} {} examples".format( args.data, args.gen_subset, len(task.dataset(args.gen_subset)) ) ) # hack to pass transitions to W2lDecoder if args.criterion == "asg_loss": raise NotImplementedError("asg_loss is currently not supported") # trans = criterions[0].asg.trans.data # args.asg_transitions = torch.flatten(trans).tolist() # Load dataset (possibly sharded) itr = get_dataset_itr(args, task, models) # Initialize generator gen_timer = StopwatchMeter() def build_generator(args): w2l_decoder = getattr(args, "w2l_decoder", None) if w2l_decoder == "viterbi": from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(args, task.target_dictionary) elif w2l_decoder == "kenlm": from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder return W2lKenLMDecoder(args, task.target_dictionary) elif w2l_decoder == "fairseqlm": from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder return W2lFairseqLMDecoder(args, task.target_dictionary) else: print( "only flashlight decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment" ) # please do not touch this unless you test both generate.py and infer.py with audio_pretraining task generator = build_generator(args) if args.load_emissions: generator = ExistingEmissionsDecoder( generator, np.load(args.load_emissions, allow_pickle=True) ) logger.info("loaded emissions from " + args.load_emissions) num_sentences = 0 if args.results_path is not None and not os.path.exists(args.results_path): os.makedirs(args.results_path) max_source_pos = ( utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ), ) if max_source_pos is not None: max_source_pos = max_source_pos[0] if max_source_pos is not None: max_source_pos = max_source_pos[0] - 1 if args.dump_emissions: emissions = {} if args.dump_features: features = {} models[0].bert.proj = None else: res_files = prepare_result_files(args) errs_t = 0 lengths_t = 0 with progress_bar.build_progress_bar(args, itr) as t: wps_meter = TimeMeter() for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample if use_fp16: sample = utils.apply_to_sample(apply_half, sample) if "net_input" not in sample: continue prefix_tokens = None if args.prefix_size > 0: prefix_tokens = sample["target"][:, : args.prefix_size] gen_timer.start() if args.dump_emissions: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) emm = models[0].get_normalized_probs(encoder_out, log_probs=True) emm = emm.transpose(0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): emissions[id.item()] = emm[i] continue elif args.dump_features: with torch.no_grad(): encoder_out = models[0](**sample["net_input"]) feat = encoder_out["encoder_out"].transpose(0, 1).cpu().numpy() for i, id in enumerate(sample["id"]): padding = ( encoder_out["encoder_padding_mask"][i].cpu().numpy() if encoder_out["encoder_padding_mask"] is not None else None ) features[id.item()] = (feat[i], padding) continue hypos = task.inference_step(generator, models, sample, prefix_tokens) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): speaker = None # id = task.dataset(args.gen_subset).ids[int(sample_id)] id = sample_id toks = ( sample["target"][i, :] if "target_label" not in sample else sample["target_label"][i, :] ) target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu() # Process top predictions errs, length = process_predictions( args, hypos[i], None, tgt_dict, target_tokens, res_files, speaker, id, ) errs_t += errs lengths_t += length wps_meter.update(num_generated_tokens) t.log({"wps": round(wps_meter.avg)}) num_sentences += ( sample["nsentences"] if "nsentences" in sample else sample["id"].numel() ) wer = None if args.dump_emissions: emm_arr = [] for i in range(len(emissions)): emm_arr.append(emissions[i]) np.save(args.dump_emissions, emm_arr) logger.info(f"saved {len(emissions)} emissions to {args.dump_emissions}") elif args.dump_features: feat_arr = [] for i in range(len(features)): feat_arr.append(features[i]) np.save(args.dump_features, feat_arr) logger.info(f"saved {len(features)} emissions to {args.dump_features}") else: if lengths_t > 0: wer = errs_t * 100.0 / lengths_t logger.info(f"WER: {wer}") logger.info( "| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}" "sentences/s, {:.2f} tokens/s)".format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, ) ) logger.info("| Generate {} with beam={}".format(args.gen_subset, args.beam)) return task, wer def make_parser(): parser = options.get_generation_parser() parser = add_asr_eval_argument(parser) return parser def cli_main(): parser = make_parser() args = options.parse_args_and_arch(parser) main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/speech_recognition/kaldi/__init__.py ================================================ ================================================ FILE: examples/speech_recognition/kaldi/add-self-loop-simple.cc ================================================ /* * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include <iostream> #include "fstext/fstext-lib.h" // @manual #include "util/common-utils.h" // @manual /* * This program is to modify a FST without self-loop by: * for each incoming arc with non-eps input symbol, add a self-loop arc * with that non-eps symbol as input and eps as output. * * This is to make sure the resultant FST can do deduplication for repeated * symbols, which is very common in acoustic model * */ namespace { int32 AddSelfLoopsSimple(fst::StdVectorFst* fst) { typedef fst::MutableArcIterator<fst::StdVectorFst> IterType; int32 num_states_before = fst->NumStates(); fst::MakePrecedingInputSymbolsSame(false, fst); int32 num_states_after = fst->NumStates(); KALDI_LOG << "There are " << num_states_before << " states in the original FST; " << " after MakePrecedingInputSymbolsSame, there are " << num_states_after << " states " << std::endl; auto weight_one = fst::StdArc::Weight::One(); int32 num_arc_added = 0; fst::StdArc self_loop_arc; self_loop_arc.weight = weight_one; int32 num_states = fst->NumStates(); std::vector<std::set<int32>> incoming_non_eps_label_per_state(num_states); for (int32 state = 0; state < num_states; state++) { for (IterType aiter(fst, state); !aiter.Done(); aiter.Next()) { fst::StdArc arc(aiter.Value()); if (arc.ilabel != 0) { incoming_non_eps_label_per_state[arc.nextstate].insert(arc.ilabel); } } } for (int32 state = 0; state < num_states; state++) { if (!incoming_non_eps_label_per_state[state].empty()) { auto& ilabel_set = incoming_non_eps_label_per_state[state]; for (auto it = ilabel_set.begin(); it != ilabel_set.end(); it++) { self_loop_arc.ilabel = *it; self_loop_arc.olabel = 0; self_loop_arc.nextstate = state; fst->AddArc(state, self_loop_arc); num_arc_added++; } } } return num_arc_added; } void print_usage() { std::cout << "add-self-loop-simple usage:\n" "\tadd-self-loop-simple <in-fst> <out-fst> \n"; } } // namespace int main(int argc, char** argv) { if (argc != 3) { print_usage(); exit(1); } auto input = argv[1]; auto output = argv[2]; auto fst = fst::ReadFstKaldi(input); auto num_states = fst->NumStates(); KALDI_LOG << "Loading FST from " << input << " with " << num_states << " states." << std::endl; int32 num_arc_added = AddSelfLoopsSimple(fst); KALDI_LOG << "Adding " << num_arc_added << " self-loop arcs " << std::endl; fst::WriteFstKaldi(*fst, std::string(output)); KALDI_LOG << "Writing FST to " << output << std::endl; delete fst; } ================================================ FILE: examples/speech_recognition/kaldi/config/kaldi_initializer.yaml ================================================ # @package _group_ data_dir: ??? fst_dir: ??? in_labels: ??? kaldi_root: ??? lm_arpa: ??? blank_symbol: <s> ================================================ FILE: examples/speech_recognition/kaldi/kaldi_decoder.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from concurrent.futures import ThreadPoolExecutor import logging from omegaconf import MISSING import os import torch from typing import Optional import warnings from dataclasses import dataclass from fairseq.dataclass import FairseqDataclass from .kaldi_initializer import KaldiInitializerConfig, initalize_kaldi logger = logging.getLogger(__name__) @dataclass class KaldiDecoderConfig(FairseqDataclass): hlg_graph_path: Optional[str] = None output_dict: str = MISSING kaldi_initializer_config: Optional[KaldiInitializerConfig] = None acoustic_scale: float = 0.5 max_active: int = 10000 beam_delta: float = 0.5 hash_ratio: float = 2.0 is_lattice: bool = False lattice_beam: float = 10.0 prune_interval: int = 25 determinize_lattice: bool = True prune_scale: float = 0.1 max_mem: int = 0 phone_determinize: bool = True word_determinize: bool = True minimize: bool = True num_threads: int = 1 class KaldiDecoder(object): def __init__( self, cfg: KaldiDecoderConfig, beam: int, nbest: int = 1, ): try: from kaldi.asr import FasterRecognizer, LatticeFasterRecognizer from kaldi.base import set_verbose_level from kaldi.decoder import ( FasterDecoder, FasterDecoderOptions, LatticeFasterDecoder, LatticeFasterDecoderOptions, ) from kaldi.lat.functions import DeterminizeLatticePhonePrunedOptions from kaldi.fstext import read_fst_kaldi, SymbolTable except: warnings.warn( "pykaldi is required for this functionality. Please install from https://github.com/pykaldi/pykaldi" ) # set_verbose_level(2) self.acoustic_scale = cfg.acoustic_scale self.nbest = nbest if cfg.hlg_graph_path is None: assert ( cfg.kaldi_initializer_config is not None ), "Must provide hlg graph path or kaldi initializer config" cfg.hlg_graph_path = initalize_kaldi(cfg.kaldi_initializer_config) assert os.path.exists(cfg.hlg_graph_path), cfg.hlg_graph_path if cfg.is_lattice: self.dec_cls = LatticeFasterDecoder opt_cls = LatticeFasterDecoderOptions self.rec_cls = LatticeFasterRecognizer else: assert self.nbest == 1, "nbest > 1 requires lattice decoder" self.dec_cls = FasterDecoder opt_cls = FasterDecoderOptions self.rec_cls = FasterRecognizer self.decoder_options = opt_cls() self.decoder_options.beam = beam self.decoder_options.max_active = cfg.max_active self.decoder_options.beam_delta = cfg.beam_delta self.decoder_options.hash_ratio = cfg.hash_ratio if cfg.is_lattice: self.decoder_options.lattice_beam = cfg.lattice_beam self.decoder_options.prune_interval = cfg.prune_interval self.decoder_options.determinize_lattice = cfg.determinize_lattice self.decoder_options.prune_scale = cfg.prune_scale det_opts = DeterminizeLatticePhonePrunedOptions() det_opts.max_mem = cfg.max_mem det_opts.phone_determinize = cfg.phone_determinize det_opts.word_determinize = cfg.word_determinize det_opts.minimize = cfg.minimize self.decoder_options.det_opts = det_opts self.output_symbols = {} with open(cfg.output_dict, "r") as f: for line in f: items = line.rstrip().split() assert len(items) == 2 self.output_symbols[int(items[1])] = items[0] logger.info(f"Loading FST from {cfg.hlg_graph_path}") self.fst = read_fst_kaldi(cfg.hlg_graph_path) self.symbol_table = SymbolTable.read_text(cfg.output_dict) self.executor = ThreadPoolExecutor(max_workers=cfg.num_threads) def generate(self, models, sample, **unused): """Generate a batch of inferences.""" # model.forward normally channels prev_output_tokens into the decoder # separately, but SequenceGenerator directly calls model.encoder encoder_input = { k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens" } emissions, padding = self.get_emissions(models, encoder_input) return self.decode(emissions, padding) def get_emissions(self, models, encoder_input): """Run encoder and normalize emissions""" model = models[0] all_encoder_out = [m(**encoder_input) for m in models] if len(all_encoder_out) > 1: if "encoder_out" in all_encoder_out[0]: encoder_out = { "encoder_out": sum(e["encoder_out"] for e in all_encoder_out) / len(all_encoder_out), "encoder_padding_mask": all_encoder_out[0]["encoder_padding_mask"], } padding = encoder_out["encoder_padding_mask"] else: encoder_out = { "logits": sum(e["logits"] for e in all_encoder_out) / len(all_encoder_out), "padding_mask": all_encoder_out[0]["padding_mask"], } padding = encoder_out["padding_mask"] else: encoder_out = all_encoder_out[0] padding = ( encoder_out["padding_mask"] if "padding_mask" in encoder_out else encoder_out["encoder_padding_mask"] ) if hasattr(model, "get_logits"): emissions = model.get_logits(encoder_out, normalize=True) else: emissions = model.get_normalized_probs(encoder_out, log_probs=True) return ( emissions.cpu().float().transpose(0, 1), padding.cpu() if padding is not None and padding.any() else None, ) def decode_one(self, logits, padding): from kaldi.matrix import Matrix decoder = self.dec_cls(self.fst, self.decoder_options) asr = self.rec_cls( decoder, self.symbol_table, acoustic_scale=self.acoustic_scale ) if padding is not None: logits = logits[~padding] mat = Matrix(logits.numpy()) out = asr.decode(mat) if self.nbest > 1: from kaldi.fstext import shortestpath from kaldi.fstext.utils import ( convert_compact_lattice_to_lattice, convert_lattice_to_std, convert_nbest_to_list, get_linear_symbol_sequence, ) lat = out["lattice"] sp = shortestpath(lat, nshortest=self.nbest) sp = convert_compact_lattice_to_lattice(sp) sp = convert_lattice_to_std(sp) seq = convert_nbest_to_list(sp) results = [] for s in seq: _, o, w = get_linear_symbol_sequence(s) words = list(self.output_symbols[z] for z in o) results.append( { "tokens": words, "words": words, "score": w.value, "emissions": logits, } ) return results else: words = out["text"].split() return [ { "tokens": words, "words": words, "score": out["likelihood"], "emissions": logits, } ] def decode(self, emissions, padding): if padding is None: padding = [None] * len(emissions) ret = list( map( lambda e, p: self.executor.submit(self.decode_one, e, p), emissions, padding, ) ) return ret ================================================ FILE: examples/speech_recognition/kaldi/kaldi_initializer.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass import hydra from hydra.core.config_store import ConfigStore import logging from omegaconf import MISSING, OmegaConf import os import os.path as osp from pathlib import Path import subprocess from typing import Optional from fairseq.data.dictionary import Dictionary from fairseq.dataclass import FairseqDataclass script_dir = Path(__file__).resolve().parent config_path = script_dir / "config" logger = logging.getLogger(__name__) @dataclass class KaldiInitializerConfig(FairseqDataclass): data_dir: str = MISSING fst_dir: Optional[str] = None in_labels: str = MISSING out_labels: Optional[str] = None wav2letter_lexicon: Optional[str] = None lm_arpa: str = MISSING kaldi_root: str = MISSING blank_symbol: str = "<s>" silence_symbol: Optional[str] = None def create_units(fst_dir: Path, in_labels: str, vocab: Dictionary) -> Path: in_units_file = fst_dir / f"kaldi_dict.{in_labels}.txt" if not in_units_file.exists(): logger.info(f"Creating {in_units_file}") with open(in_units_file, "w") as f: print("<eps> 0", file=f) i = 1 for symb in vocab.symbols[vocab.nspecial :]: if not symb.startswith("madeupword"): print(f"{symb} {i}", file=f) i += 1 return in_units_file def create_lexicon( cfg: KaldiInitializerConfig, fst_dir: Path, unique_label: str, in_units_file: Path, out_words_file: Path, ) -> (Path, Path): disambig_in_units_file = fst_dir / f"kaldi_dict.{cfg.in_labels}_disambig.txt" lexicon_file = fst_dir / f"kaldi_lexicon.{unique_label}.txt" disambig_lexicon_file = fst_dir / f"kaldi_lexicon.{unique_label}_disambig.txt" if ( not lexicon_file.exists() or not disambig_lexicon_file.exists() or not disambig_in_units_file.exists() ): logger.info(f"Creating {lexicon_file} (in units file: {in_units_file})") assert cfg.wav2letter_lexicon is not None or cfg.in_labels == cfg.out_labels if cfg.wav2letter_lexicon is not None: lm_words = set() with open(out_words_file, "r") as lm_dict_f: for line in lm_dict_f: lm_words.add(line.split()[0]) num_skipped = 0 total = 0 with open(cfg.wav2letter_lexicon, "r") as w2l_lex_f, open( lexicon_file, "w" ) as out_f: for line in w2l_lex_f: items = line.rstrip().split("\t") assert len(items) == 2, items if items[0] in lm_words: print(items[0], items[1], file=out_f) else: num_skipped += 1 logger.debug( f"Skipping word {items[0]} as it was not found in LM" ) total += 1 if num_skipped > 0: logger.warning( f"Skipped {num_skipped} out of {total} words as they were not found in LM" ) else: with open(in_units_file, "r") as in_f, open(lexicon_file, "w") as out_f: for line in in_f: symb = line.split()[0] if symb != "<eps>" and symb != "<ctc_blank>" and symb != "<SIL>": print(symb, symb, file=out_f) lex_disambig_path = ( Path(cfg.kaldi_root) / "egs/wsj/s5/utils/add_lex_disambig.pl" ) res = subprocess.run( [lex_disambig_path, lexicon_file, disambig_lexicon_file], check=True, capture_output=True, ) ndisambig = int(res.stdout) disamib_path = Path(cfg.kaldi_root) / "egs/wsj/s5/utils/add_disambig.pl" res = subprocess.run( [disamib_path, "--include-zero", in_units_file, str(ndisambig)], check=True, capture_output=True, ) with open(disambig_in_units_file, "wb") as f: f.write(res.stdout) return disambig_lexicon_file, disambig_in_units_file def create_G( kaldi_root: Path, fst_dir: Path, lm_arpa: Path, arpa_base: str ) -> (Path, Path): out_words_file = fst_dir / f"kaldi_dict.{arpa_base}.txt" grammar_graph = fst_dir / f"G_{arpa_base}.fst" if not grammar_graph.exists() or not out_words_file.exists(): logger.info(f"Creating {grammar_graph}") arpa2fst = kaldi_root / "src/lmbin/arpa2fst" subprocess.run( [ arpa2fst, "--disambig-symbol=#0", f"--write-symbol-table={out_words_file}", lm_arpa, grammar_graph, ], check=True, ) return grammar_graph, out_words_file def create_L( kaldi_root: Path, fst_dir: Path, unique_label: str, lexicon_file: Path, in_units_file: Path, out_words_file: Path, ) -> Path: lexicon_graph = fst_dir / f"L.{unique_label}.fst" if not lexicon_graph.exists(): logger.info(f"Creating {lexicon_graph} (in units: {in_units_file})") make_lex = kaldi_root / "egs/wsj/s5/utils/make_lexicon_fst.pl" fstcompile = kaldi_root / "tools/openfst-1.6.7/bin/fstcompile" fstaddselfloops = kaldi_root / "src/fstbin/fstaddselfloops" fstarcsort = kaldi_root / "tools/openfst-1.6.7/bin/fstarcsort" def write_disambig_symbol(file): with open(file, "r") as f: for line in f: items = line.rstrip().split() if items[0] == "#0": out_path = str(file) + "_disamig" with open(out_path, "w") as out_f: print(items[1], file=out_f) return out_path return None in_disambig_sym = write_disambig_symbol(in_units_file) assert in_disambig_sym is not None out_disambig_sym = write_disambig_symbol(out_words_file) assert out_disambig_sym is not None try: with open(lexicon_graph, "wb") as out_f: res = subprocess.run( [make_lex, lexicon_file], capture_output=True, check=True ) assert len(res.stderr) == 0, res.stderr.decode("utf-8") res = subprocess.run( [ fstcompile, f"--isymbols={in_units_file}", f"--osymbols={out_words_file}", "--keep_isymbols=false", "--keep_osymbols=false", ], input=res.stdout, capture_output=True, ) assert len(res.stderr) == 0, res.stderr.decode("utf-8") res = subprocess.run( [fstaddselfloops, in_disambig_sym, out_disambig_sym], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstarcsort, "--sort_type=olabel"], input=res.stdout, capture_output=True, check=True, ) out_f.write(res.stdout) except subprocess.CalledProcessError as e: logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") os.remove(lexicon_graph) raise except AssertionError: os.remove(lexicon_graph) raise return lexicon_graph def create_LG( kaldi_root: Path, fst_dir: Path, unique_label: str, lexicon_graph: Path, grammar_graph: Path, ) -> Path: lg_graph = fst_dir / f"LG.{unique_label}.fst" if not lg_graph.exists(): logger.info(f"Creating {lg_graph}") fsttablecompose = kaldi_root / "src/fstbin/fsttablecompose" fstdeterminizestar = kaldi_root / "src/fstbin/fstdeterminizestar" fstminimizeencoded = kaldi_root / "src/fstbin/fstminimizeencoded" fstpushspecial = kaldi_root / "src/fstbin/fstpushspecial" fstarcsort = kaldi_root / "tools/openfst-1.6.7/bin/fstarcsort" try: with open(lg_graph, "wb") as out_f: res = subprocess.run( [fsttablecompose, lexicon_graph, grammar_graph], capture_output=True, check=True, ) res = subprocess.run( [ fstdeterminizestar, "--use-log=true", ], input=res.stdout, capture_output=True, ) res = subprocess.run( [fstminimizeencoded], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstpushspecial], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstarcsort, "--sort_type=ilabel"], input=res.stdout, capture_output=True, check=True, ) out_f.write(res.stdout) except subprocess.CalledProcessError as e: logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") os.remove(lg_graph) raise return lg_graph def create_H( kaldi_root: Path, fst_dir: Path, disambig_out_units_file: Path, in_labels: str, vocab: Dictionary, blk_sym: str, silence_symbol: Optional[str], ) -> (Path, Path, Path): h_graph = ( fst_dir / f"H.{in_labels}{'_' + silence_symbol if silence_symbol else ''}.fst" ) h_out_units_file = fst_dir / f"kaldi_dict.h_out.{in_labels}.txt" disambig_in_units_file_int = Path(str(h_graph) + "isym_disambig.int") disambig_out_units_file_int = Path(str(disambig_out_units_file) + ".int") if ( not h_graph.exists() or not h_out_units_file.exists() or not disambig_in_units_file_int.exists() ): logger.info(f"Creating {h_graph}") eps_sym = "<eps>" num_disambig = 0 osymbols = [] with open(disambig_out_units_file, "r") as f, open( disambig_out_units_file_int, "w" ) as out_f: for line in f: symb, id = line.rstrip().split() if line.startswith("#"): num_disambig += 1 print(id, file=out_f) else: if len(osymbols) == 0: assert symb == eps_sym, symb osymbols.append((symb, id)) i_idx = 0 isymbols = [(eps_sym, 0)] imap = {} for i, s in enumerate(vocab.symbols): i_idx += 1 isymbols.append((s, i_idx)) imap[s] = i_idx fst_str = [] node_idx = 0 root_node = node_idx special_symbols = [blk_sym] if silence_symbol is not None: special_symbols.append(silence_symbol) for ss in special_symbols: fst_str.append("{} {} {} {}".format(root_node, root_node, ss, eps_sym)) for symbol, _ in osymbols: if symbol == eps_sym or symbol.startswith("#"): continue node_idx += 1 # 1. from root to emitting state fst_str.append("{} {} {} {}".format(root_node, node_idx, symbol, symbol)) # 2. from emitting state back to root fst_str.append("{} {} {} {}".format(node_idx, root_node, eps_sym, eps_sym)) # 3. from emitting state to optional blank state pre_node = node_idx node_idx += 1 for ss in special_symbols: fst_str.append("{} {} {} {}".format(pre_node, node_idx, ss, eps_sym)) # 4. from blank state back to root fst_str.append("{} {} {} {}".format(node_idx, root_node, eps_sym, eps_sym)) fst_str.append("{}".format(root_node)) fst_str = "\n".join(fst_str) h_str = str(h_graph) isym_file = h_str + ".isym" with open(isym_file, "w") as f: for sym, id in isymbols: f.write("{} {}\n".format(sym, id)) with open(h_out_units_file, "w") as f: for sym, id in osymbols: f.write("{} {}\n".format(sym, id)) with open(disambig_in_units_file_int, "w") as f: disam_sym_id = len(isymbols) for _ in range(num_disambig): f.write("{}\n".format(disam_sym_id)) disam_sym_id += 1 fstcompile = kaldi_root / "tools/openfst-1.6.7/bin/fstcompile" fstaddselfloops = kaldi_root / "src/fstbin/fstaddselfloops" fstarcsort = kaldi_root / "tools/openfst-1.6.7/bin/fstarcsort" try: with open(h_graph, "wb") as out_f: res = subprocess.run( [ fstcompile, f"--isymbols={isym_file}", f"--osymbols={h_out_units_file}", "--keep_isymbols=false", "--keep_osymbols=false", ], input=str.encode(fst_str), capture_output=True, check=True, ) res = subprocess.run( [ fstaddselfloops, disambig_in_units_file_int, disambig_out_units_file_int, ], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstarcsort, "--sort_type=olabel"], input=res.stdout, capture_output=True, check=True, ) out_f.write(res.stdout) except subprocess.CalledProcessError as e: logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") os.remove(h_graph) raise return h_graph, h_out_units_file, disambig_in_units_file_int def create_HLGa( kaldi_root: Path, fst_dir: Path, unique_label: str, h_graph: Path, lg_graph: Path, disambig_in_words_file_int: Path, ) -> Path: hlga_graph = fst_dir / f"HLGa.{unique_label}.fst" if not hlga_graph.exists(): logger.info(f"Creating {hlga_graph}") fsttablecompose = kaldi_root / "src/fstbin/fsttablecompose" fstdeterminizestar = kaldi_root / "src/fstbin/fstdeterminizestar" fstrmsymbols = kaldi_root / "src/fstbin/fstrmsymbols" fstrmepslocal = kaldi_root / "src/fstbin/fstrmepslocal" fstminimizeencoded = kaldi_root / "src/fstbin/fstminimizeencoded" try: with open(hlga_graph, "wb") as out_f: res = subprocess.run( [ fsttablecompose, h_graph, lg_graph, ], capture_output=True, check=True, ) res = subprocess.run( [fstdeterminizestar, "--use-log=true"], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstrmsymbols, disambig_in_words_file_int], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstrmepslocal], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstminimizeencoded], input=res.stdout, capture_output=True, check=True, ) out_f.write(res.stdout) except subprocess.CalledProcessError as e: logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") os.remove(hlga_graph) raise return hlga_graph def create_HLa( kaldi_root: Path, fst_dir: Path, unique_label: str, h_graph: Path, l_graph: Path, disambig_in_words_file_int: Path, ) -> Path: hla_graph = fst_dir / f"HLa.{unique_label}.fst" if not hla_graph.exists(): logger.info(f"Creating {hla_graph}") fsttablecompose = kaldi_root / "src/fstbin/fsttablecompose" fstdeterminizestar = kaldi_root / "src/fstbin/fstdeterminizestar" fstrmsymbols = kaldi_root / "src/fstbin/fstrmsymbols" fstrmepslocal = kaldi_root / "src/fstbin/fstrmepslocal" fstminimizeencoded = kaldi_root / "src/fstbin/fstminimizeencoded" try: with open(hla_graph, "wb") as out_f: res = subprocess.run( [ fsttablecompose, h_graph, l_graph, ], capture_output=True, check=True, ) res = subprocess.run( [fstdeterminizestar, "--use-log=true"], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstrmsymbols, disambig_in_words_file_int], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstrmepslocal], input=res.stdout, capture_output=True, check=True, ) res = subprocess.run( [fstminimizeencoded], input=res.stdout, capture_output=True, check=True, ) out_f.write(res.stdout) except subprocess.CalledProcessError as e: logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") os.remove(hla_graph) raise return hla_graph def create_HLG( kaldi_root: Path, fst_dir: Path, unique_label: str, hlga_graph: Path, prefix: str = "HLG", ) -> Path: hlg_graph = fst_dir / f"{prefix}.{unique_label}.fst" if not hlg_graph.exists(): logger.info(f"Creating {hlg_graph}") add_self_loop = script_dir / "add-self-loop-simple" kaldi_src = kaldi_root / "src" kaldi_lib = kaldi_src / "lib" try: if not add_self_loop.exists(): fst_include = kaldi_root / "tools/openfst-1.6.7/include" add_self_loop_src = script_dir / "add-self-loop-simple.cc" subprocess.run( [ "c++", f"-I{kaldi_src}", f"-I{fst_include}", f"-L{kaldi_lib}", add_self_loop_src, "-lkaldi-base", "-lkaldi-fstext", "-o", add_self_loop, ], check=True, ) my_env = os.environ.copy() my_env["LD_LIBRARY_PATH"] = f"{kaldi_lib}:{my_env['LD_LIBRARY_PATH']}" subprocess.run( [ add_self_loop, hlga_graph, hlg_graph, ], check=True, capture_output=True, env=my_env, ) except subprocess.CalledProcessError as e: logger.error(f"cmd: {e.cmd}, err: {e.stderr.decode('utf-8')}") raise return hlg_graph def initalize_kaldi(cfg: KaldiInitializerConfig) -> Path: if cfg.fst_dir is None: cfg.fst_dir = osp.join(cfg.data_dir, "kaldi") if cfg.out_labels is None: cfg.out_labels = cfg.in_labels kaldi_root = Path(cfg.kaldi_root) data_dir = Path(cfg.data_dir) fst_dir = Path(cfg.fst_dir) fst_dir.mkdir(parents=True, exist_ok=True) arpa_base = osp.splitext(osp.basename(cfg.lm_arpa))[0] unique_label = f"{cfg.in_labels}.{arpa_base}" with open(data_dir / f"dict.{cfg.in_labels}.txt", "r") as f: vocab = Dictionary.load(f) in_units_file = create_units(fst_dir, cfg.in_labels, vocab) grammar_graph, out_words_file = create_G( kaldi_root, fst_dir, Path(cfg.lm_arpa), arpa_base ) disambig_lexicon_file, disambig_L_in_units_file = create_lexicon( cfg, fst_dir, unique_label, in_units_file, out_words_file ) h_graph, h_out_units_file, disambig_in_units_file_int = create_H( kaldi_root, fst_dir, disambig_L_in_units_file, cfg.in_labels, vocab, cfg.blank_symbol, cfg.silence_symbol, ) lexicon_graph = create_L( kaldi_root, fst_dir, unique_label, disambig_lexicon_file, disambig_L_in_units_file, out_words_file, ) lg_graph = create_LG( kaldi_root, fst_dir, unique_label, lexicon_graph, grammar_graph ) hlga_graph = create_HLGa( kaldi_root, fst_dir, unique_label, h_graph, lg_graph, disambig_in_units_file_int ) hlg_graph = create_HLG(kaldi_root, fst_dir, unique_label, hlga_graph) # for debugging # hla_graph = create_HLa(kaldi_root, fst_dir, unique_label, h_graph, lexicon_graph, disambig_in_units_file_int) # hl_graph = create_HLG(kaldi_root, fst_dir, unique_label, hla_graph, prefix="HL_looped") # create_HLG(kaldi_root, fst_dir, "phnc", h_graph, prefix="H_looped") return hlg_graph @hydra.main(config_path=config_path, config_name="kaldi_initializer") def cli_main(cfg: KaldiInitializerConfig) -> None: container = OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) cfg = OmegaConf.create(container) OmegaConf.set_struct(cfg, True) initalize_kaldi(cfg) if __name__ == "__main__": logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) try: from hydra._internal.utils import ( get_args, ) # pylint: disable=import-outside-toplevel cfg_name = get_args().config_name or "kaldi_initializer" except ImportError: logger.warning("Failed to get config name from hydra args") cfg_name = "kaldi_initializer" cs = ConfigStore.instance() cs.store(name=cfg_name, node=KaldiInitializerConfig) cli_main() ================================================ FILE: examples/speech_recognition/models/__init__.py ================================================ import importlib import os for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): model_name = file[: file.find(".py")] importlib.import_module("examples.speech_recognition.models." + model_name) ================================================ FILE: examples/speech_recognition/models/vggtransformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import math from collections.abc import Iterable import torch import torch.nn as nn from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask from fairseq import utils from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqEncoderModel, FairseqIncrementalDecoder, register_model, register_model_architecture, ) from fairseq.modules import ( LinearizedConvolution, TransformerDecoderLayer, TransformerEncoderLayer, VGGBlock, ) @register_model("asr_vggtransformer") class VGGTransformerModel(FairseqEncoderDecoderModel): """ Transformers with convolutional context for ASR https://arxiv.org/abs/1904.11660 """ def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--input-feat-per-channel", type=int, metavar="N", help="encoder input dimension per input channel", ) parser.add_argument( "--vggblock-enc-config", type=str, metavar="EXPR", help=""" an array of tuples each containing the configuration of one vggblock: [(out_channels, conv_kernel_size, pooling_kernel_size, num_conv_layers, use_layer_norm), ...]) """, ) parser.add_argument( "--transformer-enc-config", type=str, metavar="EXPR", help="""" a tuple containing the configuration of the encoder transformer layers configurations: [(input_dim, num_heads, ffn_dim, normalize_before, dropout, attention_dropout, relu_dropout), ...]') """, ) parser.add_argument( "--enc-output-dim", type=int, metavar="N", help=""" encoder output dimension, can be None. If specified, projecting the transformer output to the specified dimension""", ) parser.add_argument( "--in-channels", type=int, metavar="N", help="number of encoder input channels", ) parser.add_argument( "--tgt-embed-dim", type=int, metavar="N", help="embedding dimension of the decoder target tokens", ) parser.add_argument( "--transformer-dec-config", type=str, metavar="EXPR", help=""" a tuple containing the configuration of the decoder transformer layers configurations: [(input_dim, num_heads, ffn_dim, normalize_before, dropout, attention_dropout, relu_dropout), ...] """, ) parser.add_argument( "--conv-dec-config", type=str, metavar="EXPR", help=""" an array of tuples for the decoder 1-D convolution config [(out_channels, conv_kernel_size, use_layer_norm), ...]""", ) @classmethod def build_encoder(cls, args, task): return VGGTransformerEncoder( input_feat_per_channel=args.input_feat_per_channel, vggblock_config=eval(args.vggblock_enc_config), transformer_config=eval(args.transformer_enc_config), encoder_output_dim=args.enc_output_dim, in_channels=args.in_channels, ) @classmethod def build_decoder(cls, args, task): return TransformerDecoder( dictionary=task.target_dictionary, embed_dim=args.tgt_embed_dim, transformer_config=eval(args.transformer_dec_config), conv_config=eval(args.conv_dec_config), encoder_output_dim=args.enc_output_dim, ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted # (in case there are any new ones) base_architecture(args) encoder = cls.build_encoder(args, task) decoder = cls.build_decoder(args, task) return cls(encoder, decoder) def get_normalized_probs(self, net_output, log_probs, sample=None): # net_output['encoder_out'] is a (B, T, D) tensor lprobs = super().get_normalized_probs(net_output, log_probs, sample) lprobs.batch_first = True return lprobs DEFAULT_ENC_VGGBLOCK_CONFIG = ((32, 3, 2, 2, False),) * 2 DEFAULT_ENC_TRANSFORMER_CONFIG = ((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2 # 256: embedding dimension # 4: number of heads # 1024: FFN # True: apply layerNorm before (dropout + resiaul) instead of after # 0.2 (dropout): dropout after MultiheadAttention and second FC # 0.2 (attention_dropout): dropout in MultiheadAttention # 0.2 (relu_dropout): dropout after ReLu DEFAULT_DEC_TRANSFORMER_CONFIG = ((256, 2, 1024, True, 0.2, 0.2, 0.2),) * 2 DEFAULT_DEC_CONV_CONFIG = ((256, 3, True),) * 2 # TODO: repace transformer encoder config from one liner # to explicit args to get rid of this transformation def prepare_transformer_encoder_params( input_dim, num_heads, ffn_dim, normalize_before, dropout, attention_dropout, relu_dropout, ): args = argparse.Namespace() args.encoder_embed_dim = input_dim args.encoder_attention_heads = num_heads args.attention_dropout = attention_dropout args.dropout = dropout args.activation_dropout = relu_dropout args.encoder_normalize_before = normalize_before args.encoder_ffn_embed_dim = ffn_dim return args def prepare_transformer_decoder_params( input_dim, num_heads, ffn_dim, normalize_before, dropout, attention_dropout, relu_dropout, ): args = argparse.Namespace() args.encoder_embed_dim = None args.decoder_embed_dim = input_dim args.decoder_attention_heads = num_heads args.attention_dropout = attention_dropout args.dropout = dropout args.activation_dropout = relu_dropout args.decoder_normalize_before = normalize_before args.decoder_ffn_embed_dim = ffn_dim return args class VGGTransformerEncoder(FairseqEncoder): """VGG + Transformer encoder""" def __init__( self, input_feat_per_channel, vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG, transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG, encoder_output_dim=512, in_channels=1, transformer_context=None, transformer_sampling=None, ): """constructor for VGGTransformerEncoder Args: - input_feat_per_channel: feature dim (not including stacked, just base feature) - in_channel: # input channels (e.g., if stack 8 feature vector together, this is 8) - vggblock_config: configuration of vggblock, see comments on DEFAULT_ENC_VGGBLOCK_CONFIG - transformer_config: configuration of transformer layer, see comments on DEFAULT_ENC_TRANSFORMER_CONFIG - encoder_output_dim: final transformer output embedding dimension - transformer_context: (left, right) if set, self-attention will be focused on (t-left, t+right) - transformer_sampling: an iterable of int, must match with len(transformer_config), transformer_sampling[i] indicates sampling factor for i-th transformer layer, after multihead att and feedfoward part """ super().__init__(None) self.num_vggblocks = 0 if vggblock_config is not None: if not isinstance(vggblock_config, Iterable): raise ValueError("vggblock_config is not iterable") self.num_vggblocks = len(vggblock_config) self.conv_layers = nn.ModuleList() self.in_channels = in_channels self.input_dim = input_feat_per_channel self.pooling_kernel_sizes = [] if vggblock_config is not None: for _, config in enumerate(vggblock_config): ( out_channels, conv_kernel_size, pooling_kernel_size, num_conv_layers, layer_norm, ) = config self.conv_layers.append( VGGBlock( in_channels, out_channels, conv_kernel_size, pooling_kernel_size, num_conv_layers, input_dim=input_feat_per_channel, layer_norm=layer_norm, ) ) self.pooling_kernel_sizes.append(pooling_kernel_size) in_channels = out_channels input_feat_per_channel = self.conv_layers[-1].output_dim transformer_input_dim = self.infer_conv_output_dim( self.in_channels, self.input_dim ) # transformer_input_dim is the output dimension of VGG part self.validate_transformer_config(transformer_config) self.transformer_context = self.parse_transformer_context(transformer_context) self.transformer_sampling = self.parse_transformer_sampling( transformer_sampling, len(transformer_config) ) self.transformer_layers = nn.ModuleList() if transformer_input_dim != transformer_config[0][0]: self.transformer_layers.append( Linear(transformer_input_dim, transformer_config[0][0]) ) self.transformer_layers.append( TransformerEncoderLayer( prepare_transformer_encoder_params(*transformer_config[0]) ) ) for i in range(1, len(transformer_config)): if transformer_config[i - 1][0] != transformer_config[i][0]: self.transformer_layers.append( Linear(transformer_config[i - 1][0], transformer_config[i][0]) ) self.transformer_layers.append( TransformerEncoderLayer( prepare_transformer_encoder_params(*transformer_config[i]) ) ) self.encoder_output_dim = encoder_output_dim self.transformer_layers.extend( [ Linear(transformer_config[-1][0], encoder_output_dim), LayerNorm(encoder_output_dim), ] ) def forward(self, src_tokens, src_lengths, **kwargs): """ src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (B,) """ bsz, max_seq_len, _ = src_tokens.size() x = src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) x = x.transpose(1, 2).contiguous() # (B, C, T, feat) for layer_idx in range(len(self.conv_layers)): x = self.conv_layers[layer_idx](x) bsz, _, output_seq_len, _ = x.size() # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) -> (T, B, C * feat) x = x.transpose(1, 2).transpose(0, 1) x = x.contiguous().view(output_seq_len, bsz, -1) input_lengths = src_lengths.clone() for s in self.pooling_kernel_sizes: input_lengths = (input_lengths.float() / s).ceil().long() encoder_padding_mask, _ = lengths_to_encoder_padding_mask( input_lengths, batch_first=True ) if not encoder_padding_mask.any(): encoder_padding_mask = None subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5) attn_mask = self.lengths_to_attn_mask(input_lengths, subsampling_factor) transformer_layer_idx = 0 for layer_idx in range(len(self.transformer_layers)): if isinstance(self.transformer_layers[layer_idx], TransformerEncoderLayer): x = self.transformer_layers[layer_idx]( x, encoder_padding_mask, attn_mask ) if self.transformer_sampling[transformer_layer_idx] != 1: sampling_factor = self.transformer_sampling[transformer_layer_idx] x, encoder_padding_mask, attn_mask = self.slice( x, encoder_padding_mask, attn_mask, sampling_factor ) transformer_layer_idx += 1 else: x = self.transformer_layers[layer_idx](x) # encoder_padding_maks is a (T x B) tensor, its [t, b] elements indicate # whether encoder_output[t, b] is valid or not (valid=0, invalid=1) return { "encoder_out": x, # (T, B, C) "encoder_padding_mask": encoder_padding_mask.t() if encoder_padding_mask is not None else None, # (B, T) --> (T, B) } def infer_conv_output_dim(self, in_channels, input_dim): sample_seq_len = 200 sample_bsz = 10 x = torch.randn(sample_bsz, in_channels, sample_seq_len, input_dim) for i, _ in enumerate(self.conv_layers): x = self.conv_layers[i](x) x = x.transpose(1, 2) mb, seq = x.size()[:2] return x.contiguous().view(mb, seq, -1).size(-1) def validate_transformer_config(self, transformer_config): for config in transformer_config: input_dim, num_heads = config[:2] if input_dim % num_heads != 0: msg = ( "ERROR in transformer config {}: ".format(config) + "input dimension {} ".format(input_dim) + "not dividable by number of heads {}".format(num_heads) ) raise ValueError(msg) def parse_transformer_context(self, transformer_context): """ transformer_context can be the following: - None; indicates no context is used, i.e., transformer can access full context - a tuple/list of two int; indicates left and right context, any number <0 indicates infinite context * e.g., (5, 6) indicates that for query at x_t, transformer can access [t-5, t+6] (inclusive) * e.g., (-1, 6) indicates that for query at x_t, transformer can access [0, t+6] (inclusive) """ if transformer_context is None: return None if not isinstance(transformer_context, Iterable): raise ValueError("transformer context must be Iterable if it is not None") if len(transformer_context) != 2: raise ValueError("transformer context must have length 2") left_context = transformer_context[0] if left_context < 0: left_context = None right_context = transformer_context[1] if right_context < 0: right_context = None if left_context is None and right_context is None: return None return (left_context, right_context) def parse_transformer_sampling(self, transformer_sampling, num_layers): """ parsing transformer sampling configuration Args: - transformer_sampling, accepted input: * None, indicating no sampling * an Iterable with int (>0) as element - num_layers, expected number of transformer layers, must match with the length of transformer_sampling if it is not None Returns: - A tuple with length num_layers """ if transformer_sampling is None: return (1,) * num_layers if not isinstance(transformer_sampling, Iterable): raise ValueError( "transformer_sampling must be an iterable if it is not None" ) if len(transformer_sampling) != num_layers: raise ValueError( "transformer_sampling {} does not match with the number " "of layers {}".format(transformer_sampling, num_layers) ) for layer, value in enumerate(transformer_sampling): if not isinstance(value, int): raise ValueError("Invalid value in transformer_sampling: ") if value < 1: raise ValueError( "{} layer's subsampling is {}.".format(layer, value) + " This is not allowed! " ) return transformer_sampling def slice(self, embedding, padding_mask, attn_mask, sampling_factor): """ embedding is a (T, B, D) tensor padding_mask is a (B, T) tensor or None attn_mask is a (T, T) tensor or None """ embedding = embedding[::sampling_factor, :, :] if padding_mask is not None: padding_mask = padding_mask[:, ::sampling_factor] if attn_mask is not None: attn_mask = attn_mask[::sampling_factor, ::sampling_factor] return embedding, padding_mask, attn_mask def lengths_to_attn_mask(self, input_lengths, subsampling_factor=1): """ create attention mask according to sequence lengths and transformer context Args: - input_lengths: (B, )-shape Int/Long tensor; input_lengths[b] is the length of b-th sequence - subsampling_factor: int * Note that the left_context and right_context is specified in the input frame-level while input to transformer may already go through subsampling (e.g., the use of striding in vggblock) we use subsampling_factor to scale the left/right context Return: - a (T, T) binary tensor or None, where T is max(input_lengths) * if self.transformer_context is None, None * if left_context is None, * attn_mask[t, t + right_context + 1:] = 1 * others = 0 * if right_context is None, * attn_mask[t, 0:t - left_context] = 1 * others = 0 * elsif * attn_mask[t, t - left_context: t + right_context + 1] = 0 * others = 1 """ if self.transformer_context is None: return None maxT = torch.max(input_lengths).item() attn_mask = torch.zeros(maxT, maxT) left_context = self.transformer_context[0] right_context = self.transformer_context[1] if left_context is not None: left_context = math.ceil(self.transformer_context[0] / subsampling_factor) if right_context is not None: right_context = math.ceil(self.transformer_context[1] / subsampling_factor) for t in range(maxT): if left_context is not None: st = 0 en = max(st, t - left_context) attn_mask[t, st:en] = 1 if right_context is not None: st = t + right_context + 1 st = min(st, maxT - 1) attn_mask[t, st:] = 1 return attn_mask.to(input_lengths.device) def reorder_encoder_out(self, encoder_out, new_order): encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 1, new_order ) if encoder_out["encoder_padding_mask"] is not None: encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(1, new_order) return encoder_out class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs. Default: ``False`` left_pad (bool, optional): whether the input is left-padded. Default: ``False`` """ def __init__( self, dictionary, embed_dim=512, transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG, conv_config=DEFAULT_DEC_CONV_CONFIG, encoder_output_dim=512, ): super().__init__(dictionary) vocab_size = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(vocab_size, embed_dim, self.padding_idx) self.conv_layers = nn.ModuleList() for i in range(len(conv_config)): out_channels, kernel_size, layer_norm = conv_config[i] if i == 0: conv_layer = LinearizedConv1d( embed_dim, out_channels, kernel_size, padding=kernel_size - 1 ) else: conv_layer = LinearizedConv1d( conv_config[i - 1][0], out_channels, kernel_size, padding=kernel_size - 1, ) self.conv_layers.append(conv_layer) if layer_norm: self.conv_layers.append(nn.LayerNorm(out_channels)) self.conv_layers.append(nn.ReLU()) self.layers = nn.ModuleList() if conv_config[-1][0] != transformer_config[0][0]: self.layers.append(Linear(conv_config[-1][0], transformer_config[0][0])) self.layers.append( TransformerDecoderLayer( prepare_transformer_decoder_params(*transformer_config[0]) ) ) for i in range(1, len(transformer_config)): if transformer_config[i - 1][0] != transformer_config[i][0]: self.layers.append( Linear(transformer_config[i - 1][0], transformer_config[i][0]) ) self.layers.append( TransformerDecoderLayer( prepare_transformer_decoder_params(*transformer_config[i]) ) ) self.fc_out = Linear(transformer_config[-1][0], vocab_size) def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ target_padding_mask = ( (prev_output_tokens == self.padding_idx).to(prev_output_tokens.device) if incremental_state is None else None ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] # embed tokens x = self.embed_tokens(prev_output_tokens) # B x T x C -> T x B x C x = self._transpose_if_training(x, incremental_state) for layer in self.conv_layers: if isinstance(layer, LinearizedConvolution): x = layer(x, incremental_state) else: x = layer(x) # B x T x C -> T x B x C x = self._transpose_if_inference(x, incremental_state) # decoder layers for layer in self.layers: if isinstance(layer, TransformerDecoderLayer): x, *_ = layer( x, (encoder_out["encoder_out"] if encoder_out is not None else None), ( encoder_out["encoder_padding_mask"].t() if encoder_out["encoder_padding_mask"] is not None else None ), incremental_state, self_attn_mask=( self.buffered_future_mask(x) if incremental_state is None else None ), self_attn_padding_mask=( target_padding_mask if incremental_state is None else None ), ) else: x = layer(x) # T x B x C -> B x T x C x = x.transpose(0, 1) x = self.fc_out(x) return x, None def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, "_future_mask") or self._future_mask is None or self._future_mask.device != tensor.device ): self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 ) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1 ) return self._future_mask[:dim, :dim] def _transpose_if_training(self, x, incremental_state): if incremental_state is None: x = x.transpose(0, 1) return x def _transpose_if_inference(self, x, incremental_state): if incremental_state: x = x.transpose(0, 1) return x @register_model("asr_vggtransformer_encoder") class VGGTransformerEncoderModel(FairseqEncoderModel): def __init__(self, encoder): super().__init__(encoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--input-feat-per-channel", type=int, metavar="N", help="encoder input dimension per input channel", ) parser.add_argument( "--vggblock-enc-config", type=str, metavar="EXPR", help=""" an array of tuples each containing the configuration of one vggblock [(out_channels, conv_kernel_size, pooling_kernel_size,num_conv_layers), ...] """, ) parser.add_argument( "--transformer-enc-config", type=str, metavar="EXPR", help=""" a tuple containing the configuration of the Transformer layers configurations: [(input_dim, num_heads, ffn_dim, normalize_before, dropout, attention_dropout, relu_dropout), ]""", ) parser.add_argument( "--enc-output-dim", type=int, metavar="N", help="encoder output dimension, projecting the LSTM output", ) parser.add_argument( "--in-channels", type=int, metavar="N", help="number of encoder input channels", ) parser.add_argument( "--transformer-context", type=str, metavar="EXPR", help=""" either None or a tuple of two ints, indicating left/right context a transformer can have access to""", ) parser.add_argument( "--transformer-sampling", type=str, metavar="EXPR", help=""" either None or a tuple of ints, indicating sampling factor in each layer""", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" base_architecture_enconly(args) encoder = VGGTransformerEncoderOnly( vocab_size=len(task.target_dictionary), input_feat_per_channel=args.input_feat_per_channel, vggblock_config=eval(args.vggblock_enc_config), transformer_config=eval(args.transformer_enc_config), encoder_output_dim=args.enc_output_dim, in_channels=args.in_channels, transformer_context=eval(args.transformer_context), transformer_sampling=eval(args.transformer_sampling), ) return cls(encoder) def get_normalized_probs(self, net_output, log_probs, sample=None): # net_output['encoder_out'] is a (T, B, D) tensor lprobs = super().get_normalized_probs(net_output, log_probs, sample) # lprobs is a (T, B, D) tensor # we need to transoose to get (B, T, D) tensor lprobs = lprobs.transpose(0, 1).contiguous() lprobs.batch_first = True return lprobs class VGGTransformerEncoderOnly(VGGTransformerEncoder): def __init__( self, vocab_size, input_feat_per_channel, vggblock_config=DEFAULT_ENC_VGGBLOCK_CONFIG, transformer_config=DEFAULT_ENC_TRANSFORMER_CONFIG, encoder_output_dim=512, in_channels=1, transformer_context=None, transformer_sampling=None, ): super().__init__( input_feat_per_channel=input_feat_per_channel, vggblock_config=vggblock_config, transformer_config=transformer_config, encoder_output_dim=encoder_output_dim, in_channels=in_channels, transformer_context=transformer_context, transformer_sampling=transformer_sampling, ) self.fc_out = Linear(self.encoder_output_dim, vocab_size) def forward(self, src_tokens, src_lengths, **kwargs): """ src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (B,) """ enc_out = super().forward(src_tokens, src_lengths) x = self.fc_out(enc_out["encoder_out"]) # x = F.log_softmax(x, dim=-1) # Note: no need this line, because model.get_normalized_prob will call # log_softmax return { "encoder_out": x, # (T, B, C) "encoder_padding_mask": enc_out["encoder_padding_mask"], # (T, B) } def max_positions(self): """Maximum input length supported by the encoder.""" return (1e6, 1e6) # an arbitrary large number def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) # nn.init.uniform_(m.weight, -0.1, 0.1) # nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, bias=True, dropout=0): """Linear layer (input: N x T x C)""" m = nn.Linear(in_features, out_features, bias=bias) # m.weight.data.uniform_(-0.1, 0.1) # if bias: # m.bias.data.uniform_(-0.1, 0.1) return m def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0, **kwargs): """Weight-normalized Conv1d layer optimized for decoding""" m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) nn.init.normal_(m.weight, mean=0, std=std) nn.init.constant_(m.bias, 0) return nn.utils.weight_norm(m, dim=2) def LayerNorm(embedding_dim): m = nn.LayerNorm(embedding_dim) return m # seq2seq models def base_architecture(args): args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40) args.vggblock_enc_config = getattr( args, "vggblock_enc_config", DEFAULT_ENC_VGGBLOCK_CONFIG ) args.transformer_enc_config = getattr( args, "transformer_enc_config", DEFAULT_ENC_TRANSFORMER_CONFIG ) args.enc_output_dim = getattr(args, "enc_output_dim", 512) args.in_channels = getattr(args, "in_channels", 1) args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 128) args.transformer_dec_config = getattr( args, "transformer_dec_config", DEFAULT_ENC_TRANSFORMER_CONFIG ) args.conv_dec_config = getattr(args, "conv_dec_config", DEFAULT_DEC_CONV_CONFIG) args.transformer_context = getattr(args, "transformer_context", "None") @register_model_architecture("asr_vggtransformer", "vggtransformer_1") def vggtransformer_1(args): args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.vggblock_enc_config = getattr( args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]" ) args.transformer_enc_config = getattr( args, "transformer_enc_config", "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 14", ) args.enc_output_dim = getattr(args, "enc_output_dim", 1024) args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 128) args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4") args.transformer_dec_config = getattr( args, "transformer_dec_config", "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 4", ) @register_model_architecture("asr_vggtransformer", "vggtransformer_2") def vggtransformer_2(args): args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.vggblock_enc_config = getattr( args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]" ) args.transformer_enc_config = getattr( args, "transformer_enc_config", "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16", ) args.enc_output_dim = getattr(args, "enc_output_dim", 1024) args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 512) args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4") args.transformer_dec_config = getattr( args, "transformer_dec_config", "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 6", ) @register_model_architecture("asr_vggtransformer", "vggtransformer_base") def vggtransformer_base(args): args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.vggblock_enc_config = getattr( args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]" ) args.transformer_enc_config = getattr( args, "transformer_enc_config", "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 12" ) args.enc_output_dim = getattr(args, "enc_output_dim", 512) args.tgt_embed_dim = getattr(args, "tgt_embed_dim", 512) args.conv_dec_config = getattr(args, "conv_dec_config", "((256, 3, True),) * 4") args.transformer_dec_config = getattr( args, "transformer_dec_config", "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 6" ) # Size estimations: # Encoder: # - vggblock param: 64*1*3*3 + 64*64*3*3 + 128*64*3*3 + 128*128*3 = 258K # Transformer: # - input dimension adapter: 2560 x 512 -> 1.31M # - transformer_layers (x12) --> 37.74M # * MultiheadAttention: 512*512*3 (in_proj) + 512*512 (out_proj) = 1.048M # * FFN weight: 512*2048*2 = 2.097M # - output dimension adapter: 512 x 512 -> 0.26 M # Decoder: # - LinearizedConv1d: 512 * 256 * 3 + 256 * 256 * 3 * 3 # - transformer_layer: (x6) --> 25.16M # * MultiheadAttention (self-attention): 512*512*3 + 512*512 = 1.048M # * MultiheadAttention (encoder-attention): 512*512*3 + 512*512 = 1.048M # * FFN: 512*2048*2 = 2.097M # Final FC: # - FC: 512*5000 = 256K (assuming vocab size 5K) # In total: # ~65 M # CTC models def base_architecture_enconly(args): args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 40) args.vggblock_enc_config = getattr( args, "vggblock_enc_config", "[(32, 3, 2, 2, True)] * 2" ) args.transformer_enc_config = getattr( args, "transformer_enc_config", "((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2" ) args.enc_output_dim = getattr(args, "enc_output_dim", 512) args.in_channels = getattr(args, "in_channels", 1) args.transformer_context = getattr(args, "transformer_context", "None") args.transformer_sampling = getattr(args, "transformer_sampling", "None") @register_model_architecture("asr_vggtransformer_encoder", "vggtransformer_enc_1") def vggtransformer_enc_1(args): # vggtransformer_1 is the same as vggtransformer_enc_big, except the number # of layers is increased to 16 # keep it here for backward compatiablity purpose args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.vggblock_enc_config = getattr( args, "vggblock_enc_config", "[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]" ) args.transformer_enc_config = getattr( args, "transformer_enc_config", "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16", ) args.enc_output_dim = getattr(args, "enc_output_dim", 1024) ================================================ FILE: examples/speech_recognition/models/w2l_conv_glu_enc.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn as nn import torch.nn.functional as F from fairseq.models import ( FairseqEncoder, FairseqEncoderModel, register_model, register_model_architecture, ) from fairseq.modules.fairseq_dropout import FairseqDropout default_conv_enc_config = """[ (400, 13, 170, 0.2), (440, 14, 0, 0.214), (484, 15, 0, 0.22898), (532, 16, 0, 0.2450086), (584, 17, 0, 0.262159202), (642, 18, 0, 0.28051034614), (706, 19, 0, 0.30014607037), (776, 20, 0, 0.321156295296), (852, 21, 0, 0.343637235966), (936, 22, 0, 0.367691842484), (1028, 23, 0, 0.393430271458), (1130, 24, 0, 0.42097039046), (1242, 25, 0, 0.450438317792), (1366, 26, 0, 0.481969000038), (1502, 27, 0, 0.51570683004), (1652, 28, 0, 0.551806308143), (1816, 29, 0, 0.590432749713), ]""" @register_model("asr_w2l_conv_glu_encoder") class W2lConvGluEncoderModel(FairseqEncoderModel): def __init__(self, encoder): super().__init__(encoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--input-feat-per-channel", type=int, metavar="N", help="encoder input dimension per input channel", ) parser.add_argument( "--in-channels", type=int, metavar="N", help="number of encoder input channels", ) parser.add_argument( "--conv-enc-config", type=str, metavar="EXPR", help=""" an array of tuples each containing the configuration of one conv layer [(out_channels, kernel_size, padding, dropout), ...] """, ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" conv_enc_config = getattr(args, "conv_enc_config", default_conv_enc_config) encoder = W2lConvGluEncoder( vocab_size=len(task.target_dictionary), input_feat_per_channel=args.input_feat_per_channel, in_channels=args.in_channels, conv_enc_config=eval(conv_enc_config), ) return cls(encoder) def get_normalized_probs(self, net_output, log_probs, sample=None): lprobs = super().get_normalized_probs(net_output, log_probs, sample) lprobs.batch_first = False return lprobs class W2lConvGluEncoder(FairseqEncoder): def __init__( self, vocab_size, input_feat_per_channel, in_channels, conv_enc_config ): super().__init__(None) self.input_dim = input_feat_per_channel if in_channels != 1: raise ValueError("only 1 input channel is currently supported") self.conv_layers = nn.ModuleList() self.linear_layers = nn.ModuleList() self.dropouts = [] cur_channels = input_feat_per_channel for out_channels, kernel_size, padding, dropout in conv_enc_config: layer = nn.Conv1d(cur_channels, out_channels, kernel_size, padding=padding) layer.weight.data.mul_(math.sqrt(3)) # match wav2letter init self.conv_layers.append(nn.utils.weight_norm(layer)) self.dropouts.append( FairseqDropout(dropout, module_name=self.__class__.__name__) ) if out_channels % 2 != 0: raise ValueError("odd # of out_channels is incompatible with GLU") cur_channels = out_channels // 2 # halved by GLU for out_channels in [2 * cur_channels, vocab_size]: layer = nn.Linear(cur_channels, out_channels) layer.weight.data.mul_(math.sqrt(3)) self.linear_layers.append(nn.utils.weight_norm(layer)) cur_channels = out_channels // 2 def forward(self, src_tokens, src_lengths, **kwargs): """ src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (B,) """ B, T, _ = src_tokens.size() x = src_tokens.transpose(1, 2).contiguous() # (B, feat, T) assuming C == 1 for layer_idx in range(len(self.conv_layers)): x = self.conv_layers[layer_idx](x) x = F.glu(x, dim=1) x = self.dropouts[layer_idx](x) x = x.transpose(1, 2).contiguous() # (B, T, 908) x = self.linear_layers[0](x) x = F.glu(x, dim=2) x = self.dropouts[-1](x) x = self.linear_layers[1](x) assert x.size(0) == B assert x.size(1) == T encoder_out = x.transpose(0, 1) # (T, B, vocab_size) # need to debug this -- find a simpler/elegant way in pytorch APIs encoder_padding_mask = ( torch.arange(T).view(1, T).expand(B, -1).to(x.device) >= src_lengths.view(B, 1).expand(-1, T) ).t() # (B x T) -> (T x B) return { "encoder_out": encoder_out, # (T, B, vocab_size) "encoder_padding_mask": encoder_padding_mask, # (T, B) } def reorder_encoder_out(self, encoder_out, new_order): encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 1, new_order ) encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(1, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" return (1e6, 1e6) # an arbitrary large number @register_model_architecture("asr_w2l_conv_glu_encoder", "w2l_conv_glu_enc") def w2l_conv_glu_enc(args): args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.in_channels = getattr(args, "in_channels", 1) args.conv_enc_config = getattr(args, "conv_enc_config", default_conv_enc_config) ================================================ FILE: examples/speech_recognition/new/README.md ================================================ # Flashlight Decoder This script runs decoding for pre-trained speech recognition models. ## Usage Assuming a few variables: ```bash checkpoint=<path-to-checkpoint> data=<path-to-data-directory> lm_model=<path-to-language-model> lexicon=<path-to-lexicon> ``` Example usage for decoding a fine-tuned Wav2Vec model: ```bash python $FAIRSEQ_ROOT/examples/speech_recognition/new/infer.py --multirun \ task=audio_pretraining \ task.data=$data \ task.labels=ltr \ common_eval.path=$checkpoint \ decoding.type=kenlm \ decoding.lexicon=$lexicon \ decoding.lmpath=$lm_model \ dataset.gen_subset=dev_clean,dev_other,test_clean,test_other ``` Example usage for using Ax to sweep WER parameters (requires `pip install hydra-ax-sweeper`): ```bash python $FAIRSEQ_ROOT/examples/speech_recognition/new/infer.py --multirun \ hydra/sweeper=ax \ task=audio_pretraining \ task.data=$data \ task.labels=ltr \ common_eval.path=$checkpoint \ decoding.type=kenlm \ decoding.lexicon=$lexicon \ decoding.lmpath=$lm_model \ dataset.gen_subset=dev_other ``` ================================================ FILE: examples/speech_recognition/new/__init__.py ================================================ ================================================ FILE: examples/speech_recognition/new/conf/hydra/sweeper/ax.yaml ================================================ # @package hydra.sweeper _target_: hydra_plugins.hydra_ax_sweeper.ax_sweeper.AxSweeper max_batch_size: null ax_config: max_trials: 128 early_stop: minimize: true max_epochs_without_improvement: 10 epsilon: 0.025 experiment: name: ${dataset.gen_subset} objective_name: wer minimize: true parameter_constraints: null outcome_constraints: null status_quo: null client: verbose_logging: false random_seed: null params: decoding.lmweight: type: range bounds: [0.0, 5.0] decoding.wordscore: type: range bounds: [-5.0, 5.0] decoding.silweight: type: range bounds: [ -8.0, 0.0 ] ================================================ FILE: examples/speech_recognition/new/conf/hydra/sweeper/ax_sil.yaml ================================================ # @package hydra.sweeper _target_: hydra_plugins.hydra_ax_sweeper.ax_sweeper.AxSweeper max_batch_size: null ax_config: max_trials: 64 early_stop: minimize: true max_epochs_without_improvement: 10 epsilon: 0.025 experiment: name: ${dataset.gen_subset} objective_name: wer minimize: true parameter_constraints: null outcome_constraints: null status_quo: null client: verbose_logging: false random_seed: null params: decoding.lmweight: type: range bounds: [0.0, 10.0] decoding.wordscore: type: range bounds: [-10.0, 10.0] decoding.silweight: type: range bounds: [ -10.0, 0.0 ] ================================================ FILE: examples/speech_recognition/new/conf/infer.yaml ================================================ # @package _group_ defaults: - task: null - model: null hydra: run: dir: ${common_eval.results_path}/${dataset.gen_subset} sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${common_eval.results_path} subdir: ${dataset.gen_subset} common: user_dir: /private/home/abaevski/fairseq-py/examples/data2vec common_eval: results_path: null path: null post_process: letter quiet: true dataset: max_tokens: 3000000 gen_subset: test distributed_training: distributed_world_size: 1 decoding: beam: 5 type: viterbi ================================================ FILE: examples/speech_recognition/new/conf/run_config/fb_slurm_1.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - common_eval.path sweep: dir: /checkpoint/abaevski/asr/d2v2/decoding/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} # subdir: ${hydra.job.override_dirname} launcher: cpus_per_task: 16 gpus_per_node: 1 tasks_per_node: 1 nodes: 1 partition: devlab,learnlab mem_gb: 100 timeout_min: 2000 max_num_timeout: 10 name: ${env:PREFIX}_${hydra.job.config_name} submitit_folder: ${hydra.sweep.dir}/%j constraint: volta32gb exclude: learnfair7598 ================================================ FILE: examples/speech_recognition/new/conf/run_config/fb_slurm_2g.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - common_eval.path sweep: dir: /checkpoint/abaevski/asr/d2v2/decoding/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} # subdir: ${hydra.job.override_dirname} launcher: cpus_per_task: 16 gpus_per_node: 2 tasks_per_node: 2 nodes: 1 partition: devlab,learnlab mem_gb: 100 timeout_min: 2000 max_num_timeout: 10 name: ${env:PREFIX}_${hydra.job.config_name} submitit_folder: ${hydra.sweep.dir}/%j constraint: volta32gb ================================================ FILE: examples/speech_recognition/new/decoders/__init__.py ================================================ ================================================ FILE: examples/speech_recognition/new/decoders/base_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools as it from typing import Any, Dict, List import torch from fairseq.data.dictionary import Dictionary from fairseq.models.fairseq_model import FairseqModel class BaseDecoder: def __init__(self, tgt_dict: Dictionary) -> None: self.tgt_dict = tgt_dict self.vocab_size = len(tgt_dict) self.blank = ( tgt_dict.index("<ctc_blank>") if "<ctc_blank>" in tgt_dict.indices else tgt_dict.bos() ) if "<sep>" in tgt_dict.indices: self.silence = tgt_dict.index("<sep>") elif "|" in tgt_dict.indices: self.silence = tgt_dict.index("|") else: self.silence = tgt_dict.eos() def generate( self, models: List[FairseqModel], sample: Dict[str, Any], **unused ) -> List[List[Dict[str, torch.LongTensor]]]: encoder_input = { k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens" } emissions = self.get_emissions(models, encoder_input) return self.decode(emissions) def get_emissions( self, models: List[FairseqModel], encoder_input: Dict[str, Any], ) -> torch.FloatTensor: model = models[0] encoder_out = model(**encoder_input) if hasattr(model, "get_logits"): emissions = model.get_logits(encoder_out) else: emissions = model.get_normalized_probs(encoder_out, log_probs=True) return emissions.transpose(0, 1).float().cpu().contiguous() def get_tokens(self, idxs: torch.IntTensor) -> torch.LongTensor: idxs = (g[0] for g in it.groupby(idxs)) idxs = filter(lambda x: x != self.blank, idxs) return torch.LongTensor(list(idxs)) def decode( self, emissions: torch.FloatTensor, ) -> List[List[Dict[str, torch.LongTensor]]]: raise NotImplementedError ================================================ FILE: examples/speech_recognition/new/decoders/decoder.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Union from fairseq.data.dictionary import Dictionary from .decoder_config import DecoderConfig, FlashlightDecoderConfig from .base_decoder import BaseDecoder def Decoder( cfg: Union[DecoderConfig, FlashlightDecoderConfig], tgt_dict: Dictionary ) -> BaseDecoder: if cfg.type == "viterbi": from .viterbi_decoder import ViterbiDecoder return ViterbiDecoder(tgt_dict) if cfg.type == "kenlm": from .flashlight_decoder import KenLMDecoder return KenLMDecoder(cfg, tgt_dict) if cfg.type == "fairseqlm": from .flashlight_decoder import FairseqLMDecoder return FairseqLMDecoder(cfg, tgt_dict) raise NotImplementedError(f"Invalid decoder name: {cfg.name}") ================================================ FILE: examples/speech_recognition/new/decoders/decoder_config.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field from typing import Optional from fairseq.dataclass.configs import FairseqDataclass from fairseq.dataclass.constants import ChoiceEnum from omegaconf import MISSING DECODER_CHOICES = ChoiceEnum(["viterbi", "kenlm", "fairseqlm"]) @dataclass class DecoderConfig(FairseqDataclass): type: DECODER_CHOICES = field( default="viterbi", metadata={"help": "The type of decoder to use"}, ) @dataclass class FlashlightDecoderConfig(FairseqDataclass): nbest: int = field( default=1, metadata={"help": "Number of decodings to return"}, ) unitlm: bool = field( default=False, metadata={"help": "If set, use unit language model"}, ) lmpath: str = field( default=MISSING, metadata={"help": "Language model for KenLM decoder"}, ) lexicon: Optional[str] = field( default=None, metadata={"help": "Lexicon for Flashlight decoder"}, ) beam: int = field( default=50, metadata={"help": "Number of beams to use for decoding"}, ) beamthreshold: float = field( default=50.0, metadata={"help": "Threshold for beam search decoding"}, ) beamsizetoken: Optional[int] = field( default=None, metadata={"help": "Beam size to use"} ) wordscore: float = field( default=-1, metadata={"help": "Word score for KenLM decoder"}, ) unkweight: float = field( default=-math.inf, metadata={"help": "Unknown weight for KenLM decoder"}, ) silweight: float = field( default=0, metadata={"help": "Silence weight for KenLM decoder"}, ) lmweight: float = field( default=2, metadata={"help": "Weight for LM while interpolating score"}, ) ================================================ FILE: examples/speech_recognition/new/decoders/flashlight_decoder.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import gc import os.path as osp import warnings from collections import deque, namedtuple from typing import Any, Dict, Tuple import numpy as np import torch from fairseq import tasks from fairseq.data.dictionary import Dictionary from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models.fairseq_model import FairseqModel from fairseq.utils import apply_to_sample from omegaconf import open_dict, OmegaConf from typing import List from .decoder_config import FlashlightDecoderConfig from .base_decoder import BaseDecoder try: from flashlight.lib.text.decoder import ( LM, CriterionType, DecodeResult, KenLM, LexiconDecoder, LexiconDecoderOptions, LexiconFreeDecoder, LexiconFreeDecoderOptions, LMState, SmearingMode, Trie, ) from flashlight.lib.text.dictionary import create_word_dict, load_words from flashlight.lib.text.dictionary import Dictionary as flDictionary except ImportError: warnings.warn( "flashlight python bindings are required to use this functionality. " "Please install from " "https://github.com/facebookresearch/flashlight/tree/master/bindings/python" ) LM = object LMState = object class KenLMDecoder(BaseDecoder): def __init__(self, cfg: FlashlightDecoderConfig, tgt_dict: Dictionary) -> None: super().__init__(tgt_dict) self.nbest = cfg.nbest self.unitlm = cfg.unitlm if cfg.lexicon: self.lexicon = load_words(cfg.lexicon) self.word_dict = create_word_dict(self.lexicon) self.unk_word = self.word_dict.get_index("<unk>") self.lm = KenLM(cfg.lmpath, self.word_dict) self.trie = Trie(self.vocab_size, self.silence) start_state = self.lm.start(False) for word, spellings in self.lexicon.items(): word_idx = self.word_dict.get_index(word) _, score = self.lm.score(start_state, word_idx) for spelling in spellings: spelling_idxs = [tgt_dict.index(token) for token in spelling] assert ( tgt_dict.unk() not in spelling_idxs ), f"{word} {spelling} {spelling_idxs}" self.trie.insert(spelling_idxs, word_idx, score) self.trie.smear(SmearingMode.MAX) self.decoder_opts = LexiconDecoderOptions( beam_size=cfg.beam, beam_size_token=cfg.beamsizetoken or len(tgt_dict), beam_threshold=cfg.beamthreshold, lm_weight=cfg.lmweight, word_score=cfg.wordscore, unk_score=cfg.unkweight, sil_score=cfg.silweight, log_add=False, criterion_type=CriterionType.CTC, ) self.decoder = LexiconDecoder( self.decoder_opts, self.trie, self.lm, self.silence, self.blank, self.unk_word, [], self.unitlm, ) else: assert self.unitlm, "Lexicon-free decoding requires unit LM" self.word_dict = flDictionary() for sym in tgt_dict.symbols: self.word_dict.add_entry(sym, tgt_dict.index(sym)) self.lm = KenLM(cfg.lmpath, self.word_dict) self.decoder_opts = LexiconFreeDecoderOptions( beam_size=cfg.beam, beam_size_token=cfg.beamsizetoken or len(tgt_dict), beam_threshold=cfg.beamthreshold, lm_weight=cfg.lmweight, sil_score=cfg.silweight, log_add=False, criterion_type=CriterionType.CTC, ) self.decoder = LexiconFreeDecoder( self.decoder_opts, self.lm, self.silence, self.blank, [] ) def get_timesteps(self, token_idxs: List[int]) -> List[int]: """Returns frame numbers corresponding to every non-blank token. Parameters ---------- token_idxs : List[int] IDs of decoded tokens. Returns ------- List[int] Frame numbers corresponding to every non-blank token. """ timesteps = [] for i, token_idx in enumerate(token_idxs): if token_idx == self.blank: continue if i == 0 or token_idx != token_idxs[i-1]: timesteps.append(i) return timesteps def decode( self, emissions: torch.FloatTensor, ) -> List[List[Dict[str, torch.LongTensor]]]: B, T, N = emissions.size() hypos = [] for b in range(B): emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0) results = self.decoder.decode(emissions_ptr, T, N) nbest_results = results[: self.nbest] hypos.append( [ { "tokens": self.get_tokens(result.tokens), "score": result.score, "timesteps": self.get_timesteps(result.tokens), "words": [ self.word_dict.get_entry(x) for x in result.words if x >= 0 ], } for result in nbest_results ] ) return hypos FairseqLMState = namedtuple( "FairseqLMState", [ "prefix", "incremental_state", "probs", ], ) class FairseqLM(LM): def __init__(self, dictionary: Dictionary, model: FairseqModel) -> None: super().__init__() self.dictionary = dictionary self.model = model self.unk = self.dictionary.unk() self.save_incremental = False # this currently does not work properly self.max_cache = 20_000 if torch.cuda.is_available(): model.cuda() model.eval() model.make_generation_fast_() self.states = {} self.stateq = deque() def start(self, start_with_nothing: bool) -> LMState: state = LMState() prefix = torch.LongTensor([[self.dictionary.eos()]]) incremental_state = {} if self.save_incremental else None with torch.no_grad(): res = self.model(prefix.cuda(), incremental_state=incremental_state) probs = self.model.get_normalized_probs(res, log_probs=True, sample=None) if incremental_state is not None: incremental_state = apply_to_sample(lambda x: x.cpu(), incremental_state) self.states[state] = FairseqLMState( prefix.numpy(), incremental_state, probs[0, -1].cpu().numpy() ) self.stateq.append(state) return state def score( self, state: LMState, token_index: int, no_cache: bool = False, ) -> Tuple[LMState, int]: """ Evaluate language model based on the current lm state and new word Parameters: ----------- state: current lm state token_index: index of the word (can be lexicon index then you should store inside LM the mapping between indices of lexicon and lm, or lm index of a word) Returns: -------- (LMState, float): pair of (new state, score for the current word) """ curr_state = self.states[state] def trim_cache(targ_size: int) -> None: while len(self.stateq) > targ_size: rem_k = self.stateq.popleft() rem_st = self.states[rem_k] rem_st = FairseqLMState(rem_st.prefix, None, None) self.states[rem_k] = rem_st if curr_state.probs is None: new_incremental_state = ( curr_state.incremental_state.copy() if curr_state.incremental_state is not None else None ) with torch.no_grad(): if new_incremental_state is not None: new_incremental_state = apply_to_sample( lambda x: x.cuda(), new_incremental_state ) elif self.save_incremental: new_incremental_state = {} res = self.model( torch.from_numpy(curr_state.prefix).cuda(), incremental_state=new_incremental_state, ) probs = self.model.get_normalized_probs( res, log_probs=True, sample=None ) if new_incremental_state is not None: new_incremental_state = apply_to_sample( lambda x: x.cpu(), new_incremental_state ) curr_state = FairseqLMState( curr_state.prefix, new_incremental_state, probs[0, -1].cpu().numpy() ) if not no_cache: self.states[state] = curr_state self.stateq.append(state) score = curr_state.probs[token_index].item() trim_cache(self.max_cache) outstate = state.child(token_index) if outstate not in self.states and not no_cache: prefix = np.concatenate( [curr_state.prefix, torch.LongTensor([[token_index]])], -1 ) incr_state = curr_state.incremental_state self.states[outstate] = FairseqLMState(prefix, incr_state, None) if token_index == self.unk: score = float("-inf") return outstate, score def finish(self, state: LMState) -> Tuple[LMState, int]: """ Evaluate eos for language model based on the current lm state Returns: -------- (LMState, float): pair of (new state, score for the current word) """ return self.score(state, self.dictionary.eos()) def empty_cache(self) -> None: self.states = {} self.stateq = deque() gc.collect() class FairseqLMDecoder(BaseDecoder): def __init__(self, cfg: FlashlightDecoderConfig, tgt_dict: Dictionary) -> None: super().__init__(tgt_dict) self.nbest = cfg.nbest self.unitlm = cfg.unitlm self.lexicon = load_words(cfg.lexicon) if cfg.lexicon else None self.idx_to_wrd = {} checkpoint = torch.load(cfg.lmpath, map_location="cpu") if "cfg" in checkpoint and checkpoint["cfg"] is not None: lm_args = checkpoint["cfg"] else: lm_args = convert_namespace_to_omegaconf(checkpoint["args"]) if not OmegaConf.is_dict(lm_args): lm_args = OmegaConf.create(lm_args) with open_dict(lm_args.task): lm_args.task.data = osp.dirname(cfg.lmpath) task = tasks.setup_task(lm_args.task) model = task.build_model(lm_args.model) model.load_state_dict(checkpoint["model"], strict=False) self.trie = Trie(self.vocab_size, self.silence) self.word_dict = task.dictionary self.unk_word = self.word_dict.unk() self.lm = FairseqLM(self.word_dict, model) if self.lexicon: start_state = self.lm.start(False) for i, (word, spellings) in enumerate(self.lexicon.items()): if self.unitlm: word_idx = i self.idx_to_wrd[i] = word score = 0 else: word_idx = self.word_dict.index(word) _, score = self.lm.score(start_state, word_idx, no_cache=True) for spelling in spellings: spelling_idxs = [tgt_dict.index(token) for token in spelling] assert ( tgt_dict.unk() not in spelling_idxs ), f"{spelling} {spelling_idxs}" self.trie.insert(spelling_idxs, word_idx, score) self.trie.smear(SmearingMode.MAX) self.decoder_opts = LexiconDecoderOptions( beam_size=cfg.beam, beam_size_token=cfg.beamsizetoken or len(tgt_dict), beam_threshold=cfg.beamthreshold, lm_weight=cfg.lmweight, word_score=cfg.wordscore, unk_score=cfg.unkweight, sil_score=cfg.silweight, log_add=False, criterion_type=CriterionType.CTC, ) self.decoder = LexiconDecoder( self.decoder_opts, self.trie, self.lm, self.silence, self.blank, self.unk_word, [], self.unitlm, ) else: assert self.unitlm, "Lexicon-free decoding requires unit LM" d = {w: [[w]] for w in tgt_dict.symbols} self.word_dict = create_word_dict(d) self.lm = KenLM(cfg.lmpath, self.word_dict) self.decoder_opts = LexiconFreeDecoderOptions( beam_size=cfg.beam, beam_size_token=cfg.beamsizetoken or len(tgt_dict), beam_threshold=cfg.beamthreshold, lm_weight=cfg.lmweight, sil_score=cfg.silweight, log_add=False, criterion_type=CriterionType.CTC, ) self.decoder = LexiconFreeDecoder( self.decoder_opts, self.lm, self.silence, self.blank, [] ) def decode( self, emissions: torch.FloatTensor, ) -> List[List[Dict[str, torch.LongTensor]]]: B, T, N = emissions.size() hypos = [] def make_hypo(result: DecodeResult) -> Dict[str, Any]: hypo = { "tokens": self.get_tokens(result.tokens), "score": result.score, } if self.lexicon: hypo["words"] = [ self.idx_to_wrd[x] if self.unitlm else self.word_dict[x] for x in result.words if x >= 0 ] return hypo for b in range(B): emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0) results = self.decoder.decode(emissions_ptr, T, N) nbest_results = results[: self.nbest] hypos.append([make_hypo(result) for result in nbest_results]) self.lm.empty_cache() return hypos ================================================ FILE: examples/speech_recognition/new/decoders/viterbi_decoder.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from typing import List, Dict from .base_decoder import BaseDecoder class ViterbiDecoder(BaseDecoder): def decode( self, emissions: torch.FloatTensor, ) -> List[List[Dict[str, torch.LongTensor]]]: def get_pred(e): score = e.log_softmax(dim=-1).max(dim=-1)[0].sum() toks = e.argmax(dim=-1).unique_consecutive() return {"tokens":toks[toks != self.blank], "score":score} return [[get_pred(x)] for x in emissions] ================================================ FILE: examples/speech_recognition/new/infer.py ================================================ #!/usr/bin/env python -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ast import hashlib import logging import os import shutil import sys import re from dataclasses import dataclass, field, is_dataclass from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union import editdistance import torch import torch.distributed as dist from examples.speech_recognition.new.decoders.decoder_config import ( DecoderConfig, FlashlightDecoderConfig, ) from examples.speech_recognition.new.decoders.decoder import Decoder from fairseq import checkpoint_utils, distributed_utils, progress_bar, tasks, utils from fairseq.data.data_utils import post_process from fairseq.dataclass.configs import ( CheckpointConfig, CommonConfig, CommonEvalConfig, DatasetConfig, DistributedTrainingConfig, FairseqDataclass, ) from fairseq.logging.meters import StopwatchMeter, TimeMeter from fairseq.logging.progress_bar import BaseProgressBar from fairseq.models.fairseq_model import FairseqModel from omegaconf import OmegaConf import hydra from hydra.core.config_store import ConfigStore logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) config_path = Path(__file__).resolve().parent / "conf" @dataclass class DecodingConfig(DecoderConfig, FlashlightDecoderConfig): unique_wer_file: bool = field( default=False, metadata={"help": "If set, use a unique file for storing WER"}, ) results_path: Optional[str] = field( default=None, metadata={ "help": "If set, write hypothesis and reference sentences into this directory" }, ) @dataclass class InferConfig(FairseqDataclass): task: Any = None decoding: DecodingConfig = DecodingConfig() common: CommonConfig = CommonConfig() common_eval: CommonEvalConfig = CommonEvalConfig() checkpoint: CheckpointConfig = CheckpointConfig() distributed_training: DistributedTrainingConfig = DistributedTrainingConfig() dataset: DatasetConfig = DatasetConfig() is_ax: bool = field( default=False, metadata={ "help": "if true, assumes we are using ax for tuning and returns a tuple for ax to consume" }, ) def reset_logging(): root = logging.getLogger() for handler in root.handlers: root.removeHandler(handler) root.setLevel(os.environ.get("LOGLEVEL", "INFO").upper()) handler = logging.StreamHandler(sys.stdout) handler.setFormatter( logging.Formatter( fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) ) root.addHandler(handler) class InferenceProcessor: cfg: InferConfig def __init__(self, cfg: InferConfig) -> None: self.cfg = cfg self.task = tasks.setup_task(cfg.task) models, saved_cfg = self.load_model_ensemble() ### LOAD ADAPTER #### ckpt_obj = checkpoint_utils.load_checkpoint_to_cpu(self.cfg.common_eval.path) if "adapter" in ckpt_obj: target_lang = self.cfg.dataset.gen_subset.split(":")[0] assert target_lang in ckpt_obj["adapter"] logger.info(f">>> LOADING ADAPTER: {target_lang}") ft_obj = ckpt_obj["adapter"][target_lang] ft_model = ft_obj["model"] cdevice = models[0].w2v_encoder.proj.weight.device cdtype = models[0].w2v_encoder.proj.weight.dtype ft_proj_out, ft_proj_in = ft_model["w2v_encoder.proj.weight"].shape ft_proj = torch.nn.Linear(ft_proj_in, ft_proj_out, bias=True) ft_proj.to(device=cdevice, dtype=cdtype) models[0].w2v_encoder.proj = ft_proj with torch.no_grad(): for kk, vv in models[0].named_parameters(): if kk in ft_model: vv.copy_(ft_model[kk]) self.task.load_state_dict(ft_obj["task_state"]) # overwrite gen_subset with master config self.cfg.dataset.gen_subset = re.sub('^[\w-]+:', saved_cfg['task']['multi_corpus_keys']+":", self.cfg.dataset.gen_subset) self.models = models self.saved_cfg = saved_cfg self.tgt_dict = self.task.target_dictionary self.task.load_dataset( self.cfg.dataset.gen_subset, task_cfg=saved_cfg.task, ) self.generator = Decoder(cfg.decoding, self.tgt_dict) self.gen_timer = StopwatchMeter() self.wps_meter = TimeMeter() self.num_sentences = 0 self.total_errors = 0 self.total_length = 0 self.hypo_words_file = None self.hypo_units_file = None self.ref_words_file = None self.ref_units_file = None self.score_file = None self.progress_bar = self.build_progress_bar() def __enter__(self) -> "InferenceProcessor": if self.cfg.decoding.results_path is not None: self.hypo_words_file = self.get_res_file("hypo.word") self.hypo_units_file = self.get_res_file("hypo.units") self.ref_words_file = self.get_res_file("ref.word") self.ref_units_file = self.get_res_file("ref.units") self.score_file = self.get_res_file("asr_score") return self def __exit__(self, *exc) -> bool: if self.cfg.decoding.results_path is not None: self.hypo_words_file.close() self.hypo_units_file.close() self.ref_words_file.close() self.ref_units_file.close() self.score_file.close() return False def __iter__(self) -> Any: for sample in self.progress_bar: if not self.cfg.common.cpu: sample = utils.move_to_cuda(sample) # Happens on the last batch. if "net_input" not in sample: continue yield sample def log(self, *args, **kwargs): self.progress_bar.log(*args, **kwargs) def print(self, *args, **kwargs): self.progress_bar.print(*args, **kwargs) def get_res_file(self, fname: str) -> None: fname = os.path.join(self.cfg.decoding.results_path, fname) if self.data_parallel_world_size > 1: fname = f"{fname}.{self.data_parallel_rank}" return open(fname, "w", buffering=1) def merge_shards(self) -> None: """Merges all shard files into shard 0, then removes shard suffix.""" shard_id = self.data_parallel_rank num_shards = self.data_parallel_world_size if self.data_parallel_world_size > 1: def merge_shards_with_root(fname: str) -> None: fname = os.path.join(self.cfg.decoding.results_path, fname) logger.info("Merging %s on shard %d", fname, shard_id) base_fpath = Path(f"{fname}.0") with open(base_fpath, "a") as out_file: for s in range(1, num_shards): shard_fpath = Path(f"{fname}.{s}") with open(shard_fpath, "r") as in_file: for line in in_file: out_file.write(line) shard_fpath.unlink() shutil.move(f"{fname}.0", fname) dist.barrier() # ensure all shards finished writing if shard_id == (0 % num_shards): merge_shards_with_root("hypo.word") if shard_id == (1 % num_shards): merge_shards_with_root("hypo.units") if shard_id == (2 % num_shards): merge_shards_with_root("ref.word") if shard_id == (3 % num_shards): merge_shards_with_root("ref.units") dist.barrier() def optimize_model(self, model: FairseqModel) -> None: model.make_generation_fast_() if self.cfg.common.fp16: model.half() if not self.cfg.common.cpu: model.cuda() def load_model_ensemble(self) -> Tuple[List[FairseqModel], FairseqDataclass]: arg_overrides = ast.literal_eval(self.cfg.common_eval.model_overrides) models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(self.cfg.common_eval.path, separator="\\"), arg_overrides=arg_overrides, task=self.task, suffix=self.cfg.checkpoint.checkpoint_suffix, strict=(self.cfg.checkpoint.checkpoint_shard_count == 1), num_shards=self.cfg.checkpoint.checkpoint_shard_count, ) for model in models: self.optimize_model(model) return models, saved_cfg def get_dataset_itr(self, disable_iterator_cache: bool = False) -> None: return self.task.get_batch_iterator( dataset=self.task.dataset(self.cfg.dataset.gen_subset), max_tokens=self.cfg.dataset.max_tokens, max_sentences=self.cfg.dataset.batch_size, max_positions=(sys.maxsize, sys.maxsize), ignore_invalid_inputs=self.cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=self.cfg.dataset.required_batch_size_multiple, seed=self.cfg.common.seed, num_shards=self.data_parallel_world_size, shard_id=self.data_parallel_rank, num_workers=self.cfg.dataset.num_workers, data_buffer_size=self.cfg.dataset.data_buffer_size, disable_iterator_cache=disable_iterator_cache, ).next_epoch_itr(shuffle=False) def build_progress_bar( self, epoch: Optional[int] = None, prefix: Optional[str] = None, default_log_format: str = "tqdm", ) -> BaseProgressBar: return progress_bar.progress_bar( iterator=self.get_dataset_itr(), log_format=self.cfg.common.log_format, log_interval=self.cfg.common.log_interval, epoch=epoch, prefix=prefix, tensorboard_logdir=self.cfg.common.tensorboard_logdir, default_log_format=default_log_format, ) @property def data_parallel_world_size(self): if self.cfg.distributed_training.distributed_world_size == 1: return 1 return distributed_utils.get_data_parallel_world_size() @property def data_parallel_rank(self): if self.cfg.distributed_training.distributed_world_size == 1: return 0 return distributed_utils.get_data_parallel_rank() def process_sentence( self, sample: Dict[str, Any], hypo: Dict[str, Any], sid: int, batch_id: int, ) -> Tuple[int, int]: speaker = None # Speaker can't be parsed from dataset. if "target_label" in sample: toks = sample["target_label"] else: toks = sample["target"] toks = toks[batch_id, :] # Processes hypothesis. hyp_pieces = self.tgt_dict.string(hypo["tokens"].int().cpu()) if "words" in hypo: hyp_words = " ".join(hypo["words"]) else: hyp_words = post_process(hyp_pieces, self.cfg.common_eval.post_process) # Processes target. target_tokens = utils.strip_pad(toks, self.tgt_dict.pad()) tgt_pieces = self.tgt_dict.string(target_tokens.int().cpu()) tgt_words = post_process(tgt_pieces, self.cfg.common_eval.post_process) if self.cfg.decoding.results_path is not None: print(f"{hyp_pieces} ({speaker}-{sid})", file=self.hypo_units_file) print(f"{hyp_words} ({speaker}-{sid})", file=self.hypo_words_file) print(f"{tgt_pieces} ({speaker}-{sid})", file=self.ref_units_file) print(f"{tgt_words} ({speaker}-{sid})", file=self.ref_words_file) print(f"{hypo['score'].item()} ({speaker}-{sid})", file=self.score_file) if not self.cfg.common_eval.quiet: logger.info(f"HYPO: {hyp_words}") logger.info(f"REF: {tgt_words}") logger.info("---------------------") hyp_words, tgt_words = hyp_words.split(), tgt_words.split() return editdistance.eval(hyp_words, tgt_words), len(tgt_words) def process_sample(self, sample: Dict[str, Any]) -> None: self.gen_timer.start() hypos = self.task.inference_step( generator=self.generator, models=self.models, sample=sample, ) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) self.gen_timer.stop(num_generated_tokens) self.wps_meter.update(num_generated_tokens) for batch_id, sample_id in enumerate(sample["id"].tolist()): errs, length = self.process_sentence( sample=sample, sid=sample_id, batch_id=batch_id, hypo=hypos[batch_id][0], ) self.total_errors += errs self.total_length += length self.log({"wps": round(self.wps_meter.avg)}) if "nsentences" in sample: self.num_sentences += sample["nsentences"] else: self.num_sentences += sample["id"].numel() def log_generation_time(self) -> None: logger.info( "Processed %d sentences (%d tokens) in %.1fs %.2f " "sentences per second, %.2f tokens per second)", self.num_sentences, self.gen_timer.n, self.gen_timer.sum, self.num_sentences / (self.gen_timer.sum + 1e-6), 1.0 / (self.gen_timer.avg + 1e-6), ) def parse_wer(wer_file: Path) -> float: with open(wer_file, "r") as f: return float(f.readline().strip().split(" ")[1]) def get_wer_file(cfg: InferConfig) -> Path: """Hashes the decoding parameters to a unique file ID.""" base_path = "wer" if cfg.decoding.results_path is not None: base_path = os.path.join(cfg.decoding.results_path, base_path) if cfg.decoding.unique_wer_file: yaml_str = OmegaConf.to_yaml(cfg.decoding) fid = int(hashlib.md5(yaml_str.encode("utf-8")).hexdigest(), 16) return Path(f"{base_path}.{fid % 1000000}") else: return Path(base_path) def main(cfg: InferConfig) -> float: """Entry point for main processing logic. Args: cfg: The inferance configuration to use. wer: Optional shared memory pointer for returning the WER. If not None, the final WER value will be written here instead of being returned. Returns: The final WER if `wer` is None, otherwise None. """ yaml_str, wer_file = OmegaConf.to_yaml(cfg.decoding), get_wer_file(cfg) # Validates the provided configuration. if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: cfg.dataset.max_tokens = 4000000 if not cfg.common.cpu and not torch.cuda.is_available(): raise ValueError("CUDA not found; set `cpu=True` to run without CUDA") logger.info(cfg.common_eval.path) with InferenceProcessor(cfg) as processor: for sample in processor: processor.process_sample(sample) processor.log_generation_time() if cfg.decoding.results_path is not None: processor.merge_shards() errs_t, leng_t = processor.total_errors, processor.total_length if cfg.common.cpu: logger.warning("Merging WER requires CUDA.") elif processor.data_parallel_world_size > 1: stats = torch.LongTensor([errs_t, leng_t]).cuda() dist.all_reduce(stats, op=dist.ReduceOp.SUM) errs_t, leng_t = stats[0].item(), stats[1].item() wer = errs_t * 100.0 / leng_t if distributed_utils.is_master(cfg.distributed_training): with open(wer_file, "w") as f: f.write( ( f"WER: {wer}\n" f"err / num_ref_words = {errs_t} / {leng_t}\n\n" f"{yaml_str}" ) ) return wer @hydra.main(config_path=config_path, config_name="infer") def hydra_main(cfg: InferConfig) -> Union[float, Tuple[float, Optional[float]]]: container = OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) cfg = OmegaConf.create(container) OmegaConf.set_struct(cfg, True) if cfg.common.reset_logging: reset_logging() utils.import_user_module(cfg.common) # logger.info("Config:\n%s", OmegaConf.to_yaml(cfg)) wer = float("inf") try: if cfg.common.profile: with torch.cuda.profiler.profile(): with torch.autograd.profiler.emit_nvtx(): distributed_utils.call_main(cfg, main) else: distributed_utils.call_main(cfg, main) wer = parse_wer(get_wer_file(cfg)) except BaseException as e: # pylint: disable=broad-except if not cfg.common.suppress_crashes: raise else: logger.error("Crashed! %s", str(e)) logger.info("Word error rate: %.4f", wer) if cfg.is_ax: return wer, None return wer def cli_main() -> None: try: from hydra._internal.utils import ( get_args, ) # pylint: disable=import-outside-toplevel cfg_name = get_args().config_name or "infer" except ImportError: logger.warning("Failed to get config name from hydra args") cfg_name = "infer" cs = ConfigStore.instance() cs.store(name=cfg_name, node=InferConfig) for k in InferConfig.__dataclass_fields__: if is_dataclass(InferConfig.__dataclass_fields__[k].type): v = InferConfig.__dataclass_fields__[k].default cs.store(name=k, node=v) hydra_main() # pylint: disable=no-value-for-parameter if __name__ == "__main__": cli_main() ================================================ FILE: examples/speech_recognition/tasks/__init__.py ================================================ import importlib import os for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): task_name = file[: file.find(".py")] importlib.import_module("examples.speech_recognition.tasks." + task_name) ================================================ FILE: examples/speech_recognition/tasks/speech_recognition.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import os import re import sys import torch from examples.speech_recognition.data import AsrDataset from examples.speech_recognition.data.replabels import replabel_symbol from fairseq.data import Dictionary from fairseq.tasks import LegacyFairseqTask, register_task def get_asr_dataset_from_json(data_json_path, tgt_dict): """ Parse data json and create dataset. See scripts/asr_prep_json.py which pack json from raw files Json example: { "utts": { "4771-29403-0025": { "input": { "length_ms": 170, "path": "/tmp/file1.flac" }, "output": { "text": "HELLO \n", "token": "HE LLO", "tokenid": "4815, 861" } }, "1564-142299-0096": { ... } } """ if not os.path.isfile(data_json_path): raise FileNotFoundError("Dataset not found: {}".format(data_json_path)) with open(data_json_path, "rb") as f: data_samples = json.load(f)["utts"] assert len(data_samples) != 0 sorted_samples = sorted( data_samples.items(), key=lambda sample: int(sample[1]["input"]["length_ms"]), reverse=True, ) aud_paths = [s[1]["input"]["path"] for s in sorted_samples] ids = [s[0] for s in sorted_samples] speakers = [] for s in sorted_samples: m = re.search("(.+?)-(.+?)-(.+?)", s[0]) speakers.append(m.group(1) + "_" + m.group(2)) frame_sizes = [s[1]["input"]["length_ms"] for s in sorted_samples] tgt = [ [int(i) for i in s[1]["output"]["tokenid"].split(", ")] for s in sorted_samples ] # append eos tgt = [[*t, tgt_dict.eos()] for t in tgt] return AsrDataset(aud_paths, frame_sizes, tgt, tgt_dict, ids, speakers) @register_task("speech_recognition") class SpeechRecognitionTask(LegacyFairseqTask): """ Task for training speech recognition model. """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument("data", help="path to data directory") parser.add_argument( "--silence-token", default="\u2581", help="token for silence (used by w2l)" ) parser.add_argument( "--max-source-positions", default=sys.maxsize, type=int, metavar="N", help="max number of frames in the source sequence", ) parser.add_argument( "--max-target-positions", default=1024, type=int, metavar="N", help="max number of tokens in the target sequence", ) def __init__(self, args, tgt_dict): super().__init__(args) self.tgt_dict = tgt_dict @classmethod def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries).""" dict_path = os.path.join(args.data, "dict.txt") if not os.path.isfile(dict_path): raise FileNotFoundError("Dict not found: {}".format(dict_path)) tgt_dict = Dictionary.load(dict_path) if args.criterion == "ctc_loss": tgt_dict.add_symbol("<ctc_blank>") elif args.criterion == "asg_loss": for i in range(1, args.max_replabel + 1): tgt_dict.add_symbol(replabel_symbol(i)) print("| dictionary: {} types".format(len(tgt_dict))) return cls(args, tgt_dict) def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ data_json_path = os.path.join(self.args.data, "{}.json".format(split)) self.datasets[split] = get_asr_dataset_from_json(data_json_path, self.tgt_dict) def build_generator(self, models, args, **unused): w2l_decoder = getattr(args, "w2l_decoder", None) if w2l_decoder == "viterbi": from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(args, self.target_dictionary) elif w2l_decoder == "kenlm": from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder return W2lKenLMDecoder(args, self.target_dictionary) elif w2l_decoder == "fairseqlm": from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder return W2lFairseqLMDecoder(args, self.target_dictionary) else: return super().build_generator(models, args) @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.tgt_dict @property def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary` (if applicable for this task).""" return None def max_positions(self): """Return the max speech and sentence length allowed by the task.""" return (self.args.max_source_positions, self.args.max_target_positions) ================================================ FILE: examples/speech_recognition/utils/wer_utils.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from __future__ import absolute_import, division, print_function, unicode_literals import re from collections import deque from enum import Enum import numpy as np """ Utility modules for computation of Word Error Rate, Alignments, as well as more granular metrics like deletion, insersion and substitutions. """ class Code(Enum): match = 1 substitution = 2 insertion = 3 deletion = 4 class Token(object): def __init__(self, lbl="", st=np.nan, en=np.nan): if np.isnan(st): self.label, self.start, self.end = "", 0.0, 0.0 else: self.label, self.start, self.end = lbl, st, en class AlignmentResult(object): def __init__(self, refs, hyps, codes, score): self.refs = refs # std::deque<int> self.hyps = hyps # std::deque<int> self.codes = codes # std::deque<Code> self.score = score # float def coordinate_to_offset(row, col, ncols): return int(row * ncols + col) def offset_to_row(offset, ncols): return int(offset / ncols) def offset_to_col(offset, ncols): return int(offset % ncols) def trimWhitespace(str): return re.sub(" +", " ", re.sub(" *$", "", re.sub("^ *", "", str))) def str2toks(str): pieces = trimWhitespace(str).split(" ") toks = [] for p in pieces: toks.append(Token(p, 0.0, 0.0)) return toks class EditDistance(object): def __init__(self, time_mediated): self.time_mediated_ = time_mediated self.scores_ = np.nan # Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic> self.backtraces_ = ( np.nan ) # Eigen::Matrix<size_t, Eigen::Dynamic, Eigen::Dynamic> backtraces_; self.confusion_pairs_ = {} def cost(self, ref, hyp, code): if self.time_mediated_: if code == Code.match: return abs(ref.start - hyp.start) + abs(ref.end - hyp.end) elif code == Code.insertion: return hyp.end - hyp.start elif code == Code.deletion: return ref.end - ref.start else: # substitution return abs(ref.start - hyp.start) + abs(ref.end - hyp.end) + 0.1 else: if code == Code.match: return 0 elif code == Code.insertion or code == Code.deletion: return 3 else: # substitution return 4 def get_result(self, refs, hyps): res = AlignmentResult(refs=deque(), hyps=deque(), codes=deque(), score=np.nan) num_rows, num_cols = self.scores_.shape res.score = self.scores_[num_rows - 1, num_cols - 1] curr_offset = coordinate_to_offset(num_rows - 1, num_cols - 1, num_cols) while curr_offset != 0: curr_row = offset_to_row(curr_offset, num_cols) curr_col = offset_to_col(curr_offset, num_cols) prev_offset = self.backtraces_[curr_row, curr_col] prev_row = offset_to_row(prev_offset, num_cols) prev_col = offset_to_col(prev_offset, num_cols) res.refs.appendleft(curr_row - 1) # Note: this was .push_front() in C++ res.hyps.appendleft(curr_col - 1) if curr_row - 1 == prev_row and curr_col == prev_col: res.codes.appendleft(Code.deletion) elif curr_row == prev_row and curr_col - 1 == prev_col: res.codes.appendleft(Code.insertion) else: # assert(curr_row - 1 == prev_row and curr_col - 1 == prev_col) ref_str = refs[res.refs[0]].label hyp_str = hyps[res.hyps[0]].label if ref_str == hyp_str: res.codes.appendleft(Code.match) else: res.codes.appendleft(Code.substitution) confusion_pair = "%s -> %s" % (ref_str, hyp_str) if confusion_pair not in self.confusion_pairs_: self.confusion_pairs_[confusion_pair] = 1 else: self.confusion_pairs_[confusion_pair] += 1 curr_offset = prev_offset return res def align(self, refs, hyps): if len(refs) == 0 and len(hyps) == 0: return np.nan # NOTE: we're not resetting the values in these matrices because every value # will be overridden in the loop below. If this assumption doesn't hold, # be sure to set all entries in self.scores_ and self.backtraces_ to 0. self.scores_ = np.zeros((len(refs) + 1, len(hyps) + 1)) self.backtraces_ = np.zeros((len(refs) + 1, len(hyps) + 1)) num_rows, num_cols = self.scores_.shape for i in range(num_rows): for j in range(num_cols): if i == 0 and j == 0: self.scores_[i, j] = 0.0 self.backtraces_[i, j] = 0 continue if i == 0: self.scores_[i, j] = self.scores_[i, j - 1] + self.cost( None, hyps[j - 1], Code.insertion ) self.backtraces_[i, j] = coordinate_to_offset(i, j - 1, num_cols) continue if j == 0: self.scores_[i, j] = self.scores_[i - 1, j] + self.cost( refs[i - 1], None, Code.deletion ) self.backtraces_[i, j] = coordinate_to_offset(i - 1, j, num_cols) continue # Below here both i and j are greater than 0 ref = refs[i - 1] hyp = hyps[j - 1] best_score = self.scores_[i - 1, j - 1] + ( self.cost(ref, hyp, Code.match) if (ref.label == hyp.label) else self.cost(ref, hyp, Code.substitution) ) prev_row = i - 1 prev_col = j - 1 ins = self.scores_[i, j - 1] + self.cost(None, hyp, Code.insertion) if ins < best_score: best_score = ins prev_row = i prev_col = j - 1 delt = self.scores_[i - 1, j] + self.cost(ref, None, Code.deletion) if delt < best_score: best_score = delt prev_row = i - 1 prev_col = j self.scores_[i, j] = best_score self.backtraces_[i, j] = coordinate_to_offset( prev_row, prev_col, num_cols ) return self.get_result(refs, hyps) class WERTransformer(object): def __init__(self, hyp_str, ref_str, verbose=True): self.ed_ = EditDistance(False) self.id2oracle_errs_ = {} self.utts_ = 0 self.words_ = 0 self.insertions_ = 0 self.deletions_ = 0 self.substitutions_ = 0 self.process(["dummy_str", hyp_str, ref_str]) if verbose: print("'%s' vs '%s'" % (hyp_str, ref_str)) self.report_result() def process(self, input): # std::vector<std::string>&& input if len(input) < 3: print( "Input must be of the form <id> ... <hypo> <ref> , got ", len(input), " inputs:", ) return None # Align # std::vector<Token> hyps; # std::vector<Token> refs; hyps = str2toks(input[-2]) refs = str2toks(input[-1]) alignment = self.ed_.align(refs, hyps) if alignment is None: print("Alignment is null") return np.nan # Tally errors ins = 0 dels = 0 subs = 0 for code in alignment.codes: if code == Code.substitution: subs += 1 elif code == Code.insertion: ins += 1 elif code == Code.deletion: dels += 1 # Output row = input row.append(str(len(refs))) row.append(str(ins)) row.append(str(dels)) row.append(str(subs)) # print(row) # Accumulate kIdIndex = 0 kNBestSep = "/" pieces = input[kIdIndex].split(kNBestSep) if len(pieces) == 0: print( "Error splitting ", input[kIdIndex], " on '", kNBestSep, "', got empty list", ) return np.nan id = pieces[0] if id not in self.id2oracle_errs_: self.utts_ += 1 self.words_ += len(refs) self.insertions_ += ins self.deletions_ += dels self.substitutions_ += subs self.id2oracle_errs_[id] = [ins, dels, subs] else: curr_err = ins + dels + subs prev_err = np.sum(self.id2oracle_errs_[id]) if curr_err < prev_err: self.id2oracle_errs_[id] = [ins, dels, subs] return 0 def report_result(self): # print("---------- Summary ---------------") if self.words_ == 0: print("No words counted") return # 1-best best_wer = ( 100.0 * (self.insertions_ + self.deletions_ + self.substitutions_) / self.words_ ) print( "\tWER = %0.2f%% (%i utts, %i words, %0.2f%% ins, " "%0.2f%% dels, %0.2f%% subs)" % ( best_wer, self.utts_, self.words_, 100.0 * self.insertions_ / self.words_, 100.0 * self.deletions_ / self.words_, 100.0 * self.substitutions_ / self.words_, ) ) def wer(self): if self.words_ == 0: wer = np.nan else: wer = ( 100.0 * (self.insertions_ + self.deletions_ + self.substitutions_) / self.words_ ) return wer def stats(self): if self.words_ == 0: stats = {} else: wer = ( 100.0 * (self.insertions_ + self.deletions_ + self.substitutions_) / self.words_ ) stats = dict( { "wer": wer, "utts": self.utts_, "numwords": self.words_, "ins": self.insertions_, "dels": self.deletions_, "subs": self.substitutions_, "confusion_pairs": self.ed_.confusion_pairs_, } ) return stats def calc_wer(hyp_str, ref_str): t = WERTransformer(hyp_str, ref_str, verbose=0) return t.wer() def calc_wer_stats(hyp_str, ref_str): t = WERTransformer(hyp_str, ref_str, verbose=0) return t.stats() def get_wer_alignment_codes(hyp_str, ref_str): """ INPUT: hypothesis string, reference string OUTPUT: List of alignment codes (intermediate results from WER computation) """ t = WERTransformer(hyp_str, ref_str, verbose=0) return t.ed_.align(str2toks(ref_str), str2toks(hyp_str)).codes def merge_counts(x, y): # Merge two hashes which have 'counts' as their values # This can be used for example to merge confusion pair counts # conf_pairs = merge_counts(conf_pairs, stats['confusion_pairs']) for k, v in y.items(): if k not in x: x[k] = 0 x[k] += v return x ================================================ FILE: examples/speech_recognition/w2l_decoder.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Flashlight decoders. """ import gc import itertools as it import os.path as osp from typing import List import warnings from collections import deque, namedtuple import numpy as np import torch from examples.speech_recognition.data.replabels import unpack_replabels from fairseq import tasks from fairseq.utils import apply_to_sample from omegaconf import open_dict from fairseq.dataclass.utils import convert_namespace_to_omegaconf try: from flashlight.lib.text.dictionary import create_word_dict, load_words from flashlight.lib.sequence.criterion import CpuViterbiPath, get_data_ptr_as_bytes from flashlight.lib.text.decoder import ( CriterionType, LexiconDecoderOptions, KenLM, LM, LMState, SmearingMode, Trie, LexiconDecoder, ) except: warnings.warn( "flashlight python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/flashlight/tree/master/bindings/python" ) LM = object LMState = object class W2lDecoder(object): def __init__(self, args, tgt_dict): self.tgt_dict = tgt_dict self.vocab_size = len(tgt_dict) self.nbest = args.nbest # criterion-specific init self.criterion_type = CriterionType.CTC self.blank = ( tgt_dict.index("<ctc_blank>") if "<ctc_blank>" in tgt_dict.indices else tgt_dict.bos() ) if "<sep>" in tgt_dict.indices: self.silence = tgt_dict.index("<sep>") elif "|" in tgt_dict.indices: self.silence = tgt_dict.index("|") else: self.silence = tgt_dict.eos() self.asg_transitions = None def generate(self, models, sample, **unused): """Generate a batch of inferences.""" # model.forward normally channels prev_output_tokens into the decoder # separately, but SequenceGenerator directly calls model.encoder encoder_input = { k: v for k, v in sample["net_input"].items() if k != "prev_output_tokens" } emissions = self.get_emissions(models, encoder_input) return self.decode(emissions) def get_emissions(self, models, encoder_input): """Run encoder and normalize emissions""" model = models[0] encoder_out = model(**encoder_input) if hasattr(model, "get_logits"): emissions = model.get_logits(encoder_out) # no need to normalize emissions else: emissions = model.get_normalized_probs(encoder_out, log_probs=True) return emissions.transpose(0, 1).float().cpu().contiguous() def get_tokens(self, idxs): """Normalize tokens by handling CTC blank, ASG replabels, etc.""" idxs = (g[0] for g in it.groupby(idxs)) idxs = filter(lambda x: x != self.blank, idxs) return torch.LongTensor(list(idxs)) class W2lViterbiDecoder(W2lDecoder): def __init__(self, args, tgt_dict): super().__init__(args, tgt_dict) def decode(self, emissions): B, T, N = emissions.size() hypos = [] if self.asg_transitions is None: transitions = torch.FloatTensor(N, N).zero_() else: transitions = torch.FloatTensor(self.asg_transitions).view(N, N) viterbi_path = torch.IntTensor(B, T) workspace = torch.ByteTensor(CpuViterbiPath.get_workspace_size(B, T, N)) CpuViterbiPath.compute( B, T, N, get_data_ptr_as_bytes(emissions), get_data_ptr_as_bytes(transitions), get_data_ptr_as_bytes(viterbi_path), get_data_ptr_as_bytes(workspace), ) return [ [{"tokens": self.get_tokens(viterbi_path[b].tolist()), "score": 0}] for b in range(B) ] class W2lKenLMDecoder(W2lDecoder): def __init__(self, args, tgt_dict): super().__init__(args, tgt_dict) self.unit_lm = getattr(args, "unit_lm", False) if args.lexicon: self.lexicon = load_words(args.lexicon) self.word_dict = create_word_dict(self.lexicon) self.unk_word = self.word_dict.get_index("<unk>") self.lm = KenLM(args.kenlm_model, self.word_dict) self.trie = Trie(self.vocab_size, self.silence) start_state = self.lm.start(False) for i, (word, spellings) in enumerate(self.lexicon.items()): word_idx = self.word_dict.get_index(word) _, score = self.lm.score(start_state, word_idx) for spelling in spellings: spelling_idxs = [tgt_dict.index(token) for token in spelling] assert ( tgt_dict.unk() not in spelling_idxs ), f"{spelling} {spelling_idxs}" self.trie.insert(spelling_idxs, word_idx, score) self.trie.smear(SmearingMode.MAX) self.decoder_opts = LexiconDecoderOptions( beam_size=args.beam, beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), beam_threshold=args.beam_threshold, lm_weight=args.lm_weight, word_score=args.word_score, unk_score=args.unk_weight, sil_score=args.sil_weight, log_add=False, criterion_type=self.criterion_type, ) if self.asg_transitions is None: N = 768 # self.asg_transitions = torch.FloatTensor(N, N).zero_() self.asg_transitions = [] self.decoder = LexiconDecoder( self.decoder_opts, self.trie, self.lm, self.silence, self.blank, self.unk_word, self.asg_transitions, self.unit_lm, ) else: assert args.unit_lm, "lexicon free decoding can only be done with a unit language model" from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions d = {w: [[w]] for w in tgt_dict.symbols} self.word_dict = create_word_dict(d) self.lm = KenLM(args.kenlm_model, self.word_dict) self.decoder_opts = LexiconFreeDecoderOptions( beam_size=args.beam, beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), beam_threshold=args.beam_threshold, lm_weight=args.lm_weight, sil_score=args.sil_weight, log_add=False, criterion_type=self.criterion_type, ) self.decoder = LexiconFreeDecoder( self.decoder_opts, self.lm, self.silence, self.blank, [] ) def get_timesteps(self, token_idxs: List[int]) -> List[int]: """Returns frame numbers corresponding to every non-blank token. Parameters ---------- token_idxs : List[int] IDs of decoded tokens. Returns ------- List[int] Frame numbers corresponding to every non-blank token. """ timesteps = [] for i, token_idx in enumerate(token_idxs): if token_idx == self.blank: continue if i == 0 or token_idx != token_idxs[i-1]: timesteps.append(i) return timesteps def decode(self, emissions): B, T, N = emissions.size() hypos = [] for b in range(B): emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0) results = self.decoder.decode(emissions_ptr, T, N) nbest_results = results[: self.nbest] hypos.append( [ { "tokens": self.get_tokens(result.tokens), "score": result.score, "timesteps": self.get_timesteps(result.tokens), "words": [ self.word_dict.get_entry(x) for x in result.words if x >= 0 ], } for result in nbest_results ] ) return hypos FairseqLMState = namedtuple("FairseqLMState", ["prefix", "incremental_state", "probs"]) class FairseqLM(LM): def __init__(self, dictionary, model): LM.__init__(self) self.dictionary = dictionary self.model = model self.unk = self.dictionary.unk() self.save_incremental = False # this currently does not work properly self.max_cache = 20_000 model.cuda() model.eval() model.make_generation_fast_() self.states = {} self.stateq = deque() def start(self, start_with_nothing): state = LMState() prefix = torch.LongTensor([[self.dictionary.eos()]]) incremental_state = {} if self.save_incremental else None with torch.no_grad(): res = self.model(prefix.cuda(), incremental_state=incremental_state) probs = self.model.get_normalized_probs(res, log_probs=True, sample=None) if incremental_state is not None: incremental_state = apply_to_sample(lambda x: x.cpu(), incremental_state) self.states[state] = FairseqLMState( prefix.numpy(), incremental_state, probs[0, -1].cpu().numpy() ) self.stateq.append(state) return state def score(self, state: LMState, token_index: int, no_cache: bool = False): """ Evaluate language model based on the current lm state and new word Parameters: ----------- state: current lm state token_index: index of the word (can be lexicon index then you should store inside LM the mapping between indices of lexicon and lm, or lm index of a word) Returns: -------- (LMState, float): pair of (new state, score for the current word) """ curr_state = self.states[state] def trim_cache(targ_size): while len(self.stateq) > targ_size: rem_k = self.stateq.popleft() rem_st = self.states[rem_k] rem_st = FairseqLMState(rem_st.prefix, None, None) self.states[rem_k] = rem_st if curr_state.probs is None: new_incremental_state = ( curr_state.incremental_state.copy() if curr_state.incremental_state is not None else None ) with torch.no_grad(): if new_incremental_state is not None: new_incremental_state = apply_to_sample( lambda x: x.cuda(), new_incremental_state ) elif self.save_incremental: new_incremental_state = {} res = self.model( torch.from_numpy(curr_state.prefix).cuda(), incremental_state=new_incremental_state, ) probs = self.model.get_normalized_probs( res, log_probs=True, sample=None ) if new_incremental_state is not None: new_incremental_state = apply_to_sample( lambda x: x.cpu(), new_incremental_state ) curr_state = FairseqLMState( curr_state.prefix, new_incremental_state, probs[0, -1].cpu().numpy() ) if not no_cache: self.states[state] = curr_state self.stateq.append(state) score = curr_state.probs[token_index].item() trim_cache(self.max_cache) outstate = state.child(token_index) if outstate not in self.states and not no_cache: prefix = np.concatenate( [curr_state.prefix, torch.LongTensor([[token_index]])], -1 ) incr_state = curr_state.incremental_state self.states[outstate] = FairseqLMState(prefix, incr_state, None) if token_index == self.unk: score = float("-inf") return outstate, score def finish(self, state: LMState): """ Evaluate eos for language model based on the current lm state Returns: -------- (LMState, float): pair of (new state, score for the current word) """ return self.score(state, self.dictionary.eos()) def empty_cache(self): self.states = {} self.stateq = deque() gc.collect() class W2lFairseqLMDecoder(W2lDecoder): def __init__(self, args, tgt_dict): super().__init__(args, tgt_dict) self.unit_lm = getattr(args, "unit_lm", False) self.lexicon = load_words(args.lexicon) if args.lexicon else None self.idx_to_wrd = {} checkpoint = torch.load(args.kenlm_model, map_location="cpu") if "cfg" in checkpoint and checkpoint["cfg"] is not None: lm_args = checkpoint["cfg"] else: lm_args = convert_namespace_to_omegaconf(checkpoint["args"]) with open_dict(lm_args.task): lm_args.task.data = osp.dirname(args.kenlm_model) task = tasks.setup_task(lm_args.task) model = task.build_model(lm_args.model) model.load_state_dict(checkpoint["model"], strict=False) self.trie = Trie(self.vocab_size, self.silence) self.word_dict = task.dictionary self.unk_word = self.word_dict.unk() self.lm = FairseqLM(self.word_dict, model) if self.lexicon: start_state = self.lm.start(False) for i, (word, spellings) in enumerate(self.lexicon.items()): if self.unit_lm: word_idx = i self.idx_to_wrd[i] = word score = 0 else: word_idx = self.word_dict.index(word) _, score = self.lm.score(start_state, word_idx, no_cache=True) for spelling in spellings: spelling_idxs = [tgt_dict.index(token) for token in spelling] assert ( tgt_dict.unk() not in spelling_idxs ), f"{spelling} {spelling_idxs}" self.trie.insert(spelling_idxs, word_idx, score) self.trie.smear(SmearingMode.MAX) self.decoder_opts = LexiconDecoderOptions( beam_size=args.beam, beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), beam_threshold=args.beam_threshold, lm_weight=args.lm_weight, word_score=args.word_score, unk_score=args.unk_weight, sil_score=args.sil_weight, log_add=False, criterion_type=self.criterion_type, ) self.decoder = LexiconDecoder( self.decoder_opts, self.trie, self.lm, self.silence, self.blank, self.unk_word, [], self.unit_lm, ) else: assert args.unit_lm, "lexicon free decoding can only be done with a unit language model" from flashlight.lib.text.decoder import LexiconFreeDecoder, LexiconFreeDecoderOptions d = {w: [[w]] for w in tgt_dict.symbols} self.word_dict = create_word_dict(d) self.lm = KenLM(args.kenlm_model, self.word_dict) self.decoder_opts = LexiconFreeDecoderOptions( beam_size=args.beam, beam_size_token=int(getattr(args, "beam_size_token", len(tgt_dict))), beam_threshold=args.beam_threshold, lm_weight=args.lm_weight, sil_score=args.sil_weight, log_add=False, criterion_type=self.criterion_type, ) self.decoder = LexiconFreeDecoder( self.decoder_opts, self.lm, self.silence, self.blank, [] ) def decode(self, emissions): B, T, N = emissions.size() hypos = [] def idx_to_word(idx): if self.unit_lm: return self.idx_to_wrd[idx] else: return self.word_dict[idx] def make_hypo(result): hypo = {"tokens": self.get_tokens(result.tokens), "score": result.score} if self.lexicon: hypo["words"] = [idx_to_word(x) for x in result.words if x >= 0] return hypo for b in range(B): emissions_ptr = emissions.data_ptr() + 4 * b * emissions.stride(0) results = self.decoder.decode(emissions_ptr, T, N) nbest_results = results[: self.nbest] hypos.append([make_hypo(result) for result in nbest_results]) self.lm.empty_cache() return hypos ================================================ FILE: examples/speech_synthesis/README.md ================================================ Speech Synthesis (S^2) === [https://arxiv.org/abs/2109.06912](https://arxiv.org/abs/2109.06912) Speech synthesis with fairseq. ## Features - Autoregressive and non-autoregressive models - Multi-speaker synthesis - Audio preprocessing (denoising, VAD, etc.) for less curated data - Automatic metrics for model development - Similar data configuration as [S2T](../speech_to_text/README.md) ## Examples - [Single-speaker synthesis on LJSpeech](docs/ljspeech_example.md) - [Multi-speaker synthesis on VCTK](docs/vctk_example.md) - [Multi-speaker synthesis on Common Voice](docs/common_voice_example.md) ## Citation Please cite as: ``` @article{wang2021fairseqs2, title={fairseq S\^{} 2: A Scalable and Integrable Speech Synthesis Toolkit}, author={Wang, Changhan and Hsu, Wei-Ning and Adi, Yossi and Polyak, Adam and Lee, Ann and Chen, Peng-Jen and Gu, Jiatao and Pino, Juan}, journal={arXiv preprint arXiv:2109.06912}, year={2021} } @inproceedings{ott2019fairseq, title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, year = {2019}, } ``` ================================================ FILE: examples/speech_synthesis/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: examples/speech_synthesis/data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import io import os from pathlib import Path from typing import Optional, List, Dict import zipfile import tempfile from dataclasses import dataclass from itertools import groupby import torch import torch.nn.functional as F import numpy as np from tqdm import tqdm from examples.speech_to_text.data_utils import load_tsv_to_dicts from fairseq.data.audio.audio_utils import ( TTSSpectrogram, TTSMelScale, parse_path, read_from_stored_zip, is_npy_data ) def trim_or_pad_to_target_length( data_1d_or_2d: np.ndarray, target_length: int ) -> np.ndarray: assert len(data_1d_or_2d.shape) in {1, 2} delta = data_1d_or_2d.shape[0] - target_length if delta >= 0: # trim if being longer data_1d_or_2d = data_1d_or_2d[: target_length] else: # pad if being shorter if len(data_1d_or_2d.shape) == 1: data_1d_or_2d = np.concatenate( [data_1d_or_2d, np.zeros(-delta)], axis=0 ) else: data_1d_or_2d = np.concatenate( [data_1d_or_2d, np.zeros((-delta, data_1d_or_2d.shape[1]))], axis=0 ) return data_1d_or_2d def extract_logmel_spectrogram( waveform: torch.Tensor, sample_rate: int, output_path: Optional[Path] = None, win_length: int = 1024, hop_length: int = 256, n_fft: int = 1024, win_fn: callable = torch.hann_window, n_mels: int = 80, f_min: float = 0., f_max: float = 8000, eps: float = 1e-5, overwrite: bool = False, target_length: Optional[int] = None ): if output_path is not None and output_path.is_file() and not overwrite: return spectrogram_transform = TTSSpectrogram( n_fft=n_fft, win_length=win_length, hop_length=hop_length, window_fn=win_fn ) mel_scale_transform = TTSMelScale( n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max, n_stft=n_fft // 2 + 1 ) spectrogram = spectrogram_transform(waveform) mel_spec = mel_scale_transform(spectrogram) logmel_spec = torch.clamp(mel_spec, min=eps).log() assert len(logmel_spec.shape) == 3 and logmel_spec.shape[0] == 1 logmel_spec = logmel_spec.squeeze().t() # D x T -> T x D if target_length is not None: logmel_spec = trim_or_pad_to_target_length(logmel_spec, target_length) if output_path is not None: np.save(output_path.as_posix(), logmel_spec) else: return logmel_spec def extract_pitch( waveform: torch.Tensor, sample_rate: int, output_path: Optional[Path] = None, hop_length: int = 256, log_scale: bool = True, phoneme_durations: Optional[List[int]] = None ): if output_path is not None and output_path.is_file(): return try: import pyworld except ImportError: raise ImportError("Please install PyWORLD: pip install pyworld") _waveform = waveform.squeeze(0).double().numpy() pitch, t = pyworld.dio( _waveform, sample_rate, frame_period=hop_length / sample_rate * 1000 ) pitch = pyworld.stonemask(_waveform, pitch, t, sample_rate) if phoneme_durations is not None: pitch = trim_or_pad_to_target_length(pitch, sum(phoneme_durations)) try: from scipy.interpolate import interp1d except ImportError: raise ImportError("Please install SciPy: pip install scipy") nonzero_ids = np.where(pitch != 0)[0] if len(nonzero_ids) == 0: print((f"{output_path} has all empty values in the pitch contour")) return elif len(nonzero_ids) == 1: print((f"{output_path} has only one non-zero values in the pitch contour")) return else: interp_fn = interp1d( nonzero_ids, pitch[nonzero_ids], fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]), bounds_error=False, ) pitch = interp_fn(np.arange(0, len(pitch))) d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations])) pitch = np.array( [ np.mean(pitch[d_cumsum[i-1]: d_cumsum[i]]) for i in range(1, len(d_cumsum)) ] ) assert len(pitch) == len(phoneme_durations) if log_scale: pitch = np.log(pitch + 1) if output_path is not None: np.save(output_path.as_posix(), pitch) else: return pitch def extract_energy( waveform: torch.Tensor, output_path: Optional[Path] = None, hop_length: int = 256, n_fft: int = 1024, log_scale: bool = True, phoneme_durations: Optional[List[int]] = None ): if output_path is not None and output_path.is_file(): return assert len(waveform.shape) == 2 and waveform.shape[0] == 1 waveform = waveform.view(1, 1, waveform.shape[1]) waveform = F.pad( waveform.unsqueeze(1), [n_fft // 2, n_fft // 2, 0, 0], mode="reflect" ) waveform = waveform.squeeze(1) fourier_basis = np.fft.fft(np.eye(n_fft)) cutoff = int((n_fft / 2 + 1)) fourier_basis = np.vstack( [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] ) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) forward_transform = F.conv1d( waveform, forward_basis, stride=hop_length, padding=0 ) real_part = forward_transform[:, :cutoff, :] imag_part = forward_transform[:, cutoff:, :] magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) energy = torch.norm(magnitude, dim=1).squeeze(0).numpy() if phoneme_durations is not None: energy = trim_or_pad_to_target_length(energy, sum(phoneme_durations)) d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations])) energy = np.array( [ np.mean(energy[d_cumsum[i - 1]: d_cumsum[i]]) for i in range(1, len(d_cumsum)) ] ) assert len(energy) == len(phoneme_durations) if log_scale: energy = np.log(energy + 1) if output_path is not None: np.save(output_path.as_posix(), energy) else: return energy def get_global_cmvn(feature_root: Path, output_path: Optional[Path] = None): mean_x, mean_x2, n_frames = None, None, 0 feature_paths = feature_root.glob("*.npy") for p in tqdm(feature_paths): with open(p, 'rb') as f: frames = np.load(f).squeeze() n_frames += frames.shape[0] cur_mean_x = frames.sum(axis=0) if mean_x is None: mean_x = cur_mean_x else: mean_x += cur_mean_x cur_mean_x2 = (frames ** 2).sum(axis=0) if mean_x2 is None: mean_x2 = cur_mean_x2 else: mean_x2 += cur_mean_x2 mean_x /= n_frames mean_x2 /= n_frames var_x = mean_x2 - mean_x ** 2 std_x = np.sqrt(np.maximum(var_x, 1e-10)) if output_path is not None: with open(output_path, 'wb') as f: np.savez(f, mean=mean_x, std=std_x) else: return {"mean": mean_x, "std": std_x} def ipa_phonemize(text, lang="en-us", use_g2p=False): if use_g2p: assert lang == "en-us", "g2pE phonemizer only works for en-us" try: from g2p_en import G2p g2p = G2p() return " ".join("|" if p == " " else p for p in g2p(text)) except ImportError: raise ImportError( "Please install phonemizer: pip install g2p_en" ) else: try: from phonemizer import phonemize from phonemizer.separator import Separator return phonemize( text, backend='espeak', language=lang, separator=Separator(word="| ", phone=" ") ) except ImportError: raise ImportError( "Please install phonemizer: pip install phonemizer" ) @dataclass class ForceAlignmentInfo(object): tokens: List[str] frame_durations: List[int] start_sec: Optional[float] end_sec: Optional[float] def get_mfa_alignment_by_sample_id( textgrid_zip_path: str, sample_id: str, sample_rate: int, hop_length: int, silence_phones: List[str] = ("sil", "sp", "spn") ) -> ForceAlignmentInfo: try: import tgt except ImportError: raise ImportError("Please install TextGridTools: pip install tgt") filename = f"{sample_id}.TextGrid" out_root = Path(tempfile.gettempdir()) tgt_path = out_root / filename with zipfile.ZipFile(textgrid_zip_path) as f_zip: f_zip.extract(filename, path=out_root) textgrid = tgt.io.read_textgrid(tgt_path.as_posix()) os.remove(tgt_path) phones, frame_durations = [], [] start_sec, end_sec, end_idx = 0, 0, 0 for t in textgrid.get_tier_by_name("phones")._objects: s, e, p = t.start_time, t.end_time, t.text # Trim leading silences if len(phones) == 0: if p in silence_phones: continue else: start_sec = s phones.append(p) if p not in silence_phones: end_sec = e end_idx = len(phones) r = sample_rate / hop_length frame_durations.append(int(np.round(e * r) - np.round(s * r))) # Trim tailing silences phones = phones[:end_idx] frame_durations = frame_durations[:end_idx] return ForceAlignmentInfo( tokens=phones, frame_durations=frame_durations, start_sec=start_sec, end_sec=end_sec ) def get_mfa_alignment( textgrid_zip_path: str, sample_ids: List[str], sample_rate: int, hop_length: int ) -> Dict[str, ForceAlignmentInfo]: return { i: get_mfa_alignment_by_sample_id( textgrid_zip_path, i, sample_rate, hop_length ) for i in tqdm(sample_ids) } def get_unit_alignment( id_to_unit_tsv_path: str, sample_ids: List[str] ) -> Dict[str, ForceAlignmentInfo]: id_to_units = { e["id"]: e["units"] for e in load_tsv_to_dicts(id_to_unit_tsv_path) } id_to_units = {i: id_to_units[i].split() for i in sample_ids} id_to_units_collapsed = { i: [uu for uu, _ in groupby(u)] for i, u in id_to_units.items() } id_to_durations = { i: [len(list(g)) for _, g in groupby(u)] for i, u in id_to_units.items() } return { i: ForceAlignmentInfo( tokens=id_to_units_collapsed[i], frame_durations=id_to_durations[i], start_sec=None, end_sec=None ) for i in sample_ids } def get_feature_value_min_max(feature_paths: List[str]): v_min, v_max = 1e-8, -1e-8 for p in tqdm(feature_paths): _path, slice_ptr = parse_path(p) assert len(slice_ptr) == 2 byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) assert is_npy_data(byte_data) path_or_fp = io.BytesIO(byte_data) features = np.load(path_or_fp).squeeze() v_min = min(v_min, features.min().item()) v_max = max(v_max, features.max().item()) return v_min, v_max ================================================ FILE: examples/speech_synthesis/docs/common_voice_example.md ================================================ [[Back]](..) # Common Voice [Common Voice](https://commonvoice.mozilla.org/en/datasets) is a public domain speech corpus with 11.2K hours of read speech in 76 languages (the latest version 7.0). We provide examples for building [Transformer](https://arxiv.org/abs/1809.08895) models on this dataset. ## Data preparation [Download](https://commonvoice.mozilla.org/en/datasets) and unpack Common Voice v4 to a path `${DATA_ROOT}/${LANG_ID}`. Create splits and generate audio manifests with ```bash python -m examples.speech_synthesis.preprocessing.get_common_voice_audio_manifest \ --data-root ${DATA_ROOT} \ --lang ${LANG_ID} \ --output-manifest-root ${AUDIO_MANIFEST_ROOT} --convert-to-wav ``` To denoise audio and trim leading/trailing silence using signal processing based VAD, run ```bash for SPLIT in dev test train; do python -m examples.speech_synthesis.preprocessing.denoise_and_vad_audio \ --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \ --output-dir ${PROCESSED_DATA_ROOT} \ --denoise --vad --vad-agg-level 2 done ``` which generates a new audio TSV manifest under `${PROCESSED_DATA_ROOT}` with updated path to the processed audio and a new column for SNR. To do filtering by CER, follow the [Automatic Evaluation](../docs/ljspeech_example.md#automatic-evaluation) section to run ASR model (add `--eval-target` to `get_eval_manifest` for evaluation on the reference audio; add `--err-unit char` to `eval_asr` to compute CER instead of WER). The example-level CER is saved to `${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv`. Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with ```bash python -m examples.speech_synthesis.preprocessing.get_feature_manifest \ --audio-manifest-root ${AUDIO_MANIFEST_ROOT} \ --output-root ${FEATURE_MANIFEST_ROOT} \ --ipa-vocab --lang ${LANG_ID} \ --snr-threshold 15 \ --cer-threshold 0.1 --cer-tsv-path ${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv ``` where we use phoneme inputs (`--ipa-vocab`) as example. For sample filtering, we set the SNR and CER threshold to 15 and 10%, respectively. ## Training (Please refer to [the LJSpeech example](../docs/ljspeech_example.md#transformer).) ## Inference (Please refer to [the LJSpeech example](../docs/ljspeech_example.md#inference).) ## Automatic Evaluation (Please refer to [the LJSpeech example](../docs/ljspeech_example.md#automatic-evaluation).) ## Results | Language | Speakers | --arch | Params | Test MCD | Model | |---|---|---|---|---|---| | English | 200 | tts_transformer | 54M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/cv4_en200_transformer_phn.tar) | [[Back]](..) ================================================ FILE: examples/speech_synthesis/docs/ljspeech_example.md ================================================ [[Back]](..) # LJSpeech [LJSpeech](https://keithito.com/LJ-Speech-Dataset) is a public domain TTS corpus with around 24 hours of English speech sampled at 22.05kHz. We provide examples for building [Transformer](https://arxiv.org/abs/1809.08895) and [FastSpeech 2](https://arxiv.org/abs/2006.04558) models on this dataset. ## Data preparation Download data, create splits and generate audio manifests with ```bash python -m examples.speech_synthesis.preprocessing.get_ljspeech_audio_manifest \ --output-data-root ${AUDIO_DATA_ROOT} \ --output-manifest-root ${AUDIO_MANIFEST_ROOT} ``` Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with ```bash python -m examples.speech_synthesis.preprocessing.get_feature_manifest \ --audio-manifest-root ${AUDIO_MANIFEST_ROOT} \ --output-root ${FEATURE_MANIFEST_ROOT} \ --ipa-vocab --use-g2p ``` where we use phoneme inputs (`--ipa-vocab --use-g2p`) as example. FastSpeech 2 additionally requires frame durations, pitch and energy as auxiliary training targets. Add `--add-fastspeech-targets` to include these fields in the feature manifests. We get frame durations either from phoneme-level force-alignment or frame-level pseudo-text unit sequence. They should be pre-computed and specified via: - `--textgrid-zip ${TEXT_GRID_ZIP_PATH}` for a ZIP file, inside which there is one [TextGrid](https://www.fon.hum.uva.nl/praat/manual/TextGrid.html) file per sample to provide force-alignment info. - `--id-to-units-tsv ${ID_TO_UNIT_TSV}` for a TSV file, where there are 2 columns for sample ID and space-delimited pseudo-text unit sequence, respectively. For your convenience, we provide pre-computed [force-alignment](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_mfa.zip) from [Montreal Forced Aligner](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) and [pseudo-text units](s3://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_hubert.tsv) from [HuBERT](https://github.com/pytorch/fairseq/tree/main/examples/hubert). You can also generate them by yourself using a different software or model. ## Training #### Transformer ```bash fairseq-train ${FEATURE_MANIFEST_ROOT} --save-dir ${SAVE_DIR} \ --config-yaml config.yaml --train-subset train --valid-subset dev \ --num-workers 4 --max-tokens 30000 --max-update 200000 \ --task text_to_speech --criterion tacotron2 --arch tts_transformer \ --clip-norm 5.0 --n-frames-per-step 4 --bce-pos-weight 5.0 \ --dropout 0.1 --attention-dropout 0.1 --activation-dropout 0.1 \ --encoder-normalize-before --decoder-normalize-before \ --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ --seed 1 --update-freq 8 --eval-inference --best-checkpoint-metric mcd_loss ``` where `SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. #### FastSpeech2 ```bash fairseq-train ${FEATURE_MANIFEST_ROOT} --save-dir ${SAVE_DIR} \ --config-yaml config.yaml --train-subset train --valid-subset dev \ --num-workers 4 --max-sentences 6 --max-update 200000 \ --task text_to_speech --criterion fastspeech2 --arch fastspeech2 \ --clip-norm 5.0 --n-frames-per-step 1 \ --dropout 0.1 --attention-dropout 0.1 \ --optimizer adam --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ --seed 1 --update-freq 8 --eval-inference --best-checkpoint-metric mcd_loss ``` ## Inference Average the last 5 checkpoints, generate the test split spectrogram and waveform using the default Griffin-Lim vocoder: ```bash SPLIT=test CHECKPOINT_NAME=avg_last_5 CHECKPOINT_PATH=${SAVE_DIR}/checkpoint_${CHECKPOINT_NAME}.pt python scripts/average_checkpoints.py --inputs ${SAVE_DIR} \ --num-epoch-checkpoints 5 \ --output ${CHECKPOINT_PATH} python -m examples.speech_synthesis.generate_waveform ${FEATURE_MANIFEST_ROOT} \ --config-yaml config.yaml --gen-subset ${SPLIT} --task text_to_speech \ --path ${CHECKPOINT_PATH} --max-tokens 50000 --spec-bwd-max-iter 32 \ --dump-waveforms ``` which dumps files (waveform, feature, attention plot, etc.) to `${SAVE_DIR}/generate-${CHECKPOINT_NAME}-${SPLIT}`. To re-synthesize target waveforms for automatic evaluation, add `--dump-target`. ## Automatic Evaluation To start with, generate the manifest for synthetic speech, which will be taken as inputs by evaluation scripts. ```bash python -m examples.speech_synthesis.evaluation.get_eval_manifest \ --generation-root ${SAVE_DIR}/generate-${CHECKPOINT_NAME}-${SPLIT} \ --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \ --output-path ${EVAL_OUTPUT_ROOT}/eval.tsv \ --vocoder griffin_lim --sample-rate 22050 --audio-format flac \ --use-resynthesized-target ``` Speech recognition (ASR) models usually operate at lower sample rates (e.g. 16kHz). For the WER/CER metric, you may need to resample the audios accordingly --- add `--output-sample-rate 16000` for `generate_waveform.py` and use `--sample-rate 16000` for `get_eval_manifest.py`. #### WER/CER metric We use wav2vec 2.0 ASR model as example. [Download](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) the model checkpoint and dictionary, then compute WER/CER with ```bash python -m examples.speech_synthesis.evaluation.eval_asr \ --audio-header syn --text-header text --err-unit char --split ${SPLIT} \ --w2v-ckpt ${WAV2VEC2_CHECKPOINT_PATH} --w2v-dict-dir ${WAV2VEC2_DICT_DIR} \ --raw-manifest ${EVAL_OUTPUT_ROOT}/eval_16khz.tsv --asr-dir ${EVAL_OUTPUT_ROOT}/asr ``` #### MCD/MSD metric ```bash python -m examples.speech_synthesis.evaluation.eval_sp \ ${EVAL_OUTPUT_ROOT}/eval.tsv --mcd --msd ``` #### F0 metrics ```bash python -m examples.speech_synthesis.evaluation.eval_f0 \ ${EVAL_OUTPUT_ROOT}/eval.tsv --gpe --vde --ffe ``` ## Results | --arch | Params | Test MCD | Model | |---|---|---|---| | tts_transformer | 54M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_transformer_phn.tar) | | fastspeech2 | 41M | 3.8 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/ljspeech_fastspeech2_phn.tar) | [[Back]](..) ================================================ FILE: examples/speech_synthesis/docs/vctk_example.md ================================================ [[Back]](..) # VCTK [VCTK](https://datashare.ed.ac.uk/handle/10283/3443) is an open English speech corpus. We provide examples for building [Transformer](https://arxiv.org/abs/1809.08895) models on this dataset. ## Data preparation Download data, create splits and generate audio manifests with ```bash python -m examples.speech_synthesis.preprocessing.get_vctk_audio_manifest \ --output-data-root ${AUDIO_DATA_ROOT} \ --output-manifest-root ${AUDIO_MANIFEST_ROOT} ``` To denoise audio and trim leading/trailing silence using signal processing based VAD, run ```bash for SPLIT in dev test train; do python -m examples.speech_synthesis.preprocessing.denoise_and_vad_audio \ --audio-manifest ${AUDIO_MANIFEST_ROOT}/${SPLIT}.audio.tsv \ --output-dir ${PROCESSED_DATA_ROOT} \ --denoise --vad --vad-agg-level 3 done ``` which generates a new audio TSV manifest under `${PROCESSED_DATA_ROOT}` with updated path to the processed audio and a new column for SNR. To do filtering by CER, follow the [Automatic Evaluation](../docs/ljspeech_example.md#automatic-evaluation) section to run ASR model (add `--eval-target` to `get_eval_manifest` for evaluation on the reference audio; add `--err-unit char` to `eval_asr` to compute CER instead of WER). The example-level CER is saved to `${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv`. Then, extract log-Mel spectrograms, generate feature manifest and create data configuration YAML with ```bash python -m examples.speech_synthesis.preprocessing.get_feature_manifest \ --audio-manifest-root ${PROCESSED_DATA_ROOT} \ --output-root ${FEATURE_MANIFEST_ROOT} \ --ipa-vocab --use-g2p \ --snr-threshold 15 \ --cer-threshold 0.1 --cer-tsv-path ${EVAL_OUTPUT_ROOT}/uer_cer.${SPLIT}.tsv ``` where we use phoneme inputs (`--ipa-vocab --use-g2p`) as example. For sample filtering, we set the SNR and CER threshold to 15 and 10%, respectively. ## Training (Please refer to [the LJSpeech example](../docs/ljspeech_example.md#transformer).) ## Inference (Please refer to [the LJSpeech example](../docs/ljspeech_example.md#inference).) ## Automatic Evaluation (Please refer to [the LJSpeech example](../docs/ljspeech_example.md#automatic-evaluation).) ## Results | --arch | Params | Test MCD | Model | |---|---|---|---| | tts_transformer | 54M | 3.4 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2/vctk_transformer_phn.tar) | [[Back]](..) ================================================ FILE: examples/speech_synthesis/evaluation/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: examples/speech_synthesis/evaluation/eval_asr.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import editdistance import re import shutil import soundfile as sf import subprocess from pathlib import Path from examples.speech_to_text.data_utils import load_tsv_to_dicts def preprocess_text(text): text = "|".join(re.sub(r"[^A-Z' ]", " ", text.upper()).split()) text = " ".join(text) return text def prepare_w2v_data( dict_dir, sample_rate, label, audio_paths, texts, split, data_dir ): data_dir.mkdir(parents=True, exist_ok=True) shutil.copyfile( dict_dir / f"dict.{label}.txt", data_dir / f"dict.{label}.txt" ) with open(data_dir / f"{split}.tsv", "w") as f: f.write("/\n") for audio_path in audio_paths: wav, sr = sf.read(audio_path) assert sr == sample_rate, f"{sr} != sample_rate" nsample = len(wav) f.write(f"{audio_path}\t{nsample}\n") with open(data_dir / f"{split}.{label}", "w") as f: for text in texts: text = preprocess_text(text) f.write(f"{text}\n") def run_asr(asr_dir, split, w2v_ckpt, w2v_label, res_dir): """ results will be saved at {res_dir}/{ref,hypo}.word-{w2v_ckpt.filename}-{split}.txt """ cmd = ["python", "-m", "examples.speech_recognition.infer"] cmd += [str(asr_dir.resolve())] cmd += ["--task", "audio_finetuning", "--nbest", "1", "--quiet"] cmd += ["--w2l-decoder", "viterbi", "--criterion", "ctc"] cmd += ["--post-process", "letter", "--max-tokens", "4000000"] cmd += ["--path", str(w2v_ckpt.resolve()), "--labels", w2v_label] cmd += ["--gen-subset", split, "--results-path", str(res_dir.resolve())] print(f"running cmd:\n{' '.join(cmd)}") subprocess.run(cmd, check=True) def compute_error_rate(hyp_wrd_path, ref_wrd_path, unit="word"): """each line is "<text> (None-<index>)" """ tokenize_line = { "word": lambda x: re.sub(r" \(.*\)$", "", x.rstrip()).split(), "char": lambda x: list(re.sub(r" \(.*\)$", "", x.rstrip())) }.get(unit) if tokenize_line is None: raise ValueError(f"{unit} not supported") inds = [int(re.sub(r"\D*(\d*)\D*", r"\1", line)) for line in open(hyp_wrd_path)] hyps = [tokenize_line(line) for line in open(hyp_wrd_path)] refs = [tokenize_line(line) for line in open(ref_wrd_path)] assert(len(hyps) == len(refs)) err_rates = [ editdistance.eval(hyp, ref) / len(ref) for hyp, ref in zip(hyps, refs) ] ind_to_err_rates = {i: e for i, e in zip(inds, err_rates)} return ind_to_err_rates def main(args): samples = load_tsv_to_dicts(args.raw_manifest) ids = [ sample[args.id_header] if args.id_header else "" for sample in samples ] audio_paths = [sample[args.audio_header] for sample in samples] texts = [sample[args.text_header] for sample in samples] prepare_w2v_data( args.w2v_dict_dir, args.w2v_sample_rate, args.w2v_label, audio_paths, texts, args.split, args.asr_dir ) run_asr(args.asr_dir, args.split, args.w2v_ckpt, args.w2v_label, args.asr_dir) ind_to_err_rates = compute_error_rate( args.asr_dir / f"hypo.word-{args.w2v_ckpt.name}-{args.split}.txt", args.asr_dir / f"ref.word-{args.w2v_ckpt.name}-{args.split}.txt", args.err_unit, ) uer_path = args.asr_dir / f"uer_{args.err_unit}.{args.split}.tsv" with open(uer_path, "w") as f: f.write("id\taudio\tuer\n") for ind, (id_, audio_path) in enumerate(zip(ids, audio_paths)): f.write(f"{id_}\t{audio_path}\t{ind_to_err_rates[ind]:.4f}\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--raw-manifest", required=True, type=Path) parser.add_argument("--asr-dir", required=True, type=Path) parser.add_argument("--id-header", default="id", type=str) parser.add_argument("--audio-header", default="audio", type=str) parser.add_argument("--text-header", default="src_text", type=str) parser.add_argument("--split", default="raw", type=str) parser.add_argument("--w2v-ckpt", required=True, type=Path) parser.add_argument("--w2v-dict-dir", required=True, type=Path) parser.add_argument("--w2v-sample-rate", default=16000, type=int) parser.add_argument("--w2v-label", default="ltr", type=str) parser.add_argument("--err-unit", default="word", type=str) args = parser.parse_args() main(args) ================================================ FILE: examples/speech_synthesis/evaluation/eval_f0.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Signal processing-based evaluation using waveforms """ import numpy as np import os.path as op import torchaudio import tqdm from tabulate import tabulate from examples.speech_synthesis.utils import ( gross_pitch_error, voicing_decision_error, f0_frame_error ) from examples.speech_synthesis.evaluation.eval_sp import load_eval_spec def difference_function(x, n, tau_max): """ Compute difference function of data x. This solution is implemented directly with Numpy fft. :param x: audio data :param n: length of data :param tau_max: integration window size :return: difference function :rtype: list """ x = np.array(x, np.float64) w = x.size tau_max = min(tau_max, w) x_cumsum = np.concatenate((np.array([0.]), (x * x).cumsum())) size = w + tau_max p2 = (size // 32).bit_length() nice_numbers = (16, 18, 20, 24, 25, 27, 30, 32) size_pad = min(x * 2 ** p2 for x in nice_numbers if x * 2 ** p2 >= size) fc = np.fft.rfft(x, size_pad) conv = np.fft.irfft(fc * fc.conjugate())[:tau_max] return x_cumsum[w:w - tau_max:-1] + x_cumsum[w] - x_cumsum[:tau_max] - \ 2 * conv def cumulative_mean_normalized_difference_function(df, n): """ Compute cumulative mean normalized difference function (CMND). :param df: Difference function :param n: length of data :return: cumulative mean normalized difference function :rtype: list """ # scipy method cmn_df = df[1:] * range(1, n) / np.cumsum(df[1:]).astype(float) return np.insert(cmn_df, 0, 1) def get_pitch(cmdf, tau_min, tau_max, harmo_th=0.1): """ Return fundamental period of a frame based on CMND function. :param cmdf: Cumulative Mean Normalized Difference function :param tau_min: minimum period for speech :param tau_max: maximum period for speech :param harmo_th: harmonicity threshold to determine if it is necessary to compute pitch frequency :return: fundamental period if there is values under threshold, 0 otherwise :rtype: float """ tau = tau_min while tau < tau_max: if cmdf[tau] < harmo_th: while tau + 1 < tau_max and cmdf[tau + 1] < cmdf[tau]: tau += 1 return tau tau += 1 return 0 # if unvoiced def compute_yin(sig, sr, w_len=512, w_step=256, f0_min=100, f0_max=500, harmo_thresh=0.1): """ Compute the Yin Algorithm. Return fundamental frequency and harmonic rate. https://github.com/NVIDIA/mellotron adaption of https://github.com/patriceguyot/Yin :param sig: Audio signal (list of float) :param sr: sampling rate (int) :param w_len: size of the analysis window (samples) :param w_step: size of the lag between two consecutives windows (samples) :param f0_min: Minimum fundamental frequency that can be detected (hertz) :param f0_max: Maximum fundamental frequency that can be detected (hertz) :param harmo_thresh: Threshold of detection. The yalgorithmù return the first minimum of the CMND function below this threshold. :returns: * pitches: list of fundamental frequencies, * harmonic_rates: list of harmonic rate values for each fundamental frequency value (= confidence value) * argmins: minimums of the Cumulative Mean Normalized DifferenceFunction * times: list of time of each estimation :rtype: tuple """ tau_min = int(sr / f0_max) tau_max = int(sr / f0_min) # time values for each analysis window time_scale = range(0, len(sig) - w_len, w_step) times = [t/float(sr) for t in time_scale] frames = [sig[t:t + w_len] for t in time_scale] pitches = [0.0] * len(time_scale) harmonic_rates = [0.0] * len(time_scale) argmins = [0.0] * len(time_scale) for i, frame in enumerate(frames): # Compute YIN df = difference_function(frame, w_len, tau_max) cm_df = cumulative_mean_normalized_difference_function(df, tau_max) p = get_pitch(cm_df, tau_min, tau_max, harmo_thresh) # Get results if np.argmin(cm_df) > tau_min: argmins[i] = float(sr / np.argmin(cm_df)) if p != 0: # A pitch was found pitches[i] = float(sr / p) harmonic_rates[i] = cm_df[p] else: # No pitch, but we compute a value of the harmonic rate harmonic_rates[i] = min(cm_df) return pitches, harmonic_rates, argmins, times def extract_f0(samples): f0_samples = [] for sample in tqdm.tqdm(samples): if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]): f0_samples.append(None) continue # assume single channel yref, sr = torchaudio.load(sample["ref"]) ysyn, _sr = torchaudio.load(sample["syn"]) yref, ysyn = yref[0], ysyn[0] assert sr == _sr, f"{sr} != {_sr}" yref_f0 = compute_yin(yref, sr) ysyn_f0 = compute_yin(ysyn, sr) f0_samples += [ { "ref": yref_f0, "syn": ysyn_f0 } ] return f0_samples def eval_f0_error(samples, distortion_fn): results = [] for sample in tqdm.tqdm(samples): if sample is None: results.append(None) continue # assume single channel yref_f, _, _, yref_t = sample["ref"] ysyn_f, _, _, ysyn_t = sample["syn"] yref_f = np.array(yref_f) yref_t = np.array(yref_t) ysyn_f = np.array(ysyn_f) ysyn_t = np.array(ysyn_t) distortion = distortion_fn(yref_t, yref_f, ysyn_t, ysyn_f) results.append((distortion.item(), len(yref_f), len(ysyn_f) )) return results def eval_gross_pitch_error(samples): return eval_f0_error(samples, gross_pitch_error) def eval_voicing_decision_error(samples): return eval_f0_error(samples, voicing_decision_error) def eval_f0_frame_error(samples): return eval_f0_error(samples, f0_frame_error) def print_results(results, show_bin): results = np.array(list(filter(lambda x: x is not None, results))) np.set_printoptions(precision=3) def _print_result(results): res = { "nutt": len(results), "error": results[:, 0].mean(), "std": results[:, 0].std(), "dur_ref": int(results[:, 1].sum()), "dur_syn": int(results[:, 2].sum()), } print(tabulate([res.values()], res.keys(), floatfmt=".4f")) print(">>>> ALL") _print_result(results) if show_bin: edges = [0, 200, 400, 600, 800, 1000, 2000, 4000] for i in range(1, len(edges)): mask = np.logical_and(results[:, 1] >= edges[i-1], results[:, 1] < edges[i]) if not mask.any(): continue bin_results = results[mask] print(f">>>> ({edges[i-1]}, {edges[i]})") _print_result(bin_results) def main(eval_f0, gpe, vde, ffe, show_bin): samples = load_eval_spec(eval_f0) if gpe or vde or ffe: f0_samples = extract_f0(samples) if gpe: print("===== Evaluate Gross Pitch Error =====") results = eval_gross_pitch_error(f0_samples) print_results(results, show_bin) if vde: print("===== Evaluate Voicing Decision Error =====") results = eval_voicing_decision_error(f0_samples) print_results(results, show_bin) if ffe: print("===== Evaluate F0 Frame Error =====") results = eval_f0_frame_error(f0_samples) print_results(results, show_bin) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("eval_f0") parser.add_argument("--gpe", action="store_true") parser.add_argument("--vde", action="store_true") parser.add_argument("--ffe", action="store_true") parser.add_argument("--show-bin", action="store_true") args = parser.parse_args() main(args.eval_f0, args.gpe, args.vde, args.ffe, args.show_bin) ================================================ FILE: examples/speech_synthesis/evaluation/eval_sp.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Signal processing-based evaluation using waveforms """ import csv import numpy as np import os.path as op import torch import tqdm from tabulate import tabulate import torchaudio from examples.speech_synthesis.utils import batch_mel_spectral_distortion from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion def load_eval_spec(path): with open(path) as f: reader = csv.DictReader(f, delimiter='\t') samples = list(reader) return samples def eval_distortion(samples, distortion_fn, device="cuda"): nmiss = 0 results = [] for sample in tqdm.tqdm(samples): if not op.isfile(sample["ref"]) or not op.isfile(sample["syn"]): nmiss += 1 results.append(None) continue # assume single channel yref, sr = torchaudio.load(sample["ref"]) ysyn, _sr = torchaudio.load(sample["syn"]) yref, ysyn = yref[0].to(device), ysyn[0].to(device) assert sr == _sr, f"{sr} != {_sr}" distortion, extra = distortion_fn([yref], [ysyn], sr, None)[0] _, _, _, _, _, pathmap = extra nins = torch.sum(pathmap.sum(dim=1) - 1) # extra frames in syn ndel = torch.sum(pathmap.sum(dim=0) - 1) # missing frames from syn results.append( (distortion.item(), # path distortion pathmap.size(0), # yref num frames pathmap.size(1), # ysyn num frames pathmap.sum().item(), # path length nins.item(), # insertion ndel.item(), # deletion ) ) return results def eval_mel_cepstral_distortion(samples, device="cuda"): return eval_distortion(samples, batch_mel_cepstral_distortion, device) def eval_mel_spectral_distortion(samples, device="cuda"): return eval_distortion(samples, batch_mel_spectral_distortion, device) def print_results(results, show_bin): results = np.array(list(filter(lambda x: x is not None, results))) np.set_printoptions(precision=3) def _print_result(results): dist, dur_ref, dur_syn, dur_ali, nins, ndel = results.sum(axis=0) res = { "nutt": len(results), "dist": dist, "dur_ref": int(dur_ref), "dur_syn": int(dur_syn), "dur_ali": int(dur_ali), "dist_per_ref_frm": dist/dur_ref, "dist_per_syn_frm": dist/dur_syn, "dist_per_ali_frm": dist/dur_ali, "ins": nins/dur_ref, "del": ndel/dur_ref, } print(tabulate( [res.values()], res.keys(), floatfmt=".4f" )) print(">>>> ALL") _print_result(results) if show_bin: edges = [0, 200, 400, 600, 800, 1000, 2000, 4000] for i in range(1, len(edges)): mask = np.logical_and(results[:, 1] >= edges[i-1], results[:, 1] < edges[i]) if not mask.any(): continue bin_results = results[mask] print(f">>>> ({edges[i-1]}, {edges[i]})") _print_result(bin_results) def main(eval_spec, mcd, msd, show_bin): samples = load_eval_spec(eval_spec) device = "cpu" if mcd: print("===== Evaluate Mean Cepstral Distortion =====") results = eval_mel_cepstral_distortion(samples, device) print_results(results, show_bin) if msd: print("===== Evaluate Mean Spectral Distortion =====") results = eval_mel_spectral_distortion(samples, device) print_results(results, show_bin) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("eval_spec") parser.add_argument("--mcd", action="store_true") parser.add_argument("--msd", action="store_true") parser.add_argument("--show-bin", action="store_true") args = parser.parse_args() main(args.eval_spec, args.mcd, args.msd, args.show_bin) ================================================ FILE: examples/speech_synthesis/evaluation/get_eval_manifest.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import csv from pathlib import Path def main(args): """ `uid syn ref text` """ in_root = Path(args.generation_root).resolve() ext = args.audio_format with open(args.audio_manifest) as f, open(args.output_path, "w") as f_out: reader = csv.DictReader( f, delimiter="\t", quotechar=None, doublequote=False, lineterminator="\n", quoting=csv.QUOTE_NONE ) header = ["id", "syn", "ref", "text", "speaker"] f_out.write("\t".join(header) + "\n") for row in reader: dir_name = f"{ext}_{args.sample_rate}hz_{args.vocoder}" id_ = row["id"] syn = (in_root / dir_name / f"{id_}.{ext}").as_posix() ref = row["audio"] if args.use_resynthesized_target: ref = (in_root / f"{dir_name}_tgt" / f"{id_}.{ext}").as_posix() if args.eval_target: syn = row["audio"] sample = [id_, syn, ref, row["tgt_text"], row["speaker"]] f_out.write("\t".join(sample) + "\n") print(f"wrote evaluation file to {args.output_path}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument( "--generation-root", help="output directory for generate_waveform.py" ) parser.add_argument( "--audio-manifest", help="used to determine the original utterance ID and text" ) parser.add_argument( "--output-path", help="path to output evaluation spec file" ) parser.add_argument( "--use-resynthesized-target", action="store_true", help="use resynthesized reference instead of the original audio" ) parser.add_argument( "--eval-target", action="store_true", help="evaluate reference instead of model prediction" ) parser.add_argument("--vocoder", type=str, default="griffin_lim") parser.add_argument("--sample-rate", type=int, default=22_050) parser.add_argument("--audio-format", type=str, default="wav") args = parser.parse_args() main(args) ================================================ FILE: examples/speech_synthesis/generate_waveform.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ast import logging import matplotlib.pyplot as plt import numpy as np from pathlib import Path import soundfile as sf import sys import torch import torchaudio from fairseq import checkpoint_utils, options, tasks, utils from fairseq.logging import progress_bar from fairseq.tasks.text_to_speech import plot_tts_output from fairseq.data.audio.text_to_speech_dataset import TextToSpeechDataset logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def make_parser(): parser = options.get_speech_generation_parser() parser.add_argument("--dump-features", action="store_true") parser.add_argument("--dump-waveforms", action="store_true") parser.add_argument("--dump-attentions", action="store_true") parser.add_argument("--dump-eos-probs", action="store_true") parser.add_argument("--dump-plots", action="store_true") parser.add_argument("--dump-target", action="store_true") parser.add_argument("--output-sample-rate", default=22050, type=int) parser.add_argument("--teacher-forcing", action="store_true") parser.add_argument( "--audio-format", type=str, default="wav", choices=["wav", "flac"] ) return parser def postprocess_results( dataset: TextToSpeechDataset, sample, hypos, resample_fn, dump_target ): def to_np(x): return None if x is None else x.detach().cpu().numpy() sample_ids = [dataset.ids[i] for i in sample["id"].tolist()] texts = sample["src_texts"] if "src_texts" in sample else [""] * len(hypos) attns = [to_np(hypo["attn"]) for hypo in hypos] eos_probs = [to_np(hypo.get("eos_prob", None)) for hypo in hypos] feat_preds = [to_np(hypo["feature"]) for hypo in hypos] wave_preds = [to_np(resample_fn(h["waveform"])) for h in hypos] if dump_target: feat_targs = [to_np(hypo["targ_feature"]) for hypo in hypos] wave_targs = [to_np(resample_fn(h["targ_waveform"])) for h in hypos] else: feat_targs = [None for _ in hypos] wave_targs = [None for _ in hypos] return zip(sample_ids, texts, attns, eos_probs, feat_preds, wave_preds, feat_targs, wave_targs) def dump_result( is_na_model, args, vocoder, sample_id, text, attn, eos_prob, feat_pred, wave_pred, feat_targ, wave_targ, ): sample_rate = args.output_sample_rate out_root = Path(args.results_path) if args.dump_features: feat_dir = out_root / "feat" feat_dir.mkdir(exist_ok=True, parents=True) np.save(feat_dir / f"{sample_id}.npy", feat_pred) if args.dump_target: feat_tgt_dir = out_root / "feat_tgt" feat_tgt_dir.mkdir(exist_ok=True, parents=True) np.save(feat_tgt_dir / f"{sample_id}.npy", feat_targ) if args.dump_attentions: attn_dir = out_root / "attn" attn_dir.mkdir(exist_ok=True, parents=True) np.save(attn_dir / f"{sample_id}.npy", attn.numpy()) if args.dump_eos_probs and not is_na_model: eos_dir = out_root / "eos" eos_dir.mkdir(exist_ok=True, parents=True) np.save(eos_dir / f"{sample_id}.npy", eos_prob) if args.dump_plots: images = [feat_pred.T] if is_na_model else [feat_pred.T, attn] names = ["output"] if is_na_model else ["output", "alignment"] if feat_targ is not None: images = [feat_targ.T] + images names = [f"target (idx={sample_id})"] + names if is_na_model: plot_tts_output(images, names, attn, "alignment", suptitle=text) else: plot_tts_output(images, names, eos_prob, "eos prob", suptitle=text) plot_dir = out_root / "plot" plot_dir.mkdir(exist_ok=True, parents=True) plt.savefig(plot_dir / f"{sample_id}.png") plt.close() if args.dump_waveforms: ext = args.audio_format if wave_pred is not None: wav_dir = out_root / f"{ext}_{sample_rate}hz_{vocoder}" wav_dir.mkdir(exist_ok=True, parents=True) sf.write(wav_dir / f"{sample_id}.{ext}", wave_pred, sample_rate) if args.dump_target and wave_targ is not None: wav_tgt_dir = out_root / f"{ext}_{sample_rate}hz_{vocoder}_tgt" wav_tgt_dir.mkdir(exist_ok=True, parents=True) sf.write(wav_tgt_dir / f"{sample_id}.{ext}", wave_targ, sample_rate) def main(args): assert(args.dump_features or args.dump_waveforms or args.dump_attentions or args.dump_eos_probs or args.dump_plots) if args.max_tokens is None and args.batch_size is None: args.max_tokens = 8000 logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu task = tasks.setup_task(args) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [args.path], task=task, arg_overrides=ast.literal_eval(args.model_overrides), ) model = models[0].cuda() if use_cuda else models[0] # use the original n_frames_per_step task.args.n_frames_per_step = saved_cfg.task.n_frames_per_step task.load_dataset(args.gen_subset, task_cfg=saved_cfg.task) data_cfg = task.data_cfg sample_rate = data_cfg.config.get("features", {}).get("sample_rate", 22050) resample_fn = { False: lambda x: x, True: lambda x: torchaudio.sox_effects.apply_effects_tensor( x.detach().cpu().unsqueeze(0), sample_rate, [['rate', str(args.output_sample_rate)]] )[0].squeeze(0) }.get(args.output_sample_rate != sample_rate) if args.output_sample_rate != sample_rate: logger.info(f"resampling to {args.output_sample_rate}Hz") generator = task.build_generator([model], args) itr = task.get_batch_iterator( dataset=task.dataset(args.gen_subset), max_tokens=args.max_tokens, max_sentences=args.batch_size, max_positions=(sys.maxsize, sys.maxsize), ignore_invalid_inputs=args.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=args.required_batch_size_multiple, num_shards=args.num_shards, shard_id=args.shard_id, num_workers=args.num_workers, data_buffer_size=args.data_buffer_size, ).next_epoch_itr(shuffle=False) Path(args.results_path).mkdir(exist_ok=True, parents=True) is_na_model = getattr(model, "NON_AUTOREGRESSIVE", False) dataset = task.dataset(args.gen_subset) vocoder = task.args.vocoder with progress_bar.build_progress_bar(args, itr) as t: for sample in t: sample = utils.move_to_cuda(sample) if use_cuda else sample hypos = generator.generate(model, sample, has_targ=args.dump_target) for result in postprocess_results( dataset, sample, hypos, resample_fn, args.dump_target ): dump_result(is_na_model, args, vocoder, *result) def cli_main(): parser = make_parser() args = options.parse_args_and_arch(parser) main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/speech_synthesis/preprocessing/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: examples/speech_synthesis/preprocessing/denoise_and_vad_audio.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os import csv import tempfile from collections import defaultdict from pathlib import Path import torchaudio try: import webrtcvad except ImportError: raise ImportError("Please install py-webrtcvad: pip install webrtcvad") import pandas as pd from tqdm import tqdm from examples.speech_synthesis.preprocessing.denoiser.pretrained import master64 import examples.speech_synthesis.preprocessing.denoiser.utils as utils from examples.speech_synthesis.preprocessing.vad import ( frame_generator, vad_collector, read_wave, write_wave, FS_MS, THRESHOLD, SCALE ) from examples.speech_to_text.data_utils import save_df_to_tsv log = logging.getLogger(__name__) PATHS = ["after_denoise", "after_vad"] MIN_T = 0.05 def generate_tmp_filename(extension="txt"): return tempfile._get_default_tempdir() + "/" + \ next(tempfile._get_candidate_names()) + "." + extension def convert_sr(inpath, sr, output_path=None): if not output_path: output_path = generate_tmp_filename("wav") cmd = f"sox {inpath} -r {sr} {output_path}" os.system(cmd) return output_path def apply_vad(vad, inpath): audio, sample_rate = read_wave(inpath) frames = frame_generator(FS_MS, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, FS_MS, 300, vad, frames) merge_segments = list() timestamp_start = 0.0 timestamp_end = 0.0 # removing start, end, and long sequences of sils for i, segment in enumerate(segments): merge_segments.append(segment[0]) if i and timestamp_start: sil_duration = segment[1] - timestamp_end if sil_duration > THRESHOLD: merge_segments.append(int(THRESHOLD / SCALE) * (b'\x00')) else: merge_segments.append(int((sil_duration / SCALE)) * (b'\x00')) timestamp_start = segment[1] timestamp_end = segment[2] segment = b''.join(merge_segments) return segment, sample_rate def write(wav, filename, sr=16_000): # Normalize audio if it prevents clipping wav = wav / max(wav.abs().max().item(), 1) torchaudio.save(filename, wav.cpu(), sr, encoding="PCM_S", bits_per_sample=16) def process(args): # making sure we are requested either denoise or vad if not args.denoise and not args.vad: log.error("No denoise or vad is requested.") return log.info("Creating out directories...") if args.denoise: out_denoise = Path(args.output_dir).absolute().joinpath(PATHS[0]) out_denoise.mkdir(parents=True, exist_ok=True) if args.vad: out_vad = Path(args.output_dir).absolute().joinpath(PATHS[1]) out_vad.mkdir(parents=True, exist_ok=True) log.info("Loading pre-trained speech enhancement model...") model = master64().to(args.device) log.info("Building the VAD model...") vad = webrtcvad.Vad(int(args.vad_agg_level)) # preparing the output dict output_dict = defaultdict(list) log.info(f"Parsing input manifest: {args.audio_manifest}") with open(args.audio_manifest, "r") as f: manifest_dict = csv.DictReader(f, delimiter="\t") for row in tqdm(manifest_dict): filename = str(row["audio"]) final_output = filename keep_sample = True n_frames = row["n_frames"] snr = -1 if args.denoise: output_path_denoise = out_denoise.joinpath(Path(filename).name) # convert to 16khz in case we use a differet sr tmp_path = convert_sr(final_output, 16000) # loading audio file and generating the enhanced version out, sr = torchaudio.load(tmp_path) out = out.to(args.device) estimate = model(out) estimate = (1 - args.dry_wet) * estimate + args.dry_wet * out write(estimate[0], str(output_path_denoise), sr) snr = utils.cal_snr(out, estimate) snr = snr.cpu().detach().numpy()[0][0] final_output = str(output_path_denoise) if args.vad: output_path_vad = out_vad.joinpath(Path(filename).name) sr = torchaudio.info(final_output).sample_rate if sr in [16000, 32000, 48000]: tmp_path = final_output elif sr < 16000: tmp_path = convert_sr(final_output, 16000) elif sr < 32000: tmp_path = convert_sr(final_output, 32000) else: tmp_path = convert_sr(final_output, 48000) # apply VAD segment, sample_rate = apply_vad(vad, tmp_path) if len(segment) < sample_rate * MIN_T: keep_sample = False print(( f"WARNING: skip {filename} because it is too short " f"after VAD ({len(segment) / sample_rate} < {MIN_T})" )) else: if sample_rate != sr: tmp_path = generate_tmp_filename("wav") write_wave(tmp_path, segment, sample_rate) convert_sr(tmp_path, sr, output_path=str(output_path_vad)) else: write_wave(str(output_path_vad), segment, sample_rate) final_output = str(output_path_vad) segment, _ = torchaudio.load(final_output) n_frames = segment.size(1) if keep_sample: output_dict["id"].append(row["id"]) output_dict["audio"].append(final_output) output_dict["n_frames"].append(n_frames) output_dict["tgt_text"].append(row["tgt_text"]) output_dict["speaker"].append(row["speaker"]) output_dict["src_text"].append(row["src_text"]) output_dict["snr"].append(snr) out_tsv_path = Path(args.output_dir) / Path(args.audio_manifest).name log.info(f"Saving manifest to {out_tsv_path.as_posix()}") save_df_to_tsv(pd.DataFrame.from_dict(output_dict), out_tsv_path) def main(): parser = argparse.ArgumentParser() parser.add_argument("--audio-manifest", "-i", required=True, type=str, help="path to the input manifest.") parser.add_argument( "--output-dir", "-o", required=True, type=str, help="path to the output dir. it will contain files after denoising and" " vad" ) parser.add_argument("--vad-agg-level", "-a", type=int, default=2, help="the aggresive level of the vad [0-3].") parser.add_argument( "--dry-wet", "-dw", type=float, default=0.01, help="the level of linear interpolation between noisy and enhanced " "files." ) parser.add_argument( "--device", "-d", type=str, default="cpu", help="the device to be used for the speech enhancement model: " "cpu | cuda." ) parser.add_argument("--denoise", action="store_true", help="apply a denoising") parser.add_argument("--vad", action="store_true", help="apply a VAD") args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_synthesis/preprocessing/denoiser/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: examples/speech_synthesis/preprocessing/denoiser/demucs.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # author: adefossez import math import time import torch as th from torch import nn from torch.nn import functional as F from .resample import downsample2, upsample2 from .utils import capture_init class BLSTM(nn.Module): def __init__(self, dim, layers=2, bi=True): super().__init__() klass = nn.LSTM self.lstm = klass( bidirectional=bi, num_layers=layers, hidden_size=dim, input_size=dim ) self.linear = None if bi: self.linear = nn.Linear(2 * dim, dim) def forward(self, x, hidden=None): x, hidden = self.lstm(x, hidden) if self.linear: x = self.linear(x) return x, hidden def rescale_conv(conv, reference): std = conv.weight.std().detach() scale = (std / reference)**0.5 conv.weight.data /= scale if conv.bias is not None: conv.bias.data /= scale def rescale_module(module, reference): for sub in module.modules(): if isinstance(sub, (nn.Conv1d, nn.ConvTranspose1d)): rescale_conv(sub, reference) class Demucs(nn.Module): """ Demucs speech enhancement model. Args: - chin (int): number of input channels. - chout (int): number of output channels. - hidden (int): number of initial hidden channels. - depth (int): number of layers. - kernel_size (int): kernel size for each layer. - stride (int): stride for each layer. - causal (bool): if false, uses BiLSTM instead of LSTM. - resample (int): amount of resampling to apply to the input/output. Can be one of 1, 2 or 4. - growth (float): number of channels is multiplied by this for every layer. - max_hidden (int): maximum number of channels. Can be useful to control the size/speed of the model. - normalize (bool): if true, normalize the input. - glu (bool): if true uses GLU instead of ReLU in 1x1 convolutions. - rescale (float): controls custom weight initialization. See https://arxiv.org/abs/1911.13254. - floor (float): stability flooring when normalizing. """ @capture_init def __init__(self, chin=1, chout=1, hidden=48, depth=5, kernel_size=8, stride=4, causal=True, resample=4, growth=2, max_hidden=10_000, normalize=True, glu=True, rescale=0.1, floor=1e-3): super().__init__() if resample not in [1, 2, 4]: raise ValueError("Resample should be 1, 2 or 4.") self.chin = chin self.chout = chout self.hidden = hidden self.depth = depth self.kernel_size = kernel_size self.stride = stride self.causal = causal self.floor = floor self.resample = resample self.normalize = normalize self.encoder = nn.ModuleList() self.decoder = nn.ModuleList() activation = nn.GLU(1) if glu else nn.ReLU() ch_scale = 2 if glu else 1 for index in range(depth): encode = [] encode += [ nn.Conv1d(chin, hidden, kernel_size, stride), nn.ReLU(), nn.Conv1d(hidden, hidden * ch_scale, 1), activation, ] self.encoder.append(nn.Sequential(*encode)) decode = [] decode += [ nn.Conv1d(hidden, ch_scale * hidden, 1), activation, nn.ConvTranspose1d(hidden, chout, kernel_size, stride), ] if index > 0: decode.append(nn.ReLU()) self.decoder.insert(0, nn.Sequential(*decode)) chout = hidden chin = hidden hidden = min(int(growth * hidden), max_hidden) self.lstm = BLSTM(chin, bi=not causal) if rescale: rescale_module(self, reference=rescale) def valid_length(self, length): """ Return the nearest valid length to use with the model so that there is no time steps left over in a convolutions, e.g. for all layers, size of the input - kernel_size % stride = 0. If the mixture has a valid length, the estimated sources will have exactly the same length. """ length = math.ceil(length * self.resample) for _ in range(self.depth): length = math.ceil((length - self.kernel_size) / self.stride) + 1 length = max(length, 1) for _ in range(self.depth): length = (length - 1) * self.stride + self.kernel_size length = int(math.ceil(length / self.resample)) return int(length) @property def total_stride(self): return self.stride ** self.depth // self.resample def forward(self, mix): if mix.dim() == 2: mix = mix.unsqueeze(1) if self.normalize: mono = mix.mean(dim=1, keepdim=True) std = mono.std(dim=-1, keepdim=True) mix = mix / (self.floor + std) else: std = 1 length = mix.shape[-1] x = mix x = F.pad(x, (0, self.valid_length(length) - length)) if self.resample == 2: x = upsample2(x) elif self.resample == 4: x = upsample2(x) x = upsample2(x) skips = [] for encode in self.encoder: x = encode(x) skips.append(x) x = x.permute(2, 0, 1) x, _ = self.lstm(x) x = x.permute(1, 2, 0) for decode in self.decoder: skip = skips.pop(-1) x = x + skip[..., :x.shape[-1]] x = decode(x) if self.resample == 2: x = downsample2(x) elif self.resample == 4: x = downsample2(x) x = downsample2(x) x = x[..., :length] return std * x def fast_conv(conv, x): """ Faster convolution evaluation if either kernel size is 1 or length of sequence is 1. """ batch, chin, length = x.shape chout, chin, kernel = conv.weight.shape assert batch == 1 if kernel == 1: x = x.view(chin, length) out = th.addmm(conv.bias.view(-1, 1), conv.weight.view(chout, chin), x) elif length == kernel: x = x.view(chin * kernel, 1) out = th.addmm(conv.bias.view(-1, 1), conv.weight.view(chout, chin * kernel), x) else: out = conv(x) return out.view(batch, chout, -1) class DemucsStreamer: """ Streaming implementation for Demucs. It supports being fed with any amount of audio at a time. You will get back as much audio as possible at that point. Args: - demucs (Demucs): Demucs model. - dry (float): amount of dry (e.g. input) signal to keep. 0 is maximum noise removal, 1 just returns the input signal. Small values > 0 allows to limit distortions. - num_frames (int): number of frames to process at once. Higher values will increase overall latency but improve the real time factor. - resample_lookahead (int): extra lookahead used for the resampling. - resample_buffer (int): size of the buffer of previous inputs/outputs kept for resampling. """ def __init__(self, demucs, dry=0, num_frames=1, resample_lookahead=64, resample_buffer=256): device = next(iter(demucs.parameters())).device self.demucs = demucs self.lstm_state = None self.conv_state = None self.dry = dry self.resample_lookahead = resample_lookahead resample_buffer = min(demucs.total_stride, resample_buffer) self.resample_buffer = resample_buffer self.frame_length = demucs.valid_length(1) + \ demucs.total_stride * (num_frames - 1) self.total_length = self.frame_length + self.resample_lookahead self.stride = demucs.total_stride * num_frames self.resample_in = th.zeros(demucs.chin, resample_buffer, device=device) self.resample_out = th.zeros( demucs.chin, resample_buffer, device=device ) self.frames = 0 self.total_time = 0 self.variance = 0 self.pending = th.zeros(demucs.chin, 0, device=device) bias = demucs.decoder[0][2].bias weight = demucs.decoder[0][2].weight chin, chout, kernel = weight.shape self._bias = bias.view(-1, 1).repeat(1, kernel).view(-1, 1) self._weight = weight.permute(1, 2, 0).contiguous() def reset_time_per_frame(self): self.total_time = 0 self.frames = 0 @property def time_per_frame(self): return self.total_time / self.frames def flush(self): """ Flush remaining audio by padding it with zero. Call this when you have no more input and want to get back the last chunk of audio. """ pending_length = self.pending.shape[1] padding = th.zeros( self.demucs.chin, self.total_length, device=self.pending.device ) out = self.feed(padding) return out[:, :pending_length] def feed(self, wav): """ Apply the model to mix using true real time evaluation. Normalization is done online as is the resampling. """ begin = time.time() demucs = self.demucs resample_buffer = self.resample_buffer stride = self.stride resample = demucs.resample if wav.dim() != 2: raise ValueError("input wav should be two dimensional.") chin, _ = wav.shape if chin != demucs.chin: raise ValueError(f"Expected {demucs.chin} channels, got {chin}") self.pending = th.cat([self.pending, wav], dim=1) outs = [] while self.pending.shape[1] >= self.total_length: self.frames += 1 frame = self.pending[:, :self.total_length] dry_signal = frame[:, :stride] if demucs.normalize: mono = frame.mean(0) variance = (mono**2).mean() self.variance = variance / self.frames + \ (1 - 1 / self.frames) * self.variance frame = frame / (demucs.floor + math.sqrt(self.variance)) frame = th.cat([self.resample_in, frame], dim=-1) self.resample_in[:] = frame[:, stride - resample_buffer:stride] if resample == 4: frame = upsample2(upsample2(frame)) elif resample == 2: frame = upsample2(frame) # remove pre sampling buffer frame = frame[:, resample * resample_buffer:] # remove extra samples after window frame = frame[:, :resample * self.frame_length] out, extra = self._separate_frame(frame) padded_out = th.cat([self.resample_out, out, extra], 1) self.resample_out[:] = out[:, -resample_buffer:] if resample == 4: out = downsample2(downsample2(padded_out)) elif resample == 2: out = downsample2(padded_out) else: out = padded_out out = out[:, resample_buffer // resample:] out = out[:, :stride] if demucs.normalize: out *= math.sqrt(self.variance) out = self.dry * dry_signal + (1 - self.dry) * out outs.append(out) self.pending = self.pending[:, stride:] self.total_time += time.time() - begin if outs: out = th.cat(outs, 1) else: out = th.zeros(chin, 0, device=wav.device) return out def _separate_frame(self, frame): demucs = self.demucs skips = [] next_state = [] first = self.conv_state is None stride = self.stride * demucs.resample x = frame[None] for idx, encode in enumerate(demucs.encoder): stride //= demucs.stride length = x.shape[2] if idx == demucs.depth - 1: # This is sligthly faster for the last conv x = fast_conv(encode[0], x) x = encode[1](x) x = fast_conv(encode[2], x) x = encode[3](x) else: if not first: prev = self.conv_state.pop(0) prev = prev[..., stride:] tgt = (length - demucs.kernel_size) // demucs.stride + 1 missing = tgt - prev.shape[-1] offset = length - demucs.kernel_size - \ demucs.stride * (missing - 1) x = x[..., offset:] x = encode[1](encode[0](x)) x = fast_conv(encode[2], x) x = encode[3](x) if not first: x = th.cat([prev, x], -1) next_state.append(x) skips.append(x) x = x.permute(2, 0, 1) x, self.lstm_state = demucs.lstm(x, self.lstm_state) x = x.permute(1, 2, 0) # In the following, x contains only correct samples, i.e. the one # for which each time position is covered by two window of the upper # layer. extra contains extra samples to the right, and is used only as # a better padding for the online resampling. extra = None for idx, decode in enumerate(demucs.decoder): skip = skips.pop(-1) x += skip[..., :x.shape[-1]] x = fast_conv(decode[0], x) x = decode[1](x) if extra is not None: skip = skip[..., x.shape[-1]:] extra += skip[..., :extra.shape[-1]] extra = decode[2](decode[1](decode[0](extra))) x = decode[2](x) next_state.append( x[..., -demucs.stride:] - decode[2].bias.view(-1, 1) ) if extra is None: extra = x[..., -demucs.stride:] else: extra[..., :demucs.stride] += next_state[-1] x = x[..., :-demucs.stride] if not first: prev = self.conv_state.pop(0) x[..., :demucs.stride] += prev if idx != demucs.depth - 1: x = decode[3](x) extra = decode[3](extra) self.conv_state = next_state return x[0], extra[0] def test(): import argparse parser = argparse.ArgumentParser( "denoiser.demucs", description="Benchmark the streaming Demucs implementation, as well as " "checking the delta with the offline implementation.") parser.add_argument("--depth", default=5, type=int) parser.add_argument("--resample", default=4, type=int) parser.add_argument("--hidden", default=48, type=int) parser.add_argument("--sample_rate", default=16000, type=float) parser.add_argument("--device", default="cpu") parser.add_argument("-t", "--num_threads", type=int) parser.add_argument("-f", "--num_frames", type=int, default=1) args = parser.parse_args() if args.num_threads: th.set_num_threads(args.num_threads) sr = args.sample_rate sr_ms = sr / 1000 demucs = Demucs( depth=args.depth, hidden=args.hidden, resample=args.resample ).to(args.device) x = th.randn(1, int(sr * 4)).to(args.device) out = demucs(x[None])[0] streamer = DemucsStreamer(demucs, num_frames=args.num_frames) out_rt = [] frame_size = streamer.total_length with th.no_grad(): while x.shape[1] > 0: out_rt.append(streamer.feed(x[:, :frame_size])) x = x[:, frame_size:] frame_size = streamer.demucs.total_stride out_rt.append(streamer.flush()) out_rt = th.cat(out_rt, 1) model_size = sum(p.numel() for p in demucs.parameters()) * 4 / 2**20 initial_lag = streamer.total_length / sr_ms tpf = 1000 * streamer.time_per_frame print(f"model size: {model_size:.1f}MB, ", end='') print(f"delta batch/streaming: {th.norm(out - out_rt) / th.norm(out):.2%}") print(f"initial lag: {initial_lag:.1f}ms, ", end='') print(f"stride: {streamer.stride * args.num_frames / sr_ms:.1f}ms") print(f"time per frame: {tpf:.1f}ms, ", end='') rtf = (1000 * streamer.time_per_frame) / (streamer.stride / sr_ms) print(f"RTF: {rtf:.2f}") print(f"Total lag with computation: {initial_lag + tpf:.1f}ms") if __name__ == "__main__": test() ================================================ FILE: examples/speech_synthesis/preprocessing/denoiser/pretrained.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # author: adefossez import logging import torch.hub from .demucs import Demucs from .utils import deserialize_model logger = logging.getLogger(__name__) ROOT = "https://dl.fbaipublicfiles.com/adiyoss/denoiser/" DNS_48_URL = ROOT + "dns48-11decc9d8e3f0998.th" DNS_64_URL = ROOT + "dns64-a7761ff99a7d5bb6.th" MASTER_64_URL = ROOT + "master64-8a5dfb4bb92753dd.th" def _demucs(pretrained, url, **kwargs): model = Demucs(**kwargs) if pretrained: state_dict = torch.hub.load_state_dict_from_url(url, map_location='cpu') model.load_state_dict(state_dict) return model def dns48(pretrained=True): return _demucs(pretrained, DNS_48_URL, hidden=48) def dns64(pretrained=True): return _demucs(pretrained, DNS_64_URL, hidden=64) def master64(pretrained=True): return _demucs(pretrained, MASTER_64_URL, hidden=64) def add_model_flags(parser): group = parser.add_mutually_exclusive_group(required=False) group.add_argument( "-m", "--model_path", help="Path to local trained model." ) group.add_argument( "--dns48", action="store_true", help="Use pre-trained real time H=48 model trained on DNS." ) group.add_argument( "--dns64", action="store_true", help="Use pre-trained real time H=64 model trained on DNS." ) group.add_argument( "--master64", action="store_true", help="Use pre-trained real time H=64 model trained on DNS and Valentini." ) def get_model(args): """ Load local model package or torchhub pre-trained model. """ if args.model_path: logger.info("Loading model from %s", args.model_path) pkg = torch.load(args.model_path) model = deserialize_model(pkg) elif args.dns64: logger.info("Loading pre-trained real time H=64 model trained on DNS.") model = dns64() elif args.master64: logger.info( "Loading pre-trained real time H=64 model trained on DNS and Valentini." ) model = master64() else: logger.info("Loading pre-trained real time H=48 model trained on DNS.") model = dns48() logger.debug(model) return model ================================================ FILE: examples/speech_synthesis/preprocessing/denoiser/resample.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # author: adefossez import math import torch as th from torch.nn import functional as F def sinc(t): """sinc. :param t: the input tensor """ return th.where(t == 0, th.tensor(1., device=t.device, dtype=t.dtype), th.sin(t) / t) def kernel_upsample2(zeros=56): """kernel_upsample2. """ win = th.hann_window(4 * zeros + 1, periodic=False) winodd = win[1::2] t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros) t *= math.pi kernel = (sinc(t) * winodd).view(1, 1, -1) return kernel def upsample2(x, zeros=56): """ Upsampling the input by 2 using sinc interpolation. Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method." ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing. Vol. 9. IEEE, 1984. """ *other, time = x.shape kernel = kernel_upsample2(zeros).to(x) out = F.conv1d(x.view(-1, 1, time), kernel, padding=zeros)[..., 1:].view( *other, time ) y = th.stack([x, out], dim=-1) return y.view(*other, -1) def kernel_downsample2(zeros=56): """kernel_downsample2. """ win = th.hann_window(4 * zeros + 1, periodic=False) winodd = win[1::2] t = th.linspace(-zeros + 0.5, zeros - 0.5, 2 * zeros) t.mul_(math.pi) kernel = (sinc(t) * winodd).view(1, 1, -1) return kernel def downsample2(x, zeros=56): """ Downsampling the input by 2 using sinc interpolation. Smith, Julius, and Phil Gossett. "A flexible sampling-rate conversion method." ICASSP'84. IEEE International Conference on Acoustics, Speech, and Signal Processing. Vol. 9. IEEE, 1984. """ if x.shape[-1] % 2 != 0: x = F.pad(x, (0, 1)) xeven = x[..., ::2] xodd = x[..., 1::2] *other, time = xodd.shape kernel = kernel_downsample2(zeros).to(x) out = xeven + F.conv1d( xodd.view(-1, 1, time), kernel, padding=zeros )[..., :-1].view(*other, time) return out.view(*other, -1).mul(0.5) ================================================ FILE: examples/speech_synthesis/preprocessing/denoiser/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. # author: adefossez import functools import logging from contextlib import contextmanager import inspect import time logger = logging.getLogger(__name__) EPS = 1e-8 def capture_init(init): """capture_init. Decorate `__init__` with this, and you can then recover the *args and **kwargs passed to it in `self._init_args_kwargs` """ @functools.wraps(init) def __init__(self, *args, **kwargs): self._init_args_kwargs = (args, kwargs) init(self, *args, **kwargs) return __init__ def deserialize_model(package, strict=False): """deserialize_model. """ klass = package['class'] if strict: model = klass(*package['args'], **package['kwargs']) else: sig = inspect.signature(klass) kw = package['kwargs'] for key in list(kw): if key not in sig.parameters: logger.warning("Dropping inexistant parameter %s", key) del kw[key] model = klass(*package['args'], **kw) model.load_state_dict(package['state']) return model def copy_state(state): return {k: v.cpu().clone() for k, v in state.items()} def serialize_model(model): args, kwargs = model._init_args_kwargs state = copy_state(model.state_dict()) return {"class": model.__class__, "args": args, "kwargs": kwargs, "state": state} @contextmanager def swap_state(model, state): """ Context manager that swaps the state of a model, e.g: # model is in old state with swap_state(model, new_state): # model in new state # model back to old state """ old_state = copy_state(model.state_dict()) model.load_state_dict(state) try: yield finally: model.load_state_dict(old_state) def pull_metric(history, name): out = [] for metrics in history: if name in metrics: out.append(metrics[name]) return out class LogProgress: """ Sort of like tqdm but using log lines and not as real time. Args: - logger: logger obtained from `logging.getLogger`, - iterable: iterable object to wrap - updates (int): number of lines that will be printed, e.g. if `updates=5`, log every 1/5th of the total length. - total (int): length of the iterable, in case it does not support `len`. - name (str): prefix to use in the log. - level: logging level (like `logging.INFO`). """ def __init__(self, logger, iterable, updates=5, total=None, name="LogProgress", level=logging.INFO): self.iterable = iterable self.total = total or len(iterable) self.updates = updates self.name = name self.logger = logger self.level = level def update(self, **infos): self._infos = infos def __iter__(self): self._iterator = iter(self.iterable) self._index = -1 self._infos = {} self._begin = time.time() return self def __next__(self): self._index += 1 try: value = next(self._iterator) except StopIteration: raise else: return value finally: log_every = max(1, self.total // self.updates) # logging is delayed by 1 it, in order to have the metrics from update if self._index >= 1 and self._index % log_every == 0: self._log() def _log(self): self._speed = (1 + self._index) / (time.time() - self._begin) infos = " | ".join(f"{k.capitalize()} {v}" for k, v in self._infos.items()) if self._speed < 1e-4: speed = "oo sec/it" elif self._speed < 0.1: speed = f"{1/self._speed:.1f} sec/it" else: speed = f"{self._speed:.1f} it/sec" out = f"{self.name} | {self._index}/{self.total} | {speed}" if infos: out += " | " + infos self.logger.log(self.level, out) def colorize(text, color): """ Display text with some ANSI color in the terminal. """ code = f"\033[{color}m" restore = "\033[0m" return "".join([code, text, restore]) def bold(text): """ Display text in bold in the terminal. """ return colorize(text, "1") def cal_snr(lbl, est): import torch y = 10.0 * torch.log10( torch.sum(lbl**2, dim=-1) / (torch.sum((est-lbl)**2, dim=-1) + EPS) + EPS ) return y ================================================ FILE: examples/speech_synthesis/preprocessing/get_common_voice_audio_manifest.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path from collections import defaultdict from typing import List, Dict, Tuple import pandas as pd import numpy as np import torchaudio from tqdm import tqdm from examples.speech_to_text.data_utils import load_df_from_tsv, save_df_to_tsv log = logging.getLogger(__name__) SPLITS = ["train", "dev", "test"] def get_top_n( root: Path, n_speakers: int = 10, min_n_tokens: int = 5 ) -> pd.DataFrame: df = load_df_from_tsv(root / "validated.tsv") df["n_tokens"] = [len(s.split()) for s in df["sentence"]] df = df[df["n_tokens"] >= min_n_tokens] df["n_frames"] = [ torchaudio.info((root / "clips" / p).as_posix()).num_frames for p in tqdm(df["path"]) ] df["id"] = [Path(p).stem for p in df["path"]] total_duration_ms = df.groupby("client_id")["n_frames"].agg(["sum"]) total_duration_ms = total_duration_ms.sort_values("sum", ascending=False) top_n_total_duration_ms = total_duration_ms.head(n_speakers) top_n_client_ids = set(top_n_total_duration_ms.index.tolist()) df_top_n = df[df["client_id"].isin(top_n_client_ids)] return df_top_n def get_splits( df, train_split_ratio=0.99, speaker_in_all_splits=False, rand_seed=0 ) -> Tuple[Dict[str, str], List[str]]: np.random.seed(rand_seed) dev_split_ratio = (1. - train_split_ratio) / 3 grouped = list(df.groupby("client_id")) id_to_split = {} for _, cur_df in tqdm(grouped): cur_n_examples = len(cur_df) if speaker_in_all_splits and cur_n_examples < 3: continue cur_n_train = int(cur_n_examples * train_split_ratio) cur_n_dev = int(cur_n_examples * dev_split_ratio) cur_n_test = cur_n_examples - cur_n_dev - cur_n_train if speaker_in_all_splits and cur_n_dev * cur_n_test == 0: cur_n_dev, cur_n_test = 1, 1 cur_n_train = cur_n_examples - cur_n_dev - cur_n_test cur_indices = cur_df.index.tolist() cur_shuffled_indices = np.random.permutation(cur_n_examples) cur_shuffled_indices = [cur_indices[i] for i in cur_shuffled_indices] cur_indices_by_split = { "train": cur_shuffled_indices[:cur_n_train], "dev": cur_shuffled_indices[cur_n_train: cur_n_train + cur_n_dev], "test": cur_shuffled_indices[cur_n_train + cur_n_dev:] } for split in SPLITS: for i in cur_indices_by_split[split]: id_ = df["id"].loc[i] id_to_split[id_] = split return id_to_split, sorted(df["client_id"].unique()) def convert_to_wav(root: Path, filenames: List[str], target_sr=16_000): out_root = root / "wav" out_root.mkdir(exist_ok=True, parents=True) print("Converting to WAV...") for n in tqdm(filenames): in_path = (root / "clips" / n).as_posix() waveform, sr = torchaudio.load(in_path) converted, converted_sr = torchaudio.sox_effects.apply_effects_tensor( waveform, sr, [["rate", str(target_sr)], ["channels", "1"]] ) out_path = (out_root / Path(n).with_suffix(".wav").name).as_posix() torchaudio.save(out_path, converted, converted_sr, encoding="PCM_S", bits_per_sample=16) def process(args): data_root = Path(args.data_root).absolute() / args.lang # Generate TSV manifest print("Generating manifest...") df_top_n = get_top_n(data_root) id_to_split, speakers = get_splits(df_top_n) if args.convert_to_wav: convert_to_wav(data_root, df_top_n["path"].tolist()) manifest_by_split = {split: defaultdict(list) for split in SPLITS} for sample in tqdm(df_top_n.to_dict(orient="index").values()): sample_id = sample["id"] split = id_to_split[sample_id] manifest_by_split[split]["id"].append(sample_id) if args.convert_to_wav: audio_path = data_root / "wav" / f"{sample_id}.wav" else: audio_path = data_root / "clips" / f"{sample_id}.mp3" manifest_by_split[split]["audio"].append(audio_path.as_posix()) manifest_by_split[split]["n_frames"].append(sample["n_frames"]) manifest_by_split[split]["tgt_text"].append(sample["sentence"]) manifest_by_split[split]["speaker"].append(sample["client_id"]) manifest_by_split[split]["src_text"].append(sample["sentence"]) output_root = Path(args.output_manifest_root).absolute() output_root.mkdir(parents=True, exist_ok=True) for split in SPLITS: save_df_to_tsv( pd.DataFrame.from_dict(manifest_by_split[split]), output_root / f"{split}.audio.tsv" ) def main(): parser = argparse.ArgumentParser() parser.add_argument("--data-root", "-d", required=True, type=str) parser.add_argument("--output-manifest-root", "-m", required=True, type=str) parser.add_argument("--lang", "-l", required=True, type=str) parser.add_argument("--convert-to-wav", action="store_true") args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_synthesis/preprocessing/get_feature_manifest.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path import shutil from tempfile import NamedTemporaryFile from collections import Counter, defaultdict import pandas as pd import torchaudio from tqdm import tqdm from fairseq.data.audio.audio_utils import convert_waveform from examples.speech_to_text.data_utils import ( create_zip, gen_config_yaml, gen_vocab, get_zip_manifest, load_tsv_to_dicts, save_df_to_tsv ) from examples.speech_synthesis.data_utils import ( extract_logmel_spectrogram, extract_pitch, extract_energy, get_global_cmvn, ipa_phonemize, get_mfa_alignment, get_unit_alignment, get_feature_value_min_max ) log = logging.getLogger(__name__) def process(args): assert "train" in args.splits out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) print("Fetching data...") audio_manifest_root = Path(args.audio_manifest_root).absolute() samples = [] for s in args.splits: for e in load_tsv_to_dicts(audio_manifest_root / f"{s}.audio.tsv"): e["split"] = s samples.append(e) sample_ids = [s["id"] for s in samples] # Get alignment info id_to_alignment = None if args.textgrid_zip is not None: assert args.id_to_units_tsv is None id_to_alignment = get_mfa_alignment( args.textgrid_zip, sample_ids, args.sample_rate, args.hop_length ) elif args.id_to_units_tsv is not None: # assume identical hop length on the unit sequence id_to_alignment = get_unit_alignment(args.id_to_units_tsv, sample_ids) # Extract features and pack features into ZIP feature_name = "logmelspec80" zip_path = out_root / f"{feature_name}.zip" pitch_zip_path = out_root / "pitch.zip" energy_zip_path = out_root / "energy.zip" gcmvn_npz_path = out_root / "gcmvn_stats.npz" if zip_path.exists() and gcmvn_npz_path.exists(): print(f"{zip_path} and {gcmvn_npz_path} exist.") else: feature_root = out_root / feature_name feature_root.mkdir(exist_ok=True) pitch_root = out_root / "pitch" energy_root = out_root / "energy" if args.add_fastspeech_targets: pitch_root.mkdir(exist_ok=True) energy_root.mkdir(exist_ok=True) print("Extracting Mel spectrogram features...") for sample in tqdm(samples): waveform, sample_rate = torchaudio.load(sample["audio"]) waveform, sample_rate = convert_waveform( waveform, sample_rate, normalize_volume=args.normalize_volume, to_sample_rate=args.sample_rate ) sample_id = sample["id"] target_length = None if id_to_alignment is not None: a = id_to_alignment[sample_id] target_length = sum(a.frame_durations) if a.start_sec is not None and a.end_sec is not None: start_frame = int(a.start_sec * sample_rate) end_frame = int(a.end_sec * sample_rate) waveform = waveform[:, start_frame: end_frame] extract_logmel_spectrogram( waveform, sample_rate, feature_root / f"{sample_id}.npy", win_length=args.win_length, hop_length=args.hop_length, n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min, f_max=args.f_max, target_length=target_length ) if args.add_fastspeech_targets: assert id_to_alignment is not None extract_pitch( waveform, sample_rate, pitch_root / f"{sample_id}.npy", hop_length=args.hop_length, log_scale=True, phoneme_durations=id_to_alignment[sample_id].frame_durations ) extract_energy( waveform, energy_root / f"{sample_id}.npy", hop_length=args.hop_length, n_fft=args.n_fft, log_scale=True, phoneme_durations=id_to_alignment[sample_id].frame_durations ) print("ZIPing features...") create_zip(feature_root, zip_path) get_global_cmvn(feature_root, gcmvn_npz_path) shutil.rmtree(feature_root) if args.add_fastspeech_targets: create_zip(pitch_root, pitch_zip_path) shutil.rmtree(pitch_root) create_zip(energy_root, energy_zip_path) shutil.rmtree(energy_root) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) pitch_paths, pitch_lengths, energy_paths, energy_lengths = [None] * 4 if args.add_fastspeech_targets: pitch_paths, pitch_lengths = get_zip_manifest(pitch_zip_path) energy_paths, energy_lengths = get_zip_manifest(energy_zip_path) # Generate TSV manifest print("Generating manifest...") id_to_cer = None if args.cer_threshold is not None: assert Path(args.cer_tsv_path).is_file() id_to_cer = { x["id"]: x["uer"] for x in load_tsv_to_dicts(args.cer_tsv_path) } manifest_by_split = {split: defaultdict(list) for split in args.splits} for sample in tqdm(samples): sample_id, split = sample["id"], sample["split"] if args.snr_threshold is not None and "snr" in sample \ and sample["snr"] < args.snr_threshold: continue if args.cer_threshold is not None \ and id_to_cer[sample_id] > args.cer_threhold: continue normalized_utt = sample["tgt_text"] if id_to_alignment is not None: normalized_utt = " ".join(id_to_alignment[sample_id].tokens) elif args.ipa_vocab: normalized_utt = ipa_phonemize( normalized_utt, lang=args.lang, use_g2p=args.use_g2p ) manifest_by_split[split]["id"].append(sample_id) manifest_by_split[split]["audio"].append(audio_paths[sample_id]) manifest_by_split[split]["n_frames"].append(audio_lengths[sample_id]) manifest_by_split[split]["tgt_text"].append(normalized_utt) manifest_by_split[split]["speaker"].append(sample["speaker"]) manifest_by_split[split]["src_text"].append(sample["src_text"]) if args.add_fastspeech_targets: assert id_to_alignment is not None duration = " ".join( str(d) for d in id_to_alignment[sample_id].frame_durations ) manifest_by_split[split]["duration"].append(duration) manifest_by_split[split]["pitch"].append(pitch_paths[sample_id]) manifest_by_split[split]["energy"].append(energy_paths[sample_id]) for split in args.splits: save_df_to_tsv( pd.DataFrame.from_dict(manifest_by_split[split]), out_root / f"{split}.tsv" ) # Generate vocab vocab_name, spm_filename = None, None if id_to_alignment is not None or args.ipa_vocab: vocab = Counter() for t in manifest_by_split["train"]["tgt_text"]: vocab.update(t.split(" ")) vocab_name = "vocab.txt" with open(out_root / vocab_name, "w") as f: for s, c in vocab.most_common(): f.write(f"{s} {c}\n") else: spm_filename_prefix = "spm_char" spm_filename = f"{spm_filename_prefix}.model" with NamedTemporaryFile(mode="w") as f: for t in manifest_by_split["train"]["tgt_text"]: f.write(t + "\n") f.flush() # needed to ensure gen_vocab sees dumped text gen_vocab(Path(f.name), out_root / spm_filename_prefix, "char") # Generate speaker list speakers = sorted({sample["speaker"] for sample in samples}) speakers_path = out_root / "speakers.txt" with open(speakers_path, "w") as f: for speaker in speakers: f.write(f"{speaker}\n") # Generate config YAML win_len_t = args.win_length / args.sample_rate hop_len_t = args.hop_length / args.sample_rate extra = { "sample_rate": args.sample_rate, "features": { "type": "spectrogram+melscale+log", "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft, "window_fn": "hann", "win_length": args.win_length, "hop_length": args.hop_length, "sample_rate": args.sample_rate, "win_len_t": win_len_t, "hop_len_t": hop_len_t, "f_min": args.f_min, "f_max": args.f_max, "n_stft": args.n_fft // 2 + 1 } } if len(speakers) > 1: extra["speaker_set_filename"] = "speakers.txt" if args.add_fastspeech_targets: pitch_min, pitch_max = get_feature_value_min_max( [(out_root / n).as_posix() for n in pitch_paths.values()] ) energy_min, energy_max = get_feature_value_min_max( [(out_root / n).as_posix() for n in energy_paths.values()] ) extra["features"]["pitch_min"] = pitch_min extra["features"]["pitch_max"] = pitch_max extra["features"]["energy_min"] = energy_min extra["features"]["energy_max"] = energy_max gen_config_yaml( out_root, spm_filename=spm_filename, vocab_name=vocab_name, audio_root=out_root.as_posix(), input_channels=None, input_feat_per_channel=None, specaugment_policy=None, cmvn_type="global", gcmvn_path=gcmvn_npz_path, extra=extra ) def main(): parser = argparse.ArgumentParser() parser.add_argument("--audio-manifest-root", "-m", required=True, type=str) parser.add_argument("--output-root", "-o", required=True, type=str) parser.add_argument("--splits", "-s", type=str, nargs="+", default=["train", "dev", "test"]) parser.add_argument("--ipa-vocab", action="store_true") parser.add_argument("--use-g2p", action="store_true") parser.add_argument("--lang", type=str, default="en-us") parser.add_argument("--win-length", type=int, default=1024) parser.add_argument("--hop-length", type=int, default=256) parser.add_argument("--n-fft", type=int, default=1024) parser.add_argument("--n-mels", type=int, default=80) parser.add_argument("--f-min", type=int, default=20) parser.add_argument("--f-max", type=int, default=8000) parser.add_argument("--sample-rate", type=int, default=22050) parser.add_argument("--normalize-volume", "-n", action="store_true") parser.add_argument("--textgrid-zip", type=str, default=None) parser.add_argument("--id-to-units-tsv", type=str, default=None) parser.add_argument("--add-fastspeech-targets", action="store_true") parser.add_argument("--snr-threshold", type=float, default=None) parser.add_argument("--cer-threshold", type=float, default=None) parser.add_argument("--cer-tsv-path", type=str, default="") args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_synthesis/preprocessing/get_ljspeech_audio_manifest.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path from collections import defaultdict import pandas as pd from torchaudio.datasets import LJSPEECH from tqdm import tqdm from examples.speech_to_text.data_utils import save_df_to_tsv log = logging.getLogger(__name__) SPLITS = ["train", "dev", "test"] def process(args): out_root = Path(args.output_data_root).absolute() out_root.mkdir(parents=True, exist_ok=True) # Generate TSV manifest print("Generating manifest...") # following FastSpeech's splits dataset = LJSPEECH(out_root.as_posix(), download=True) id_to_split = {} for x in dataset._flist: id_ = x[0] speaker = id_.split("-")[0] id_to_split[id_] = { "LJ001": "test", "LJ002": "test", "LJ003": "dev" }.get(speaker, "train") manifest_by_split = {split: defaultdict(list) for split in SPLITS} progress = tqdm(enumerate(dataset), total=len(dataset)) for i, (waveform, _, utt, normalized_utt) in progress: sample_id = dataset._flist[i][0] split = id_to_split[sample_id] manifest_by_split[split]["id"].append(sample_id) audio_path = f"{dataset._path}/{sample_id}.wav" manifest_by_split[split]["audio"].append(audio_path) manifest_by_split[split]["n_frames"].append(len(waveform[0])) manifest_by_split[split]["tgt_text"].append(normalized_utt) manifest_by_split[split]["speaker"].append("ljspeech") manifest_by_split[split]["src_text"].append(utt) manifest_root = Path(args.output_manifest_root).absolute() manifest_root.mkdir(parents=True, exist_ok=True) for split in SPLITS: save_df_to_tsv( pd.DataFrame.from_dict(manifest_by_split[split]), manifest_root / f"{split}.audio.tsv" ) def main(): parser = argparse.ArgumentParser() parser.add_argument("--output-data-root", "-d", required=True, type=str) parser.add_argument("--output-manifest-root", "-m", required=True, type=str) args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_synthesis/preprocessing/get_speaker_embedding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse from collections import defaultdict from itertools import chain from pathlib import Path import numpy as np import torchaudio import torchaudio.sox_effects as ta_sox import yaml from tqdm import tqdm from examples.speech_to_text.data_utils import load_tsv_to_dicts from examples.speech_synthesis.preprocessing.speaker_embedder import SpkrEmbedder def extract_embedding(audio_path, embedder): wav, sr = torchaudio.load(audio_path) # 2D if sr != embedder.RATE: wav, sr = ta_sox.apply_effects_tensor( wav, sr, [["rate", str(embedder.RATE)]] ) try: emb = embedder([wav[0].cuda().float()]).cpu().numpy() except RuntimeError: emb = None return emb def process(args): print("Fetching data...") raw_manifest_root = Path(args.raw_manifest_root).absolute() samples = [load_tsv_to_dicts(raw_manifest_root / (s + ".tsv")) for s in args.splits] samples = list(chain(*samples)) with open(args.config, "r") as f: config = yaml.load(f, Loader=yaml.FullLoader) with open(f"{config['audio_root']}/{config['speaker_set_filename']}") as f: speaker_to_id = {r.strip(): i for i, r in enumerate(f)} embedder = SpkrEmbedder(args.ckpt).cuda() speaker_to_cnt = defaultdict(float) speaker_to_emb = defaultdict(float) for sample in tqdm(samples, desc="extract emb"): emb = extract_embedding(sample["audio"], embedder) if emb is not None: speaker_to_cnt[sample["speaker"]] += 1 speaker_to_emb[sample["speaker"]] += emb if len(speaker_to_emb) != len(speaker_to_id): missed = set(speaker_to_id) - set(speaker_to_emb.keys()) print( f"WARNING: missing embeddings for {len(missed)} speaker:\n{missed}" ) speaker_emb_mat = np.zeros((len(speaker_to_id), len(emb)), float) for speaker in speaker_to_emb: idx = speaker_to_id[speaker] emb = speaker_to_emb[speaker] cnt = speaker_to_cnt[speaker] speaker_emb_mat[idx, :] = emb / cnt speaker_emb_name = "speaker_emb.npy" speaker_emb_path = f"{config['audio_root']}/{speaker_emb_name}" np.save(speaker_emb_path, speaker_emb_mat) config["speaker_emb_filename"] = speaker_emb_name with open(args.new_config, "w") as f: yaml.dump(config, f) def main(): parser = argparse.ArgumentParser() parser.add_argument("--raw-manifest-root", "-m", required=True, type=str) parser.add_argument("--splits", "-s", type=str, nargs="+", default=["train"]) parser.add_argument("--config", "-c", required=True, type=str) parser.add_argument("--new-config", "-n", required=True, type=str) parser.add_argument("--ckpt", required=True, type=str, help="speaker embedder checkpoint") args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_synthesis/preprocessing/get_vctk_audio_manifest.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import numpy as np import re from pathlib import Path from collections import defaultdict import pandas as pd from torchaudio.datasets import VCTK from tqdm import tqdm from examples.speech_to_text.data_utils import save_df_to_tsv log = logging.getLogger(__name__) SPLITS = ["train", "dev", "test"] def normalize_text(text): return re.sub(r"[^a-zA-Z.?!,'\- ]", '', text) def process(args): out_root = Path(args.output_data_root).absolute() out_root.mkdir(parents=True, exist_ok=True) # Generate TSV manifest print("Generating manifest...") dataset = VCTK(out_root.as_posix(), download=False) ids = list(dataset._walker) np.random.seed(args.seed) np.random.shuffle(ids) n_train = len(ids) - args.n_dev - args.n_test _split = ["train"] * n_train + ["dev"] * args.n_dev + ["test"] * args.n_test id_to_split = dict(zip(ids, _split)) manifest_by_split = {split: defaultdict(list) for split in SPLITS} progress = tqdm(enumerate(dataset), total=len(dataset)) for i, (waveform, _, text, speaker_id, _) in progress: sample_id = dataset._walker[i] _split = id_to_split[sample_id] audio_dir = Path(dataset._path) / dataset._folder_audio / speaker_id audio_path = audio_dir / f"{sample_id}.wav" text = normalize_text(text) manifest_by_split[_split]["id"].append(sample_id) manifest_by_split[_split]["audio"].append(audio_path.as_posix()) manifest_by_split[_split]["n_frames"].append(len(waveform[0])) manifest_by_split[_split]["tgt_text"].append(text) manifest_by_split[_split]["speaker"].append(speaker_id) manifest_by_split[_split]["src_text"].append(text) manifest_root = Path(args.output_manifest_root).absolute() manifest_root.mkdir(parents=True, exist_ok=True) for _split in SPLITS: save_df_to_tsv( pd.DataFrame.from_dict(manifest_by_split[_split]), manifest_root / f"{_split}.audio.tsv" ) def main(): parser = argparse.ArgumentParser() parser.add_argument("--output-data-root", "-d", required=True, type=str) parser.add_argument("--output-manifest-root", "-m", required=True, type=str) parser.add_argument("--n-dev", default=50, type=int) parser.add_argument("--n-test", default=100, type=int) parser.add_argument("--seed", "-s", default=1234, type=int) args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_synthesis/preprocessing/speaker_embedder/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import librosa import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.data import torchaudio EMBEDDER_PARAMS = { 'num_mels': 40, 'n_fft': 512, 'emb_dim': 256, 'lstm_hidden': 768, 'lstm_layers': 3, 'window': 80, 'stride': 40, } def set_requires_grad(nets, requires_grad=False): """Set requies_grad=Fasle for all the networks to avoid unnecessary computations Parameters: nets (network list) -- a list of networks requires_grad (bool) -- whether the networks require gradients or not """ if not isinstance(nets, list): nets = [nets] for net in nets: if net is not None: for param in net.parameters(): param.requires_grad = requires_grad class LinearNorm(nn.Module): def __init__(self, hp): super(LinearNorm, self).__init__() self.linear_layer = nn.Linear(hp["lstm_hidden"], hp["emb_dim"]) def forward(self, x): return self.linear_layer(x) class SpeechEmbedder(nn.Module): def __init__(self, hp): super(SpeechEmbedder, self).__init__() self.lstm = nn.LSTM(hp["num_mels"], hp["lstm_hidden"], num_layers=hp["lstm_layers"], batch_first=True) self.proj = LinearNorm(hp) self.hp = hp def forward(self, mel): # (num_mels, T) -> (num_mels, T', window) mels = mel.unfold(1, self.hp["window"], self.hp["stride"]) mels = mels.permute(1, 2, 0) # (T', window, num_mels) x, _ = self.lstm(mels) # (T', window, lstm_hidden) x = x[:, -1, :] # (T', lstm_hidden), use last frame only x = self.proj(x) # (T', emb_dim) x = x / torch.norm(x, p=2, dim=1, keepdim=True) # (T', emb_dim) x = x.mean(dim=0) if x.norm(p=2) != 0: x = x / x.norm(p=2) return x class SpkrEmbedder(nn.Module): RATE = 16000 def __init__( self, embedder_path, embedder_params=EMBEDDER_PARAMS, rate=16000, hop_length=160, win_length=400, pad=False, ): super(SpkrEmbedder, self).__init__() embedder_pt = torch.load(embedder_path, map_location="cpu") self.embedder = SpeechEmbedder(embedder_params) self.embedder.load_state_dict(embedder_pt) self.embedder.eval() set_requires_grad(self.embedder, requires_grad=False) self.embedder_params = embedder_params self.register_buffer('mel_basis', torch.from_numpy( librosa.filters.mel( sr=self.RATE, n_fft=self.embedder_params["n_fft"], n_mels=self.embedder_params["num_mels"]) ) ) self.resample = None if rate != self.RATE: self.resample = torchaudio.transforms.Resample(rate, self.RATE) self.hop_length = hop_length self.win_length = win_length self.pad = pad def get_mel(self, y): if self.pad and y.shape[-1] < 14000: y = F.pad(y, (0, 14000 - y.shape[-1])) window = torch.hann_window(self.win_length).to(y) y = torch.stft(y, n_fft=self.embedder_params["n_fft"], hop_length=self.hop_length, win_length=self.win_length, window=window) magnitudes = torch.norm(y, dim=-1, p=2) ** 2 mel = torch.log10(self.mel_basis @ magnitudes + 1e-6) return mel def forward(self, inputs): dvecs = [] for wav in inputs: mel = self.get_mel(wav) if mel.dim() == 3: mel = mel.squeeze(0) dvecs += [self.embedder(mel)] dvecs = torch.stack(dvecs) dvec = torch.mean(dvecs, dim=0) dvec = dvec / torch.norm(dvec) return dvec ================================================ FILE: examples/speech_synthesis/preprocessing/vad/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import collections import contextlib import wave try: import webrtcvad except ImportError: raise ImportError("Please install py-webrtcvad: pip install webrtcvad") import argparse import os import logging from tqdm import tqdm AUDIO_SUFFIX = '.wav' FS_MS = 30 SCALE = 6e-5 THRESHOLD = 0.3 def read_wave(path): """Reads a .wav file. Takes the path, and returns (PCM audio data, sample rate). """ with contextlib.closing(wave.open(path, 'rb')) as wf: num_channels = wf.getnchannels() assert num_channels == 1 sample_width = wf.getsampwidth() assert sample_width == 2 sample_rate = wf.getframerate() assert sample_rate in (8000, 16000, 32000, 48000) pcm_data = wf.readframes(wf.getnframes()) return pcm_data, sample_rate def write_wave(path, audio, sample_rate): """Writes a .wav file. Takes path, PCM audio data, and sample rate. """ with contextlib.closing(wave.open(path, 'wb')) as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(audio) class Frame(object): """Represents a "frame" of audio data.""" def __init__(self, bytes, timestamp, duration): self.bytes = bytes self.timestamp = timestamp self.duration = duration def frame_generator(frame_duration_ms, audio, sample_rate): """Generates audio frames from PCM audio data. Takes the desired frame duration in milliseconds, the PCM data, and the sample rate. Yields Frames of the requested duration. """ n = int(sample_rate * (frame_duration_ms / 1000.0) * 2) offset = 0 timestamp = 0.0 duration = (float(n) / sample_rate) / 2.0 while offset + n < len(audio): yield Frame(audio[offset:offset + n], timestamp, duration) timestamp += duration offset += n def vad_collector(sample_rate, frame_duration_ms, padding_duration_ms, vad, frames): """Filters out non-voiced audio frames. Given a webrtcvad.Vad and a source of audio frames, yields only the voiced audio. Uses a padded, sliding window algorithm over the audio frames. When more than 90% of the frames in the window are voiced (as reported by the VAD), the collector triggers and begins yielding audio frames. Then the collector waits until 90% of the frames in the window are unvoiced to detrigger. The window is padded at the front and back to provide a small amount of silence or the beginnings/endings of speech around the voiced frames. Arguments: sample_rate - The audio sample rate, in Hz. frame_duration_ms - The frame duration in milliseconds. padding_duration_ms - The amount to pad the window, in milliseconds. vad - An instance of webrtcvad.Vad. frames - a source of audio frames (sequence or generator). Returns: A generator that yields PCM audio data. """ num_padding_frames = int(padding_duration_ms / frame_duration_ms) # We use a deque for our sliding window/ring buffer. ring_buffer = collections.deque(maxlen=num_padding_frames) # We have two states: TRIGGERED and NOTTRIGGERED. We start in the # NOTTRIGGERED state. triggered = False voiced_frames = [] for frame in frames: is_speech = vad.is_speech(frame.bytes, sample_rate) # sys.stdout.write('1' if is_speech else '0') if not triggered: ring_buffer.append((frame, is_speech)) num_voiced = len([f for f, speech in ring_buffer if speech]) # If we're NOTTRIGGERED and more than 90% of the frames in # the ring buffer are voiced frames, then enter the # TRIGGERED state. if num_voiced > 0.9 * ring_buffer.maxlen: triggered = True # We want to yield all the audio we see from now until # we are NOTTRIGGERED, but we have to start with the # audio that's already in the ring buffer. for f, _ in ring_buffer: voiced_frames.append(f) ring_buffer.clear() else: # We're in the TRIGGERED state, so collect the audio data # and add it to the ring buffer. voiced_frames.append(frame) ring_buffer.append((frame, is_speech)) num_unvoiced = len([f for f, speech in ring_buffer if not speech]) # If more than 90% of the frames in the ring buffer are # unvoiced, then enter NOTTRIGGERED and yield whatever # audio we've collected. if num_unvoiced > 0.9 * ring_buffer.maxlen: triggered = False yield [b''.join([f.bytes for f in voiced_frames]), voiced_frames[0].timestamp, voiced_frames[-1].timestamp] ring_buffer.clear() voiced_frames = [] # If we have any leftover voiced audio when we run out of input, # yield it. if voiced_frames: yield [b''.join([f.bytes for f in voiced_frames]), voiced_frames[0].timestamp, voiced_frames[-1].timestamp] def main(args): # create output folder try: cmd = f"mkdir -p {args.out_path}" os.system(cmd) except Exception: logging.error("Can not create output folder") exit(-1) # build vad object vad = webrtcvad.Vad(int(args.agg)) # iterating over wavs in dir for file in tqdm(os.listdir(args.in_path)): if file.endswith(AUDIO_SUFFIX): audio_inpath = os.path.join(args.in_path, file) audio_outpath = os.path.join(args.out_path, file) audio, sample_rate = read_wave(audio_inpath) frames = frame_generator(FS_MS, audio, sample_rate) frames = list(frames) segments = vad_collector(sample_rate, FS_MS, 300, vad, frames) merge_segments = list() timestamp_start = 0.0 timestamp_end = 0.0 # removing start, end, and long sequences of sils for i, segment in enumerate(segments): merge_segments.append(segment[0]) if i and timestamp_start: sil_duration = segment[1] - timestamp_end if sil_duration > THRESHOLD: merge_segments.append(int(THRESHOLD / SCALE)*(b'\x00')) else: merge_segments.append(int((sil_duration / SCALE))*(b'\x00')) timestamp_start = segment[1] timestamp_end = segment[2] segment = b''.join(merge_segments) write_wave(audio_outpath, segment, sample_rate) if __name__ == '__main__': parser = argparse.ArgumentParser(description='Apply vad to a file of fils.') parser.add_argument('in_path', type=str, help='Path to the input files') parser.add_argument('out_path', type=str, help='Path to save the processed files') parser.add_argument('--agg', type=int, default=3, help='The level of aggressiveness of the VAD: [0-3]') args = parser.parse_args() main(args) ================================================ FILE: examples/speech_synthesis/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from scipy.interpolate import interp1d import torchaudio from fairseq.tasks.text_to_speech import ( batch_compute_distortion, compute_rms_dist ) def batch_mel_spectral_distortion( y1, y2, sr, normalize_type="path", mel_fn=None ): """ https://arxiv.org/pdf/2011.03568.pdf Same as Mel Cepstral Distortion, but computed on log-mel spectrograms. """ if mel_fn is None or mel_fn.sample_rate != sr: mel_fn = torchaudio.transforms.MelSpectrogram( sr, n_fft=int(0.05 * sr), win_length=int(0.05 * sr), hop_length=int(0.0125 * sr), f_min=20, n_mels=80, window_fn=torch.hann_window ).to(y1[0].device) offset = 1e-6 return batch_compute_distortion( y1, y2, sr, lambda y: torch.log(mel_fn(y) + offset).transpose(-1, -2), compute_rms_dist, normalize_type ) # This code is based on # "https://github.com/bastibe/MAPS-Scripts/blob/master/helper.py" def _same_t_in_true_and_est(func): def new_func(true_t, true_f, est_t, est_f): assert type(true_t) is np.ndarray assert type(true_f) is np.ndarray assert type(est_t) is np.ndarray assert type(est_f) is np.ndarray interpolated_f = interp1d( est_t, est_f, bounds_error=False, kind='nearest', fill_value=0 )(true_t) return func(true_t, true_f, true_t, interpolated_f) return new_func @_same_t_in_true_and_est def gross_pitch_error(true_t, true_f, est_t, est_f): """The relative frequency in percent of pitch estimates that are outside a threshold around the true pitch. Only frames that are considered pitched by both the ground truth and the estimator (if applicable) are considered. """ correct_frames = _true_voiced_frames(true_t, true_f, est_t, est_f) gross_pitch_error_frames = _gross_pitch_error_frames( true_t, true_f, est_t, est_f ) return np.sum(gross_pitch_error_frames) / np.sum(correct_frames) def _gross_pitch_error_frames(true_t, true_f, est_t, est_f, eps=1e-8): voiced_frames = _true_voiced_frames(true_t, true_f, est_t, est_f) true_f_p_eps = [x + eps for x in true_f] pitch_error_frames = np.abs(est_f / true_f_p_eps - 1) > 0.2 return voiced_frames & pitch_error_frames def _true_voiced_frames(true_t, true_f, est_t, est_f): return (est_f != 0) & (true_f != 0) def _voicing_decision_error_frames(true_t, true_f, est_t, est_f): return (est_f != 0) != (true_f != 0) @_same_t_in_true_and_est def f0_frame_error(true_t, true_f, est_t, est_f): gross_pitch_error_frames = _gross_pitch_error_frames( true_t, true_f, est_t, est_f ) voicing_decision_error_frames = _voicing_decision_error_frames( true_t, true_f, est_t, est_f ) return (np.sum(gross_pitch_error_frames) + np.sum(voicing_decision_error_frames)) / (len(true_t)) @_same_t_in_true_and_est def voicing_decision_error(true_t, true_f, est_t, est_f): voicing_decision_error_frames = _voicing_decision_error_frames( true_t, true_f, est_t, est_f ) return np.sum(voicing_decision_error_frames) / (len(true_t)) ================================================ FILE: examples/speech_text_joint_to_text/README.md ================================================ # Joint Speech Text training in Fairseq An extension of Fairseq s2t project with the speech to text task enhanced by the co-trained text to text mapping task. More details about Fairseq s2t can be found [here](../speech_to_text/README.md) ## Examples Examples of speech text joint training in fairseq - [English-to-German MuST-C model](docs/ende-mustc.md) - [IWSLT 2021 Multilingual Speech Translation](docs/iwslt2021.md) - [Speech Text Joint Pre-training ](docs/pre-training.md) ## Citation Please cite as: ``` @inproceedings{Tang2022UnifiedSP, title={Unified Speech-Text Pre-training for Speech Translation and Recognition}, author={Yun Tang and Hongyu Gong and Ning Dong and Changhan Wang and Wei-Ning Hsu and Jiatao Gu and Alexei Baevski and Xian Li and Abdelrahman Mohamed and Michael Auli and Juan Miguel Pino}, booktitle={ACL}, year={2022} } @inproceedings{Tang2021IST, title = {Improving Speech Translation by Understanding and Learning from the Auxiliary Text Translation Task}, author = {Yun Tang and Juan Pino and Xian Li and Changhan Wang and Dmitriy Genzel}, booktitle = {ACL}, year = {2021}, } @inproceedings{Tang2021FST, title = {FST: the FAIR Speech Translation System for the IWSLT21 Multilingual Shared Task}, author = {Yun Tang and Hongyu Gong and Xian Li and Changhan Wang and Juan Pino and Holger Schwenk and Naman Goyal}, booktitle = {IWSLT}, year = {2021}, } @inproceedings{Tang2021AGM, title={A General Multi-Task Learning Framework to Leverage Text Data for Speech to Text Tasks}, author={Yun Tang and J. Pino and Changhan Wang and Xutai Ma and Dmitriy Genzel}, booktitle={ICASSP}, year={2021} } @inproceedings{wang2020fairseqs2t, title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq}, author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino}, booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations}, year = {2020}, } @inproceedings{ott2019fairseq, title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, year = {2019}, } ``` ================================================ FILE: examples/speech_text_joint_to_text/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import tasks, criterions, models # noqa ================================================ FILE: examples/speech_text_joint_to_text/configs/mustc_noise.list ================================================ "(Applause) NOISE "(Laughter) VOICE "(Laughter)" VOICE (Applause) NOISE (Applause). NOISE (Audience) VOICE (Audio) NOISE (Beat) NOISE (Beatboxing) VOICE (Beep) NOISE (Beeps) NOISE (Cheering) VOICE (Cheers) VOICE (Claps) NOISE (Clicking) NOISE (Clunk) NOISE (Coughs) NOISE (Drums) NOISE (Explosion) NOISE (Gasps) VOICE (Guitar) NOISE (Honk) NOISE (Laugher) VOICE (Laughing) VOICE (Laughs) VOICE (Laughter) VOICE (Laughter). VOICE (Laughter)... VOICE (Mumbling) VOICE (Music) NOISE (Noise) NOISE (Recording) VOICE (Ringing) NOISE (Shouts) VOICE (Sigh) VOICE (Sighs) VOICE (Silence) NOISE (Singing) VOICE (Sings) VOICE (Spanish) VOICE (Static) NOISE (Tones) NOISE (Trumpet) NOISE (Video) NOISE (Video): NOISE (Voice-over) NOISE (Whistle) NOISE (Whistling) NOISE (video): NOISE ================================================ FILE: examples/speech_text_joint_to_text/criterions/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os for file in os.listdir(os.path.dirname(__file__)): if file.endswith(".py") and not file.startswith("_"): criterion_name = file[: file.find(".py")] importlib.import_module( "examples.speech_text_joint_to_text.criterions." + criterion_name ) ================================================ FILE: examples/speech_text_joint_to_text/criterions/multi_modality_compound.py ================================================ # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from dataclasses import dataclass, field from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.criterions.ctc import CtcCriterion, CtcCriterionConfig from fairseq.criterions.label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterionConfig, ) from fairseq.logging.meters import safe_round from .multi_modality_cross_entropy import SpeechTextPreTrainCrossEntCriterion logger = logging.getLogger(__name__) @dataclass class SpeechTextPreTrainCompoundCriterionConfig( LabelSmoothedCrossEntropyCriterionConfig ): zero_infinity: bool = field( default=False, metadata={"help": "zero inf loss when source length <= target length"}, ) post_process: str = field( default="none", metadata={ "help": "how to post process predictions into words. can be letter, " "wordpiece, BPE symbols, etc. " "See fairseq.data.data_utils.post_process() for full list of options" }, ) @register_criterion( "speech_text_pretrain_compound", dataclass=SpeechTextPreTrainCompoundCriterionConfig ) class SpeechTextPreTrainCompoundCriterion(FairseqCriterion): def __init__( self, task, sentence_avg, label_smoothing, report_accuracy=False, zero_infinity=False, post_process=None, ): super().__init__(task) self.xent = SpeechTextPreTrainCrossEntCriterion( task, sentence_avg, label_smoothing, report_accuracy ) cfg_dict = { "zero_infinity": zero_infinity, "sentence_avg": sentence_avg, "post_process": post_process, } cfg_ctc = CtcCriterionConfig(**cfg_dict) self.ctc = CtcCriterion(cfg_ctc, task) def forward(self, model, sample, reduce=True): mode = sample["net_input"]["mode"] if mode == "sup_speech_ctc": # CTC sample["net_input"][ "src_lengths" ] = None # get downsampled src_lengths from padding_mask loss, sample_size, logging_output = self.ctc(model, sample, reduce) logging_output["mode"] = SpeechTextPreTrainCompoundCriterion.mode2value( "CTC" ) else: loss, sample_size, logging_output = self.xent(model, sample, reduce) logging_output["mode"] = SpeechTextPreTrainCompoundCriterion.mode2value( "xent" ) return loss, sample_size, logging_output @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True @staticmethod def mode2value(mode): # make the logging_outputs_can_be_summed = True if mode == "CTC": return 907 # prime number if mode == "xent": return 887 # prime number return 0 @staticmethod def value2mode(value): if value % 907 == 0: return "CTC" if value % 887 == 0: return "xent" raise ValueError("Unknow mode") @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" def _get_mode(logging_outputs): mds = [ SpeechTextPreTrainCompoundCriterion.value2mode(log["mode"]) for log in logging_outputs ] if sum([1 if l != mds[0] else 0 for l in mds]) > 0: raise ValueError("mode in one mini-batch is expected to be the same!") return mds[0] log_mode = _get_mode(logging_outputs) if log_mode == "xent": return SpeechTextPreTrainCrossEntCriterion.reduce_metrics(logging_outputs) # ctc loss loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) nsentences = utils.item( sum(log.get("nsentences", 0) for log in logging_outputs) ) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) metrics.log_scalar( "ctc_loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar("ctc_ntokens", ntokens) metrics.log_scalar("ctc_nsentences", nsentences) if sample_size != ntokens: metrics.log_scalar( "ctc_nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) c_errors = sum(log.get("c_errors", 0) for log in logging_outputs) metrics.log_scalar("_c_errors", c_errors) c_total = sum(log.get("c_total", 0) for log in logging_outputs) metrics.log_scalar("_c_total", c_total) w_errors = sum(log.get("w_errors", 0) for log in logging_outputs) metrics.log_scalar("_w_errors", w_errors) wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs) metrics.log_scalar("_wv_errors", wv_errors) w_total = sum(log.get("w_total", 0) for log in logging_outputs) metrics.log_scalar("_w_total", w_total) if c_total > 0: metrics.log_derived( "uer", lambda meters: safe_round( meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3 ) if meters["_c_total"].sum > 0 else float("nan"), ) if w_total > 0: metrics.log_derived( "wer", lambda meters: safe_round( meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3 ) if meters["_w_total"].sum > 0 else float("nan"), ) metrics.log_derived( "raw_wer", lambda meters: safe_round( meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3 ) if meters["_w_total"].sum > 0 else float("nan"), ) ================================================ FILE: examples/speech_text_joint_to_text/criterions/multi_modality_cross_entropy.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq import utils from fairseq.criterions import register_criterion from fairseq.criterions.label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterion, LabelSmoothedCrossEntropyCriterionConfig, label_smoothed_nll_loss, ) @register_criterion( "speech_text_pretrain_cross_entropy", dataclass=LabelSmoothedCrossEntropyCriterionConfig, ) class SpeechTextPreTrainCrossEntCriterion(LabelSmoothedCrossEntropyCriterion): def __init__(self, task, sentence_avg, label_smoothing, report_accuracy=False): super().__init__( task, sentence_avg, label_smoothing, report_accuracy=report_accuracy ) def forward(self, model, sample, reduce=True): net_output = model(**sample["net_input"]) loss, nll_loss, nsentences, ntokens, n_correct = self.compute_loss( model, net_output, sample, reduce=reduce ) sample_size = nsentences if self.sentence_avg else ntokens logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size, } if self.report_accuracy: logging_output["n_correct"] = utils.item(n_correct) logging_output["total"] = utils.item(ntokens) return loss, sample_size, logging_output def get_lprobs_and_target(self, model, net_output, sample): lprobs = model.get_normalized_probs(net_output, log_probs=True) target = model.get_targets(sample, net_output) assert self.ignore_prefix_size == 0 if self.ignore_prefix_size > 0: if getattr(lprobs, "batch_first", False): lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous() target = target[:, self.ignore_prefix_size :].contiguous() else: lprobs = lprobs[self.ignore_prefix_size :, :, :].contiguous() target = target[self.ignore_prefix_size :, :].contiguous() return lprobs, target def compute_loss(self, model, net_output, sample, reduce=True): lprobs, target = self.get_lprobs_and_target(model, net_output, sample) n_correct = 0 if isinstance(target, dict): t_lprobs = target["target_logprobs"] if not lprobs.batch_first: lprobs = lprobs.transpose(0, 1) t_lprobs = t_lprobs.transpose(0, 1) nsentences, seq_len = lprobs.size()[:2] ntokens = nsentences * seq_len t_probs = t_lprobs.exp() mask_indices = ( net_output[1]["mask_indices"][0] if len(net_output[1]["mask_indices"]) > 0 else None ) # mask_indices is True for those masking frames if mask_indices is not None: # B X T t_probs = t_probs.masked_fill(mask_indices.eq(False).unsqueeze(-1), 0) ntokens = mask_indices.int().sum() t_probs = t_probs.detach() t_lprobs = t_lprobs.detach() loss = ( -(t_probs * (lprobs - t_lprobs)).sum() if reduce else -(t_probs * (lprobs - t_lprobs)).sum(-1, keepdim=True) ) nll_loss = loss else: nsentences = target.size(0) mask = target.ne(self.padding_idx) loss, nll_loss = label_smoothed_nll_loss( lprobs.view(-1, lprobs.size(-1)), target.view(-1), self.eps, ignore_index=self.padding_idx, reduce=reduce, ) n_correct = torch.sum( lprobs.argmax(-1).masked_select(mask).eq(target.masked_select(mask)) ) ntokens = torch.sum(mask) return loss, nll_loss, nsentences, ntokens, n_correct ================================================ FILE: examples/speech_text_joint_to_text/criterions/text_guide_cross_entropy_acc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.criterions.label_smoothed_cross_entropy import label_smoothed_nll_loss from fairseq.logging import metrics @register_criterion("guided_label_smoothed_cross_entropy_with_accuracy") class GuidedCrossEntAccCriterion(FairseqCriterion): def __init__( self, task, sentence_avg, guide_alpha, text_input_cost_ratio, label_smoothing, disable_text_guide_update_num=0, attentive_cost_regularization=0, ): """ guide_alpha: alpha to inteplate nll and kd loss text_input_cost_ratio: loss ratio for text only input data label_smoothing: label smoothing ratio disable_text_guide_update_num: only use nll loss for the first N updates attentive_cost_regularization: ratio fo attentive cost """ super().__init__(task) self.alpha = guide_alpha self.attn_beta = attentive_cost_regularization self.sentence_avg = sentence_avg self.eps = label_smoothing self.text_input_cost_ratio = text_input_cost_ratio self.disable_update_num = disable_text_guide_update_num assert self.alpha >= 0 and self.alpha <= 1.0 @staticmethod def add_args(parser): """Add criterion-specific arguments to the parser.""" # fmt: off parser.add_argument('--label-smoothing', default=0., type=float, metavar='D', help='epsilon for label smoothing, 0 means no label smoothing') # fmt: off parser.add_argument('--guide-alpha', default=0., type=float, metavar='D', help='alpha to merge kd cost from text to speech input with ce loss') # fmt: off parser.add_argument('--disable-text-guide-update-num', default=0, type=int, metavar='D', help='disable guided target from text for the first N updates.') parser.add_argument("--attentive-cost-regularization", default=0.0, type=float, metavar='D', help="use encoder attentive loss regularization with cost ratio D") parser.add_argument("--attentive-cost-without-normalize", action='store_true', help="Don't do normalization during attentive cost computation") def forward(self, model, sample, reduce=True): reduction = 'sum' if reduce else 'none' net_input = sample["net_input"] net_output = model(**net_input) attn_cost = None lprobs = model.get_normalized_probs(net_output, log_probs=True) is_dual_input = True if net_input['src_tokens'] is not None and net_input.get('src_txt_tokens') is not None else False target = model.get_targets(sample, net_output) src_token_num = 0 if is_dual_input: # lprobs_spch from speech encoder and lprobs_text from text encoder lprobs_spch, lprobs_text = torch.chunk(lprobs, 2) lprobs_spch.batch_first = lprobs.batch_first lprobs_text.batch_first = lprobs.batch_first speech_loss, speech_nll_loss, speech_correct, speech_total = \ self.guide_loss_and_acc(model, lprobs_spch, lprobs_text, target, reduce=(reduction == 'sum')) text_loss, text_nll_loss, text_correct, text_total = self.compute_loss_and_acc(model, lprobs_text, target, reduction=reduction) loss = (speech_loss + text_loss) nll_loss = (speech_nll_loss + text_nll_loss) correct = speech_correct + text_correct total = speech_total + text_total attn_cost = net_output[1].get('attn_cost') if attn_cost is not None: # attn_cost is batch_first and padding tokens have been masked already src_token_num = attn_cost.ne(0).sum() attn_cost = attn_cost.sum() loss = loss + attn_cost * self.attn_beta else: attn_cost = 0 else: loss, nll_loss, correct, total = self.compute_loss_and_acc(model, lprobs, target, reduction=reduction) if sample["net_input"]['src_tokens'] is None: # text input only loss = loss * self.text_input_cost_ratio speech_loss = None speech_nll_loss = None sample_size, logging_output = self.get_logging_output( sample, loss, nll_loss, correct, total, src_token_num, speech_loss, speech_nll_loss, attn_cost, is_dual_input ) return loss, sample_size, logging_output def compute_loss_and_acc(self, model, lprobs, target, reduction='sum'): if not lprobs.batch_first: lprobs = lprobs.transpose(0, 1) lprobs = lprobs.view(-1, lprobs.size(-1)) # -> (B x T) x C target = target.view(-1) loss, nll_loss = label_smoothed_nll_loss( lprobs, target, self.eps, ignore_index=self.padding_idx, reduce=(reduction == 'sum'), ) mask = target.ne(self.padding_idx) correct = torch.sum(lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))) total = torch.sum(mask) return loss, nll_loss, correct, total def guide_loss_and_acc(self, model, lprobs, lprobs_teacher, target, reduce=True): """ lprobs_teacher is used as guide for lprobs """ if self.alpha == 0.0 or model.num_updates < self.disable_update_num: return self.compute_loss_and_acc(model, lprobs, target, reduction=('sum' if reduce else 'none')) if not lprobs.batch_first: lprobs = lprobs.transpose(0, 1) lprobs_teacher = lprobs_teacher.transpose(0, 1) lprobs = lprobs.view(-1, lprobs.size(-1)).float() # -> (B x T) x C lprobs_teacher = lprobs_teacher.view(-1, lprobs_teacher.size(-1)).float() # -> (B x T) x C target = target.view(-1) loss = F.nll_loss(lprobs, target, ignore_index=self.padding_idx, reduction='sum' if reduce else 'none') nll_loss = loss probs_teacher = lprobs_teacher.exp().masked_fill_(target.unsqueeze(-1).eq(self.padding_idx), 0) probs_teacher = probs_teacher.detach() guide_loss = -(probs_teacher*lprobs).sum() if reduce else -(probs_teacher*lprobs).sum(-1, keepdim=True) loss = self.alpha*guide_loss + (1.0 - self.alpha)*loss mask = target.ne(self.padding_idx) correct = torch.sum(lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask))) total = torch.sum(mask) return loss, nll_loss, correct, total def get_logging_output( self, sample, loss, nll_loss, correct, total, src_token_num=0, speech_loss=None, speech_nll_loss=None, attn_cost=None, is_dual_input=False, ): sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) mul_size = 2 if is_dual_input else 1 logging_output = { "loss": utils.item(loss.data), # * sample['ntokens'], "nll_loss": utils.item(nll_loss.data), # * sample['ntokens'], "ntokens": sample["ntokens"]*mul_size, "nsentences": sample["target"].size(0)*mul_size, "sample_size": sample_size*mul_size, "correct": utils.item(correct.data), "total": utils.item(total.data), "src_token_num": utils.item(src_token_num.data) if src_token_num > 0 else 0, "nframes": torch.sum(sample["net_input"]["src_lengths"]).item(), } if speech_loss is not None: logging_output["speech_loss"] = utils.item(speech_loss.data) logging_output["speech_nll_loss"] = utils.item(speech_nll_loss.data) logging_output["sample_size_speech_cost"] = sample_size logging_output["speech_attn_loss"] = attn_cost return sample_size*mul_size, logging_output @staticmethod def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" correct_sum = sum(log.get("correct", 0) for log in logging_outputs) total_sum = sum(log.get("total", 0) for log in logging_outputs) src_token_sum = sum(log.get("src_token_num", 0) for log in logging_outputs) loss_sum = sum(log.get("loss", 0) for log in logging_outputs) nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) nframes = sum(log.get("nframes", 0) for log in logging_outputs) speech_loss_sum = sum(log.get("speech_loss", 0) for log in logging_outputs) speech_nll_loss_sum = sum(log.get("speech_nll_loss", 0) for log in logging_outputs) speech_attn_loss_sum = sum(log.get("speech_attn_loss", 0) for log in logging_outputs) sample_size_speech = sum(log.get("sample_size_speech_cost", 0) for log in logging_outputs) agg_output = { "loss": loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0, "nll_loss": nll_loss_sum / sample_size / math.log(2) if sample_size > 0 else 0.0, # if args.sentence_avg, then sample_size is nsentences, and loss # is per-sentence loss; else sample_size is ntokens, and the loss # becomes per-output token loss "speech_loss": speech_loss_sum / sample_size_speech / math.log(2) if sample_size_speech > 0 else 0.0, "speech_nll_loss": speech_nll_loss_sum / sample_size_speech / math.log(2) if sample_size_speech > 0 else 0.0, "speech_attn_loss": speech_attn_loss_sum / src_token_sum / math.log(2) if src_token_sum > 0 else 0.0, "ntokens": ntokens, "nsentences": nsentences, "nframes": nframes, "sample_size": sample_size, "acc": correct_sum * 100.0 / total_sum if total_sum > 0 else 0.0, "correct": correct_sum, "total": total_sum, "src_token_num": src_token_sum, # total is the number of validate tokens } return agg_output @classmethod def reduce_metrics(cls, logging_outputs): """Aggregate logging outputs from data parallel training.""" agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs) for k, v in agg_logging_outputs.items(): if k in {'nsentences', 'ntokens', 'sample_size'}: continue metrics.log_scalar(k, v, round=3) ================================================ FILE: examples/speech_text_joint_to_text/data/pair_denoising_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import math import re import torch from fairseq.data import data_utils from fairseq.data.language_pair_dataset import LanguagePairDataset # Part of the code is modified from DenoisingDataset # compared with DenoisingDataset, no permute_sentences or documents (rotate_ratio, permute_sentence_ratio) class LanguagePairDenoisingDataset(LanguagePairDataset): def __init__( self, src, src_sizes, src_dict, tgt, tgt_sizes, tgt_dict, mask_idx, mask_whole_words, seed, args, left_pad_source=True, left_pad_target=False, shuffle=True, input_feeding=True, remove_eos_from_source=False, append_eos_to_target=False, align_dataset=None, constraints=None, append_bos=False, eos=None, num_buckets=0, src_lang_id=None, tgt_lang_id=None, pad_to_multiple=1, ): super().__init__( src, src_sizes, src_dict, tgt, tgt_sizes, tgt_dict, left_pad_source, left_pad_target, shuffle, input_feeding, remove_eos_from_source, append_eos_to_target, align_dataset, constraints, append_bos, eos, num_buckets, src_lang_id, tgt_lang_id, pad_to_multiple, ) self.mask_idx = mask_idx self.mask_whole_word = mask_whole_words self.mask_ratio = args.mask self.random_ratio = args.mask_random self.insert_ratio = args.insert self.replace_length = args.replace_length if self.replace_length not in [-1, 0, 1]: raise ValueError(f"invalid arg: replace_length={self.replace_length}") if args.mask_length not in ["subword", "word", "span-poisson"]: raise ValueError(f"invalid arg: mask-length={args.mask_length}") if args.mask_length == "subword" and args.replace_length not in [0, 1]: raise ValueError("if using subwords, use replace-length=1 or 0") self.mask_span_distribution = None if args.mask_length == "span-poisson": # Text infilling: "A number of text spans are sampled, with span lengths drawn from a Poisson distribution (λ = 3). Each span is replaced with a single [MASK] token. 0-length spans correspond to the insertion of [MASK] tokens." _lambda = args.poisson_lambda lambda_to_the_k = 1 e_to_the_minus_lambda = math.exp(-_lambda) k_factorial = 1 ps = [] for k in range(0, 128): ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial) lambda_to_the_k *= _lambda k_factorial *= k + 1 if ps[-1] < 0.0000001: break ps = torch.FloatTensor(ps) self.mask_span_distribution = torch.distributions.Categorical(ps) self.epoch = 0 self.seed = seed def _is_phoneme(x): if re.search("<lang:", x) or x in ( "<mask>", "<sil>", "<pad>", "<s>", "</s>", "<unk>", ): return False return True self.voc_valid_ids = torch.LongTensor( [i for i, x in enumerate(self.src_dict.symbols) if _is_phoneme(x)] ) self.voc_valid_size = self.voc_valid_ids.size(0) @property def can_reuse_epoch_itr_across_epochs(self): return False def set_epoch(self, epoch, **unused): self.epoch = epoch def __getitem__(self, index): tgt_item = self.tgt[index] if self.tgt is not None else None src_item = copy.deepcopy(self.src[index]) with data_utils.numpy_seed(self.seed, self.epoch, index): source = src_item assert source[-1] == self.eos if self.mask_ratio > 0: source = self.add_whole_word_mask(source, self.mask_ratio) if self.insert_ratio > 0: source = self.add_insertion_noise(source, self.insert_ratio) src_item = source if self.append_eos_to_target: eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos() if self.tgt and self.tgt[index][-1] != eos: tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])]) if self.append_bos: bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos() if self.tgt and self.tgt[index][0] != bos: tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]]) bos = self.src_dict.bos() if src_item[0] != bos: src_item = torch.cat([torch.LongTensor([bos]), src_item]) if self.remove_eos_from_source: eos = self.src_dict.eos() if src_item[-1] == eos: src_item = src_item[:-1] example = { "id": index, "source": src_item, "target": tgt_item, } if self.align_dataset is not None: example["alignment"] = self.align_dataset[index] if self.constraints is not None: example["constraints"] = self.constraints[index] if self.src_lang_id is not None: example["src_lang_id"] = self.src_lang_id if self.tgt_lang_id is not None: example["tgt_lang_id"] = self.tgt_lang_id return example # following functions are borrowed from denoising_dataset def word_starts(self, source): if self.mask_whole_word is not None: is_word_start = self.mask_whole_word.gather(0, source) else: is_word_start = torch.ones(source.size()) is_word_start[0] = 0 is_word_start[-1] = 0 return is_word_start def add_whole_word_mask(self, source, p): is_word_start = self.word_starts(source) num_to_mask = int(math.ceil(is_word_start.float().sum() * p)) num_inserts = 0 if num_to_mask == 0: return source if self.mask_span_distribution is not None: lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,)) # Make sure we have enough to mask cum_length = torch.cumsum(lengths, 0) while cum_length[-1] < num_to_mask: lengths = torch.cat( [ lengths, self.mask_span_distribution.sample(sample_shape=(num_to_mask,)), ], dim=0, ) cum_length = torch.cumsum(lengths, 0) # Trim to masking budget i = 0 while cum_length[i] < num_to_mask: i += 1 lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1]) num_to_mask = i + 1 lengths = lengths[:num_to_mask] # Handle 0-length mask (inserts) separately lengths = lengths[lengths > 0] num_inserts = num_to_mask - lengths.size(0) num_to_mask -= num_inserts if num_to_mask == 0: return self.add_insertion_noise(source, num_inserts / source.size(0)) assert (lengths > 0).all() else: lengths = torch.ones((num_to_mask,)).long() assert is_word_start[-1] == 0 word_starts = is_word_start.nonzero(as_tuple=False) indices = word_starts[ torch.randperm(word_starts.size(0))[:num_to_mask] ].squeeze(1) mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio source_length = source.size(0) assert source_length - 1 not in indices to_keep = torch.ones(source_length, dtype=torch.bool) is_word_start[ -1 ] = 255 # acts as a long length, so spans don't go over the end of doc if self.replace_length == 0: to_keep[indices] = 0 else: # keep index, but replace it with [MASK] source[indices] = self.mask_idx source[indices[mask_random]] = self.voc_valid_ids[ torch.randint(0, self.voc_valid_size - 1, size=(mask_random.sum(),)) ] if self.mask_span_distribution is not None: assert len(lengths.size()) == 1 assert lengths.size() == indices.size() lengths -= 1 while indices.size(0) > 0: assert lengths.size() == indices.size() lengths -= is_word_start[indices + 1].long() uncompleted = lengths >= 0 indices = indices[uncompleted] + 1 mask_random = mask_random[uncompleted] lengths = lengths[uncompleted] if self.replace_length != -1: # delete token to_keep[indices] = 0 else: # keep index, but replace it with [MASK] source[indices] = self.mask_idx source[indices[mask_random]] = self.voc_valid_ids[ torch.randint( 0, self.voc_valid_size - 1, size=(mask_random.sum(),) ) ] else: # A bit faster when all lengths are 1 while indices.size(0) > 0: uncompleted = is_word_start[indices + 1] == 0 indices = indices[uncompleted] + 1 mask_random = mask_random[uncompleted] if self.replace_length != -1: # delete token to_keep[indices] = 0 else: # keep index, but replace it with [MASK] source[indices] = self.mask_idx source[indices[mask_random]] = self.voc_valid_ids[ torch.randint( 0, self.voc_valid_size - 1, size=(mask_random.sum(),) ) ] assert source_length - 1 not in indices source = source[to_keep] if num_inserts > 0: source = self.add_insertion_noise(source, num_inserts / source.size(0)) return source def add_insertion_noise(self, tokens, p): if p == 0.0: return tokens num_tokens = len(tokens) n = int(math.ceil(num_tokens * p)) noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1 noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool) noise_mask[noise_indices] = 1 result = torch.LongTensor(n + len(tokens)).fill_(-1) num_random = int(math.ceil(n * self.random_ratio)) result[noise_indices[num_random:]] = self.mask_idx result[noise_indices[:num_random]] = self.voc_valid_ids[ torch.randint(0, self.voc_valid_size - 1, size=(num_random,)) ] result[~noise_mask] = tokens assert (result >= 0).all() return result ================================================ FILE: examples/speech_text_joint_to_text/docs/ende-mustc.md ================================================ [[Back]](..) # Joint Speech Text Training for the MuST-C English to German Speech Translation task Joint Training Baseline: it is based on paper ["A general multi-task learning framework to leverage text data for speech to text tasks"](https://arxiv.org/pdf/2010.11338.pdf) Enhanced Joint Training: the joint training is enhanced with pre-trained models, cross attentive regularization and online knowledge distillation based on paper ["Improving Speech Translation by Understanding and Learning from the Auxiliary Text Translation Task"](https://research.fb.com/publications/improving-speech-translation-by-understanding-and-learning-from-the-auxiliary-text-translation-task) ## Prepare Data #### Download files - Sentence piece model [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/spm.model) - Dictionary [dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/dict.txt) - config [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/config.yaml) #### Prepare MuST-C data set - Please follow the data preparation in the [S2T example](https://github.com/pytorch/fairseq/blob/main/examples/speech_to_text/docs/mustc_example.md) - Convert source text under the "src_text" column in the tsv file into phoneme representation. ```bash python examples/speech_text_joint_to_text/scripts/g2p_encode.py \ --lower-case --do-filter --use-word-start --no-punc \ --reserve-word examples/speech_text_joint_to_text/configs/mustc_noise.list \ --data-path ${must_c_en_de_src_text} \ --out-path ${must_c_en_de_src_text_pho} ``` - Replace the source text under the "src_text" column in the tsv file with the corresponding phoneme reprentation generated in the step above. Below is the snapshot for the MuST-C en-de dev tsv ``` id audio n_frames tgt_text src_text speaker ted_767_0 en-de/flac.zip:10071514743:48445 56160 Heute spreche ich zu Ihnen über Energie und Klima. ▁AY1 M ▁G OW1 IH0 NG ▁T UW1 ▁T AO1 K ▁T AH0 D EY1 ▁AH0 B AW1 T ▁EH1 N ER0 JH IY0 ▁AH0 N D ▁K L AY1 M AH0 T spk.767_ ted_767_1 en-de/flac.zip:1214217978:205678 226080 Und das überrascht vielleicht etwas, weil sich meine Vollzeitbeschäftigung bei der Stiftung hauptsächlich um Impfstoffe und Saatgut dreht, um die Dinge, die wir erfinden und liefern müssen um den ärmsten 2 Milliarden ein besseres Leben zu ermöglichen. ▁AH0 N D ▁DH AE1 T ▁M AY1 T ▁S IY1 M ▁AH0 ▁B IH1 T ▁S ER0 P R AY1 Z IH0 NG ▁B IH0 K AO1 Z ▁M AY1 ▁F UH1 L ▁T AY1 M ▁W ER1 K ▁AE1 T ▁DH AH0 ▁F AW0 N D EY1 SH AH0 N ▁IH1 Z ▁M OW1 S T L IY0 ▁AH0 B AW1 T ▁V AE2 K S IY1 N Z ▁AH0 N D ▁S IY1 D Z ▁AH0 B AW1 T ▁DH AH0 ▁TH IH1 NG Z ▁DH AE1 T ▁W IY1 ▁N IY1 D ▁T UW1 ▁IH0 N V EH1 N T ▁AH0 N D ▁D IH0 L IH1 V ER0 ▁T UW1 ▁HH EH1 L P ▁DH AH0 ▁P UH1 R IH0 S T ▁T UW1 ▁B IH1 L Y AH0 N ▁L AY1 V ▁B EH1 T ER0 ▁L IH1 V Z spk.767_ ``` - Prepare phoneme dictionary and save to $MANIFEST_ROOT as [src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/src_dict.txt) #### Prepare WMT text data - [Download wmt data](https://github.com/pytorch/fairseq/blob/main/examples/translation/prepare-wmt14en2de.sh) - Convert source text (English) into phoneme representation as above - Generate binary parallel files with "fairseq-preprocess" from fairseq for training and validation. The source input is English phoneme representation and the target input is German sentencepiece token . The output is saved under $parallel_text_data ## Training The model is trained with 8 v100 GPUs. #### Download pretrained models - [pretrain_encoder](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_joint_asr_transformer_m.pt) - [pretrain_nmt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/checkpoint_mt.pt) #### Training scripts - Jointly trained model from scratch ```bash python train.py ${MANIFEST_ROOT} \ --save-dir ${save_dir} \ --num-workers 8 \ --task speech_text_joint_to_text \ --arch dualinputs2ttransformer_s \ --user-dir examples/speech_text_joint_to_text \ --max-epoch 100 --update-mix-data \ --optimizer adam --lr-scheduler inverse_sqrt \ --lr 0.001 --update-freq 4 --clip-norm 10.0 \ --criterion guided_label_smoothed_cross_entropy_with_accuracy \ --label-smoothing 0.1 --max-tokens 10000 --max-tokens-text 10000 \ --max-positions-text 400 --seed 2 --speech-encoder-layers 12 \ --text-encoder-layers 6 --encoder-shared-layers 6 --decoder-layers 6 \ --dropout 0.1 --warmup-updates 20000 \ --text-sample-ratio 0.25 --parallel-text-data ${parallel_text_data} \ --text-input-cost-ratio 0.5 --enc-grad-mult 2.0 --add-speech-eos \ --log-format json --langpairs en-de --noise-token '"'"'▁NOISE'"'"' \ --mask-text-ratio 0.0 --max-tokens-valid 20000 --ddp-backend no_c10d \ --log-interval 100 --data-buffer-size 50 --config-yaml config.yaml \ --keep-last-epochs 10 ``` - Jointly trained model with good initialization, cross attentive loss and online knowledge distillation ```bash python train.py ${MANIFEST_ROOT} \ --save-dir ${save_dir} \ --num-workers 8 \ --task speech_text_joint_to_text \ --arch dualinputs2ttransformer_m \ --user-dir examples/speech_text_joint_to_text \ --max-epoch 100 --update-mix-data \ --optimizer adam --lr-scheduler inverse_sqrt \ --lr 0.002 --update-freq 4 --clip-norm 10.0 \ --criterion guided_label_smoothed_cross_entropy_with_accuracy \ --guide-alpha 0.8 --disable-text-guide-update-num 5000 \ --label-smoothing 0.1 --max-tokens 10000 --max-tokens-text 10000 \ --max-positions-text 400 --seed 2 --speech-encoder-layers 12 \ --text-encoder-layers 6 --encoder-shared-layers 6 --decoder-layers 6 \ --dropout 0.1 --warmup-updates 20000 --attentive-cost-regularization 0.02 \ --text-sample-ratio 0.25 --parallel-text-data ${parallel_text_data} \ --text-input-cost-ratio 0.5 --enc-grad-mult 2.0 --add-speech-eos \ --log-format json --langpairs en-de --noise-token '"'"'▁NOISE'"'"' \ --mask-text-ratio 0.0 --max-tokens-valid 20000 --ddp-backend no_c10d \ --log-interval 100 --data-buffer-size 50 --config-yaml config.yaml \ --load-pretrain-speech-encoder ${pretrain_encoder} \ --load-pretrain-decoder ${pretrain_nmt} \ --load-pretrain-text-encoder-last ${pretrain_nmt} \ --keep-last-epochs 10 ``` ## Evaluation ```bash python ./fairseq_cli/generate.py \ ${MANIFEST_ROOT} \ --task speech_text_joint_to_text \ --max-tokens 25000 \ --nbest 1 \ --results-path ${infer_results} \ --batch-size 512 \ --path ${model} \ --gen-subset tst-COMMON_st \ --config-yaml config.yaml \ --scoring sacrebleu \ --beam 5 --lenpen 1.0 \ --user-dir examples/speech_text_joint_to_text \ --load-speech-only ``` ## Results (Joint training with initialization + CAR + online KD) |Direction|En-De | En-Es | En-Fr | |---|---|---|---| |BLEU|27.4| 31.2 | 37.6 | |checkpoint | [link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de/checkpoint_ave_10.pt) |[link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_es/checkpoint_ave_10.pt)|[link](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_fr/checkpoint_ave_10.pt)| ================================================ FILE: examples/speech_text_joint_to_text/docs/iwslt2021.md ================================================ [[Back]](..) # Joint Speech Text Training for the 2021 IWSLT multilingual speech translation This directory contains the code from paper ["FST: the FAIR Speech Translation System for the IWSLT21 Multilingual Shared Task"](https://arxiv.org/pdf/2107.06959.pdf). ## Prepare Data #### Download files - Sentence piece model [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/spm.model) - Dictionary [tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/dict.txt) - Config [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/config.yaml) #### Prepare - Please follow the data preparation in [speech-to-text](https://github.com/pytorch/fairseq/blob/main/examples/speech_to_text/docs/mtedx_example.md) with option "--use-audio-input" for raw audio tsv files. - Prepare tsv files with phoneme based source text (under column 'src_text') as [MuST-C](ende-mustc.md) example. ## Training #### Download pretrained models - [Pretrained mbart model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/mbart.pt) - [Pretrained w2v model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/xlsr_53_56k.pt) #### Training scripts ```bash python train.py ${MANIFEST_ROOT} \ --save-dir ${save_dir} \ --user-dir examples/speech_text_joint_to_text \ --train-subset train_es_en_tedx,train_es_es_tedx,train_fr_en_tedx,train_fr_es_tedx,train_fr_fr_tedx,train_it_it_tedx,train_pt_en_tedx,train_pt_pt_tedx \ --valid-subset valid_es_en_tedx,valid_es_es_tedx,valid_es_fr_tedx,valid_es_it_tedx,valid_es_pt_tedx,valid_fr_en_tedx,valid_fr_es_tedx,valid_fr_fr_tedx,valid_fr_pt_tedx,valid_it_en_tedx,valid_it_es_tedx,valid_it_it_tedx,valid_pt_en_tedx,valid_pt_es_tedx,valid_pt_pt_tedx \ --config-yaml config.yaml --ddp-backend no_c10d \ --num-workers 2 --task speech_text_joint_to_text \ --criterion guided_label_smoothed_cross_entropy_with_accuracy \ --label-smoothing 0.3 --guide-alpha 0.8 \ --disable-text-guide-update-num 5000 --arch dualinputxmtransformer_base \ --max-tokens 500000 --max-sentences 3 --max-tokens-valid 800000 \ --max-source-positions 800000 --enc-grad-mult 2.0 \ --attentive-cost-regularization 0.02 --optimizer adam \ --clip-norm 1.0 --log-format simple --log-interval 200 \ --keep-last-epochs 5 --seed 1 \ --w2v-path ${w2v_path} \ --load-pretrained-mbart-from ${mbart_path} \ --max-update 1000000 --update-freq 4 \ --skip-invalid-size-inputs-valid-test \ --skip-encoder-projection --save-interval 1 \ --attention-dropout 0.3 --mbart-dropout 0.3 \ --finetune-w2v-params all --finetune-mbart-decoder-params all \ --finetune-mbart-encoder-params all --stack-w2v-mbart-encoder \ --drop-w2v-layers 12 --normalize \ --lr 5e-05 --lr-scheduler inverse_sqrt --warmup-updates 5000 ``` ## Evaluation ```bash python ./fairseq_cli/generate.py ${MANIFEST_ROOT} \ --task speech_text_joint_to_text \ --user-dir ./examples/speech_text_joint_to_text \ --load-speech-only --gen-subset test_es_en_tedx \ --path ${model} \ --max-source-positions 800000 \ --skip-invalid-size-inputs-valid-test \ --config-yaml config.yaml \ --infer-target-lang en \ --max-tokens 800000 \ --beam 5 \ --results-path ${RESULTS_DIR} \ --scoring sacrebleu ``` The trained model can be downloaded [here](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/iwslt/iwslt_data/checkpoint17.pt) |direction|es_en|fr_en|pt_en|it_en|fr_es|pt_es|it_es|es_es|fr_fr|pt_pt|it_it| |---|---|---|---|---|---|---|---|---|---|---|---| |BLEU|31.62|36.93|35.07|27.12|38.87|35.57|34.13|74.59|74.64|70.84|69.76| ================================================ FILE: examples/speech_text_joint_to_text/docs/pre-training.md ================================================ [[Back]](..) # Unified Speech-Text Pre-training for Speech Translation and Recognition This directory contains the pre-training recipes from paper ["Unified Speech-Text Pre-training for Speech Translation and Recognition"](https://arxiv.org/abs/2204.05409). ## Librispeech ASR Pre-training ### Prepare Data #### Download files #### Prepare pre-training data - Text to text task (T2T): prepare the binary data following the similar steps in [EN_DE Joint training](./ende-mustc.md). The source data is presented as phomeme token sequence and the target data is coded as subword tokens via SentencePiece. The text data is downloaded from [openslr](https://www.openslr.org/12) - Self-supervised speech learning task (SSL): The data is prepared as [wav2vec 2.0](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec/README.md) - Speech to phoneme classification task (S2P): The tsv file contains 5 fields: "id", "audio", "n_frames", "tgt_text", and "align". The tgt_text field is corresponding to the phoneme based representation of the speech data. "align" field contains the alignment information. The phoneme level forced alignment for the labelled speech data (i.e. Librispeech) can be obtained via [kaldi](http://kaldi-asr.org) or [MFA](https://montrealcorpustools.github.io/Montreal-Forced-Aligner/). The segmentation information is normalized to 0$\sim$1 for the whole utterance. The snapshot of the tsv file is below: ``` id audio n_frames tgt_text align 116-288045-0000 /librispeech/dev-other/116/288045/116-288045-0000.flac 170400 <sil> ▁AE1 Z AY1 ▁AH0 P R OW1 CH T ▁DH AH1 ▁S IH1 T IY0 <sil> AY1 ▁HH ER1 D ▁B EH1 L Z ▁R IH1 NG IH0 NG <sil> ▁AE1 N D AH0 ▁L IH1 T AH0 L ▁L EY1 T ER0 AY1 ▁F AW1 N D ▁DH AH0 ▁S T R IY1 T S ▁AH0 S T IH1 R ▁W IH0 TH ▁TH R AO1 NG Z ▁AH0 V ▁W EH1 L ▁D R EH1 S T ▁P IY1 P AH0 L ▁IH1 N ▁F AE1 M L IY0 ▁G R UW1 P S <sil> ▁W EH1 N D IH0 NG ▁DH EH1 R ▁W EY1 <sil> ▁HH IH1 DH ER0 ▁AH0 N D ▁TH IH1 DH ER0 <sil> 0.047977 0.056444 0.064911 0.075259 0.081844 0.089370 0.095014 0.104421 0.109125 0.111947 0.115710 0.120414 0.134525 0.141110 0.143932 0.174036 0.176858 0.190028 0.199436 0.207902 0.218250 0.224835 0.231421 0.242709 0.251176 0.257761 0.263405 0.268109 0.270931 0.290687 0.342427 0.349953 0.353716 0.356538 0.360301 0.363123 0.365945 0.368768 0.371590 0.376294 0.384760 0.394167 0.401693 0.409219 0.419567 0.430856 0.441204 0.444026 0.446849 0.449671 0.456256 0.463782 0.471308 0.477893 0.486359 0.491063 0.494826 0.501411 0.512700 0.517404 0.520226 0.534337 0.540922 0.545626 0.550329 0.559737 0.568203 0.583255 0.592662 0.600188 0.603951 0.611477 0.619003 0.624647 0.634055 0.639699 0.646284 0.653810 0.659454 0.664158 0.670743 0.682032 0.687676 0.692380 0.708373 0.713076 0.719661 0.729069 0.740357 0.744120 0.748824 0.752587 0.761994 0.770461 0.781750 0.790216 0.805268 0.808090 0.823142 0.832549 0.836312 0.840075 0.843838 0.851364 0.854186 0.857008 0.862653 0.878645 0.898401 0.901223 0.906867 0.913452 0.920038 0.926623 0.934149 0.939793 0.942615 0.945437 0.952023 0.957667 0.977422 1.000000 ``` - Speech to text task (S2T): The data preparation follow the steps in [EN_DE Joint training](./ende-mustc.md). #### Prepare fine-tuning data: We re-use the data from T2T and S2T tasks in the fine-tuning stage. ### Model Build #### Pre-training ``` python train.py $T2T_DATA \ --save-dir $SAVE_PRE_PATH --user-dir examples/speech_text_joint_to_text --task speech_text_joint_denoising \ --criterion speech_text_pretrain_cross_entropy --optimizer adam --weight-decay 0.01 --config-yaml config_s2p.yaml --config-s2s-yaml config.yaml --ddp-backend no_c10d \ --lang-pairs pho-wrd --num-workers 4 --log-interval 500 --save-interval-updates 5000 --keep-interval-updates 1 --no-emb-update-unsup --report-accuracy --lr 0.001 --end-learning-rate 1e-06 \ --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 800000 --update-freq 6 --validate-interval-updates 10000 --train-subset train \ --valid-subset valid,valid_sup_speech,valid_sup_speech_s2s,valid_unsup_speech --dataset-impl mmap \ --sup-speech-data $S2P_DATA_PATH --sup-speech-train-subset train_960.ali --sup-speech-valid-subset dev-clean.ali --sup-speech-s2s-data $S2T_DATA_PATH \ --sup-speech-s2s-train-subset train --sup-speech-s2s-valid-subset dev-clean --unsup-speech-train-data $SSL_DATA_PATH/train.tsv --unsup-speech-valid-data $SSL_DATA_PATH/valid.tsv \ --batch-size 200 --batch-size-valid 150 --max-source-positions 1024 --max-target-positions 1024 --max-text-tokens 3072 --max-speech-positions 600000 \ --max-sample-size 750000 --min-sample-size 64000 --max-speech-tokens 750000 --max-tokens-valid 750000 --skip-invalid-size-inputs-valid-test \ --unsupervised-speech-sample-ratio 3.0 --supervised-speech-sample-ratio 5 --supervised-speech-s2s-sample-ratio 5 --text-sample-ratio 1.0 --mask 0.3 --mask-random 0.1 \ --mask-length span-poisson --speech-sup-mask-prob 0.3 --speech-unsup-mask-prob 0.7 --use-mask-whole-words --arch speech_text_pretrain_bart_base_stack \ --no-scale-feature --activation-fn gelu --speech-extractor-mode default --stacked-encoder all --encoder-normalize-before --decoder-normalize-before \ --encoder-learned-pos --decoder-learned-pos --dropout 0.1 --load-pretrained-mbart-encoder-from $BART --load-pretrained-mbart-decoder-from $BART ``` The current implementation also supports model pre-training without the forced alignment supervised data. In this case, CTC is used to optimize the S2P task. We need to do following changes for the setting: 1. options to be added ``` --use-sup-speech-ctc --criterion speech_text_pretrain_compound ``` 2. options to be deleted ``` --same-data-update --criterion speech_text_pretrain_cross_entropy ``` However, we find the CTC based pre-training is still worse than the forced alignment based setting. It could be partially due to the inferior pre-training setting that we re-use the forced alignment based pre-training setting for the CTC based pre-training. #### Fine-tuning ``` python train.py $S2T_DATA_PATH \ --save-dir $SAVE_FT_PATH --num-workers 8 --task speech_text_joint_to_text --arch dualinputs2twavtransformer_base_stack \ --user-dir examples/speech_text_joint_to_text --max-update 100000 --optimizer adam --lr-scheduler inverse_sqrt --lr 0.0003 --update-freq 3 --clip-norm 10.0 \ --criterion guided_label_smoothed_cross_entropy_with_accuracy --guide-alpha 0.8 --label-smoothing 0.1 --warmup-updates 20000 --attentive-cost-regularization 0.02 \ --enc-grad-mult 2.0 --max-tokens 800000 --max-source-positions 800000 --max-tokens-text 10000 --max-positions-text 1024 --max-target-positions 1024 --no-scale-feature \ --activation-fn gelu --load-pretrained-speech-text-encoder $SAVE_PRE_PATH/checkpoint_last.pt --load-pretrained-speech-text-decoder $SAVE_PRE_PATH/checkpoint_last.pt \ --encoder-normalize-before --decoder-normalize-before --speech-extractor-mode default --speech-mask-channel-length 64 --speech-mask-channel-prob 0.5 \ --speech-mask-length 10 --speech-mask-prob 0.65 --text-sample-ratio 0.25 --mask-text-ratio 0.3 --mask-text-type random --parallel-text-data text_bin \ --text-input-cost-ratio 0.5 --langpairs pho-wrd --update-mix-data --log-format json --max-tokens-valid 800000 --ddp-backend no_c10d --log-interval 500 \ --config-yaml config.yaml --skip-invalid-size-inputs-valid-test --keep-last-epochs 50 --layernorm-embedding --encoder-learned-pos --decoder-learned-pos ``` ### Evaluation The last 10 epoch models from fine-tuning is conducted model average to get $FINAL_MODEL ``` python ./fairseq_cli/generate.py \ $S2T_DATA_PATH \ --task speech_text_joint_to_text \ --max-tokens 800000 \ --max-source-positions 800000 \ --nbest 1 \ --results-path $RESULTS_LOG \ --batch-size 512 \ --path $FINAL_MODEL \ --gen-subset $SUBSET \ --config-yaml config.yaml \ --scoring wer \ --beam 10 --lenpen 1.0 examples/speech_text_joint_to_text \ --user-dir examples/speech_text_joint_to_text --load-speech-only \ --model-overrides {'load_pretrained_speech_text_decoder':'','load_pretrained_speech_text_encoder':''} ``` ### Results and models | | dev-clean | dev-other | test-clean | test-other | |---|---|---|---|---| | WER| 2.0 | 4.4 | 2.1 |4.6 | **Model Links**: - [config_s2p.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/config_s2p.yaml): Config for S2P - [spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/spm.model): Sentence Piece model - [src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/src_dict.txt): Source Phoneme Dictionary - [tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/tgt_dict.txt): Target Sentence Piece Dictionary - [config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/config.yaml): Config for S2T - [BART](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/bart.pt): trained from Librispeech text data - [Joint Pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/pretrain/checkpoint6.pt): model pre-trained with 960 hours Librispeech data (S2P, S2T) Librispeech text training data (T2T) and Librilight data (SSL) - [Fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned/checkpoint_ave_10.pt): the pre-trained model is fined one 960 hours Librispeech speech and text data. (S2T + T2T) ## MuST-C ### Prepare Data Compared with the ASR Librispeech ASR recipe, the differences are below: - Replace the speech data with corresponding MuST-C data - Parallel text data from WMT is replaced the Librispeech text data ### Model Build #### Pre-training EN-DE is used as an example ``` python train.py $TXT_DATA \ --save-dir $SAVE_PRE_PATH --user-dir examples/speech_text_joint_to_text --task speech_text_joint_denoising --criterion speech_text_pretrain_cross_entropy --optimizer adam --weight-decay 0.01 \ --config-yaml config_s2p.yaml --config-s2s-yaml config.yaml --ddp-backend no_c10d --lang-pairs-bitext en-fr --num-workers 4 --log-interval 500 --save-interval-updates 5000 --keep-interval-updates 1 \ --no-emb-update-unsup --use-decoder-output-proj --report-accuracy --lr 0.001 --end-learning-rate 1e-06 --lr-scheduler polynomial_decay --warmup-updates 10000 --total-num-update 800000 \ --update-freq 8 --validate-interval-updates 10000 --train-subset train --valid-subset valid_sup_speech,valid_sup_speech_s2s,valid_unsup_speech --dataset-impl mmap \ --sup-speech-data $S2P_DATA_PATH --sup-speech-train-subset train --sup-speech-valid-subset dev --sup-speech-s2s-data $S2T_DATA_PATH --sup-speech-s2s-train-subset train \ --sup-speech-s2s-valid-subset dev --unsup-speech-train-data $SSL_DATA_PATH/train.tsv --unsup-speech-valid-data $SSL_DATA_PATH/valid.tsv --batch-size 200 --batch-size-valid 100 \ --max-source-positions 1024 --max-target-positions 1024 --max-text-tokens 2048 --max-speech-positions 600000 --max-sample-size 600000 --min-sample-size 64000 \ --max-speech-tokens 600000 --max-tokens-valid 600000 --skip-invalid-size-inputs-valid-test --unsupervised-speech-sample-ratio 1.2 --supervised-speech-sample-ratio 10 \ --supervised-speech-s2s-sample-ratio 10 --bitext-sample-ratio 0.5 --mask 0.3 --mask-random 0.1 --mask-length span-poisson --speech-sup-mask-prob 0.3 \ --speech-unsup-mask-prob 0.7 --use-mask-whole-words --arch speech_text_pretrain_bart_base_stack --no-scale-feature --activation-fn gelu --speech-extractor-mode default \ --stacked-encoder s2s --encoder-normalize-before --decoder-normalize-before --encoder-learned-pos --decoder-learned-pos --dropout 0.1 \ --load-pretrained-mbart-encoder-from $EN_FR_NMT --load-pretrained-mbart-decoder-from $EN_FR_NMT ``` #### Fine-tuning ``` python train.py $S2T_DATA_PATH \ --save-dir $SAVE_FT_PATH --num-workers 8 --task speech_text_joint_to_text --arch dualinputs2twavtransformer_base_stack --user-dir examples/speech_text_joint_to_text \ --max-epoch 25 --update-mix-data --optimizer adam --lr-scheduler inverse_sqrt --lr 0.0003 --update-freq 4 --clip-norm 10.0 --warmup-updates 20000 \ --criterion guided_label_smoothed_cross_entropy_with_accuracy --guide-alpha 0.8 --attentive-cost-regularization 0.02 --enc-grad-mult 2.0 --label-smoothing 0.1 \ --max-tokens 800000 --max-source-positions 800000 --max-tokens-text 10000 --max-positions-text 1024 --load-pretrained-speech-text-encoder $SAVE_PRE_PATH/checkpoint_last.pt \ --load-pretrained-speech-text-decoder $SAVE_PRE_PATH/checkpoint_last.pt --speech-mask-channel-length 64 --speech-mask-channel-prob 0.5 --speech-mask-length 10 \ --speech-mask-prob 0.65 --text-sample-ratio 0.05 --mask-text-ratio 0.3 --mask-text-type random --parallel-text-data data-bin-wt --text-input-cost-ratio 0.5 \ --langpairs en-fr --log-format json --max-tokens-valid 800000 --ddp-backend no_c10d --log-interval 100 --config-yaml config.yaml --skip-invalid-size-inputs-valid-test \ --noise-token '▁NOISE' --keep-last-epochs 40 --layernorm-embedding --encoder-learned-pos --decoder-learned-pos --activation-fn gelu \ --speech-extractor-mode default --max-target-positions 1024 --encoder-normalize-before --decoder-normalize-before ``` ### Evaluation The last 10 epoch models from fine-tuning is conducted model average to get $FINAL_MODEL ``` python fairseq_cli/generate.py \ $S2T_DATA_PATH \ --task speech_text_joint_to_text \ --nbest 1 \ --max-tokens 800000 \ --max-source-positions 800000 \ --results-path $RESULTS_LOG \ --batch-size 512 \ --path $FINAL_MODEL \ --gen-subset $SUBSET \ --config-yaml config.yaml \ --scoring sacrebleu \ --beam 10 --lenpen 1.0 examples/speech_text_joint_to_text \ --user-dir examples/speech_text_joint_to_text --load-speech-only \ --model-overrides {'load_pretrained_speech_text_decoder':'','load_pretrained_speech_text_encoder':''} ``` ### Results and models | | en-fr | en-es | en-de | |---|---|---|---| | BLEU| 39.7 | 33.2 |29.2 | **Model Links**: 1. DE - [de config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/config.yaml) - [de src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/src_dict.txt) - [de tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/tgt_dict.txt) - [de spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/spm.model) - [de pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/nmt.pt) - [de pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/checkpoint_pretraing.pt) - [de fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/de/checkpoint_finetune_ave10.pt) 2. ES - [es config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/config.yaml) - [es src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/src_dict.txt) - [es tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/tgt_dict.txt) - [es spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/spm.model) - [es pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/nmt.pt) - [es pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/checkpoint_pretraing.pt) - [es fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/es/checkpoint_finetune_ave10.pt) 3. FR - [fr config.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/config.yaml) - [fr src_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/src_dict.txt) - [fr tgt_dict.txt](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/tgt_dict.txt) - [fr spm.model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/spm.model) - [fr pre-trained nmt model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/nmt.pt) - [fr pre-trained model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/checkpoint_pretraing.pt) - [fr fine-tuned model](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/fr/checkpoint_finetune_ave10.pt) 4. [config_s2p.yaml](https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/must_c/config_s2p.yaml) ================================================ FILE: examples/speech_text_joint_to_text/models/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os ================================================ FILE: examples/speech_text_joint_to_text/models/joint_speech_text_pretrain_transformer.py ================================================ #!/usr/bin/env python3 import logging from collections import OrderedDict, namedtuple from typing import Dict, Optional import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor from fairseq import checkpoint_utils, utils from fairseq.file_io import PathManager from fairseq.models import ( FairseqDecoder, FairseqEncoderDecoderModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_text import ( MultiInputDecoder, MultiModalityEncoder, SpeechWavTransformerEncoder, StackedSpeechWavTransformerEncoder, ) from fairseq.models.transformer import ( TransformerDecoder, TransformerEncoder, TransformerModel, ) logger = logging.getLogger(__name__) class SpeechTextPreTrainEncoder(MultiModalityEncoder): def __init__( self, dictionary, sup_speech_encoder, sup_s2s_speech_encoder, unsup_speech_encoder, text_encoder, ): super().__init__(dictionary) self.sup_speech_encoder = sup_speech_encoder self.sup_s2s_speech_encoder = sup_s2s_speech_encoder self.unsup_speech_encoder = unsup_speech_encoder self.text_encoder = text_encoder @classmethod def update_transformer_encoder_cfg(cls, args, update_dict): cfg = dict(args._get_kwargs()) for fkey in update_dict.keys(): cfg[fkey] = update_dict[fkey] cfg.pop("_name", None) # remove keys start with _ model_args = namedtuple("args", cfg.keys())(*cfg.values()) return model_args @classmethod def build_text_encoder(cls, args, src_dictionary): enc_emb = nn.Embedding( len(src_dictionary), args.encoder_embed_dim, src_dictionary.pad() ) model_args = cls.update_transformer_encoder_cfg( args, {"encoder_layers": args.text_encoder_layers} ) text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb) return text_encoder @classmethod def build_speech_encoder(cls, args): model_args = cls.update_transformer_encoder_cfg( args, { "encoder_layers": args.speech_encoder_layers, "speech_mask_prob": args.speech_sup_mask_prob, }, ) speech_encoder = SpeechWavTransformerEncoder(model_args) return speech_encoder @classmethod def share_layers(cls, src_layers, tgt_layers): # share layer but not dropout # share parameters in src_layers with tgt_layers assert len(src_layers) == len(tgt_layers) for i, ly in enumerate(src_layers): tly = tgt_layers[i] tly.self_attn = ly.self_attn tly.self_attn_layer_norm = ly.self_attn_layer_norm tly.activation_fn = ly.activation_fn tly.normalize_before = ly.normalize_before tly.fc1 = ly.fc1 tly.fc2 = ly.fc2 tly.final_layer_norm = ly.final_layer_norm if hasattr(tly, "encoder_attn"): tly.encoder_attn = ly.encoder_attn tly.encoder_attn_layer_norm = ly.encoder_attn_layer_norm return tgt_layers @classmethod def build_unsup_speech_encoder(cls, args, sup_speech_encoder): model_args = cls.update_transformer_encoder_cfg( args, { "encoder_layers": args.speech_encoder_layers, "speech_mask_prob": args.speech_unsup_mask_prob, "encoder_layerdrop": 0.0, "decoder_layerdrop": 0.0, "dropout": args.speech_unsup_dropout, "activation_dropout": args.speech_unsup_dropout, "attention_dropout": 0.0, "dropout_features": args.speech_unsup_feature_dropout, "dropout_input": args.speech_unsup_feature_dropout, }, ) unsup_speech_encoder = SpeechWavTransformerEncoder(model_args, alway_mask=True) unsup_speech_encoder.layer_norm = sup_speech_encoder.layer_norm unsup_speech_encoder.layers = cls.share_layers( sup_speech_encoder.layers, unsup_speech_encoder.layers ) unsup_speech_encoder.mask_emb = sup_speech_encoder.mask_emb unsup_speech_encoder.embed_positions = sup_speech_encoder.embed_positions unsup_speech_encoder.feat_layer_norm = sup_speech_encoder.feat_layer_norm unsup_speech_encoder.feat_proj = sup_speech_encoder.feat_proj unsup_speech_encoder.subsample = sup_speech_encoder.subsample return unsup_speech_encoder @classmethod def build_encoder(cls, args, dictionary): text_encoder = cls.build_text_encoder(args, dictionary) if getattr(args, "load_pretrained_mbart_encoder_from", None): text_encoder = checkpoint_utils.load_pretrained_component_from_model( component=text_encoder, checkpoint=args.load_pretrained_mbart_encoder_from, ) speech_encoder = cls.build_speech_encoder(args) if getattr(args, "load_pretrained_feature_extractor_from", None): def load_feature_extractor(component, checkpoint): if not PathManager.exists(checkpoint): raise IOError("Model file not found: {}".format(checkpoint)) state = checkpoint_utils.load_checkpoint_to_cpu(checkpoint) component_state_dict = OrderedDict() component_prefix = "feature_extractor" for key in state["model"].keys(): if key.startswith(component_prefix): component_subkey = key[len(component_prefix) + 1 :] component_state_dict[component_subkey] = state["model"][key] component.load_state_dict(component_state_dict, strict=True) return component speech_encoder.subsample = load_feature_extractor( speech_encoder.subsample, args.load_pretrained_feature_extractor_from ) speech_s2s_encoder = speech_encoder unsup_speech_encoder = cls.build_unsup_speech_encoder(args, speech_encoder) if getattr(args, "stacked_encoder", "none") != "none": if args.encoder_shared_text_layers_from_begin > 0: raise ValueError( "We can not stack encoders and share encoders at the same time!" ) speech_s2s_encoder = StackedSpeechWavTransformerEncoder( speech_encoder, text_encoder.layers, text_encoder.layer_norm ) if args.stacked_encoder == "all": speech_encoder = speech_s2s_encoder unsup_speech_encoder = StackedSpeechWavTransformerEncoder( unsup_speech_encoder, text_encoder.layers, text_encoder.layer_norm ) else: cls.share_speech_text_encoder( speech_encoder, text_encoder, args.encoder_shared_text_layers_from_begin ) return SpeechTextPreTrainEncoder( dictionary, speech_encoder, speech_s2s_encoder, unsup_speech_encoder, text_encoder, ) @classmethod def share_speech_text_encoder( cls, speech_encoder, text_encoder, shared_layers_from_begin ): if shared_layers_from_begin > 0: num_text_encoder_layers = len(text_encoder.layers) assert len(speech_encoder.layers) >= shared_layers_from_begin assert num_text_encoder_layers >= shared_layers_from_begin assert len(speech_encoder.layers) >= num_text_encoder_layers for i, ly in enumerate( speech_encoder.layers[ -num_text_encoder_layers : -num_text_encoder_layers + shared_layers_from_begin ] ): assert isinstance(text_encoder.layers[i], type(ly)) text_encoder.layers[i] = ly def select_encoder(self, mode, **kwargs): if mode in ("speech", "sup_speech_ctc", "sup_speech_ali", "sup_speech_s2s"): kwargs["features_only"] = True if mode == "sup_speech_s2s": return self.sup_s2s_speech_encoder, kwargs return self.sup_speech_encoder, kwargs elif mode == "unsup_speech": kwargs["features_only"] = False return self.unsup_speech_encoder, kwargs elif mode in ("text", "bitext"): return self.text_encoder, kwargs else: raise NotImplementedError(f"{mode} is not supported") return None, kwargs def forward(self, src_tokens, src_lengths=None, mode="", alignment=None, **kwargs): return super().forward(src_tokens, src_lengths, mode, **kwargs) # SpeechDummyDecoder works as an extension of encoder, so we could fit encoder only training into seq2seq training class SpeechDummyDecoder(FairseqDecoder): def __init__( self, dictionary, output_embedding, no_emb_update_unsup=False, use_output_proj=False, ): super().__init__(dictionary) self.output_embedding = output_embedding num_embedding, num_dim = self.output_embedding.weight.size() self.out_proj = ( None if use_output_proj is False else nn.Linear(num_dim, num_dim) ) self.no_emb_update_unsup = no_emb_update_unsup def extend_alignment(self, alignment, src_lengths, prev_output_tokens): # alignment: B X N # src_lengths: B X T # prev_output_tokens: B X (N + 1) tgt_tokens = prev_output_tokens[ :, 1: ] # remove the leading start of sentence token ext_alignment = ( torch.ones(len(src_lengths), src_lengths.max(), device=src_lengths.device) .long() .fill_(self.dictionary.pad()) ) for bs in range(src_lengths.size(0)): tgt_length = tgt_tokens[bs].ne(self.dictionary.pad()).sum().item() assert tgt_length == sum(alignment[bs].ne(1)) + 1 src_st = 0 for i in range(tgt_length): tok = tgt_tokens[bs][i] src_ed = (alignment[bs][i] * src_lengths[bs]).int().item() ext_alignment[bs][src_st:src_ed].fill_(tok) src_st = src_ed return ext_alignment def forward( self, prev_output_tokens, encoder_out, incremental_state=None, mode="speech", alignment=None, **kwargs, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). Returns: sup_speech_ctc: dictionary{"logits": logits, "padding_mask": padding_mask} sup_speech_ali and unsup_speech: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ emb_weight = self.output_embedding.weight if ( mode == "unsup_speech" and self.no_emb_update_unsup ): # no gradient for embedding here emb_weight = emb_weight.detach() enc_out = ( encoder_out["encoder_out"][0] if self.out_proj is None else self.out_proj(encoder_out["encoder_out"][0]) ) logits = F.linear(enc_out, emb_weight, None).transpose(0, 1) # B X T X C others = None if mode in ( "speech", "sup_speech_ctc", ): # speech data with label, do forcealignment if len(encoder_out["encoder_padding_mask"]) > 0: padding_mask = encoder_out["encoder_padding_mask"][0] logits = logits.masked_fill(padding_mask, float("-inf")) else: seq_len, bsz = encoder_out["encoder_out"][0].size()[:2] padding_mask = torch.zeros( bsz, seq_len, device=encoder_out["encoder_out"][0].device ).bool() return {"x": logits, "padding_mask": padding_mask} elif mode == "sup_speech_ali": src_lengths = None if len(encoder_out["encoder_padding_mask"]) > 0: src_lengths = (1 - encoder_out["encoder_padding_mask"][0].long()).sum( -1 ) else: seq_len, bsz = encoder_out["encoder_out"][0].size()[:2] src_lengths = ( torch.ones(bsz, device=encoder_out["encoder_out"][0].device).long() * seq_len ) assert alignment is not None alignment = self.extend_alignment( alignment, src_lengths, prev_output_tokens ) others = {"pseudo_target_tokens": alignment} elif mode == "unsup_speech": enc_out_ori = ( encoder_out["encoder_unmasked_out"][0] if self.out_proj is None else self.out_proj(encoder_out["encoder_unmasked_out"][0]) ) logits_ori = F.linear(enc_out_ori, emb_weight, None).transpose(0, 1) if len(encoder_out["encoder_padding_mask"]) > 0: encoder_padding_mask = encoder_out["encoder_padding_mask"][0] logits_ori = logits_ori.masked_fill(encoder_padding_mask, float("-inf")) pseudo_labels = utils.log_softmax(logits_ori, dim=-1) others = { "pseudo_target_logprobs": pseudo_labels, "padding_mask": encoder_out["encoder_padding_mask"], # B X T "mask_indices": encoder_out[ "mask_indices" ], # True for masked frames B X T } return logits, others def get_normalized_probs( self, net_output: Dict[str, Tensor], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): return self.get_normalized_probs_scriptable( (net_output["x"], None), log_probs, sample ) class SpeechTextPreTrainDecoder(MultiInputDecoder): def __init__(self, dictionary, speech_decoder, text_decoder): super().__init__(dictionary) self.speech_decoder = speech_decoder self.text_decoder = text_decoder def select_decoder(self, mode, **kwargs): if mode == "unsup_speech": kwargs["mode"] = mode return self.speech_decoder, kwargs if mode in ("text", "bitext"): return self.text_decoder, kwargs if mode in ("speech", "sup_speech_ctc", "sup_speech_ali"): kwargs["mode"] = mode return self.speech_decoder, kwargs if mode in ("speech", "sup_speech_s2s"): if "alignment" in kwargs: del kwargs["alignment"] return self.text_decoder, kwargs raise NotImplementedError(f"{mode} is not supported") return None, kwargs def get_normalized_probs( self, net_output, log_probs, sample=None, ): """Get normalized probabilities (or log probs) from a net's output.""" if isinstance(net_output, dict): return self.speech_decoder.get_normalized_probs( net_output, log_probs, sample ) return self.text_decoder.get_normalized_probs(net_output, log_probs, sample) @classmethod def build_text_decoder(cls, args, tgt_dictionary, dec_emb_share=None): dec_emb = ( nn.Embedding( len(tgt_dictionary), args.decoder_embed_dim, tgt_dictionary.pad() ) if dec_emb_share is None else dec_emb_share ) text_decoder = TransformerDecoder(args, tgt_dictionary, dec_emb) return text_decoder @classmethod def build_dummy_speech_decoder(cls, args, dictionary, dec_emb_share=None): dec_emb = ( nn.Embedding(len(dictionary), args.decoder_embed_dim, dictionary.pad()) if dec_emb_share is None else dec_emb_share ) speech_decoder = SpeechDummyDecoder( dictionary, dec_emb, no_emb_update_unsup=getattr(args, "no_emb_update_unsup", False), use_output_proj=getattr(args, "use_decoder_output_proj", False), ) return speech_decoder @classmethod def build_decoder( cls, args, text_dictionary, speech_dictionary, speech_output_embedding ): text_decoder = cls.build_text_decoder(args, text_dictionary) speech_decoder = cls.build_dummy_speech_decoder( args, speech_dictionary, speech_output_embedding ) if getattr(args, "load_pretrained_mbart_decoder_from", None): text_decoder = checkpoint_utils.load_pretrained_component_from_model( component=text_decoder, checkpoint=args.load_pretrained_mbart_decoder_from, ) return SpeechTextPreTrainDecoder(text_dictionary, speech_decoder, text_decoder) @register_model("speech_text_pretrain_bart") class SpeechTextPreTrainModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) self.num_updates = 0 def forward( self, src_tokens, src_lengths, prev_output_tokens, src_lang_ids=None, **kwargs ): if src_lang_ids is not None: encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, src_lang_ids=src_lang_ids, **kwargs ) else: encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, **kwargs ) return decoder_out def max_positions(self): return None # it is provided in task def get_targets(self, sample, net_output): mode = sample["net_input"]["mode"] if mode == "unsup_speech": return {"target_logprobs": net_output[1]["pseudo_target_logprobs"]} if mode == "sup_speech_ali": return net_output[1]["pseudo_target_tokens"] return sample["target"] def get_normalized_probs( self, net_output, log_probs, sample=None, ): # net_output['encoder_out'] is a (B, T, D) tensor lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample) lprobs.batch_first = True return lprobs @staticmethod def add_args(parser): TransformerModel.add_args(parser) SpeechWavTransformerEncoder.add_args(parser) parser.add_argument( "--speech-sup-mask-prob", type=float, help="probability of replacing a token with mask (sup-speech)", ) parser.add_argument( "--speech-unsup-mask-prob", type=float, help="probability of replacing a token with mask (unsup-speech)", ) parser.add_argument( "--load-pretrained-mbart-encoder-from", type=str, metavar="STR", help="model to take text encoder weights from (for initialization)", ) parser.add_argument( "--load-pretrained-mbart-decoder-from", type=str, metavar="STR", help="model to take text decoder weights from (for initialization)", ) parser.add_argument( "--load-pretrained-feature-extractor-from", type=str, metavar="STR", help="model to take feature extractor weights from (for initialization)", ) parser.add_argument( "--speech-unsup-dropout", type=float, default=0, help="dropout for unsupervised speech encoder", ) parser.add_argument( "--speech-unsup-feature-dropout", type=float, default=0, help="dropout for unsupervised speech feature encoder", ) parser.add_argument( "--encoder-shared-text-layers-from-begin", type=int, help="number of text encoder layers shared with speech encoder (from first layer)", ) parser.add_argument( "--stacked-encoder", default="none", choices=["none", "s2s", "all"], help="stack speech and text encoders", ) parser.add_argument("--use-decoder-output-proj", action="store_true") @classmethod def build_model(cls, args, task): encoder = SpeechTextPreTrainEncoder.build_encoder(args, task.src_dict) decoder = SpeechTextPreTrainDecoder.build_decoder( args, task.tgt_dict, task.src_dict, encoder.text_encoder.embed_tokens ) model = SpeechTextPreTrainModel(encoder, decoder) return model def upgrade_state_dict(self, state_dict): """Upgrade old state dicts to work with newer code.""" if "decoder.speech_decoder.output_projection.weight" in state_dict: del state_dict["decoder.speech_decoder.output_projection.weight"] self.upgrade_state_dict_named(state_dict, "") @register_model_architecture( "speech_text_pretrain_bart", "speech_text_pretrain_bart_base" ) def speech_text_pretrain_bart_base(args): # speech masking args.dropout_input = getattr(args, "dropout_input", 0) args.dropout_features = getattr(args, "dropout_features", 0) args.speech_mask_length = getattr(args, "speech_mask_length", 10) args.speech_mask_prob = getattr(args, "speech_mask_prob", 0.65) args.speech_sup_mask_prob = getattr(args, "speech_sup_mask_prob", 0.3) args.speech_unsup_mask_prob = getattr( args, "speech_unsup_mask_prob", args.speech_mask_prob ) args.speech_mask_selection = getattr(args, "speech_mask_selection", "static") args.speech_mask_other = getattr(args, "speech_mask_other", 0) args.speech_mask_min_space = getattr(args, "speech_mask_min_space", 1) args.speech_no_mask_overlap = getattr(args, "speech_no_mask_overlap", False) args.speech_mask_channel_length = getattr(args, "speech_mask_channel_length", 10) args.speech_mask_channel_prob = getattr(args, "speech_mask_channel_prob", 0.0) args.speech_mask_channel_selection = getattr( args, "speech_mask_channel_selection", "static" ) args.speech_mask_channel_other = getattr(args, "speech_mask_channel_other", 0) args.speech_mask_channel_min_space = getattr( args, "speech_mask_channel_min_space", 1 ) args.speech_no_mask_channel_overlap = getattr( args, "speech_no_mask_channel_overlap", False ) args.no_scale_feature = getattr(args, "", False) args.feature_grad_mult = getattr(args, "feature_grad_mult", 1.0) # 0.1 # Transformer args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) args.encoder_ffn_embed_dim = getattr( args, "encoder_ffn_embed_dim", args.encoder_embed_dim * 4 ) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.speech_conv_bias = getattr(args, "speech_conv_bias", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_attention_heads = getattr( args, "decoder_attention_heads", args.encoder_attention_heads ) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", args.dropout) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") # gelu? args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.speech_unsup_dropout = getattr(args, "speech_unsup_dropout", 0) args.speech_unsup_feature_dropout = getattr(args, "speech_unsup_feature_dropout", 0) args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.encoder_shared_text_layers_from_begin = getattr( args, "encoder_shared_text_layers_from_begin", 6 ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.no_emb_update_unsup = getattr(args, "no_emb_update_unsup", False) @register_model_architecture( "speech_text_pretrain_bart", "speech_text_pretrain_bart_base_stack" ) def speech_text_pretrain_bart_base_stack(args): args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.encoder_shared_text_layers_from_begin = getattr( args, "encoder_shared_text_layers_from_begin", 0 ) args.stacked_encoder = getattr(args, "stacked_encoder", "all") args.layernorm_embedding = getattr(args, "layernorm_embedding", True) speech_text_pretrain_bart_base(args) @register_model_architecture( "speech_text_pretrain_bart", "speech_text_pretrain_bart_large" ) def speech_text_pretrain_bart_large(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 24) args.text_encoder_layers = getattr(args, "text_encoder_layers", 12) args.encoder_shared_text_layers_from_begin = getattr( args, "encoder_shared_text_layers_from_begin", 12 ) args.decoder_layers = getattr(args, "decoder_layers", 12) args.dropout = getattr(args, "dropout", 0.3) speech_text_pretrain_bart_base(args) @register_model_architecture( "speech_text_pretrain_bart", "speech_text_pretrain_bart_large_stack" ) def speech_text_pretrain_bart_large_stack(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6) args.text_encoder_layers = getattr(args, "text_encoder_layers", 12) args.encoder_shared_text_layers_from_begin = getattr( args, "encoder_shared_text_layers_from_begin", 0 ) args.decoder_layers = getattr(args, "decoder_layers", 12) args.stacked_encoder = getattr(args, "stacked_encoder", "s2s") args.layernorm_embedding = getattr(args, "layernorm_embedding", True) speech_text_pretrain_bart_base(args) ================================================ FILE: examples/speech_text_joint_to_text/models/s2t_dualinputtransformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from collections import namedtuple import torch import torch.nn as nn from fairseq import checkpoint_utils from fairseq import utils from fairseq.models import ( FairseqEncoder, FairseqDecoder, FairseqEncoderDecoderModel, register_model, register_model_architecture, ) from fairseq.models.fairseq_encoder import EncoderOut from fairseq.models.speech_to_text import ( TransformerDecoder, S2TTransformerEncoder, ) from fairseq.models.transformer import TransformerEncoder from fairseq.modules import ( TransformerEncoderLayer, GradMultiply, LayerNorm, ) logger = logging.getLogger(__name__) class SpeechEoSEncoder(FairseqEncoder): def __init__(self, encoder, eos_num, feat_dim, adapter_type="None", adapter_dim=0): super().__init__(None) self.encoder = encoder self.eos_num = eos_num # downsampling rate for speech input feature self.eos_emb = ( nn.Parameter(torch.zeros(1, feat_dim), requires_grad=True) if eos_num > 0 else None ) self.adapter = self.add_adapter(adapter_type, adapter_dim) def add_adapter(self, adapter_type, adapter_dim): def _make_identity(linear, eps=1e-5): assert isinstance(linear, nn.Linear) linear.weight.data.mul_(eps) linear.weight.data.fill_diagonal_(1.0) if linear.bias is not None: linear.bias.data.mul_(eps) adapter = None if adapter_type == "Linear": assert adapter_dim > 0 adapter = nn.Sequential( nn.Linear(adapter_dim, adapter_dim), LayerNorm(adapter_dim) ) # initialize the adapter as identity matrix first _make_identity(adapter[0]) elif adapter_type == "MLP": assert adapter_dim > 0 # assume the model is pre-norm model adapter = nn.Sequential( nn.Linear(adapter_dim, 2 * adapter_dim), nn.ReLU(), nn.Linear(2 * adapter_dim, adapter_dim), LayerNorm(adapter_dim), ) _make_identity(adapter[0]) _make_identity(adapter[2]) return adapter def add_eos(self, src_tokens, src_lengths): bsz, max_seq_len, fdim = src_tokens.size() if self.eos_num > 0: src_token_eos = torch.zeros( [bsz, max_seq_len + self.eos_num, fdim], dtype=src_tokens.dtype, device=src_tokens.device, ) src_token_eos[:, :max_seq_len] = src_tokens for bi in range(bsz): src_token_eos[bi][ src_lengths[bi] : src_lengths[bi] + self.eos_num ] = self.eos_emb.expand(self.eos_num, fdim) src_lengths = src_lengths + self.eos_num src_tokens = src_token_eos return src_tokens, src_lengths def apply_adapter(self, enc_out): if self.adapter is None: return enc_out rst = self.adapter(enc_out.encoder_out) if enc_out.encoder_padding_mask is not None: rst.masked_fill_( enc_out.encoder_padding_mask.transpose(0, 1).unsqueeze(-1), 0 ) return EncoderOut( encoder_out=rst, encoder_padding_mask=enc_out.encoder_padding_mask, encoder_embedding=enc_out.encoder_embedding, encoder_states=enc_out.encoder_states, src_tokens=enc_out.src_tokens, src_lengths=enc_out.src_lengths, ) def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs): """ src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (B,) """ src_tokens, src_lengths = self.add_eos(src_tokens, src_lengths) enc_out = self.encoder(src_tokens, src_lengths, return_all_hiddens) enc_out = self.apply_adapter(enc_out) return enc_out def reorder_encoder_out(self, encoder_out, new_order): return self.encoder.reorder_encoder_out(encoder_out, new_order) class DualInputEncoder(FairseqEncoder): def __init__( self, args, spch_encoder, text_encoder, dictionary, cross_attentive_loss_before_last_layer=-1, ): super().__init__(dictionary) self.spch_encoder = spch_encoder self.text_encoder = text_encoder self.enc_grad_mult = args.enc_grad_mult self.cross_attentive_loss_before_last_layer = ( cross_attentive_loss_before_last_layer ) self.use_cross_attentive_loss = ( False if cross_attentive_loss_before_last_layer <= -1 else True ) self.enc2_along_grad_mult = args.enc2_along_grad_mult @classmethod def set_shared_layer(cls, share_level, src_layer, tgt_layer): """ share parameters from tgt_layer to src_layer share_level: 0: share everything 1: share everything but different model 2: share weight but not bias, layernorm """ if share_level == 0: return tgt_layer if isinstance(src_layer, nn.Linear): return tgt_layer if isinstance(src_layer, TransformerEncoderLayer): assert src_layer.embed_dim == tgt_layer.embed_dim assert src_layer.normalize_before == tgt_layer.normalize_before if share_level == 1: src_layer.fc1 = tgt_layer.fc1 src_layer.fc2 = tgt_layer.fc2 src_layer.self_attn = tgt_layer.self_attn src_layer.final_layer_norm = tgt_layer.final_layer_norm src_layer.self_attn_layer_norm = tgt_layer.self_attn_layer_norm src_layer.layernorm_embedding = tgt_layer.layernorm_embedding else: src_layer.fc1.weight = tgt_layer.fc1.weight src_layer.fc2.weight = tgt_layer.fc2.weight src_layer.self_attn.k_proj.weight = tgt_layer.self_attn.k_proj.weight src_layer.self_attn.v_proj.weight = tgt_layer.self_attn.v_proj.weight src_layer.self_attn.q_proj.weight = tgt_layer.self_attn.q_proj.weight src_layer.self_attn.out_proj.weight = ( tgt_layer.self_attn.out_proj.weight ) else: if share_level == 1: return tgt_layer return src_layer @classmethod def build_spch_encoder(cls, args): cfg = { "input_feat_per_channel": args.input_feat_per_channel, "input_channels": args.input_channels, "conv_kernel_sizes": args.conv_kernel_sizes, "conv_channels": args.conv_channels, "encoder_embed_dim": args.encoder_embed_dim, "encoder_ffn_embed_dim": args.encoder_ffn_embed_dim, "encoder_layers": args.speech_encoder_layers, "encoder_layerdrop": args.encoder_layerdrop, "encoder_attention_heads": args.encoder_attention_heads, "max_source_positions": args.max_source_positions, "dropout": args.dropout, "encoder_normalize_before": args.encoder_normalize_before, "activation_dropout": args.activation_dropout, "attention_dropout": args.attention_dropout, "activation_fn": args.activation_fn, "layernorm_embedding": args.layernorm_embedding, "no_token_positional_embeddings": args.no_token_positional_embeddings, "no_scale_embedding": args.no_scale_embedding, "quant_noise_pq": args.quant_noise_pq, "encoder_freezing_updates": 0, } model_args = namedtuple("args", cfg.keys())(*cfg.values()) spch_encoder = S2TTransformerEncoder(model_args) if args.add_speech_eos: spch_encoder = SpeechEoSEncoder( spch_encoder, 2 * len(args.conv_kernel_sizes.split(",")), args.input_feat_per_channel, adapter_type=getattr(args, "speech_encoder_adapter_type", "None"), adapter_dim=args.encoder_embed_dim, ) return spch_encoder @classmethod def build_text_encoder(cls, args, src_dictionary, spch_encoder): if args.encoder_shared_layers > 0: mx_shared_layers = ( args.speech_encoder_layers if args.speech_encoder_layers < args.text_encoder_layers else args.text_encoder_layers ) args.encoder_shared_layers = ( args.encoder_shared_layers if args.encoder_shared_layers <= mx_shared_layers else mx_shared_layers ) cfg = { "encoder_embed_dim": args.encoder_text_embed_dim, "encoder_ffn_embed_dim": args.encoder_ffn_embed_dim, "encoder_layers": args.text_encoder_layers, "encoder_layerdrop": args.encoder_layerdrop, "encoder_attention_heads": args.encoder_attention_heads, "encoder_learned_pos": args.encoder_learned_pos, "max_source_positions": args.max_source_positions, "dropout": args.dropout, "encoder_normalize_before": args.encoder_normalize_before, "activation_dropout": args.activation_dropout, "attention_dropout": args.attention_dropout, "activation_fn": args.activation_fn, "adaptive_input": args.adaptive_input, "no_token_positional_embeddings": args.no_token_positional_embeddings, "no_scale_embedding": args.no_scale_embedding, "quant_noise_pq": args.quant_noise_pq, } model_args = namedtuple("args", cfg.keys())(*cfg.values()) enc_emb = nn.Embedding( len(src_dictionary), model_args.encoder_embed_dim, src_dictionary.pad() ) text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb) if args.add_speech_eos: spch_encoder = spch_encoder.encoder if args.encoder_shared_layers > 0: text_encoder.layer_norm = cls.set_shared_layer( args.encoder_shared_layer_level, text_encoder.layer_norm, spch_encoder.layer_norm, ) for i, ly in enumerate( spch_encoder.transformer_layers[-args.encoder_shared_layers :] ): ly_id = i + args.text_encoder_layers - args.encoder_shared_layers if not isinstance(text_encoder.layers[ly_id], type(ly)): if text_encoder.layers[ly_id]._get_name() not in ('TransformerEncoderLayerBase', 'TransformerEncoderLayer'): raise ValueError("The shared layers are expected from the same class") text_encoder.layers[ly_id] = cls.set_shared_layer( args.encoder_shared_layer_level, text_encoder.layers[ly_id], ly, ) return text_encoder def mult_rst_grad(self, rst, ratio): assert isinstance(rst, dict) # instead of EncoderOut assert len(rst["encoder_out"]) == 1 rst["encoder_out"][0] = GradMultiply.apply(rst["encoder_out"][0], ratio) return rst def process_attentive_loss_states(self, rst, interstates): assert isinstance(rst, dict) # instead of EncoderOut rst["encoder_states"] = interstates return rst def forward( self, src_tokens, src_lengths=None, src_txt_tokens=None, src_txt_lengths=None, **kwargs ): """ Args: src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (speech) (B,) src_txt_tokens: padded tensor (B, T) src_txt_lengths: tensor of original lengths of input utterances (text) (B,) """ # src_tokens only: inference # src_tokens, src_lengths: speech only training # src_txt_tokens, src_txt_lengths: text only training # all valid: speech + text training if src_tokens is None and src_txt_tokens is None: raise ValueError( "src_tokens and src_txt_tokens cannot be None at the same time" ) ret1 = None ret2 = None return_all_hiddens = False if src_tokens is not None: if ( self.use_cross_attentive_loss and src_txt_tokens is not None ): # remove self.training so we can get attn score during validation step return_all_hiddens = True ret1 = self.spch_encoder( src_tokens, src_lengths, return_all_hiddens=return_all_hiddens ) if self.use_cross_attentive_loss and src_txt_tokens is not None: assert self.cross_attentive_loss_before_last_layer < len( ret1["encoder_states"] ) ret1 = self.process_attentive_loss_states( ret1, ret1["encoder_states"][ -self.cross_attentive_loss_before_last_layer - 1 ], ) if src_txt_tokens is not None: ret2 = self.text_encoder( src_txt_tokens, src_txt_lengths, return_all_hiddens=return_all_hiddens ) if return_all_hiddens: if self.cross_attentive_loss_before_last_layer == len( self.text_encoder.layers ): text_embedding, _ = self.text_encoder.forward_embedding( src_txt_tokens ) text_embedding = text_embedding.transpose(0, 1) ret2 = self.process_attentive_loss_states(ret2, text_embedding) else: assert self.cross_attentive_loss_before_last_layer < len( self.text_encoder.layers ) ret2 = self.process_attentive_loss_states( ret2, ret2["encoder_states"][ -self.cross_attentive_loss_before_last_layer - 1 ], ) def merge_output(rst1, rst2): if rst1 is None: if not (self.enc2_along_grad_mult == 1.0 or self.training): rst2 = self.mult_rst_grad(rst2, self.enc2_along_grad_mult) return rst2 if rst2 is None: return rst1 if self.enc_grad_mult != 1.0 and self.training: rst1 = self.mult_rst_grad(rst1, self.enc_grad_mult) rst2 = self.mult_rst_grad(rst2, self.enc_grad_mult) rst = (rst1, rst2) return rst return merge_output(ret1, ret2) def reorder_encoder_out(self, encoder_out, new_order): assert self.training is False # used for inference only return self.spch_encoder.reorder_encoder_out(encoder_out, new_order) # TransformerMultiInputDecoder: take one or two encoder inputs class TransformerMultiInputDecoder(FairseqDecoder): def __init__( self, dictionary, spch_decoder, text_decoder, compute_cross_attentive_loss=False, cross_attentive_loss_with_norm=True, cross_attentive_loss_reverse=False, ): super().__init__(dictionary) self.spch_decoder = spch_decoder self.text_decoder = text_decoder self.compute_cross_attentive_loss = compute_cross_attentive_loss self.cross_attentive_loss_with_norm = cross_attentive_loss_with_norm self.cross_attentive_loss_reverse = cross_attentive_loss_reverse @classmethod def share_spchdecoder(cls, task_args, text_decoder, spch_decoder): if task_args.decoder_shared_layer_level == 0: return text_decoder assert text_decoder.embed_tokens == spch_decoder.embed_tokens spch_decoder.project_in_dim = text_decoder.project_in_dim spch_decoder.embed_positions = text_decoder.embed_positions spch_decoder.layernorm_embedding = text_decoder.layernorm_embedding spch_decoder.project_out_dim = text_decoder.project_out_dim spch_decoder.adaptive_softmax = text_decoder.adaptive_softmax if task_args.decoder_shared_layer_level == 1: spch_decoder.output_projection = text_decoder.output_projection spch_decoder.layer_norm = text_decoder.layer_norm else: # 2 spch_decoder.output_projection.weight = ( text_decoder.output_projection.weight ) for i, ly in enumerate(text_decoder.layers): sly = spch_decoder.layers[i] sly.self_attn = ly.self_attn sly.self_attn_layer_norm = ly.self_attn_layer_norm # sly.encoder_attn = ly.encoder_attn if ( task_args.decoder_shared_layer_level == 1 ): # share everything, but under different models sly.encoder_attn = ly.encoder_attn sly.encoder_attn_layer_norm = ly.encoder_attn_layer_norm sly.fc1 = ly.fc1 sly.fc2 = ly.fc2 sly.final_layer_norm = ly.final_layer_norm else: # task_args.decoder_shared_layer_level == 2: #separated encoder_attn_layer_norm and bias sly.encoder_attn.k_proj.weight = ly.encoder_attn.k_proj.weight sly.encoder_attn.v_proj.weight = ly.encoder_attn.v_proj.weight sly.encoder_attn.q_proj.weight = ly.encoder_attn.q_proj.weight sly.encoder_attn.out_proj.weight = ly.encoder_attn.out_proj.weight sly.fc1.weight = ly.fc1.weight sly.fc2.weight = ly.fc2.weight return spch_decoder def cross_attentive_loss( self, teacher_states, student_states, teacher_masking, student_masking, eps=1e-6 ): x = teacher_states.transpose(0, 1) # from T X B X D to B X T X D y = student_states.transpose(0, 1) if self.cross_attentive_loss_with_norm: x = x / (x.norm(dim=2, keepdim=True) + eps) y = y / (y.norm(dim=2, keepdim=True) + eps) dim = x.size(-1) # lengths: batch X seqLen sim_scores_xy = torch.bmm(x, y.transpose(1, 2)) # batch X lenx X leny ] if y.dtype == torch.float16: sim_scores_xy = sim_scores_xy.float() y = y.float() x = x.float() if teacher_masking != []: assert len(teacher_masking) == 1 sim_scores_xy = sim_scores_xy.masked_fill( teacher_masking[0].unsqueeze(-1), float("-inf") ) if student_masking != []: sim_scores_xy = sim_scores_xy.masked_fill( student_masking[0].unsqueeze(1), float("-inf") ) # do masking y_weights = utils.softmax(sim_scores_xy, dim=-1) if teacher_masking != []: y_weights = y_weights.masked_fill(teacher_masking[0].unsqueeze(-1), 0) x_reconstruct_from_y = torch.bmm(y_weights, y) sim_scores_xx = torch.bmm(x, x.transpose(1, 2)) # batch X lenx X lenx ] x_weights = utils.softmax(sim_scores_xx, dim=-1) if teacher_masking != []: x_weights = x_weights.masked_fill(teacher_masking[0].unsqueeze(-1), 0) # no gradient for teacher state x_reconstruct_from_x = torch.bmm(x_weights, x).detach() cost = (x_reconstruct_from_x - x_reconstruct_from_y).norm(dim=2) if teacher_masking != []: cost = cost.masked_fill(teacher_masking[0], 0) if not self.cross_attentive_loss_with_norm: cost = cost / dim return cost def forward( self, prev_output_tokens, encoder_out, incremental_state=None, has_txt_input=False, **kwargs ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for input feeding/teacher forcing. If there are two or more input during training, they will share the same prev_output_tokens encoder_out (tuple[Tensor]): output from the encoder, used for encoder-side attention. It will be tuple if there are more inputs, but a tensor if only one input incremental_state ([dict]): dictionary used for storing state during :ref:`Incremental decoding`. It is only valid for inference, only from single input Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)`. If there are N inputs, batch will be N bigger than a single input - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ assert not isinstance(encoder_out, EncoderOut) if isinstance(encoder_out, tuple): # training with mulitple input rst = [] assert len(encoder_out) == 2 for i, eo in enumerate(encoder_out): assert incremental_state is None if i == 0: rst.append( self.spch_decoder(prev_output_tokens, eo, incremental_state) ) else: rst.append( self.text_decoder(prev_output_tokens, eo, incremental_state) ) dec_out = torch.cat([r[0] for r in rst], dim=0) attn_cost = None if self.compute_cross_attentive_loss: assert isinstance(encoder_out[0], dict) if self.cross_attentive_loss_reverse: attn_cost = self.cross_attentive_loss( teacher_states=encoder_out[1]["encoder_states"], # text_states student_states=encoder_out[0]["encoder_states"], # spch_states teacher_masking=encoder_out[1]["encoder_padding_mask"], student_masking=encoder_out[0]["encoder_padding_mask"], ) else: attn_cost = self.cross_attentive_loss( teacher_states=encoder_out[0]["encoder_states"], # spch_states student_states=encoder_out[1]["encoder_states"], # text_states teacher_masking=encoder_out[0]["encoder_padding_mask"], student_masking=encoder_out[1]["encoder_padding_mask"], ) return (dec_out, {"attn_cost": attn_cost}) else: # inference or training with one input if has_txt_input: return self.text_decoder( prev_output_tokens, encoder_out, incremental_state ) return self.spch_decoder(prev_output_tokens, encoder_out, incremental_state) # Note: # dual input transformer: # encoder: S2TTransformerEncoder for speech + TransformerEncoder for text # decoder: TransformerDecoder for text @register_model("dual_input_s2t_transformer") class DualInputS2TTransformerModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) self.num_updates = 0 def max_positions(self): return None # it is provided in task @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # encoder 1: S2TTransformerEncoder for speech parser.add_argument( "--conv-kernel-sizes", type=str, metavar="N", help="kernel sizes of Conv1d subsampling layers", ) parser.add_argument( "--conv-channels", type=int, metavar="N", help="# of channels in Conv1d subsampling layers", ) parser.add_argument( "--enc-output-dim", type=int, metavar="N", help=""" encoder output dimension, can be None. If specified, projecting the transformer output to the specified dimension""", ) # standard Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-text-embed-dim", type=int, metavar="N", help="encoder text embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) # non-standard transformer parameters parser.add_argument( "--speech-encoder-layers", type=int, metavar="N", help="num speech encoder layers", ) parser.add_argument( "--text-encoder-layers", type=int, metavar="N", help="num text encoder layers", ) parser.add_argument( "--encoder-shared-layers", type=int, metavar="N", help="num shared encoder layers", ) parser.add_argument( "--encoder-shared-layer-level", type=int, metavar="N", default=0, choices=[0, 1, 2], help="share layer level 0: all share 1: all share with separate model 2: share weight but not bias and layernorm", ) parser.add_argument( "--decoder-shared-layer-level", default=0, choices=[0, 1, 2], type=int, metavar="N", help="0: share everything; 1: share everything with different model 2: no share layer_norm and bias", ) ### parser.add_argument( "--text-input-cost-ratio", type=float, default=1.0, metavar="V", help="text input cost ratio relative to speech input cost", ) parser.add_argument( "--init-scale", type=float, default=1.0, metavar="V", help="scale the initial weight by given factor", ) parser.add_argument( "--enc-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc1 and enc2 gradient by V", ) parser.add_argument( "--enc2-along-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc2 gradient by V if only enc2 is used", ) parser.add_argument( "--load-pretrain-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained encoder """, ) parser.add_argument( "--load-pretrain-speech-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained speech encoder """, ) parser.add_argument( "--load-pretrain-text-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained text encoder """, ) parser.add_argument( "--load-pretrain-text-encoder-last", type=str, default="", metavar="EXPR", help=""" path to the pretrained text encoder """, ) parser.add_argument( "--load-pretrain-decoder", type=str, metavar="EXPR", default="", help=""" path to the pretrained encoder """, ) parser.add_argument( "--add-speech-eos", action="store_true", help="add eos token at the end of input feature", ) parser.add_argument( "--speech-encoder-adapter-type", type=str, metavar="EXPR", default="None", choices=["None", "Linear", "MLP"], help="add speech encoder adapter", ) @classmethod def build_encoder(cls, args, task): spch_encoder = DualInputEncoder.build_spch_encoder(args) text_encoder = DualInputEncoder.build_text_encoder( args, task.src_dict, spch_encoder ) cross_attentive_loss_before_last_layer = ( 0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1 ) encoder = DualInputEncoder( args, spch_encoder, text_encoder, task.src_dict, cross_attentive_loss_before_last_layer, ) if args.init_scale != 1.0: with torch.no_grad(): for param in encoder.parameters(): param.data.mul_(args.init_scale) if args.load_pretrain_text_encoder != "": checkpoint_utils.load_pretrained_component_from_model( text_encoder, args.load_pretrain_text_encoder ) if args.load_pretrain_speech_encoder != "": if hasattr(spch_encoder, "encoder"): checkpoint_utils.load_pretrained_component_from_model( spch_encoder.encoder, args.load_pretrain_speech_encoder ) else: checkpoint_utils.load_pretrained_component_from_model( spch_encoder, args.load_pretrain_speech_encoder ) if ( args.load_pretrain_text_encoder_last != "" ): # if share encoder, speech encoder parameters will be used. # It provides a chance to use pre-trained mt encoder instead checkpoint_utils.load_pretrained_component_from_model( text_encoder, args.load_pretrain_text_encoder_last ) if args.load_pretrain_encoder != "": checkpoint_utils.load_pretrained_component_from_model( encoder, args.load_pretrain_encoder ) return encoder @classmethod def build_decoder(cls, args, task): dec_cfg = { "decoder_layerdrop": args.decoder_layerdrop, "share_decoder_input_output_embed": args.share_decoder_input_output_embed, "decoder_embed_dim": args.decoder_embed_dim, "max_target_positions": args.max_target_positions, "dropout": args.dropout, "encoder_learned_pos": args.encoder_learned_pos, "decoder_learned_pos": args.decoder_learned_pos, "layernorm_embedding": args.layernorm_embedding, "decoder_normalize_before": args.decoder_normalize_before, "activation_dropout": args.activation_dropout, "attention_dropout": args.attention_dropout, "decoder_ffn_embed_dim": args.decoder_ffn_embed_dim, "decoder_layers": args.decoder_layers, "decoder_attention_heads": args.decoder_attention_heads, "decoder_output_dim": args.decoder_embed_dim, "no_scale_embedding": args.no_scale_embedding, "adaptive_input": args.adaptive_input, "quant_noise_pq": args.quant_noise_pq, "adaptive_softmax_cutoff": args.adaptive_softmax_cutoff, "tie_adaptive_weights": args.tie_adaptive_weights, "no_token_positional_embeddings": args.no_token_positional_embeddings, "encoder": {"embed_dim":args.encoder_embed_dim} } dec_cfg = namedtuple("args", dec_cfg.keys())(*dec_cfg.values()) dec_emb = nn.Embedding( len(task.target_dictionary), args.decoder_embed_dim, task.target_dictionary.pad(), ) compute_cross_attentive_loss = ( True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False ) cross_attentive_loss_without_norm = getattr( args, "attentive_cost_without_normalize", False ) cross_attentive_loss_reverse = ( False # getattr(args, "attentive_cost_reverse", False) ) text_decoder = TransformerDecoder(dec_cfg, task.target_dictionary, dec_emb) spch_decoder = TransformerDecoder(dec_cfg, task.target_dictionary, dec_emb) spch_decoder = TransformerMultiInputDecoder.share_spchdecoder( args, text_decoder, spch_decoder ) decoder = TransformerMultiInputDecoder( dictionary=task.target_dictionary, spch_decoder=spch_decoder, text_decoder=text_decoder, compute_cross_attentive_loss=compute_cross_attentive_loss, cross_attentive_loss_with_norm=True if not cross_attentive_loss_without_norm else False, cross_attentive_loss_reverse=cross_attentive_loss_reverse, ) if args.init_scale != 1.0: with torch.no_grad(): for param in decoder.parameters(): param.data.mul_(args.init_scale) if args.load_pretrain_decoder != "": try: checkpoint_utils.load_pretrained_component_from_model( decoder, args.load_pretrain_decoder ) except RuntimeError: checkpoint_utils.load_pretrained_component_from_model( decoder.text_decoder, args.load_pretrain_decoder ) if args.decoder_shared_layer_level > 0: checkpoint_utils.load_pretrained_component_from_model( decoder.spch_decoder, args.load_pretrain_decoder ) return decoder @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted # (in case there are any new ones) dualinputs2ttransformer_base(args) encoder = cls.build_encoder(args, task) decoder = cls.build_decoder(args, task) return cls(encoder, decoder) def get_normalized_probs(self, net_output, log_probs, sample=None): # net_output['encoder_out'] is a (B, T, D) tensor lprobs = super().get_normalized_probs(net_output, log_probs, sample) lprobs.batch_first = True return lprobs def set_num_updates(self, num_updates): """Set the number of parameters updates.""" super().set_num_updates(num_updates) self.num_updates = num_updates def forward( self, src_tokens, src_lengths, prev_output_tokens, use_encoder_outputs=False, src_txt_tokens=None, src_txt_lengths=None, mode="sup_speech", **kwargs ): """ Run the forward pass for an encoder-decoder model. First feed a batch of source tokens through the encoder. Then, feed the encoder output and previous decoder outputs (i.e., teacher forcing) to the decoder to produce the next outputs:: encoder_out = self.encoder(src_tokens, src_lengths) return self.decoder(prev_output_tokens, encoder_out) Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): source sentence lengths of shape `(batch)` prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing mode = 'sup_speech' or 'text' Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ if mode == "text": assert src_txt_tokens is None src_txt_tokens = src_tokens src_txt_lengths = src_lengths src_tokens = None src_lengths = None encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, src_txt_tokens=src_txt_tokens, src_txt_lengths=src_txt_lengths, **kwargs ) has_txt_input = True if src_txt_tokens is not None else False decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, has_txt_input=has_txt_input, **kwargs ) if use_encoder_outputs: return decoder_out, encoder_out return decoder_out @register_model_architecture( "dual_input_s2t_transformer", "dualinputs2ttransformer_base" ) def dualinputs2ttransformer_base(args): args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0) # Convolutional subsampler args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5") args.conv_channels = getattr(args, "conv_channels", 1024) # Transformer args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_text_embed_dim = getattr( args, "encoder_text_embed_dim", args.encoder_embed_dim ) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", args.dropout) args.activation_dropout = getattr(args, "activation_dropout", args.dropout) args.activation_fn = getattr(args, "activation_fn", "relu") args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 10) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.encoder_shared_layers = getattr(args, "encoder_shared_layers", 0) args.decoder_layers = getattr(args, "decoder_layers", 6) args.add_speech_eos = getattr(args, "add_speech_eos", False) @register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_s") def dualinputs2ttransformer_s(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 4) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 7) args.text_encoder_layers = getattr(args, "text_encoder_layers", 7) args.decoder_layers = getattr(args, "decoder_layers", 7) dualinputs2ttransformer_base(args) @register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_m") def dualinputs2ttransformer_m(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512 * 4) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.dropout = getattr(args, "dropout", 0.15) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 10) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.decoder_layers = getattr(args, "decoder_layers", 6) dualinputs2ttransformer_base(args) @register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_b") def dualinputs2ttransformer_b(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 768 * 4) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12) args.dropout = getattr(args, "dropout", 0.15) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.decoder_layers = getattr(args, "decoder_layers", 6) dualinputs2ttransformer_base(args) @register_model_architecture("dual_input_s2t_transformer", "dualinputs2ttransformer_l") def dualinputs2ttransformer_l(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024 * 4) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.dropout = getattr(args, "dropout", 0.2) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.decoder_layers = getattr(args, "decoder_layers", 6) dualinputs2ttransformer_base(args) ================================================ FILE: examples/speech_text_joint_to_text/models/s2t_dualinputwavtransformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from collections import OrderedDict, namedtuple import torch.nn as nn from fairseq import checkpoint_utils, utils from fairseq.checkpoint_utils import load_checkpoint_to_cpu from fairseq.file_io import PathManager from fairseq.models import register_model, register_model_architecture from fairseq.models.speech_to_text import ( SpeechWavTransformerEncoder, StackedSpeechWavTransformerEncoder, TransformerDecoder, ) from fairseq.models.transformer import TransformerEncoder from .s2t_dualinputtransformer import ( DualInputEncoder, DualInputS2TTransformerModel, TransformerMultiInputDecoder, ) logger = logging.getLogger(__name__) @register_model("dual_input_wav_transformer") class DualInputWavTransformerModel(DualInputS2TTransformerModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): def add_transformer_args(parser): # We can't use TransformerModel.add_args(parser), since it defines max-source-positions which is duplicated with tasks/speech_to_text.py # Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--encoder-learned-pos", action="store_true", help="use learned positional embeddings", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="use learned positional embeddings", ) add_transformer_args(parser) SpeechWavTransformerEncoder.add_args(parser) parser.add_argument( "--load-pretrained-speech-text-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained speech text encoder from SpeechTextPreTrainModel """, ) parser.add_argument( "--load-pretrained-wav2vec-encoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained speech text encoder from wav2vec """, ) parser.add_argument( "--load-pretrained-speech-text-decoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained speech text decoder from SpeechTextPreTrainModel """, ) parser.add_argument( "--load-pretrained-text-decoder", type=str, default="", metavar="EXPR", help=""" path to the pretrained text decoder """, ) parser.add_argument( "--load-init-encoder", type=str, default="", metavar="EXPR", help=""" path to load seed encoder model """, ) parser.add_argument( "--load-init-decoder", type=str, default="", metavar="EXPR", help=""" path to load seed decoder model """, ) parser.add_argument( "--text-input-cost-ratio", type=float, default=1.0, metavar="V", help="text input cost ratio relative to speech input cost", ) parser.add_argument( "--enc-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc1 and enc2 gradient by V", ) parser.add_argument( "--enc2-along-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc2 gradient by V if only enc2 is used", ) parser.add_argument( "--no-strict-check-pretrain-model", action="store_true", help="Don't apply strict model check for the pretrained model", ) parser.add_argument( "--stacked-encoder", action="store_true", help="stack speech and text encoders", ) @classmethod def update_transformer_encoder_cfg(cls, args, update_dict): cfg = dict(args._get_kwargs()) for fkey in update_dict.keys(): cfg[fkey] = update_dict[fkey] cfg.pop("_name", None) # remove keys start with _ model_args = namedtuple("args", cfg.keys())(*cfg.values()) return model_args @classmethod def build_text_encoder(cls, args, src_dictionary): enc_emb = nn.Embedding( len(src_dictionary), args.encoder_embed_dim, src_dictionary.pad() ) model_args = cls.update_transformer_encoder_cfg( args, { "encoder_layers": args.text_encoder_layers, "max_source_positions": args.max_positions_text, }, ) text_encoder = TransformerEncoder(model_args, src_dictionary, enc_emb) return text_encoder @classmethod def build_speech_encoder(cls, args): model_args = cls.update_transformer_encoder_cfg( args, {"encoder_layers": args.speech_encoder_layers} ) speech_encoder = SpeechWavTransformerEncoder(model_args) return speech_encoder @classmethod def check_args(cls, condition, is_strict, msg): if condition: return if is_strict: raise ValueError(msg) logger.warn(msg) @classmethod def build_encoder(cls, args, task): # text_encoder = cls.build_text_encoder(args, task.source_dictionary ) text_encoder = cls.build_text_encoder(args, task.src_dict) speech_encoder = cls.build_speech_encoder(args) if args.load_pretrained_wav2vec_encoder: component_pairs = ( ("feature_extractor", speech_encoder.subsample), ("post_extract_proj", speech_encoder.feat_proj), ("layer_norm", speech_encoder.feat_layer_norm), ("encoder.pos_conv", speech_encoder.embed_positions), ("encoder.layers", speech_encoder.layers), ("encoder.layer_norm", speech_encoder.layer_norm), ("mask_emb", speech_encoder.mask_emb), ) state = cls.load_pretrained_speech_text_components( args.load_pretrained_wav2vec_encoder, component_pairs ) cls.check_args( args.encoder_normalize_before == state["cfg"]["model"]["layer_norm_first"], not args.no_strict_check_pretrain_model, f"encoder_normalize_before {args.encoder_normalize_before} doesn't match with the pretrained model", ) cls.check_args( args.activation_fn == state["cfg"]["model"]["activation_fn"], not args.no_strict_check_pretrain_model, f"activation_fn {args.activation_fn} doesn't match with the pretrained model", ) if getattr(args, "stacked_encoder", False): if args.encoder_shared_text_layers_from_begin > 0: raise ValueError( "We can not stack encoders and share encoders at the same time!" ) speech_encoder = StackedSpeechWavTransformerEncoder( speech_encoder, text_encoder.layers, text_encoder.layer_norm ) else: cls.share_speech_text_encoder( speech_encoder, text_encoder, args.encoder_shared_text_layers_from_begin ) cross_attentive_loss_before_last_layer = ( 0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1 ) encoder = DualInputEncoder( args, speech_encoder, text_encoder, task.src_dict, cross_attentive_loss_before_last_layer, ) if args.load_pretrained_speech_text_encoder: component_pairs = ( ("encoder.sup_s2s_speech_encoder", encoder.spch_encoder), ("encoder.text_encoder", encoder.text_encoder), ) cls.load_pretrained_speech_text_components( args.load_pretrained_speech_text_encoder, component_pairs ) if getattr(args, "load_init_encoder", "") != "": checkpoint_utils.load_pretrained_component_from_model( encoder, args.load_init_encoder ) return encoder @classmethod def build_text_decoder(cls, args, tgt_dictionary, dec_emb_share=None): dec_emb = ( nn.Embedding( len(tgt_dictionary), args.decoder_embed_dim, tgt_dictionary.pad() ) if dec_emb_share is None else dec_emb_share ) text_decoder = TransformerDecoder(args, tgt_dictionary, dec_emb) return text_decoder @classmethod def build_decoder(cls, args, task): text_decoder = cls.build_text_decoder(args, task.target_dictionary) compute_cross_attentive_loss = ( True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False ) cross_attentive_loss_without_norm = getattr( args, "attentive_cost_without_normalize", False ) cross_attentive_loss_reverse = ( False # getattr(args, "attentive_cost_reverse", False) ) if getattr(args, "load_pretrained_text_decoder", "") != "": checkpoint_utils.load_pretrained_component_from_model( text_decoder, args.load_pretrained_text_decoder ) if args.load_pretrained_speech_text_decoder: component_pairs = (("decoder.text_decoder", text_decoder),) cls.load_pretrained_speech_text_components( args.load_pretrained_speech_text_decoder, component_pairs ) decoder = TransformerMultiInputDecoder( dictionary=task.target_dictionary, spch_decoder=text_decoder, text_decoder=text_decoder, compute_cross_attentive_loss=compute_cross_attentive_loss, cross_attentive_loss_with_norm=True if not cross_attentive_loss_without_norm else False, cross_attentive_loss_reverse=cross_attentive_loss_reverse, ) if getattr(args, "load_init_decoder", "") != "": checkpoint_utils.load_pretrained_component_from_model( decoder, args.load_init_decoder ) return decoder @classmethod def load_pretrained_speech_text_components(cls, checkpoint, component_pairs): if not PathManager.exists(checkpoint): raise IOError("Model file not found: {}".format(checkpoint)) state = load_checkpoint_to_cpu(checkpoint) for component_type, component in component_pairs: if isinstance(component, nn.parameter.Parameter): component.data.copy_(state["model"][component_type]) else: component_state_dict = OrderedDict() for key in state["model"].keys(): if key.startswith(component_type): component_subkey = key[len(component_type) + 1 :] component_state_dict[component_subkey] = state["model"][key] component.load_state_dict(component_state_dict, strict=True) return state @classmethod def share_speech_text_encoder( cls, speech_encoder, text_encoder, shared_layers_from_begin ): if shared_layers_from_begin > 0: num_text_encoder_layers = len(text_encoder.layers) assert len(speech_encoder.layers) >= shared_layers_from_begin assert num_text_encoder_layers >= shared_layers_from_begin assert len(speech_encoder.layers) >= num_text_encoder_layers for i, ly in enumerate( speech_encoder.layers[ -num_text_encoder_layers : -num_text_encoder_layers + shared_layers_from_begin ] ): assert isinstance(text_encoder.layers[i], type(ly)) text_encoder.layers[i] = ly @register_model_architecture( "dual_input_wav_transformer", "dualinputs2twavtransformer_base" ) def dualinputs2twavtransformer_base(args): # speech masking args.dropout_input = getattr(args, "dropout_input", 0) args.dropout_features = getattr(args, "dropout_features", 0) args.speech_mask_length = getattr(args, "speech_mask_length", 10) args.speech_mask_prob = getattr(args, "speech_mask_prob", 0.65) args.speech_mask_selection = getattr(args, "speech_mask_selection", "static") args.speech_mask_other = getattr(args, "speech_mask_other", 0) args.speech_mask_min_space = getattr(args, "speech_mask_min_space", 1) args.speech_no_mask_overlap = getattr(args, "speech_no_mask_overlap", False) args.speech_conv_bias = getattr(args, "speech_conv_bias", False) args.speech_extractor_mode = getattr(args, "speech_extractor_mode", "default") args.no_strict_check_pretrain_model = getattr( args, "no_strict_check_pretrain_model", False ) args.speech_mask_channel_length = getattr(args, "speech_mask_channel_length", 10) args.speech_mask_channel_prob = getattr(args, "speech_mask_channel_prob", 0.0) args.speech_mask_channel_selection = getattr( args, "speech_mask_channel_selection", "static" ) args.speech_mask_channel_other = getattr(args, "speech_mask_channel_other", 0) args.speech_mask_channel_min_space = getattr( args, "speech_mask_channel_min_space", 1 ) args.speech_no_mask_channel_overlap = getattr( args, "speech_no_mask_channel_overlap", False ) args.no_scale_feature = getattr(args, "", False) args.feature_grad_mult = getattr(args, "feature_grad_mult", 0.0) # 0.1 # Transformer args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) args.encoder_ffn_embed_dim = getattr( args, "encoder_ffn_embed_dim", args.encoder_embed_dim * 4 ) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0.1) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_attention_heads = getattr( args, "decoder_attention_heads", args.encoder_attention_heads ) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0) args.activation_dropout = getattr(args, "activation_dropout", args.dropout) args.activation_fn = getattr(args, "activation_fn", "relu") # gelu? args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 12) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.encoder_shared_text_layers_from_begin = getattr( args, "encoder_shared_text_layers_from_begin", 6 ) args.decoder_layers = getattr(args, "decoder_layers", 6) @register_model_architecture( "dual_input_wav_transformer", "dualinputs2twavtransformer_base_stack" ) def dualinputs2twavtransformer_base_stack(args): args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 6) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.encoder_shared_text_layers_from_begin = getattr( args, "encoder_shared_text_layers_from_begin", 0 ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.stacked_encoder = getattr(args, "stacked_encoder", True) args.layernorm_embedding = getattr(args, "layernorm_embedding", True) dualinputs2twavtransformer_base(args) @register_model_architecture( "dual_input_wav_transformer", "dualinputs2twavtransformer_large" ) def dualinputs2twavtransformer_large(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.speech_encoder_layers = getattr(args, "speech_encoder_layers", 24) args.text_encoder_layers = getattr(args, "text_encoder_layers", 12) args.encoder_shared_text_layers_from_begin = getattr( args, "encoder_shared_text_layers_from_begin", 12 ) args.decoder_layers = getattr(args, "decoder_layers", 12) dualinputs2twavtransformer_base(args) ================================================ FILE: examples/speech_text_joint_to_text/models/s2t_dualinputxmtransformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import torch.nn as nn from fairseq import checkpoint_utils from fairseq import utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( register_model, register_model_architecture, FairseqEncoder, ) from fairseq.models.speech_to_text import Wav2VecEncoderWithAdaptor from fairseq.models.speech_to_text.xm_transformer import ( set_default_adaptor_args, set_default_w2v_encoder_args, need_finetuning ) from fairseq.models.transformer import TransformerEncoder, TransformerDecoder from fairseq.models.wav2vec import TransformerSentenceEncoderLayer from fairseq.utils import safe_hasattr from .s2t_dualinputtransformer import ( DualInputS2TTransformerModel, TransformerMultiInputDecoder, DualInputEncoder, ) class TransformerSentenceEncoderLayerStd(TransformerSentenceEncoderLayer): def __init__(self, sent_enc_layer): super(TransformerSentenceEncoderLayer, self).__init__() self.embedding_dim = sent_enc_layer.embedding_dim self.dropout = sent_enc_layer.dropout self.activation_dropout = sent_enc_layer.activation_dropout # Initialize blocks self.activation_fn = sent_enc_layer.activation_fn self.self_attn = sent_enc_layer.self_attn self.dropout1 = sent_enc_layer.dropout1 self.dropout2 = sent_enc_layer.dropout2 self.dropout3 = sent_enc_layer.dropout3 self.layer_norm_first = sent_enc_layer.layer_norm_first # layer norm associated with the self attention layer self.self_attn_layer_norm = sent_enc_layer.self_attn_layer_norm self.fc1 = sent_enc_layer.fc1 self.fc2 = sent_enc_layer.fc2 # layer norm associated with the position wise feed-forward NN self.final_layer_norm = sent_enc_layer.final_layer_norm def forward( self, x, self_attn_mask=None, self_attn_padding_mask=None, need_weights=None, att_args=None, ): x, attn = super().forward( x, self_attn_mask, self_attn_padding_mask, need_weights, att_args ) return x # TODO retire SharedEncoder class SharedEncoder(FairseqEncoder): def __init__(self, wav2vec_enc, mbart_enc, adaptor, shared_layers): super().__init__(None) self.w2v_encoder = wav2vec_enc self.shared_layers = self.w2v_encoder.w2v_model.encoder.layers[-shared_layers:] self.w2v_encoder.w2v_model.encoder.layers = ( self.w2v_encoder.w2v_model.encoder.layers[:-shared_layers] ) self.adaptor = adaptor if self.shared_layers[-1].layer_norm_first: self.final_layer_norm = mbart_enc.layer_norm else: mbart_enc.layer_norm = None self.final_layer_norm = None shared_layer_from = len(mbart_enc.layers) - shared_layers if shared_layer_from < 0: shared_layer_from = 0 for layer_id, layer in enumerate(self.shared_layers): mbart_enc.layers[ shared_layer_from + layer_id ] = TransformerSentenceEncoderLayerStd(layer) def forward(self, src_tokens, src_lengths=None, **kwargs): padding_mask = lengths_to_padding_mask(src_lengths) if not padding_mask.any(): padding_mask = None out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) x = out["encoder_out"] enc_padding_mask = None if out["encoder_padding_mask"] is not None: enc_padding_mask = out["encoder_padding_mask"].transpose( 0, 1 ) # T X B --> B X T x, enc_padding_mask = self.adaptor(x, enc_padding_mask) for layer in self.shared_layers: x, _ = layer(x, enc_padding_mask) if self.final_layer_norm is not None: x = self.final_layer_norm(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [enc_padding_mask] if enc_padding_mask is not None else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": [], # List[T x B x C] "src_tokens": [], "src_lengths": [], } class StackedWav2VecEncoderWithAdaptor(FairseqEncoder): def __init__( self, wav2vec_enc, mbart_enc_layers, mbart_layer_norm, adaptor, drop_w2v_layers=0, ): super().__init__(None) self.w2v_encoder = wav2vec_enc self.adaptor = adaptor self.mbart_encoder_layers = mbart_enc_layers self.final_layer_norm = mbart_layer_norm if drop_w2v_layers > 0: self.w2v_encoder.w2v_model.encoder.layers = ( self.w2v_encoder.w2v_model.encoder.layers[:-drop_w2v_layers] ) def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs): padding_mask = lengths_to_padding_mask(src_lengths) if not padding_mask.any(): padding_mask = None out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) x = out["encoder_out"] enc_padding_mask = None if out["padding_mask"] is not None: enc_padding_mask = out["padding_mask"] # B X T x, enc_padding_mask = self.adaptor(x, enc_padding_mask) encoder_states = [] for layer in self.mbart_encoder_layers: x = layer(x, enc_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.final_layer_norm is not None: x = self.final_layer_norm(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [enc_padding_mask] if enc_padding_mask is not None else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], } def reorder_encoder_out(self, encoder_out, new_order): new_encoder_out = ( [] if len(encoder_out["encoder_out"]) == 0 else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] ) new_encoder_padding_mask = ( [] if len(encoder_out["encoder_padding_mask"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_padding_mask"] ] ) new_encoder_embedding = ( [] if len(encoder_out["encoder_embedding"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] ] ) encoder_states = encoder_out["encoder_states"] if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) return { "encoder_out": new_encoder_out, # T x B x C "encoder_padding_mask": new_encoder_padding_mask, # B x T "encoder_embedding": new_encoder_embedding, # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], # B x T "src_lengths": [], # B x 1 } # Note: # dual input transformer: # encoder: wav2vec for speech + mbart encoder for text # decoder: mbart decoder for text @register_model("dual_input_xm_transformer") class DualInputXMTransformerModel(DualInputS2TTransformerModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # wav2vec encoder Wav2VecEncoderWithAdaptor.add_args(parser) # add_decoder_args(parser) # mbart Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--mbart-dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--mbart-attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--mbart-activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-mbart-from", type=str, metavar="STR", help="model to take text encoder decoder weights from (for initialization)", ) # parser.add_argument("--finetune-w2v-params", type=str, metavar="STR", # help="comma-separated param strings to finetune.") parser.add_argument( "--finetune-mbart-decoder-params", type=str, metavar="STR", help="comma-separated param strings to finetune.", ) parser.add_argument( "--finetune-mbart-encoder-params", type=str, metavar="STR", help="comma-separated param strings to finetune.", ) parser.add_argument( "--skip-encoder-projection", action="store_true", help="skip the projection layer in encoder", ) parser.add_argument( "--enc-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc1 and enc2 gradient by V", ) parser.add_argument( "--enc2-along-grad-mult", type=float, metavar="V", default=1.0, help="multiply enc2 gradient by V if only enc2 is used", ) parser.add_argument( "--text-input-cost-ratio", type=float, default=1.0, metavar="V", help="text input cost ratio relative to speech input cost", ) parser.add_argument( "--stack-w2v-mbart-encoder", action="store_true", help="stack w2v and mbart encoder", ) parser.add_argument( "--stack-w2v-mbart-nonorm-encoder", action="store_true", help="stack w2v and mbart encoder", ) parser.add_argument( "--no-final-norm-decoder", action="store_true", help="no layer norm" ) parser.add_argument( "--drop-w2v-layers", type=int, default=0, metavar="N", help="drop w2v encoder layers", ) parser.add_argument( "--share-w2v-text-encoder", action="store_true", help="share w2v encoder layers with text encoder", ) parser.add_argument( "--shared-w2v-layers", type=int, default=0, metavar="N", help="shared encoder layers from w2v encoder", ) @classmethod def build_encoder(cls, args, task): _args = copy.deepcopy(args) _args.dropout = args.mbart_dropout _args.attention_dropout = args.mbart_attention_dropout _args.activation_dropout = args.mbart_activation_dropout _args.max_source_positions = 1024 enc_emb = nn.Embedding( len(task.src_dict), _args.encoder_embed_dim, task.src_dict.pad() ) text_encoder = TransformerEncoder(_args, task.src_dict, enc_emb) spch_encoder = Wav2VecEncoderWithAdaptor(args) if getattr(args, "load_pretrained_mbart_from", None): text_encoder = checkpoint_utils.load_pretrained_component_from_model( component=text_encoder, checkpoint=args.load_pretrained_mbart_from ) if getattr(args, "stack_w2v_mbart_encoder", False): assert getattr(args, "share_w2v_text_encoder", False) is False spch_encoder = StackedWav2VecEncoderWithAdaptor( spch_encoder.w2v_encoder, text_encoder.layers, text_encoder.layer_norm, spch_encoder.adaptor, args.drop_w2v_layers, ) elif getattr(args, "stack_w2v_mbart_nonorm_encoder", False): text_encoder.layer_norm = None spch_encoder = StackedWav2VecEncoderWithAdaptor( spch_encoder.w2v_encoder, text_encoder.layers, text_encoder.layer_norm, spch_encoder.adaptor, args.drop_w2v_layers, ) elif getattr(args, "share_w2v_text_encoder", False): spch_encoder = SharedEncoder( spch_encoder.w2v_encoder, text_encoder, spch_encoder.adaptor, args.shared_w2v_layers, ) for k, p in spch_encoder.named_parameters(): # Freeze pretrained models by default if safe_hasattr( args, "finetune_w2v_params" ) and need_finetuning(args.finetune_w2v_params, k): p.requires_grad = True else: p.requires_grad = False for k, p in text_encoder.named_parameters(): # Freeze pretrained models by default if safe_hasattr( args, "finetune_mbart_encoder_params" ) and need_finetuning( args.finetune_mbart_encoder_params, k ): p.requires_grad = True else: p.requires_grad = False cross_attentive_loss_before_last_layer = ( 0 if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else -1 ) encoder = DualInputEncoder( args, spch_encoder, text_encoder, task.src_dict, cross_attentive_loss_before_last_layer, ) return encoder @classmethod def build_decoder(cls, args, task): _args = copy.deepcopy(args) _args.dropout = args.mbart_dropout _args.attention_dropout = args.mbart_attention_dropout _args.activation_dropout = args.mbart_activation_dropout _args.max_target_positions = 1024 dec_emb = nn.Embedding( len(task.tgt_dict), _args.encoder_embed_dim, task.tgt_dict.pad() ) decoder = TransformerDecoder(_args, task.tgt_dict, dec_emb) if getattr(args, "load_pretrained_mbart_from", None): decoder = checkpoint_utils.load_pretrained_component_from_model( component=decoder, checkpoint=args.load_pretrained_mbart_from ) if getattr(args, "no_final_norm_decoder", False): decoder.layer_norm = None for k, p in decoder.named_parameters(): # Freeze pretrained models by default if safe_hasattr( args, "finetune_mbart_decoder_params" ) and need_finetuning( args.finetune_mbart_decoder_params, k ): p.requires_grad = True else: p.requires_grad = False compute_cross_attentive_loss = ( True if getattr(args, "attentive_cost_regularization", 0.0) > 0.0 else False ) cross_attentive_loss_without_norm = getattr( args, "attentive_cost_without_normalize", False ) cross_attentive_loss_reverse = ( False # getattr(args, "attentive_cost_reverse", False) ) decoder = TransformerMultiInputDecoder( dictionary=task.target_dictionary, spch_decoder=decoder, text_decoder=decoder, compute_cross_attentive_loss=compute_cross_attentive_loss, cross_attentive_loss_with_norm=True if not cross_attentive_loss_without_norm else False, cross_attentive_loss_reverse=cross_attentive_loss_reverse, ) return decoder @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted # (in case there are any new ones) dualinputxmtransformer_base(args) encoder = cls.build_encoder(args, task) decoder = cls.build_decoder(args, task) return cls(encoder, decoder) @register_model_architecture("dual_input_xm_transformer", "dualinputxmtransformer_base") def dualinputxmtransformer_base(args): # wav2vec encoder set_default_w2v_encoder_args(args) set_default_adaptor_args(args) # mbart model args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr( args, "encoder_ffn_embed_dim", 4 * args.encoder_embed_dim ) args.encoder_layers = getattr(args, "encoder_layers", 12) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4 * 1024) args.decoder_layers = getattr(args, "decoder_layers", 12) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.adaptive_input = getattr(args, "adaptive_input", False) args.mbart_attention_dropout = getattr(args, "mbart_attention_dropout", 0.0) args.mbart_activation_dropout = getattr(args, "mbart_activation_dropout", 0.0) args.mbart_dropout = getattr(args, "mbart_dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", True ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.layernorm_embedding = getattr(args, "layernorm_embedding", True) args.activation_fn = getattr(args, "activation_fn", "gelu") args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) ================================================ FILE: examples/speech_text_joint_to_text/scripts/convert_model.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import re from collections import OrderedDict import torch from fairseq.file_io import PathManager def is_update(param_name, module_name): if module_name in param_name: return True return False def load_checkpoint(src_cpt): with PathManager.open(src_cpt, "rb") as f: state_src = torch.load( f, map_location=( lambda s, _: torch.serialization.default_restore_location(s, "cpu") ), ) return state_src def save_checkpoint(tgt_cpt, states): with PathManager.open(tgt_cpt, "wb") as f: torch.save( states, f, ) # convert the pre-trained model into bart model def main(): parser = argparse.ArgumentParser() # fmt: off parser.add_argument('--input-model', required=True, help='Input checkpoint file path.') parser.add_argument('--output-model', required=True, help='output checkpoint file path.') # fmt: on args = parser.parse_args() print(args) states = load_checkpoint(args.input_model) model = states["model"] new_model = OrderedDict() for key in model.keys(): if re.search("^encoder.text_encoder", key): new_key = re.sub("encoder.text_encoder", "encoder", key) new_model[new_key] = model[key] elif re.search("^decoder.text_decoder", key): new_key = re.sub("decoder.text_decoder", "decoder", key) new_model[new_key] = model[key] states["model"] = new_model save_checkpoint(args.output_model, states) if __name__ == "__main__": main() ================================================ FILE: examples/speech_text_joint_to_text/scripts/g2p_encode.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import itertools import logging import re import time from g2p_en import G2p logger = logging.getLogger(__name__) FAIL_SENT = "FAILED_SENTENCE" def parse(): parser = argparse.ArgumentParser() parser.add_argument("--data-path", type=str, required=True) parser.add_argument("--out-path", type=str, required=True) parser.add_argument("--lower-case", action="store_true") parser.add_argument("--do-filter", action="store_true") parser.add_argument("--use-word-start", action="store_true") parser.add_argument("--dup-vowel", default=1, type=int) parser.add_argument("--dup-consonant", default=1, type=int) parser.add_argument("--no-punc", action="store_true") parser.add_argument("--reserve-word", type=str, default="") parser.add_argument( "--reserve-first-column", action="store_true", help="first column is sentence id", ) ### parser.add_argument("--parallel-process-num", default=1, type=int) parser.add_argument("--logdir", default="") args = parser.parse_args() return args def process_sent(sent, g2p, res_wrds, args): sents = pre_process_sent(sent, args.do_filter, args.lower_case, res_wrds) pho_seqs = [do_g2p(g2p, s, res_wrds, i == 0) for i, s in enumerate(sents)] pho_seq = ( [FAIL_SENT] if [FAIL_SENT] in pho_seqs else list(itertools.chain.from_iterable(pho_seqs)) ) if args.no_punc: pho_seq = remove_punc(pho_seq) if args.dup_vowel > 1 or args.dup_consonant > 1: pho_seq = dup_pho(pho_seq, args.dup_vowel, args.dup_consonant) if args.use_word_start: pho_seq = add_word_start(pho_seq) return " ".join(pho_seq) def remove_punc(sent): ns = [] regex = re.compile("[^a-zA-Z0-9 ]") for p in sent: if (not regex.search(p)) or p == FAIL_SENT: if p == " " and (len(ns) == 0 or ns[-1] == " "): continue ns.append(p) return ns def do_g2p(g2p, sent, res_wrds, is_first_sent): if sent in res_wrds: pho_seq = [res_wrds[sent]] else: pho_seq = g2p(sent) if not is_first_sent: pho_seq = [" "] + pho_seq # add space to separate return pho_seq def pre_process_sent(sent, do_filter, lower_case, res_wrds): if do_filter: sent = re.sub("-", " ", sent) sent = re.sub("—", " ", sent) if len(res_wrds) > 0: wrds = sent.split() wrds = ["SPLIT_ME " + w + " SPLIT_ME" if w in res_wrds else w for w in wrds] sents = [x.strip() for x in " ".join(wrds).split("SPLIT_ME") if x.strip() != ""] else: sents = [sent] if lower_case: sents = [s.lower() if s not in res_wrds else s for s in sents] return sents def dup_pho(sent, dup_v_num, dup_c_num): """ duplicate phoneme defined as cmudict http://www.speech.cs.cmu.edu/cgi-bin/cmudict """ if dup_v_num == 1 and dup_c_num == 1: return sent ns = [] for p in sent: ns.append(p) if re.search(r"\d$", p): for i in range(1, dup_v_num): ns.append(f"{p}-{i}P") elif re.search(r"\w", p): for i in range(1, dup_c_num): ns.append(f"{p}-{i}P") return ns def add_word_start(sent): ns = [] do_add = True ws = "▁" for p in sent: if do_add: p = ws + p do_add = False if p == " ": do_add = True else: ns.append(p) return ns def load_reserve_word(reserve_word): if reserve_word == "": return [] with open(reserve_word, "r") as fp: res_wrds = [x.strip().split() for x in fp.readlines() if x.strip() != ""] assert sum([0 if len(x) == 2 else 1 for x in res_wrds]) == 0 res_wrds = dict(res_wrds) return res_wrds def process_sents(sents, args): g2p = G2p() out_sents = [] res_wrds = load_reserve_word(args.reserve_word) for sent in sents: col1 = "" if args.reserve_first_column: col1, sent = sent.split(None, 1) sent = process_sent(sent, g2p, res_wrds, args) if args.reserve_first_column and col1 != "": sent = f"{col1} {sent}" out_sents.append(sent) return out_sents def main(): args = parse() out_sents = [] with open(args.data_path, "r") as fp: sent_list = [x.strip() for x in fp.readlines()] if args.parallel_process_num > 1: try: import submitit except ImportError: logger.warn( "submitit is not found and only one job is used to process the data" ) submitit = None if args.parallel_process_num == 1 or submitit is None: out_sents = process_sents(sent_list, args) else: # process sentences with parallel computation lsize = len(sent_list) // args.parallel_process_num + 1 executor = submitit.AutoExecutor(folder=args.logdir) executor.update_parameters(timeout_min=1000, cpus_per_task=4) jobs = [] for i in range(args.parallel_process_num): job = executor.submit( process_sents, sent_list[lsize * i : lsize * (i + 1)], args ) jobs.append(job) is_running = True while is_running: time.sleep(5) is_running = sum([job.done() for job in jobs]) < len(jobs) out_sents = list(itertools.chain.from_iterable([job.result() for job in jobs])) with open(args.out_path, "w") as fp: fp.write("\n".join(out_sents) + "\n") if __name__ == "__main__": main() ================================================ FILE: examples/speech_text_joint_to_text/tasks/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os ================================================ FILE: examples/speech_text_joint_to_text/tasks/pair_denoising.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import logging import os import re import numpy as np import torch from examples.speech_text_joint_to_text.data.pair_denoising_dataset import ( LanguagePairDenoisingDataset, ) from fairseq import utils from fairseq.data import ( ConcatDataset, Dictionary, LanguagePairDataset, ResamplingDataset, TransformEosConcatLangPairDataset, TransformEosLangPairDataset, data_utils, indexed_dataset, ) from fairseq.data.encoders.utils import get_whole_word_mask from fairseq.tasks import register_task from fairseq.tasks.translation import TranslationTask logger = logging.getLogger(__name__) def gen_whole_word_mask(args, dictionary): def is_beginning_of_word(i): if i < dictionary.nspecial: # special elements are always considered beginnings return True tok = dictionary[i] if tok.startswith("madeupword"): return True if tok in ["<unk>", "<s>", "</s>", "<pad>"]: return True return tok.startswith("\u2581") if args.use_mask_whole_words: mask_whole_words = torch.ByteTensor( list(map(is_beginning_of_word, range(len(dictionary)))) ) else: # it will mask every token as word leading token, since no bpe model is loaded for phoneme tokens return get_whole_word_mask(args, dictionary) return mask_whole_words @register_task("paired_denoising") class PairedDenoisingTask(TranslationTask): LANG_TAG_TEMPLATE = "<lang:{}>" # Tag for language (target) @staticmethod def add_args(parser): TranslationTask.add_args(parser) # bart setting parser.add_argument( "--mask", default=0.0, type=float, help="fraction of words/subwords that will be masked", ) parser.add_argument( "--mask-random", default=0.0, type=float, help="instead of using [MASK], use random token this often", ) parser.add_argument( "--insert", default=0.0, type=float, help="insert this percentage of additional random tokens", ) parser.add_argument( "--poisson-lambda", default=3.0, type=float, help="randomly shuffle sentences for this proportion of inputs", ) parser.add_argument( "--mask-length", default="span-poisson", type=str, choices=["subword", "word", "span-poisson"], help="mask length to choose", ) parser.add_argument( "--replace-length", default=1, type=int, help="when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)", ) # multi-lingual parser.add_argument( "--multilang-sampling-alpha", type=float, default=1.0, help="smoothing alpha for sample ratios across multiple datasets", ) parser.add_argument( "--lang-pairs", default="", metavar="PAIRS", help="comma-separated list of language pairs (in training order): phnen-en,phnfr-fr,phnit-it. Do masking", ) parser.add_argument( "--lang-pairs-bitext", default="", metavar="PAIRS", help="comma-separated list of language pairs (in training order): en-de,en-fr,de-fr. No masking", ) parser.add_argument("--add-src-lang-token", default=False, action="store_true") parser.add_argument("--add-tgt-lang-token", default=False, action="store_true") parser.add_argument( "--no-whole-word-mask-langs", type=str, default="", metavar="N", help="languages without spacing between words dont support whole word masking", ) parser.add_argument( "--use-mask-whole-words", default=False, action="store_true" ) @classmethod def setup_task(cls, args, **kwargs): """Setup the task.""" paths = args.data.split(":") assert len(paths) > 0 src_dict = Dictionary.load( os.path.join(paths[0], "src_dict.txt") ) # assume all languages share a source dictionary tgt_dict = Dictionary.load( os.path.join(paths[0], "tgt_dict.txt") ) # assume all languages share a target dictionary lang_pairs = args.lang_pairs + "," + args.lang_pairs_bitext lang_pairs = re.sub(",$", "", re.sub("^,", "", lang_pairs)) src_langs = [lp.split("-")[0] for lp in lang_pairs.split(",")] tgt_langs = [lp.split("-")[1] for lp in lang_pairs.split(",")] if args.add_src_lang_token: for lang in src_langs: assert ( src_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) != src_dict.unk() ) if args.add_tgt_lang_token: for lang in tgt_langs: assert ( tgt_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) != tgt_dict.unk() ) logger.info("source dictionary: {} types".format(len(src_dict))) logger.info("target dictionary: {} types".format(len(tgt_dict))) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False return cls(args, src_dict, tgt_dict) def __init__(self, args, src_dict, tgt_dict): super().__init__(args, src_dict, tgt_dict) # check mask token self.mask_idx = self.src_dict.index("<mask>") assert self.mask_idx != self.src_dict.unk() self.lang_pairs = args.lang_pairs self.lang_pairs_bitext = args.lang_pairs_bitext self.args = args @classmethod def language_pair_denoising_dataset( cls, data_path, do_mask, split, src, src_dict, tgt, tgt_dict, mask_idx, mask_whole_words, seed, args, dataset_impl, combine=False, left_pad_source=True, left_pad_target=False, max_source_positions=1024, max_target_positions=1024, shuffle=True, src_lang_id=None, tgt_lang_id=None, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join( data_path, "{}.{}-{}.{}".format(split, src, tgt, lang) ) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset( prefix + tgt, tgt_dict, dataset_impl ) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info( "{} {} {}-{} {} examples".format( data_path, split_k, src, tgt, len(src_datasets[-1]) ) ) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None eos = None tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None if not do_mask: return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, eos=eos, shuffle=shuffle, src_lang_id=src_lang_id, tgt_lang_id=tgt_lang_id, ) return LanguagePairDenoisingDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, mask_idx, mask_whole_words, seed, args, left_pad_source=left_pad_source, left_pad_target=left_pad_target, eos=eos, shuffle=shuffle, src_lang_id=src_lang_id, tgt_lang_id=tgt_lang_id, ) def _get_sample_prob(self, dataset_lens): """ Get smoothed sampling porbability by languages. This helps low resource languages by upsampling them. """ prob = dataset_lens / dataset_lens.sum() smoothed_prob = prob ** self.args.multilang_sampling_alpha smoothed_prob = smoothed_prob / smoothed_prob.sum() return smoothed_prob def resample_datasets(self, lang_datasets, lang_pairs_all, epoch): # For train subset, additionally up or down sample languages. if self.args.multilang_sampling_alpha == 1.0: return lang_datasets dataset_lengths = np.array( [len(d) for d in lang_datasets], dtype=float, ) sample_probs = self._get_sample_prob(dataset_lengths) logger.info( "Sample probability by language pair: {}".format( { lp: "{0:.4f}".format(sample_probs[id]) for id, lp in enumerate(lang_pairs_all) } ) ) size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths logger.info( "Up/Down Sampling ratio by language: {}".format( { lp: "{0:.2f}".format(size_ratio[id]) for id, lp in enumerate(lang_pairs_all) } ) ) resampled_lang_datasets = [ ResamplingDataset( lang_datasets[i], size_ratio=size_ratio[i], seed=self.args.seed, epoch=epoch, replace=size_ratio[i] >= 1.0, ) for i, d in enumerate(lang_datasets) ] return resampled_lang_datasets def load_dataset_only( self, split, lang_pairs, do_mask=True, epoch=1, combine=False ): paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] # TODO unk token will be considered as first word too, though it might be an unknown phoneme within a word # get_whole_word_mask returns a tensor (size V by 1 ) to indicate if a token is a word start token mask_whole_src_words = gen_whole_word_mask(self.args, self.src_dict) language_without_segmentations = self.args.no_whole_word_mask_langs.split(",") lang_datasets = [] eos_bos = [] lang_pairs = lang_pairs.split(",") if lang_pairs != "" else [] assert len(lang_pairs) > 0 for lp in lang_pairs: src, tgt = lp.split("-") lang_mask_whole_src_words = ( mask_whole_src_words if src not in language_without_segmentations else None ) end_token = ( self.source_dictionary.index( PairedDenoisingTask.LANG_TAG_TEMPLATE.format(src) ) if self.args.add_src_lang_token else None ) bos_token = ( self.target_dictionary.index( PairedDenoisingTask.LANG_TAG_TEMPLATE.format(tgt) ) if self.args.add_tgt_lang_token else None ) src_lang_id = None if self.args.add_src_lang_token or self.args.add_tgt_lang_token: eos_bos.append((end_token, bos_token)) dataset = PairedDenoisingTask.language_pair_denoising_dataset( data_path, do_mask, split, src, self.source_dictionary, tgt, self.target_dictionary, self.mask_idx, lang_mask_whole_src_words, self.args.seed, self.args, self.args.dataset_impl, combine=combine, left_pad_source=utils.eval_bool(self.args.left_pad_source), left_pad_target=utils.eval_bool(self.args.left_pad_target), max_source_positions=self.args.max_source_positions, max_target_positions=self.args.max_target_positions, src_lang_id=src_lang_id, ) lang_datasets.append(dataset) if len(lang_datasets) == 0: return elif len(lang_datasets) == 1: dataset = lang_datasets[0] if self.args.add_src_lang_token or self.args.add_tgt_lang_token: end_token, bos_token = eos_bos[0] dataset = TransformEosLangPairDataset( dataset, src_eos=self.source_dictionary.eos(), new_src_eos=end_token, tgt_bos=self.target_dictionary.eos(), new_tgt_bos=bos_token, ) else: end_tokens = [item[0] for item in eos_bos if item[0] is not None] bos_tokens = [item[1] for item in eos_bos if item[1] is not None] lang_datasets = self.resample_datasets(lang_datasets, lang_pairs, epoch) dataset = TransformEosConcatLangPairDataset( lang_datasets, self.source_dictionary.eos(), self.target_dictionary.eos(), new_src_eos=end_tokens, new_tgt_bos=bos_tokens, ) return dataset # split in (train, valid, test, ...) def load_dataset(self, split, epoch=1, combine=False, **kwargs): self.datasets[split] = self.load_dataset_only( split, self.lang_pairs, epoch=epoch, combine=combine ) ================================================ FILE: examples/speech_text_joint_to_text/tasks/speech_text_denoise_pretrain.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import re from argparse import Namespace from pathlib import Path from fairseq.data import ConcatDataset, Dictionary, encoders from fairseq.data.audio.multi_modality_dataset import ( FileAudioDatasetWrapper, ModalityDatasetItem, MultiModalityDataset, ) from fairseq.data.audio.speech_to_text_joint_dataset import ( S2TJointDataConfig, SpeechToTextJointDatasetCreator, ) from fairseq.data.iterators import GroupedEpochBatchIterator from fairseq.tasks import register_task from .pair_denoising import PairedDenoisingTask logger = logging.getLogger(__name__) @register_task("speech_text_joint_denoising") class SpeechTextJointDenoisingPreTask(PairedDenoisingTask): """ Joint denoising training task for speech and text. """ SIL_TOKEN = "sil" @classmethod def add_args(cls, parser): PairedDenoisingTask.add_args(parser) # set max tokens and position parser.add_argument( "--max-text-tokens", type=int, metavar="N", default=1024, help="maximum samples for encoder text input ", ) parser.add_argument( "--max-speech-tokens", type=int, metavar="N", default=50000, help="maximum samples for encoder speech input ", ) parser.add_argument( "--max-speech-positions", type=int, metavar="N", default=400, help="maximum tokens for per encoder text input ", ) parser.add_argument( "--max-sample-size", type=int, metavar="N", default=32000, help="max sample size to crop to for batching (unsupervised speech) ", ) parser.add_argument( "--min-sample-size", type=int, metavar="N", default=4000, help="min sample size to crop to for batching (unsupervised speech) ", ) # set mini-batch ratio for different modalities/subtasks # s2p parser.add_argument( "--supervised-speech-sample-ratio", default="1", type=str, metavar="N", help="Multiple Ratio for speech dataset with transcripts ", ) # s2t parser.add_argument( "--supervised-speech-s2s-sample-ratio", default="1", type=str, metavar="N", help="Multiple Ratio for speech dataset with transcripts ", ) # ssl parser.add_argument( "--unsupervised-speech-sample-ratio", default="1", type=str, metavar="N", help="Multiple Ratio for speech dataset without transcripts ", ) # t2t with monolingual data (masking) parser.add_argument( "--text-sample-ratio", default="1", type=str, metavar="N", help="Multiple Ratio for text set ", ) # t2t with parallel data (no masking) parser.add_argument( "--bitext-sample-ratio", default="1", type=str, metavar="N", help="Multiple Ratio for text set (bitext) ", ) # train_subset = "train", 'valid' or so # parallel data is loaded according to string lang_pairs and lang_pairs_no_mask from args.data # (un)supervised speech is loaded from args.(un)sup_speech_{train,valid}_subset parser.add_argument( "--sup-speech-data", default="", help="path to supervised speech data" ) parser.add_argument( "--sup-speech-train-subset", default="", help="supervised speech training subsets", ) parser.add_argument( "--sup-speech-valid-subset", default="", help="supervised speech validation subsets", ) parser.add_argument( "--config-yaml", default="config.yaml", help="supervised speech configuration yaml file", ) parser.add_argument( "--sup-speech-s2s-data", default="", help="path to supervised speech data" ) parser.add_argument( "--sup-speech-s2s-train-subset", default="", help="supervised speech training subsets", ) parser.add_argument( "--sup-speech-s2s-valid-subset", default="", help="supervised speech validation subsets", ) parser.add_argument( "--config-s2s-yaml", default="config.yaml", help="supervised speech configuration yaml file", ) parser.add_argument( "--unsup-speech-train-data", default="", help="path to unsupervised speech training data (tsv)", ) parser.add_argument( "--unsup-speech-valid-data", default="", help="path to unsupervised speech valid data (tsv)", ) parser.add_argument( "--sample-rate", type=int, metavar="N", default=16000, help="input audio sampling rate", ) parser.add_argument( "--no-emb-update-unsup", default=False, action="store_true", help="no update for output embedding during unsupervised_speech mode", ) parser.add_argument("--same-data-update", default=False, action="store_true") # used for sup_speech_ali parser.add_argument( "--use-sup-speech-ctc", default=False, action="store_true", help="use speech_sup_ctc instead of speech_sup_ali", ) @classmethod def setup_task(cls, args, **kwargs): """Setup the task.""" paths = args.data.split(":") assert len(paths) > 0 src_dict = Dictionary.load( os.path.join(paths[0], "src_dict.txt") ) # assume all languages share a source dictionary tgt_dict = Dictionary.load( os.path.join(paths[0], "tgt_dict.txt") ) # assume all languages share a target dictionary lang_pairs = args.lang_pairs + "," + args.lang_pairs_bitext lang_pairs = re.sub(",$", "", re.sub("^,", "", lang_pairs)) if lang_pairs != "": src_langs = [lp.split("-")[0] for lp in lang_pairs.split(",")] tgt_langs = [lp.split("-")[1] for lp in lang_pairs.split(",")] else: src_langs = [] tgt_langs = [] if args.add_src_lang_token: for lang in src_langs: assert ( src_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) != src_dict.unk() ) if args.add_tgt_lang_token: for lang in tgt_langs: assert ( tgt_dict.index(PairedDenoisingTask.LANG_TAG_TEMPLATE.format(lang)) != tgt_dict.unk() ) logger.info("source dictionary: {} types".format(len(src_dict))) logger.info("target dictionary: {} types".format(len(tgt_dict))) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False return cls(args, src_dict, tgt_dict) def __init__(self, args, src_dict, tgt_dict): super().__init__(args, src_dict, tgt_dict) self.data_cfg = S2TJointDataConfig( Path(args.sup_speech_data) / args.config_yaml ) logger.info( f"load supervised speech data configure from {Path(args.sup_speech_data) / args.config_yaml}" ) self.data_s2s_cfg = ( S2TJointDataConfig(Path(args.sup_speech_s2s_data) / args.config_s2s_yaml) if args.sup_speech_s2s_train_subset != "" else None ) if self.data_s2s_cfg is not None: logger.info( f"load supervised sequece to sequence speech data configure from {Path(args.sup_speech_s2s_data) / args.config_yaml}" ) def parse_data_ratio(sample_ratio): ratios = sample_ratio.split(",") if len(ratios) == 1: return [float(ratios[0])] epoch_ratios = [] for item in ratios: ep, r = item.split(":") ep = int(ep) r = float(r) assert ep > 0 # epoch is 1 based assert ep >= len(epoch_ratios) if len(epoch_ratios) == 0: epoch_ratios.append( r ) # epoch_ratios[0] is not used, but we still set it to the first value to make thing simple. while len(epoch_ratios) < ep: epoch_ratios.append(epoch_ratios[-1]) epoch_ratios.append(r) return epoch_ratios self.sup_ratio = parse_data_ratio(args.supervised_speech_sample_ratio) self.sup_s2s_ratio = parse_data_ratio(args.supervised_speech_s2s_sample_ratio) self.text_ratio = parse_data_ratio(args.text_sample_ratio) self.bitext_ratio = parse_data_ratio(args.bitext_sample_ratio) self.unsup_ratio = parse_data_ratio(args.unsupervised_speech_sample_ratio) self.sample_mode = None def build_model(self, args): args.input_feat_per_channel = self.data_cfg.input_feat_per_channel args.input_channels = self.data_cfg.input_channels return super().build_model(args) def build_tokenizer(self, data_cfg, msg=""): logger.info(f"pre-tokenizer {msg}: {data_cfg.pre_tokenizer}") return encoders.build_tokenizer(Namespace(**data_cfg.pre_tokenizer)) def build_bpe(self, data_cfg, msg=""): logger.info(f"tokenizer {msg}: {data_cfg.bpe_tokenizer}") return encoders.build_bpe(Namespace(**data_cfg.bpe_tokenizer)) @classmethod def resolve_data_type(cls, split, use_sup_speech_ctc): if len(split.split("_")) == 1: # default case, train or valid is_train = split dtype = "text" else: is_train, dtype = split.split("_", 1) is_train = True if is_train == "train" else False if dtype == "sup_speech": dtype = "sup_speech_ctc" if use_sup_speech_ctc else "sup_speech_ali" assert dtype in ( "text", "bitext", "sup_speech_ali", "sup_speech_s2s", "unsup_speech", "sup_speech_ctc", ), f"failed resolving {split} (it resulted into: {dtype} ; is_train={is_train})" return is_train, dtype def create_modalitydatasetitem(self, dtype, dataset): dsitem = None if dtype in ("text", "bitext"): dsitem = ModalityDatasetItem( dtype, dataset, (self.args.max_source_positions, self.args.max_target_positions), self.args.max_text_tokens, self.args.batch_size, ) elif dtype in ("sup_speech_ctc", "sup_speech_ali", "sup_speech_s2s"): dsitem = ModalityDatasetItem( dtype, dataset, (self.args.max_speech_positions, self.args.max_target_positions), self.args.max_speech_tokens, self.args.batch_size, ) elif dtype == "unsup_speech": dsitem = ModalityDatasetItem( dtype, dataset, 1e8, self.args.max_speech_tokens, self.args.batch_size ) else: raise ValueError(f"{dtype} is not supported") return dsitem def load_dataset(self, split, epoch=1, combine=False, **kwargs): def _get_sup_src_tgt_dict(src_dict, tgt_dict, use_s2s_sup_decoder): if use_s2s_sup_decoder: return None, tgt_dict # use src_dict as tgt_dict here, since we use source dictionary as target for forcealignment return None, src_dict is_train, dtype = self.resolve_data_type(split, self.args.use_sup_speech_ctc) # Note we use --add-tgt-lang-token instead of data_cfg.prepend_tgt_lang_tag_no_change to set target language tag in the text dataset # Verify add_tgt_lang_token and prepend_tgt_lang_tag_no_change are same # Note we use --multilang-sampling-alpha instead of data_cfg.sampling_text_alpha to set text data sampling if is_train: msets = [] # train split, load everything into one if self.lang_pairs != "": text_dataset = self.load_dataset_only( "train", self.lang_pairs, epoch=epoch, combine=combine ) dsitem = self.create_modalitydatasetitem("text", text_dataset) msets.append(dsitem) if self.lang_pairs_bitext != "": # load bitext bitext_dataset = self.load_dataset_only( "train_bitext", self.lang_pairs_bitext, do_mask=False, epoch=epoch, combine=combine, ) dsitem = self.create_modalitydatasetitem("bitext", bitext_dataset) msets.append(dsitem) if self.args.sup_speech_train_subset != "": pre_tokenizer = self.build_tokenizer(self.data_cfg) bpe_tokenizer = self.build_bpe(self.data_cfg) append_eos = True sup_speech_type = "sup_speech_ali" if self.args.use_sup_speech_ctc: # CTC mode sup_speech_type = "sup_speech_ctc" append_eos = False # CTC doesn't need eos in the target src_dict, tgt_dict = _get_sup_src_tgt_dict( self.src_dict, self.tgt_dict, False ) sup_speech_dataset = SpeechToTextJointDatasetCreator.from_tsv( self.args.sup_speech_data, self.data_cfg, self.args.sup_speech_train_subset, tgt_dict=tgt_dict, src_dict=src_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, src_pre_tokenizer=None, src_bpe_tokenizer=None, is_train_split=is_train, epoch=epoch, seed=self.args.seed, append_eos=append_eos, ) dsitem = self.create_modalitydatasetitem( sup_speech_type, sup_speech_dataset ) msets.append(dsitem) if self.args.sup_speech_s2s_train_subset != "": pre_tokenizer = self.build_tokenizer(self.data_s2s_cfg, msg="(s2s)") bpe_tokenizer = self.build_bpe(self.data_s2s_cfg, msg="(s2s)") # make sure self.data_cfg.prepend_tgt_lang_tag_no_change == self.args.add_tgt_lang_token src_dict, tgt_dict = _get_sup_src_tgt_dict( self.src_dict, self.tgt_dict, True ) sup_speech_s2s_dataset = SpeechToTextJointDatasetCreator.from_tsv( self.args.sup_speech_s2s_data, self.data_s2s_cfg, self.args.sup_speech_s2s_train_subset, tgt_dict=tgt_dict, src_dict=src_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, src_pre_tokenizer=None, src_bpe_tokenizer=None, is_train_split=is_train, epoch=epoch, seed=self.args.seed, ) dsitem = self.create_modalitydatasetitem( "sup_speech_s2s", sup_speech_s2s_dataset ) msets.append(dsitem) if self.args.unsup_speech_train_data != "": unsup_speech_dataset = FileAudioDatasetWrapper( self.args.unsup_speech_train_data, self.args.sample_rate, max_sample_size=self.args.max_sample_size, min_sample_size=self.args.min_sample_size, normalize=False, ) dsitem = self.create_modalitydatasetitem( "unsup_speech", unsup_speech_dataset ) msets.append(dsitem) pre_train_dataset = MultiModalityDataset(msets) self.datasets[split] = pre_train_dataset else: # validation split, load them for each type of data if dtype == "text": text_dataset = self.load_dataset_only( split, self.lang_pairs, epoch=epoch, combine=combine ) dsitem = self.create_modalitydatasetitem("text", text_dataset) self.datasets[split] = MultiModalityDataset([dsitem]) elif dtype == "bitext": bitext_dataset = self.load_dataset_only( split, self.lang_pairs_bitext, do_mask=False, epoch=epoch, combine=combine, ) dsitem = self.create_modalitydatasetitem("bitext", bitext_dataset) self.datasets[split] = MultiModalityDataset([dsitem]) elif dtype in ("sup_speech_ctc", "sup_speech_ali"): assert self.args.sup_speech_valid_subset != "" pre_tokenizer = self.build_tokenizer(self.data_cfg) bpe_tokenizer = self.build_bpe(self.data_cfg) append_eos = True if dtype == "sup_speech_ctc": # CTC mode append_eos = False # CTC doesn't need eos assert self.args.use_sup_speech_ctc datasets = [] for split_name in self.args.sup_speech_valid_subset.split(","): src_dict, tgt_dict = _get_sup_src_tgt_dict( self.src_dict, self.tgt_dict, False ) datasets.append( SpeechToTextJointDatasetCreator.from_tsv( self.args.sup_speech_data, self.data_cfg, split_name, tgt_dict=tgt_dict, src_dict=src_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, src_pre_tokenizer=None, src_bpe_tokenizer=None, is_train_split=is_train, epoch=epoch, seed=self.args.seed, append_eos=append_eos, ) ) dset = datasets[0] if len(datasets) == 1 else ConcatDataset(datasets) dsitem = self.create_modalitydatasetitem(dtype, dset) self.datasets[split] = MultiModalityDataset([dsitem]) elif dtype == "sup_speech_s2s": assert self.args.sup_speech_s2s_valid_subset != "" pre_tokenizer = self.build_tokenizer(self.data_s2s_cfg) bpe_tokenizer = self.build_bpe(self.data_s2s_cfg) datasets = [] for split_name in self.args.sup_speech_s2s_valid_subset.split(","): src_dict, tgt_dict = _get_sup_src_tgt_dict( self.src_dict, self.tgt_dict, True ) datasets.append( SpeechToTextJointDatasetCreator.from_tsv( self.args.sup_speech_s2s_data, self.data_s2s_cfg, split_name, tgt_dict=tgt_dict, src_dict=src_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, src_pre_tokenizer=None, src_bpe_tokenizer=None, is_train_split=is_train, epoch=epoch, seed=self.args.seed, ) ) dset = datasets[0] if len(datasets) == 1 else ConcatDataset(datasets) dsitem = self.create_modalitydatasetitem("sup_speech_s2s", dset) self.datasets[split] = MultiModalityDataset([dsitem]) elif dtype == "unsup_speech": assert self.args.unsup_speech_valid_data != "" unsup_speech_dataset = FileAudioDatasetWrapper( self.args.unsup_speech_valid_data, self.args.sample_rate, max_sample_size=self.args.max_sample_size, min_sample_size=self.args.min_sample_size, normalize=False, ) dsitem = self.create_modalitydatasetitem( "unsup_speech", unsup_speech_dataset ) self.datasets[split] = MultiModalityDataset([dsitem]) else: raise ValueError(f"Unsupported type {dtype}") def get_sample_ratio(self, epoch): sup_ratio = ( self.sup_ratio[epoch] if len(self.sup_ratio) > epoch else self.sup_ratio[-1] ) sup_s2s_ratio = ( self.sup_s2s_ratio[epoch] if len(self.sup_s2s_ratio) > epoch else self.sup_s2s_ratio[-1] ) unsup_ratio = ( self.unsup_ratio[epoch] if len(self.unsup_ratio) > epoch else self.unsup_ratio[-1] ) text_ratio = ( self.text_ratio[epoch] if len(self.text_ratio) > epoch else self.text_ratio[-1] ) bitext_ratio = ( self.bitext_ratio[epoch] if len(self.bitext_ratio) > epoch else self.bitext_ratio[-1] ) return text_ratio, bitext_ratio, sup_ratio, sup_s2s_ratio, unsup_ratio def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, data_buffer_size=0, disable_iterator_cache=False, skip_remainder_batch=False, grouped_shuffling=False, update_epoch_batch_itr=False, ): assert isinstance(dataset, MultiModalityDataset) if len(dataset.id_to_mode) == 1: max_positions = dataset.max_positions[0] max_tokens = dataset.max_tokens[0] max_sentences = dataset.max_sentences[0] return super().get_batch_iterator( dataset, max_tokens, max_sentences, max_positions, ignore_invalid_inputs, required_batch_size_multiple, seed, num_shards, shard_id, num_workers, epoch, data_buffer_size, disable_iterator_cache, skip_remainder_batch=skip_remainder_batch, ) mult_ratio = [] ( text_ratio, bitext_ratio, sup_ratio, sup_s2s_ratio, unsup_ratio, ) = self.get_sample_ratio(epoch) for mode in dataset.id_to_mode: if mode in ("sup_speech_ctc", "sup_speech_ali"): mult_ratio.append(sup_ratio) elif mode == "sup_speech_s2s": mult_ratio.append(sup_s2s_ratio) elif mode == "text": mult_ratio.append(text_ratio) elif mode == "bitext": mult_ratio.append(bitext_ratio) elif mode == "unsup_speech": mult_ratio.append(unsup_ratio) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) batch_samplers = dataset.get_batch_samplers( mult_ratio, required_batch_size_multiple, seed ) # return a reusable, sharded iterator epoch_iter = GroupedEpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_samplers=batch_samplers, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, mult_rate=max(self.args.update_freq) if self.args.same_data_update else 1, buffer_size=data_buffer_size, skip_remainder_batch=skip_remainder_batch, ) self.dataset_to_epoch_iter[dataset] = {} # refresh it every epoch return epoch_iter ================================================ FILE: examples/speech_text_joint_to_text/tasks/speech_text_joint.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from argparse import Namespace from pathlib import Path import torch from fairseq.data import ( encoders, Dictionary, ResamplingDataset, TransformEosLangPairDataset, ConcatDataset, ) from fairseq.data.iterators import GroupedEpochBatchIterator from fairseq.data.audio.multi_modality_dataset import ( MultiModalityDataset, LangPairMaskDataset, ModalityDatasetItem, ) from fairseq.data.audio.speech_to_text_dataset import ( SpeechToTextDataset, SpeechToTextDatasetCreator, ) from fairseq.data.audio.speech_to_text_joint_dataset import ( S2TJointDataConfig, SpeechToTextJointDatasetCreator, ) from fairseq.tasks import register_task from fairseq.tasks.speech_to_text import SpeechToTextTask from fairseq.tasks.translation import load_langpair_dataset logger = logging.getLogger(__name__) LANG_TAG_TEMPLATE = "<lang:{}>" @register_task("speech_text_joint_to_text") class SpeechTextJointToTextTask(SpeechToTextTask): """ Task for joint training speech and text to text. """ @classmethod def add_args(cls, parser): """Add task-specific arguments to the parser.""" super(SpeechTextJointToTextTask, cls).add_args(parser) ### parser.add_argument( "--parallel-text-data", default="", help="path to parallel text data directory", ) parser.add_argument( "--max-tokens-text", type=int, metavar="N", help="maximum tokens for encoder text input ", ) parser.add_argument( "--max-positions-text", type=int, metavar="N", default=400, help="maximum tokens for per encoder text input ", ) parser.add_argument( "--langpairs", default=None, metavar="S", help='language pairs for text training, separated with ","', ) parser.add_argument( "--speech-sample-ratio", default=1, type=float, metavar="N", help="Multiple Ratio for speech dataset with transcripts ", ) parser.add_argument( "--text-sample-ratio", default=1, type=float, metavar="N", help="Multiple Ratio for text set ", ) parser.add_argument( "--update-mix-data", action="store_true", help="use mixed data in one update when update-freq > 1", ) parser.add_argument( "--load-speech-only", action="store_true", help="load speech data only", ) parser.add_argument( "--mask-text-ratio", type=float, metavar="V", default=0.0, help="mask V source tokens for text only mode", ) parser.add_argument( "--mask-text-type", default="random", choices=["random", "tail"], help="mask text typed", ) parser.add_argument( "--noise-token", default="", help="noise token for masking src text tokens if mask-text-ratio > 0", ) parser.add_argument( "--infer-target-lang", default="", metavar="S", help="target language for inference", ) def __init__(self, args, src_dict, tgt_dict, infer_tgt_lang_id=None): super().__init__(args, tgt_dict) self.src_dict = src_dict self.data_cfg = S2TJointDataConfig(Path(args.data) / args.config_yaml) assert self.tgt_dict.pad() == self.src_dict.pad() assert self.tgt_dict.eos() == self.src_dict.eos() self.speech_only = args.load_speech_only self._infer_tgt_lang_id = infer_tgt_lang_id @classmethod def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries).""" data_cfg = S2TJointDataConfig(Path(args.data) / args.config_yaml) tgt_dict_path = Path(args.data) / data_cfg.vocab_filename src_dict_path = Path(args.data) / data_cfg.src_vocab_filename if (not os.path.isfile(src_dict_path)) or (not os.path.isfile(tgt_dict_path)): raise FileNotFoundError("Dict not found: {}".format(args.data)) src_dict = Dictionary.load(src_dict_path.as_posix()) tgt_dict = Dictionary.load(tgt_dict_path.as_posix()) print("| src dictionary: {} types".format(len(src_dict))) print("| tgt dictionary: {} types".format(len(tgt_dict))) if args.parallel_text_data != "": if not os.path.isabs(args.parallel_text_data): args.parallel_text_data = os.path.join( args.data, args.parallel_text_data ) if args.langpairs is None: raise Exception( "Could not infer language pair, please provide it explicitly" ) infer_tgt_lang_id = None if args.infer_target_lang != "" and data_cfg.prepend_tgt_lang_tag_no_change: tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format( args.infer_target_lang ) infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag) assert infer_tgt_lang_id != tgt_dict.unk() return cls(args, src_dict, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id) def load_langpair_dataset( self, prepend_tgt_lang_tag=False, sampling_alpha=1.0, epoch=0 ): lang_pairs = [] text_dataset = None split = "train" for lp in self.args.langpairs.split(","): src, tgt = lp.split("-") text_dataset = load_langpair_dataset( self.args.parallel_text_data, split, src, self.src_dict, tgt, self.tgt_dict, combine=True, dataset_impl=None, upsample_primary=1, left_pad_source=False, left_pad_target=False, max_source_positions=self.args.max_positions_text, max_target_positions=self.args.max_target_positions, load_alignments=False, truncate_source=False, ) if prepend_tgt_lang_tag: # TODO text_dataset = TransformEosLangPairDataset( text_dataset, src_eos=self.src_dict.eos(), tgt_bos=self.tgt_dict.eos(), # 'prev_output_tokens' starts with eos new_tgt_bos=self.tgt_dict.index(LANG_TAG_TEMPLATE.format(tgt)), ) lang_pairs.append(text_dataset) if len(lang_pairs) > 1: if sampling_alpha != 1.0: size_ratios = SpeechToTextDatasetCreator.get_size_ratios( self.args.langpairs.split(","), [len(s) for s in lang_pairs], alpha=sampling_alpha, ) lang_pairs = [ ResamplingDataset(d, size_ratio=r, epoch=epoch, replace=(r >= 1.0)) for d, r in zip(lang_pairs, size_ratios) ] return ConcatDataset(lang_pairs) return text_dataset def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): return generator.generate( models, sample, prefix_tokens=prefix_tokens, constraints=constraints, bos_token=self._infer_tgt_lang_id, ) def build_src_tokenizer(self, args): logger.info(f"src-pre-tokenizer: {self.data_cfg.src_pre_tokenizer}") return encoders.build_tokenizer(Namespace(**self.data_cfg.src_pre_tokenizer)) def build_src_bpe(self, args): logger.info(f"tokenizer: {self.data_cfg.src_bpe_tokenizer}") return encoders.build_bpe(Namespace(**self.data_cfg.src_bpe_tokenizer)) def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ is_train_split = split.startswith("train") pre_tokenizer = self.build_tokenizer(self.args) bpe_tokenizer = self.build_bpe(self.args) src_pre_tokenizer = self.build_src_tokenizer(self.args) src_bpe_tokenizer = self.build_src_bpe(self.args) ast_dataset = SpeechToTextJointDatasetCreator.from_tsv( self.args.data, self.data_cfg, split, self.tgt_dict, src_dict=None if self.speech_only else self.src_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, src_pre_tokenizer=src_pre_tokenizer, src_bpe_tokenizer=src_bpe_tokenizer, is_train_split=is_train_split, epoch=epoch, seed=self.args.seed, ) noise_token_id = -1 text_dataset = None if self.args.parallel_text_data != "" and is_train_split: text_dataset = self.load_langpair_dataset( self.data_cfg.prepend_tgt_lang_tag_no_change, 1.0, epoch=epoch, ) if self.args.mask_text_ratio > 0: # add mask noise_token_id = ( self.src_dict.unk() if self.args.noise_token == "" else self.src_dict.index(self.args.noise_token) ) text_dataset = LangPairMaskDataset( text_dataset, src_bos=self.src_dict.bos(), src_eos=self.src_dict.eos(), noise_id=noise_token_id, mask_ratio=self.args.mask_text_ratio, mask_type=self.args.mask_text_type, ) if text_dataset is not None: mdsets = [ ModalityDatasetItem( "sup_speech", ast_dataset, (self.args.max_source_positions, self.args.max_target_positions), self.args.max_tokens, self.args.batch_size, ), ModalityDatasetItem( "text", text_dataset, (self.args.max_positions_text, self.args.max_target_positions), self.args.max_tokens_text if self.args.max_tokens_text is not None else self.args.max_tokens, self.args.batch_size, ), ] ast_dataset = MultiModalityDataset(mdsets) self.datasets[split] = ast_dataset @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.tgt_dict @property def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary` (if applicable for this task).""" return None if self.speech_only else self.src_dict def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, data_buffer_size=0, disable_iterator_cache=False, skip_remainder_batch=False, grouped_shuffling=False, update_epoch_batch_itr=False, ): if not isinstance(dataset, MultiModalityDataset): return super(SpeechTextJointToTextTask, self).get_batch_iterator( dataset, max_tokens, max_sentences, max_positions, ignore_invalid_inputs, required_batch_size_multiple, seed, num_shards, shard_id, num_workers, epoch, data_buffer_size, disable_iterator_cache, skip_remainder_batch=skip_remainder_batch, update_epoch_batch_itr=update_epoch_batch_itr, ) mult_ratio = [self.args.speech_sample_ratio, self.args.text_sample_ratio] assert len(dataset.datasets) == 2 # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) batch_samplers = dataset.get_batch_samplers( mult_ratio, required_batch_size_multiple, seed ) # return a reusable, sharded iterator epoch_iter = GroupedEpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_samplers=batch_samplers, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, mult_rate=1 if self.args.update_mix_data else max(self.args.update_freq), buffer_size=data_buffer_size, skip_remainder_batch=skip_remainder_batch, ) self.dataset_to_epoch_iter[dataset] = {} # refresh it every epoch return epoch_iter ================================================ FILE: examples/speech_to_speech/README.md ================================================ # Speech to speech translation (S2ST) We provide the implementation and resources for the following work on speech-to-speech translation (S2ST): * [Direct speech-to-speech translation with discrete units (Lee et al. 2021)](docs/direct_s2st_discrete_units.md) * [Textless Speech-to-Speech Translation on Real Data (Lee et al. 2021)](docs/textless_s2st_real_data.md) * [Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation](docs/enhanced_direct_s2st_discrete_units.md) ================================================ FILE: examples/speech_to_speech/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import unity # noqa ================================================ FILE: examples/speech_to_speech/asr_bleu/README.md ================================================ # ASR-BLEU evaluation toolkit This toolkit provides a set of public ASR models used for evaluation of different speech-to-speech translation systems at FAIR. It enables easier score comparisons between different system's outputs. The ASRGenerator wraps different CTC-based ASR models from HuggingFace and fairseq code bases. Torchaudio CTC decoder is built on top of it to decode given audio files. Please see `asr_model_cfgs.json` for a list of languages covered currently. The high-level pipeline is simple by design: given a lang tag, script loads the ASR model, transcribes model's predicted audio, and computes the BLEU score against provided reference translations using sacrebleu. # Dependencies Please see `requirements.txt`. # Usage examples This toolkit have been used with: * Speechmatrix project: https://github.com/facebookresearch/fairseq/tree/ust/examples/speech_matrix. * Hokkien speech-to-speech translation project: https://github.com/facebookresearch/fairseq/tree/ust/examples/hokkien. # Standalone run example High-level example, please substitute arguments per your case: ```bash python compute_asr_bleu.py --lang <LANG> \ --audio_dirpath <PATH_TO_AUDIO_DIR> \ --reference_path <PATH_TO_REFERENCES_FILE> \ --reference_format txt ``` For more details about arguments please see the script argparser help. ================================================ FILE: examples/speech_to_speech/asr_bleu/__init__.py ================================================ ================================================ FILE: examples/speech_to_speech/asr_bleu/asr_model_cfgs.json ================================================ { "en": { "oct22": { "desc": "Wav2Vec 2.0 Large (LV-60) + Self Training from https://github.com/facebookresearch/fairseq/tree/main/examples/wav2vec#pre-trained-models", "ckpt_path": "https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt", "dict_path": "https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt", "model_type": "fairseq", "lang": "en", "post_process": "collapse" } }, "hok": { "oct22": { "desc": "Hokkien ASR model, for details check [TODO add paper link]", "ckpt_path": "https://dl.fbaipublicfiles.com/ust_asr/hok/checkpoint_best.pt", "dict_path": "https://dl.fbaipublicfiles.com/ust_asr/hok/dict.ltr.txt", "model_type": "fairseq", "lang": "hok", "post_process": "none" } }, "es": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "model_type": "hf", "lang": "es", "post_process": "collapse" } }, "fr": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-large-fr-voxpopuli-french", "model_type": "hf", "lang": "fr", "post_process": "collapse" } }, "zh": { "oct22": { "model_path": "ydshieh/wav2vec2-large-xlsr-53-chinese-zh-cn-gpt", "model_type": "hf", "lang": "zh", "post_process": "collapse" } }, "tr": { "oct22": { "model_path": "cahya/wav2vec2-large-xlsr-turkish-artificial-cv", "model_type": "hf", "lang": "tr", "post_process": "collapse" } }, "ar": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "model_type": "hf", "lang": "ar", "post_process": "collapse" } }, "vi": { "oct22": { "model_path": "not-tanh/wav2vec2-large-xlsr-53-vietnamese", "model_type": "hf", "lang": "vi", "post_process": "collapse" } }, "de": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-german", "model_type": "hf", "lang": "de", "post_process": "collapse" } }, "pl": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-polish", "model_type": "hf", "lang": "pl", "post_process": "collapse" } }, "it": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-italian", "model_type": "hf", "lang": "it", "post_process": "collapse" } }, "pt": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-portuguese", "model_type": "hf", "lang": "pt", "post_process": "collapse" } }, "ro": { "oct22": { "model_path": "gigant/romanian-wav2vec2", "model_type": "hf", "lang": "ro", "post_process": "collapse" } }, "cs": { "oct22": { "model_path": "comodoro/wav2vec2-xls-r-300m-cs-250", "model_type": "hf", "lang": "cs", "post_process": "collapse" } }, "sk": { "oct22": { "model_path": "anuragshas/wav2vec2-xls-r-300m-sk-cv8-with-lm", "model_type": "hf", "lang": "sk", "post_process": "collapse" } }, "sl": { "oct22": { "model_path": "anuragshas/wav2vec2-xls-r-300m-sl-cv8-with-lm", "model_type": "hf", "lang": "sl", "post_process": "collapse" } }, "fi": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-finnish", "model_type": "hf", "lang": "fi", "post_process": "collapse" } }, "hu": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-large-xlsr-53-hungarian", "model_type": "hf", "lang": "hu", "post_process": "collapse" } }, "et": { "oct22": { "model_path": "RASMUS/wav2vec2-xlsr-1b-et", "model_type": "hf", "lang": "et", "post_process": "collapse" } }, "lt": { "oct22": { "model_path": "sammy786/wav2vec2-xlsr-lithuanian", "model_type": "hf", "lang": "lt", "post_process": "collapse" } }, "nl": { "oct22": { "model_path": "jonatasgrosman/wav2vec2-xls-r-1b-dutch", "model_type": "hf", "lang": "nl", "post_process": "collapse" } }, "lv": { "oct22": { "model_path": "reach-vb/wav2vec2-large-xls-r-1B-common_voice7-lv-ft", "model_type": "hf", "lang": "lv", "post_process": "collapse" } }, "sv": { "oct22": { "model_path": "marinone94/xls-r-300m-sv-robust", "model_type": "hf", "lang": "sv", "post_process": "collapse" } }, "hr": { "oct22": { "model_path": "classla/wav2vec2-xls-r-parlaspeech-hr", "model_type": "hf", "lang": "hr", "post_process": "collapse" } } } ================================================ FILE: examples/speech_to_speech/asr_bleu/compute_asr_bleu.py ================================================ import os from typing import Dict, List import sacrebleu import pandas as pd from glob import glob from pathlib import Path from utils import retrieve_asr_config, ASRGenerator from tqdm import tqdm from argparse import ArgumentParser def merge_tailo_init_final(text): """ Hokkien ASR hypothesis post-processing. """ sps = text.strip().split() results = [] last_syllable = "" for sp in sps: if sp == "NULLINIT" or sp == "nullinit": continue last_syllable += sp if sp[-1].isnumeric(): results.append(last_syllable) last_syllable = "" if last_syllable != "": results.append(last_syllable) return " ".join(results) def remove_tone(text): """ Used for tone-less evaluation of Hokkien """ return " ".join([t[:-1] for t in text.split()]) def extract_audio_for_eval(audio_dirpath: str, audio_format: str): if audio_format == "n_pred.wav": """ The assumption here is that 0_pred.wav corresponds to the reference at line position 0 from the reference manifest """ audio_list = [] audio_fp_list = glob((Path(audio_dirpath) / "*_pred.wav").as_posix()) audio_fp_list = sorted( audio_fp_list, key=lambda x: int(os.path.basename(x).split("_")[0]) ) for i in range(len(audio_fp_list)): try: audio_fp = (Path(audio_dirpath) / f"{i}_pred.wav").as_posix() assert ( audio_fp in audio_fp_list ), f"{Path(audio_fp).name} does not exist in {audio_dirpath}" except AssertionError: # check the audio with random speaker audio_fp = Path(audio_dirpath) / f"{i}_spk*_pred.wav" audio_fp = glob( audio_fp.as_posix() ) # resolve audio filepath with random speaker assert len(audio_fp) == 1 audio_fp = audio_fp[0] audio_list.append(audio_fp) else: raise NotImplementedError return audio_list def extract_text_for_eval( references_filepath: str, reference_format: str, reference_tsv_column: str = None ): if reference_format == "txt": reference_sentences = open(references_filepath, "r").readlines() reference_sentences = [l.strip() for l in reference_sentences] elif reference_format == "tsv": tsv_df = pd.read_csv(references_filepath, sep="\t", quoting=3) reference_sentences = tsv_df[reference_tsv_column].to_list() reference_sentences = [l.strip() for l in reference_sentences] else: raise NotImplementedError return reference_sentences def compose_eval_data( audio_dirpath: str, audio_format: str, references_filepath: str, reference_format: str, reference_tsv_column: str = None, save_manifest_filepath=None, ): """ Speech matrix decoding pipeline produces audio with the following mask "N_pred.wav" where N is the order of the corresponding input sample """ reference_sentences = extract_text_for_eval( references_filepath, reference_format, reference_tsv_column ) predicted_audio_fp_list = extract_audio_for_eval(audio_dirpath, audio_format) assert len(predicted_audio_fp_list) == len(reference_sentences) audio_text_pairs = [ (audio, reference) for audio, reference in zip(predicted_audio_fp_list, reference_sentences) ] tsv_manifest = pd.DataFrame(audio_text_pairs, columns=["prediction", "reference"]) if save_manifest_filepath is not None: tsv_manifest.to_csv(save_manifest_filepath, sep="\t", quoting=3) return tsv_manifest def load_eval_data_from_tsv(eval_data_filepath: str): """ We may load the result of `compose_eval_data` directly if needed """ eval_df = pd.from_csv(eval_data_filepath, sep="\t") return eval_df def run_asr_bleu(args): asr_config = retrieve_asr_config( args.lang, args.asr_version, json_path="./asr_model_cfgs.json" ) asr_model = ASRGenerator(asr_config) eval_manifest = compose_eval_data( audio_dirpath=args.audio_dirpath, audio_format=args.audio_format, references_filepath=args.reference_path, reference_format=args.reference_format, reference_tsv_column=args.reference_tsv_column, save_manifest_filepath=None, ) prediction_transcripts = [] for _, eval_pair in tqdm( eval_manifest.iterrows(), desc="Transcribing predictions", total=len(eval_manifest), ): transcription = asr_model.transcribe_audiofile(eval_pair.prediction) prediction_transcripts.append(transcription.lower()) if args.lang == "hok": prediction_transcripts = [ merge_tailo_init_final(text) for text in prediction_transcripts ] references = eval_manifest["reference"].tolist() bleu_score = sacrebleu.corpus_bleu(prediction_transcripts, [references]) print(bleu_score) return prediction_transcripts, bleu_score def main(): parser = ArgumentParser( description="This script computes the ASR-BLEU metric between model's generated audio and the text reference sequences." ) parser.add_argument( "--lang", help="The target language used to initialize ASR model, see asr_model_cfgs.json for available languages", type=str, ) parser.add_argument( "--asr_version", type=str, default="oct22", help="For future support we add and extra layer of asr versions. The current most recent version is oct22 meaning October 2022", ) parser.add_argument( "--audio_dirpath", type=str, help="Path to the directory containing the audio predictions from the translation model", ) parser.add_argument( "--reference_path", type=str, help="Path to the file containing reference translations in the form of normalized text (to be compared to ASR predictions", ) parser.add_argument( "--reference_format", choices=["txt", "tsv"], help="Format of reference file. Txt means plain text format where each line represents single reference sequence", ) parser.add_argument( "--reference_tsv_column", default=None, type=str, help="If format is tsv, then specify the column name which contains reference sequence", ) parser.add_argument( "--audio_format", default="n_pred.wav", choices=["n_pred.wav"], help="Audio format n_pred.wav corresponds to names like 94_pred.wav or 94_spk7_pred.wav where spk7 is the speaker id", ) parser.add_argument( "--results_dirpath", default=None, type=str, help="If specified, the resulting BLEU score will be written to this file path as txt file", ) parser.add_argument( "--transcripts_path", default=None, type=str, help="If specified, the predicted transcripts will be written to this path as a txt file.", ) args = parser.parse_args() prediction_transcripts, bleu_score = run_asr_bleu(args) result_filename = f"{args.reference_format}_{args.lang}_bleu.txt" if args.results_dirpath is not None: if not Path(args.results_dirpath).exists(): Path(args.results_dirpath).mkdir(parents=True) with open(Path(args.results_dirpath) / result_filename, "w") as f: f.write(bleu_score.format(width=2)) if args.transcripts_path is not None: with open(args.transcripts_path, "w") as f: for transcript in prediction_transcripts: f.write(transcript + "\n") if __name__ == "__main__": main() """ Example to load Sl audio and references, compute BLEU: export lang=fi; split=vp && python compute_asr_bleu.py --lang $lang --audio_dirpath /checkpoint/hygong/S2S/speech_matrix_release_ckpts/generated_waveform_release/en-$lang/test_$split/checkpoint.pt --audio_format n_pred.wav --reference_path /large_experiments/ust/hygong/S2S/SpeechEncoder/manifests/vp-vp/en-$lang/test_$split.$lang --reference_format txt --results_dirpath ./ """ ================================================ FILE: examples/speech_to_speech/asr_bleu/requirements.txt ================================================ fairseq==0.12.2 pandas==1.4.3 sacrebleu==2.2.0 torch==1.12.1 torchaudio==0.12.1 tqdm==4.64.0 transformers==4.21.1 ================================================ FILE: examples/speech_to_speech/asr_bleu/utils.py ================================================ import json import re import urllib.request from pathlib import Path import fairseq import torch from fairseq.data.data_utils import lengths_to_padding_mask from tqdm import tqdm try: import torchaudio from torchaudio.models.decoder import ctc_decoder except ImportError: raise ImportError("Upgrade torchaudio to 0.12 to enable CTC decoding") class DownloadProgressBar(tqdm): """A class to represent a download progress bar""" def update_to(self, b=1, bsize=1, tsize=None) -> None: """ Update the download progress """ if tsize is not None: self.total = tsize self.update(b * bsize - self.n) def retrieve_asr_config(lang_key: str, asr_version: str, json_path: str) -> dict: """ Retrieve the asr model configs Args: lang_key: the lanuage type as the key name json_path: the path of the config json file Returns: Dict of all the configs in the json file """ with open(json_path, "r") as f: asr_model_cfgs = json.load(f) return asr_model_cfgs[lang_key][asr_version] class ASRGenerator(object): """A class to represent a ASR generator""" def __init__( self, model_cfg: dict, cache_dirpath: str = (Path.home() / ".cache" / "ust_asr").as_posix(), ) -> None: """ Construct all the necessary attributes of the ASRGenerator class Args: model_cfg: the dict of the asr model config cache_dirpath: the default cache path is "Path.home()/.cache/ust_asr" """ self.cache_dirpath = Path(cache_dirpath) / model_cfg["lang"] self.model_cfg = model_cfg self.use_cuda = torch.cuda.is_available() torchaudio.set_audio_backend("sox_io") if self.model_cfg["model_type"] == "hf": self.prepare_hf_model(self.model_cfg) elif self.model_cfg["model_type"] == "fairseq": self.prepare_fairseq_model(self.model_cfg) else: raise NotImplementedError( f"Model type {self.model_cfg['model_type']} is not supported" ) if self.model_cfg["post_process"] == "collapse": self.post_process_fn = lambda hypo: "".join(hypo).replace( self.sil_token, " " ) elif self.model_cfg["post_process"] == "none": self.post_process_fn = lambda hypo: " ".join(hypo).replace( self.sil_token, " " ) else: raise NotImplementedError if self.use_cuda: self.model.cuda() self.model.eval() self.decoder = ctc_decoder( lexicon=None, tokens=self.tokens, lm=None, nbest=1, beam_size=1, beam_size_token=None, lm_weight=0.0, word_score=0.0, unk_score=float("-inf"), sil_token=self.sil_token, sil_score=0.0, log_add=False, blank_token=self.blank_token, ) def prepare_hf_model(self, model_cfg: dict) -> None: """ Prepare the huggingface asr model Args: model_cfg: dict with the relevant ASR config """ def infer_silence_token(vocab: list): """ Different HF checkpoints have different notion of silence token such as | or " " (space) Important: when adding new HF asr model in, check what silence token it uses """ if "|" in vocab: return "|" elif " " in vocab: return " " else: raise RuntimeError("Silence token is not found in the vocabulary") try: from transformers import (AutoFeatureExtractor, AutoTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor) except ImportError: raise ImportError("Install transformers to load HF wav2vec model") model_path = model_cfg["model_path"] self.model = Wav2Vec2ForCTC.from_pretrained(model_path) self.tokenizer = AutoTokenizer.from_pretrained(model_path) self.preprocessor = AutoFeatureExtractor.from_pretrained(model_path) self.processor = Wav2Vec2Processor.from_pretrained(model_path) # extra unk tokens are there to make some models work e.g. Finnish ASR has some vocab issue vocab_list = [ self.tokenizer.decoder.get(i, f"{self.tokenizer.unk_token}1") for i in range(self.tokenizer.vocab_size) ] self.sampling_rate = self.preprocessor.sampling_rate self.normalize_input = self.preprocessor.do_normalize self.tokens = vocab_list self.sil_token = infer_silence_token(vocab_list) self.blank_token = self.tokenizer.pad_token def prepare_fairseq_model(self, model_cfg: dict) -> None: """ Prepare the fairseq asr model Args: model_cfg: the specific model config dict must have: (1) ckpt_path, (2) dict_path """ def download_file(url: str, cache_dir: Path): download_path = cache_dir / url.split("/")[-1] if not (cache_dir / url.split("/")[-1]).exists(): with DownloadProgressBar( unit="B", unit_scale=True, miniters=1, desc=url.split("/")[-1] ) as t: cache_dir.mkdir(parents=True, exist_ok=True) urllib.request.urlretrieve( url, filename=download_path.as_posix(), reporthook=t.update_to ) else: print(f"'{url}' exists in {cache_dir}") return download_path.as_posix() try: ckpt_path = model_cfg["ckpt_path"] dict_path = model_cfg["dict_path"] except KeyError: raise KeyError( "Fairseq model cfg must provide (1) ckpt_path, (2) dict_path" ) if re.search("^https", ckpt_path): ckpt_path = download_file(ckpt_path, self.cache_dirpath) if re.search("^https", dict_path): dict_path = download_file(dict_path, self.cache_dirpath) model, saved_cfg, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( [ckpt_path], arg_overrides={ "task": "audio_finetuning", "data": self.cache_dirpath.as_posix(), }, # data must have dict in it ) dict_lines = open(dict_path, "r").readlines() tokens = [l.split()[0] for l in dict_lines] # adding default fairseq special tokens tokens = ["<s>", "<pad>", "</s>", "<unk>"] + tokens self.model = model[0] self.tokens = tokens if "|" in tokens: self.sil_token = "|" else: self.sil_token = tokens[ 2 ] # use eos as silence token if | not presented e.g., Hok ASR model print(f"Inferring silence token from the dict: {self.sil_token}") self.blank_token = self.tokens[0] self.sampling_rate = saved_cfg.task.sample_rate self.normalize_input = saved_cfg.task.normalize @torch.inference_mode() def load_audiofile(self, audio_path: str) -> torch.Tensor: """ Load the audio files and apply resampling and normalizaion Args: audio_path: the audio file path Returns: audio_waveform: the audio waveform as a torch.Tensor object """ audio_waveform, sampling_rate = torchaudio.load(audio_path) if audio_waveform.dim == 2: audio_waveform = audio_waveform.mean(-1) if self.sampling_rate != sampling_rate: audio_waveform = torchaudio.functional.resample( audio_waveform, sampling_rate, self.sampling_rate ) if self.normalize_input: # following fairseq raw audio dataset audio_waveform = torch.nn.functional.layer_norm( audio_waveform, audio_waveform.shape ) return audio_waveform @torch.inference_mode() def compute_emissions(self, audio_input: torch.Tensor) -> torch.Tensor: """ Compute the emissions for either fairseq or huggingface asr model Args: audio_path: the input audio waveform Returns: emissions: the logits of the encoded prediction. """ if self.use_cuda: audio_input = audio_input.to("cuda") if isinstance(self.model, fairseq.models.wav2vec.wav2vec2_asr.Wav2VecCtc): padding_mask = lengths_to_padding_mask(torch.tensor([audio_input.numel()])) emissions = self.model.w2v_encoder(audio_input, padding_mask)[ "encoder_out" ].transpose(0, 1) else: emissions = self.model(audio_input).logits return emissions def decode_emissions(self, emissions: torch.Tensor) -> str: """ Decode the emissions and apply post process functions Args: emissions: the input Tensor object Returns: hypo: the str as the decoded transcriptions """ emissions = emissions.cpu() results = self.decoder(emissions) # assuming the lexicon-free decoder and working with tokens hypo = self.decoder.idxs_to_tokens(results[0][0].tokens) hypo = self.post_process_fn(hypo) return hypo def transcribe_audiofile(self, audio_path: str, lower=True) -> str: """ Transcribe the audio into string Args: audio_path: the input audio waveform lower: the case of the transcriptions with lowercase as the default Returns: hypo: the transcription result """ asr_input = self.load_audiofile(audio_path) emissions = self.compute_emissions(asr_input) hypo = self.decode_emissions(emissions) return hypo.strip().lower() if lower else hypo.strip() ================================================ FILE: examples/speech_to_speech/benchmarking/README.md ================================================ # Benchmarking ## Overview The goal of this framework is to support benchmarking various speech to speech translation(S2ST) models in terms of runtime, max-memory consumption and total number of floating point operations(FLOPS). It is a generic framework and can be easily extended to support any fairseq models. To accurately benchmark the performance, core inference modules are re-implemented based on fairseq_cli/generate.py (core.py/Processing) and examples/speech_to_text/generate_waveform.py(core.py/SpeechGeneration. To ensure that the end to end models and cascaded models are compared fairly, for cascaded models we only consider the performance metrics for model inference at all stages ignoring any intermediate data and io processing consumption. We run all the benchmarking runs on CPU as it is generally used in production environment and also due to lack of good benchmarking library support for GPUs. 1. Runtime: Average time in seconds to run model inference on an example from a given dataset. We use [timeit](https://docs.python.org/3/library/timeit.html) library to measure the runtime. 2. Max memory: Maximum memory in MiB averaged over by running the model inference on all examples from the given dataset. We use [memory_profiler](https://pypi.org/project/memory-profiler/) library to gather memory footprints for a code snippet and find the maximum to get the max memory used by the code. For cascaded models, we find the max of all stages to get the overall max_memory footprint. 3. FLOPS: We compute the average number of floating point operations needed to run model inference for an example from the given dataset. We use [PAPI library](http://www.bnikolic.co.uk/blog/python/flops/2019/10/01/pytorch-count-flops.html) to benchmark the number of flops. ## CLI Commands ```{python} CUBLAS_WORKSPACE_CONFIG=:4096:8 python examples/speech_to_speech/benchmarking/get_metrics.py ‘’ --config $config ``` ## Note: 1. The npy dataset is a list of samples saved as a .npy file. Each sample is a dictionary with id, net_input. 2. The raw dataset is a list of raw audio paths similar to wav2vec2 input tsv file ```{python} sample: { "id": xx, "net_input": { "src_tokens": torch.tensor([]), "src_lengths": torch.tensor([]) } } ``` ================================================ FILE: examples/speech_to_speech/benchmarking/configs/2StageS2ST.yaml ================================================ general: dataset_path: $npy_dataset cpu: True model_type: 2StageS2ST dataset_size: 1 stage1: data: $data_bin_stage1 task: speech_to_text path: $checkpoint_stage1 config_yaml: config.yaml max_len_a: 2 max_len_b: 500 stage2: data: $data_bin_stage2 task: text_to_speech path: $checkpoint_stage2 config_yaml: config.yaml ================================================ FILE: examples/speech_to_speech/benchmarking/configs/3StageS2ST.yaml ================================================ general: dataset_path: $npy_dataset cpu: True model_type: 3StageS2ST max_len_a: 2 max_len_b: 500 dataset_size: 1 stage1: data: $data_bin_stage1 task: speech_to_text path: $checkpoint_stage1 config_yaml: config.yaml max_len_a: 2 max_len_b: 500 stage2: data: $data_bin_stage2 task: translation path: $checkpoint_stage2 config_yaml: config.yaml stage2: data: $data_bin_stage3 task: text_to_speech path: $checkpoint_stage3 config_yaml: config.yaml ================================================ FILE: examples/speech_to_speech/benchmarking/configs/DirectS2U.yaml ================================================ general: dataset_path: $npy_dataset_path cpu: True model_type: S2UT dataset_size: 5 dump_speech_waveforms_dir: $dump_waveforms_dir_path stage1: data: $data_bin task: speech_to_speech path: $checkpoint config_yaml: config.yaml max_len_b: 100000 beam: 10 target_is_code: True max_target_positions: 3000 target_code_size: 100 stage2: vocoder: $vocoder_path vocoder_cfg: $vocoder_cfg_json dur_prediction: True ================================================ FILE: examples/speech_to_speech/benchmarking/configs/S2T.yaml ================================================ general: dataset_path: $npy_dataset cpu: True model_type: S2T dataset_size: 1 stage1: data: $data_bin task: speech_to_text path: $checkpoint config_yaml: config.yaml max_len_a: 2 max_len_b: 500 ================================================ FILE: examples/speech_to_speech/benchmarking/core.py ================================================ import timeit import logging import torch from pypapi import events, papi_high as high from memory_profiler import memory_usage from torch import nn from argparse import Namespace from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.data import data_utils as fairseq_data_utils from fairseq import checkpoint_utils, tasks, utils from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder from examples.hubert.simple_kmeans.dump_hubert_feature import HubertFeatureReader from examples.hubert.simple_kmeans.dump_km_label import ApplyKmeans from fairseq_cli.generate import get_symbols_to_strip_from_output import soundfile as sf import ast import json logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) torch.manual_seed(1) torch.set_deterministic(True) class BenchmarkingBase(nn.Module): def __init__(self): nn.Module.__init__(self) self.s2x_task = None def warm_up(self, sample, repeat): """Warm up the model""" for _i in range(repeat): self.forward(sample) logger.info(f"Model warmed up by running inference {repeat} times") def benchmark_run_time(self, dataset, repeat): """Benchmark average runtime for the model by calling benchmark_run_time_single_sample function""" logger.info("Starting run time benchmarking") time_elapsed = 0 for i, sample in enumerate(dataset): time_elapsed += self.benchmark_run_time_single_sample(sample, repeat=repeat) if i % 100 == 0: logger.info(f"Benchmarked run time for {i}/{len(dataset)} samples") total_time_elapsed = time_elapsed / len(dataset) return total_time_elapsed def benchmark_run_time_single_sample(self, sample, repeat): """Benchmark average runtime for a single sample using timeit library. Units are seconds""" timer = timeit.Timer(lambda: self.forward(sample)) time_elapsed = timer.timeit(repeat) return time_elapsed / repeat def count_flops( self, dataset, repeat, ): """Use PYPAPI library to count average flops for model inference. Note: It only works if the model is being run on cpu""" logger.info("Starting flop counter") high.start_counters([events.PAPI_DP_OPS]) for i, sample in enumerate(dataset): for _r in range(repeat): self.forward(sample) if i % 100 == 0: logger.info(f"Counted flops for {i}/{len(dataset)} samples") flops = high.stop_counters() flops = round(flops[0] / (repeat * len(dataset))) return flops def max_memory(self, dataset, repeat): """Compute average max memory consumed by model inference. Units are MiB""" logger.info("Starting memory benchmarking") total_memory = 0 for i, sample in enumerate(dataset): for _r in range(repeat): total_memory += max(memory_usage((self.forward, (sample,), {}))) if i % 100 == 0: logger.info(f"Benchmarked memory for {i}/{len(dataset)} samples") total_memory = total_memory / (repeat * len(dataset)) return total_memory def gather_all_metrics(self, dataset, repeat): run_time = self.benchmark_run_time(dataset, repeat) max_memory = self.max_memory(dataset, repeat) flops = self.count_flops(dataset, repeat) return run_time, max_memory, flops def dump_final_speech_output( self, dataset, output_dir, resample_fn, sample_rate, prefix=None ): for i, sample in enumerate(dataset): hypo = self.forward(sample)[0] def to_np(x): return x.detach().cpu().numpy() try: wave_preds = to_np(resample_fn(hypo["waveform"])) sf.write( f"{output_dir}/{prefix}_{i}_pred.wav", wave_preds, sample_rate, ) except Exception as e: raise Exception( f" Encountered {e} - Invalid waveform. Make sure the model outputs a waveform" ) class Processing(BenchmarkingBase): """Class similar to fairseq_cli/generate.py. Supports ASR, MT and ST model inference""" def __init__(self, args): super().__init__() self.use_cuda = not getattr(args, "cpu", False) self.setUp(args) self.training = False self.s2x_task = self.task def setUp(self, cfg): if isinstance(cfg, Namespace): cfg = convert_namespace_to_omegaconf(cfg) self.task = tasks.setup_task(cfg.task) self.tgt_dict = self.task.target_dictionary # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, _ = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides={}, task=self.task, suffix=cfg.checkpoint.checkpoint_suffix, strict=False, num_shards=cfg.checkpoint.checkpoint_shard_count, ) if len(models) > 1: raise Exception("Currently loading multiple models is not supported") self.model = models[0] # Optimize model for generation if cfg.common.fp16: self.model.half() if self.use_cuda: self.model.cuda() self.model.prepare_for_inference_(cfg) self.generator = self.task.build_generator( [self.model], cfg.generation, extra_gen_cls_kwargs={}, ) # Handle tokenization and BPE self.tokenizer = self.task.build_tokenizer(cfg.tokenizer) self.bpe = self.task.build_bpe(cfg.bpe) self.remove_bpe = cfg.common_eval.post_process def encode_source(self, src): """Method to generate source tokens from a string""" if self.tokenizer is not None: src = self.tokenizer.encode(src) if self.bpe is not None: src = self.bpe.encode(src) src_tokens = self.task.source_dictionary.encode_line(src).long() src_lens = src_tokens.size(0) return { "net_input": { "src_tokens": src_tokens.view(1, src_lens), "src_lengths": torch.tensor([src_lens]), } } def decode_target(self, hypos): """Method to decode target string from tokens""" hypo_str = self.tgt_dict.string( hypos[0][0]["tokens"].int().cpu(), self.remove_bpe, get_symbols_to_strip_from_output(self.generator), ) if self.bpe is not None: hypo_str = self.bpe.decode(hypo_str) if self.tokenizer is not None: hypo_str = self.tokenizer.decode(hypo_str) return hypo_str def forward(self, sample): hypos = self.task.inference_step( self.generator, [self.model], sample, prefix_tokens=None, constraints=None, ) return hypos class GenerateWaveformFromCode(BenchmarkingBase): """Class to support waveform generation from code. Currently, vocoder only supports single speaker""" def __init__(self, args): super().__init__() with open(args.vocoder_cfg) as f: vocoder_cfg = json.load(f) self.dur_prediction = args.dur_prediction self.vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg) def format_units(self, input): code = torch.LongTensor(list(map(int, input.strip().split()))).view(1, -1) return {"code": code} def generate_vocoder_input(self, dataset): return [self.format_units(sample) for sample in dataset] def forward(self, sample): return [{"waveform": self.vocoder(sample, self.dur_prediction)}] class HubertUnitExtractor(BenchmarkingBase): def __init__(self, args): self.feature_reader = HubertFeatureReader( args.hubert_ckpt_path, args.hubert_layer ) self.kmeans = ApplyKmeans(args.hubert_km_path) def forward(self, sample): with torch.no_grad(): feat = [] for start in range(0, sample.size(1), self.feature_reader.max_chunk): x_chunk = sample[:, start : start + self.max_chunk] feat_chunk, _ = self.feature_reader.model.extract_features( source=x_chunk, padding_mask=None, mask=False, output_layer=self.layer, ) feat.append(feat_chunk) torch.cat(feat, 1).squeeze(0) return self.kmeans(feat).tolist() class SpeechGeneration(BenchmarkingBase): """Class similar to examples/text_to_speech/generate_waveform.py. Supports models with speech generation as end goal (TTS, Direct S2ST models etc)""" def __init__(self, args): super().__init__() self.use_cuda = not getattr(args, "cpu", False) self.setUp(args) self.s2x_task = self.task def setUp(self, args): if args.task == "speech_to_speech": args.normalize_waveform = False self.task = tasks.setup_task(args) self.pre_tokenizer = self.task.build_tokenizer(args) self.bpe_tokenizer = self.task.build_bpe(args) try: self.src_dict = self.task.src_dict except Exception: self.src_dict = None ensemble, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [args.path], arg_overrides=ast.literal_eval(args.model_overrides), task=self.task, strict=False, ) self.model = ensemble[0] if self.use_cuda: self.model.cuda() # criterion.cuda() self.model.eval() self.generator = self.task.build_generator( [self.model], args, ) def processTextInput(self, text): """Generate source tokens from text input""" if self.pre_tokenizer is not None: text = self.pre_tokenizer.encode(text) if self.bpe_tokenizer is not None: text = self.bpe_tokenizer.encode(text) target = self.src_dict.encode_line( text, add_if_not_exist=False, append_eos=True ).long() target = fairseq_data_utils.collate_tokens( [target], self.src_dict.pad(), self.src_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) src_lengths = torch.tensor([target.size(1)], dtype=torch.long) prev_output_tokens = None sample = { "net_input": { "src_tokens": target, "src_lengths": src_lengths, "prev_output_tokens": prev_output_tokens, } } sample = utils.move_to_cuda(sample) if self.use_cuda else sample return sample def forward(self, sample): sample["speaker"] = None output = self.generator.generate(self.model, sample) # , has_targ=False return output class S2UT(BenchmarkingBase): """Class to support S2UT models. Also supports generating waveforms from the units predicted""" def __init__(self, s2u_args, vocoder_args=None): super().__init__() self.s2u = Processing(s2u_args) self.vocoder = None if vocoder_args: self.vocoder = GenerateWaveformFromCode(vocoder_args) self.vocoder_input = None def forward(self, sample): s2u_hypos = self.s2u(sample) s2u_output = self.s2u.decode_target(s2u_hypos) if not self.vocoder: return s2u_output units = self.vocoder.format_units(s2u_output) vocoder_output = self.vocoder(units) return vocoder_output def generate_s2u_outputs(self, dataset): return [self.s2u.decode_target(self.s2u(sample)) for sample in dataset] def compute_metrics(self, metric_type, dataset, repeat=None): """Generic function to compute metrics ignoring the io processing time""" if self.vocoder and not self.vocoder_input: self.s2u_output = self.generate_s2u_outputs(dataset) self.vocoder_input = self.vocoder.generate_vocoder_input(self.s2u_output) s2u_metrics = getattr(self.s2u, metric_type)( dataset, repeat, ) vocoder_metrics = 0 if self.vocoder: vocoder_metrics = getattr(self.vocoder, metric_type)( self.vocoder_input, repeat, ) print( f"metric_type = {metric_type} s2u_metrics = {s2u_metrics} \t vocoder_metrics = {vocoder_metrics}" ) if metric_type == "max_memory": return max(s2u_metrics, vocoder_metrics) else: return s2u_metrics + vocoder_metrics def benchmark_run_time(self, dataset, repeat): return self.compute_metrics("benchmark_run_time", dataset, repeat) def count_flops(self, dataset, repeat): return self.compute_metrics("count_flops", dataset, repeat) def max_memory(self, dataset, repeat): return self.compute_metrics("max_memory", dataset, repeat) class Cascaded2StageS2ST(BenchmarkingBase): """ST + TTS""" def __init__(self, s2t_args, tts_args): super().__init__() self.s2t = Processing(s2t_args) self.s2x_task = self.s2t.task self.tts = SpeechGeneration(tts_args) if tts_args else None self.training = False self.tts_inputs = None def forward(self, sample): if not self.tts: raise Exception( "Forward function is not callable without tts. Reinitialize the class with tts_args" ) s2t_hypos = self.s2t(sample) s2t_output = self.s2t.decode_target(s2t_hypos) tts_input = self.tts.processTextInput(s2t_output) tts_output = self.tts(tts_input) return tts_output def generate_s2t_outputs(self, dataset): """Process dataset and generate s2t outputs""" return [self.s2t.decode_target(self.s2t(sample)) for sample in dataset] def generate_tts_inputs(self, dataset): """Process dataset and generate tts inputs""" return [self.tts.processTextInput(sample) for sample in dataset] def compute_metrics(self, metric_type, dataset, repeat=None): """Generic function to compute metrics ignoring the io processing time""" if not self.tts_inputs: s2t_outputs = self.generate_s2t_outputs(dataset) self.tts_inputs = self.generate_tts_inputs(s2t_outputs) s2t_metrics = getattr(self.s2t, metric_type)( dataset, repeat, ) tts_metrics = getattr(self.tts, metric_type)( self.tts_inputs, repeat, ) print( f"metric_type = {metric_type} s2t_metrics = {s2t_metrics} \t tts_metrics = {tts_metrics}" ) if metric_type == "max_memory": return max(s2t_metrics, tts_metrics) else: return s2t_metrics + tts_metrics def benchmark_run_time(self, dataset, repeat): return self.compute_metrics("benchmark_run_time", dataset, repeat) def count_flops(self, dataset, repeat): return self.compute_metrics("count_flops", dataset, repeat) def max_memory(self, dataset, repeat): return self.compute_metrics("max_memory", dataset, repeat) class Cascaded3StageS2ST(Cascaded2StageS2ST): """ASR + MT + TTS""" def __init__(self, s2t_args, tts_args, mt_args): super().__init__(s2t_args, tts_args) self.mt = Processing(mt_args) self.mt_inputs = [] def forward(self, sample): s2t_hypos = self.s2t(sample) s2t_output = self.s2t.decode_target(s2t_hypos) mt_input = self.mt.encode_source(s2t_output) mt_hypos = self.mt(mt_input) mt_output = self.mt.decode_target(mt_hypos) tts_input = self.tts.processTextInput(mt_output) tts_output = self.tts(tts_input) return tts_output def generate_mt_inputs(self, dataset): """Process dataset to generate mt model inputs""" return [self.mt.encode_source(sample) for sample in dataset] def generate_mt_outputs(self, dataset): """Process dataset to generate mt model outputs""" return [self.mt.decode_target(self.mt(sample)) for sample in dataset] def compute_metrics(self, metric_type, dataset, repeat=None): """Generic function to compute metrics ignoring the io processing time""" if not self.tts_inputs: s2t_outputs = self.generate_s2t_outputs(dataset) self.mt_inputs = self.generate_mt_inputs(s2t_outputs) mt_outputs = self.generate_mt_outputs(self.mt_inputs) self.tts_inputs = self.generate_tts_inputs(mt_outputs) s2t_metrics = getattr(self.s2t, metric_type)( dataset, repeat, ) mt_metrics = getattr(self.mt, metric_type)(self.mt_inputs, repeat) tts_metrics = getattr(self.tts, metric_type)( self.tts_inputs, repeat, ) print( f"metric_type = {metric_type} s2t_metrics = {s2t_metrics} \t mt_metrics = {mt_metrics} \t tts_metrics = {tts_metrics}" ) if metric_type == "max_memory": return max(s2t_metrics, mt_metrics, tts_metrics) else: return s2t_metrics + mt_metrics + tts_metrics ================================================ FILE: examples/speech_to_speech/benchmarking/data_utils.py ================================================ from fairseq import tasks import numpy as np import logging import random from fairseq import options import torch import os import soundfile as sf from fairseq.data.audio.audio_utils import ( get_waveform, parse_path, ) logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) random.seed(1) np.random.seed(1) random_number_generator = np.random.RandomState(30) def generate_random_data_sample(T, B=1, D=80): """Generate random data sample given the T, B, D values""" net_input = { "src_tokens": torch.tensor(random_number_generator.randn(B, T, D)).float(), "src_lengths": torch.tensor([T]), } return {"net_input": net_input} def generate_random_dataset(T_range_min, T_range_max, B=1, D=80, dataset_size=100): """Generate random dataset with T values within a given range, B, D""" T_values = [random.randint(T_range_min, T_range_max) for i in range(dataset_size)] dataset = [] for t in T_values: dataset.append(generate_random_data_sample(t, B, D)) return dataset, sum(T_values) / dataset_size def load_dataset_npy(file_name, dataset_size=None): """Load dataset from a .npy file.""" data = np.load(file_name, allow_pickle=True) if dataset_size: data = data[:dataset_size] return data def load_dataset_raw_to_waveforms( file_name, dataset_size=None, need_waveform=True, sample_rate=16000, read_using_soundfile=False, ): """Load raw dataset from w2v tsv file. Optionally get waveforms""" data = [] with open(file_name, "r") as fp: lines = fp.readlines() data = [ os.path.join(lines[0].strip(), line.strip().split("\t")[0]) for line in lines[1:] ] if dataset_size: data = data[:dataset_size] if not need_waveform: return data features = [] if read_using_soundfile: for _i, d in enumerate(data): wav = sf.read(d)[0] if wav.ndim == 2: wav = wav.mean(-1) features.append(torch.from_numpy(wav).float().view(1, -1)) else: for i, d in enumerate(data): _path, slice_ptr = parse_path(d) if len(slice_ptr) == 0: feat = get_waveform( _path, always_2d=True, output_sample_rate=sample_rate )[0] features.append( { "id": i, "net_input": { "src_tokens": torch.tensor(feat), "src_lengths": torch.tensor([feat.shape[1]]), }, } ) else: raise Exception("Currently unsupported data format") return features def load_dataset_task( args, batch_size=1, limit_size=None, ref_dataset=None, ): """Loads dataset based on args by creating a task""" if not args.data or not args.subset or not args.task: raise Exception( "Please provide necessary arguments to load the dataset - data, subset and task" ) task = tasks.setup_task(args) task.load_dataset(args.subset) if not limit_size: limit_size = len(task.dataset(args.subset)) iter = task.get_batch_iterator( dataset=task.dataset(args.subset), max_sentences=batch_size ).next_epoch_itr(shuffle=False) dataset = [] for i, sample in enumerate(iter): sample = { "id": task.datasets[args.subset].ids[sample["id"].item()], "net_input": { "src_tokens": sample["net_input"]["src_tokens"], "src_lengths": sample["net_input"]["src_lengths"], }, } dataset.append(sample) if i == limit_size - 1: break if ref_dataset: try: ids = get_ids_from_dataset(ref_dataset) except Exception as e: raise Exception(f"{e} - Cannot extract ids from reference dataset") filtered_dataset = [] for sample in dataset: if ( sample["id"] in ids or sample["id"][5:] in ids or f"dev_{sample['id']}" in ids ): filtered_dataset.append(sample) dataset = filtered_dataset max_len, min_len, avg_len = get_dataset_stats(dataset) print( f"{args.subset} dataset stats : num_samples={len(dataset)} max_len = {max_len} min_len = {min_len} avg_len = {avg_len}" ) return dataset def randomly_sample_subset(dataset, size=500): """Randomly sample subset from a dataset""" random_indices = [random.randint(0, len(dataset) - 1) for i in range(size)] return [dataset[i] for i in random_indices] def get_short_data_subset(dataset, size=500): """Get a subset of desired size by sorting based on src_lengths""" return sort_dataset(dataset)[:size] def get_long_data_subset(dataset, size=500): """Get a subset of desired size by sorting based on src_lengths descending""" return sort_dataset(dataset, reverse=True)[:size] def sort_dataset(dataset, reverse=False): return sorted( dataset, key=lambda x: x["net_input"]["src_lengths"].item(), reverse=reverse ) def save_dataset_npy(dataset, file_name): """Save a dataset as .npy file""" np.save(file_name, dataset) def get_dataset_stats(dataset): """Get stats about dataset based on src_lengths of samples""" max_len = 0 min_len = 100000 avg_len = 0 for d in dataset: max_len = max(max_len, d["net_input"]["src_lengths"].item()) min_len = min(min_len, d["net_input"]["src_lengths"].item()) avg_len += d["net_input"]["src_lengths"].item() return max_len, min_len, avg_len / len(dataset) def make_parser(): """ Additional args: 1. Provide the dataset dir path using --data. 2. Loading the dataset doesn't require config, provide --config-yaml to apply additional feature transforms """ parser = options.get_speech_generation_parser() parser.add_argument( "--subset", default=None, type=str, required=True, help="Subset to use for dataset generation", ) parser.add_argument( "--dataset-save-dir", default=None, type=str, required=False, help="Dir path in which the datasets are to be saved", ) parser.add_argument( "--ref-dataset", default=None, type=str, required=False, help="If provided, the ids in the reference dataset will be used to filter the new dataset generated.", ) parser.add_argument("--dataset-save-token", default="", type=str, required=False) options.add_generation_args(parser) return parser def get_ids_from_dataset(dataset): return {sample["id"]: 1 for sample in dataset} def cli_main(): parser = make_parser() args = options.parse_args_and_arch(parser) dataset = load_dataset_task(args) random_dataset = randomly_sample_subset(dataset) short_dataset = get_short_data_subset(dataset) long_dataset = get_long_data_subset(dataset) if args.dataset_save_token: args.dataset_save_token = f"_{args.dataset_save_token}_" if args.dataset_save_dir: save_dataset_npy( random_dataset, f"{args.dataset_save_dir}/random_dataset{args.dataset_save_token}w_ids.npy", ) save_dataset_npy( short_dataset, f"{args.dataset_save_dir}/short_dataset{args.dataset_save_token}w_ids.npy", ) save_dataset_npy( long_dataset, f"{args.dataset_save_dir}/long_dataset{args.dataset_save_token}w_ids.npy", ) if __name__ == "__main__": cli_main() ================================================ FILE: examples/speech_to_speech/benchmarking/get_metrics.py ================================================ import copy import torch import logging from argparse import Namespace import yaml from fairseq import options from examples.speech_to_speech.benchmarking.core import ( Processing, SpeechGeneration, Cascaded2StageS2ST, Cascaded3StageS2ST, S2UT, ) from examples.speech_to_speech.benchmarking.data_utils import ( load_dataset_npy, load_dataset_raw_to_waveforms, ) logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) torch.manual_seed(1) torch.set_deterministic(True) def make_parser(): """Note: As the names indicate use s2x_args(ex:ST, ASR etc) for models with speech input, x2s_args for models with speech output(ex:TTS) and mt_args for translation models (ex: mt, T2U etc). For direct S2ST models, use x2s_args to provide model details. """ parser = options.get_speech_generation_parser() parser.add_argument("--target-is-code", action="store_true", default=False) parser.add_argument("--config", type=str) parser.add_argument( "--model-type", default="S2U", choices=["S2S", "TTS", "S2UT", "MT", "S2T", "2StageS2ST", "3StageS2ST"], help="Choose one of the models. For model inference implementation, refer to core.py", ) parser.add_argument( "--dataset-path", type=str, help="""File to load dataset from. Assumes dataset is a list of samples. Each sample is a dict of format {'net_input':{'src_tokens':torch.tenor(),'src_lengths':torch.tensor()}}""", ) parser.add_argument( "--dataset-type", type=str, default="npy", choices=["npy", "raw"], help="""Type of input dataset file""", ) parser.add_argument( "--read-using-sf", type=str, default=False, help="""If sound file should be used to read the raw dataset""", ) parser.add_argument( "--dataset-size", default=None, type=int, help="Dataset size to use for benchmarking", ) parser.add_argument( "--dump-speech-waveforms-dir", default=None, type=str, help="Directory to dump the speech waveforms computed on the dataset.", ) parser.add_argument( "--dump-waveform-file-prefix", default="", type=str, help="File name prefix for the saved speech waveforms", ) parser.add_argument( "--feat-dim", default=80, type=int, help="Input feature dimension" ) parser.add_argument( "--target-sr", default=16000, type=int, help="Target sample rate for dumping waveforms", ) options.add_generation_args(parser) options.get_interactive_generation_parser(parser) return parser def cli_main(): parser = make_parser() args = options.parse_args_and_arch(parser) with open( args.config, "r", ) as f: config = yaml.load(f, Loader=yaml.FullLoader) dict_args = vars(args) dict_args.update(config["general"]) args = Namespace(**dict_args) i = 1 stage_args = [] while i <= 3: var = f"stage{i}" tmp_args = copy.deepcopy(dict_args) if var in config: tmp_args.update(config[var]) stage_args.append(Namespace(**tmp_args)) i += 1 else: break if args.model_type == "S2S" or args.model_type == "TTS": model = SpeechGeneration(stage_args[0]) elif args.model_type == "S2UT": model = S2UT(stage_args[0], stage_args[1] if len(stage_args) > 1 else None) elif args.model_type == "MT" or args.model_type == "S2T": model = Processing(stage_args[0]) elif args.model_type == "2StageS2ST": model = Cascaded2StageS2ST(stage_args[0], stage_args[1]) elif args.model_type == "3StageS2ST": model = Cascaded3StageS2ST(stage_args[0], stage_args[2], stage_args[1]) else: raise Exception(f"Currently unsupported model type {args.model_type}") print(f"Evaluating on dataset - {args.dataset_path}\n") if args.dataset_type == "npy": dataset = load_dataset_npy(args.dataset_path, dataset_size=args.dataset_size) elif args.dataset_type == "raw": dataset = load_dataset_raw_to_waveforms( args.dataset_path, dataset_size=args.dataset_size, read_using_soundfile=args.read_using_sf, ) else: raise Exception(f"Invalid dataset type {args.dataset_type}") model.warm_up(sample=dataset[0], repeat=2) run_time, memory, flops = model.gather_all_metrics(dataset, repeat=1) print(f"run_time = {run_time}sec \tmemory = {memory}MiB \tflops = {flops}") if args.dump_speech_waveforms_dir: model.dump_final_speech_output( dataset, args.dump_speech_waveforms_dir, lambda x: x, args.target_sr, prefix=args.dump_waveform_file_prefix, ) if __name__ == "__main__": cli_main() ================================================ FILE: examples/speech_to_speech/docs/data_augmentation.md ================================================ # Noise and audio augmentation techniques The noise and data augmentation techniques were written in an effort to understand how augmenatation can affect model robustness and performance in both clean and noisy settings. All transforms discussed in this section are subclasses of `AudioFeatureTransform`, `AudioWaveformTransform`, or `AudioDatasetTransform`. Each `Audio*Transform` has unique interaction with the data. If interested in implemented one's own transforms, it is highly advisable to review the differences (see [Adding your own transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#adding-your-own-transforms)). If only applying the in-built transforms, then one only needs to be mindful that the correct kind of transform is listed in the config (see [Using transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#using-transforms)). These transforms can be applied to instances of `SpeechToTextDataset`. ### Contents [In-built transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#in-built-transforms) [Benchmark studies](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#benchmark-studies) [Using transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#using-transforms) [Adding your own transforms](https://github.com/facebookresearch/fairseq/blob/main/examples/speech_to_speech/docs/data_augmentation.md#adding-your-own-transforms) ## In-built transforms ### 1. Utterance concatenation Utterance concatenation is a data augmenation technique introduced as ConcatAug in [Translatotron 2: High-quality direct speech-to-speech translation with voice preservation](https://arxiv.org/pdf/2107.08661.pdf). With some parameterized probability, samples are concatenated with one other randomly chosen sample from the whole dataset. In the positive (concatenation) case, accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]+source[j]` and `target=target[i]+target[j]`. In the negative (skip concatenation) case, accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]` and `target=target[i]` as usual. **Usage**: `concataugment` is an `AudioDatasetTransform` and has three configurable hyperparameters: - `rate`: probability that any single access will result in the positive (concatenation) case. Defaults to 0.25. - `max_tokens`: maximum number of tokens allowed for concatenated source sequences. This parameter is meant to limit the length of concatenated samples to avoid out-of-memory errors. Defaults to 300. - `attempts`: maximum number of invalid concatenation attempts before defaulting to the negative (skip concatenation) case. This parameter aims to limit excessive time spent trying to find candidate samples that are short enough to concatenate with. Defaults to 5. Please be wary of OOMs while using this augmentation technique; we used smaller batch sizes as a workaround to avoid OOMs. Batch size is determined by update frequency, batch size hyperparameter, and the number of GPU, so you may want to alter these to this end. ### 2. Noise augmentation suite The four noise augmentation methods in this suite adhere to the following principle: with some parameterized probability, samples are overlayed with a noise track. The content of the noise track is specific to the method. Signal-to-noise ratio with which the noise track is overlayed is determined by choosing a value from a random uniform distribution with parameterized endpoints. The first three methods are based off data augmentation methods suggested in Section 3.3 of [X-Vectors: Robust DNN Embeddings for Speaker Recognition](https://danielpovey.com/files/2018_icassp_xvectors.pdf). #### 2.1. Music augmentation For music augmentation, the noise track consists of one file uniformly randomly selected from a corpus of music files. The music file is cut to size, including being repeated to fill the original sample length if necessary. **Usage**: `musicaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters: - `samples_path`: path where background music files are saved as audios (.wav files). No default. - `rate`: probability that any single access will result in the positive (background music) case. Defaults to 0.25. - `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. - `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. #### 2.2. Babble augmentation For babble augmentation, the noise track consists of multiple audios uniformly randomly selected from a corpus of speech files. The number of speech audios in the background track is chosen randomly with equal probability between 3 and 7 audios. **Usage**: `babbleaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters: - `samples_path`: path where background speech files are saved as audios (.wav files). No default. - `rate`: probability that any single access will result in the positive (background speech) case. Defaults to 0.25. - `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. - `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. #### 2.3. Sporadic noise augmentation For sporadic noise augmentation, the noise track is mostly silent except for intermittent short clips of noise which are added at roughly a parameterized frequency. These clips are randomly chosen and cut from a corpus of noise files to lengths according to a parameterized Gaussian distribution. **Usage**: `sporadicnoiseaugment` is an `AudioWaveformTransform` and has seven configurable hyperparameters: - `samples_path`: path where background noise files are saved as audios (.wav files). No default. - `rate`: probability that any single access will result in the positive (add a sporadic noise track) case. Defaults to 0.25. - `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. - `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. - `noise_rate`: rate in noises per second at which noise clip will be added to the original sample - `noise_len_mean`: mean of Gaussian normal distribution from which length of noise clip is chosen - `noise_len_std`: standard deviation of Gaussian normal distribution from which length of noise clip is chosen #### 2.4. Background noise augmentation For background noise augmentation, the noise track is a single track uniformly randomly selected from a corpus of noise files. The noise file is cut to size, including being repeated to fill the original sample length if necessary. **Usage**: `backgroundnoiseaugment` is an `AudioWaveformTransform` and has four configurable hyperparameters: - `samples_path`: path where background noise files are saved as audios (.wav files). No default. - `rate`: probability that any single access will result in the positive (background noise) case. Defaults to 0.25. - `snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. - `snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 15. ### 3. Mixed babble and background noise augmentation with recognizable source speaker This augmentation technique is based on Algorithm 1 in [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) and is similar to the noise augmentation suite techniques in that it has a background noise track. The noise track consists of either (1) another audio sample from the batch or (2) a background noise track. A key difference is the length of the noise track is chosen from a uniform random distribution between 0 and half of the original sample length. **Usage**: `noisyoverlapaugment` is an `AudioDatasetTransform` and has seven configurable hyperparameters: - `noises_path`: path where background noise files are saved as audios (.wav files). No default. - `rate`: probability that any single access will result in the positive (background noise) case. Defaults to 0.25. - `mixing_noise_rate`: probability that in a positive (background noise) case, the noise track will consist of background noise (rather than babble from the batch). Defaults to 0.1. - `noise_snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to -5. - `noise_snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add background noise to the original source. Defaults to 5. - `utterance_snr_min`: lower endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add **another audio from the batch** to the original source. Defaults to -5. - `utterance_snr_max`: higher endpoint of the range from which a signal-to-noise ratio is uniformly randomly chosen with which to add **another audio from the batch** to the original source. Defaults to 5. ## Benchmark studies ### Evaluation on clean data Augmentation in training data|Hyperparameters|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) ---|---|---|---|---|--- None||3.954|24.984|23.962|24.448 ConcatAugment|rate = 0.25, max_tokens = 3000, attempts = 5|3.940|25.322|26.124|26.19 BabbleAugment|rate = 0.25, MUSAN speech, snr_min = (-5), snr_max = 5|3.957|24.226|23.186|22.368| BackgroundNoiseAugment|rate = 0.1, MUSAN noises, snr_min = (-10), snr_max = 10|3.955|24.745|23.513|23.819 MusicAugment|rate = 0.25, MUSAN music, snr_min = 0, snr_max = 20|3.954|25.096|24.301|23.341| SporadicNoiseAugment|rate = 0.1, noise_rate = 0.25, MUSAN noises, snr_min = 10, snr_max = 35|3.954|24.924|23.951|23.484| MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|as above, except limited rates to sum to 0.25: music (0.074), background (0.029), babble (0.074), sporadic (0.029)|3.953|24.874|23.675|24.249| NoisyOverlapAugment|rate = 0.25, mixing_noise_rate = 0.5, MUSAN noises, utterance_snr_min = (-10), utterance_snr_max = 0, noise_snr_min = (-5), noise_snr_max = 20|3.954|24.949|24.015|23.768| ### Evaluation on data with music noise added at SNR = (-5) - 5 Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) ---|---|---|---|--- None|3.954|15.785|21.105|16.944 ConcatAugment|3.940|17.186|23.255|18.24 BabbleAugment|3.957|19.158|22.064|17.116 BackgroundNoiseAugment|3.955|17.777|22.0|17.535| MusicAugment|3.954|20.345|23.126|19.433| SporadicNoiseAugment|3.954|15.927|21.382|14.736| MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|19.724|22.659|17.852| NoisyOverlapAugment|3.954|17.49|22.142|17.207| ### Evaluation on data with babble noise added at SNR = (-5) - 5 Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) ---|---|---|---|--- None|3.954|4.092|13.514|5.13 ConcatAugment|3.940|5.493|15.835|6.893 BabbleAugment|3.957|16.12|21.097|13.996 BackgroundNoiseAugment|3.955|4.691|15.784|5.982 MusicAugment|3.954|8.06|17.764|9.008 SporadicNoiseAugment|3.954|4.009|13.935|4.814 MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|14.692|20.882|14.45 NoisyOverlapAugment|3.954|4.032|16.434|7.284 ### Evaluation on data with sporadic noise added at SNR = (-5) - 5 Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) ---|---|---|---|--- None|3.954|23.778|23.745|22.748 ConcatAugment|3.940|24.239|25.907|25.723 BabbleAugment|3.957|23.42|23.048|21.076 BackgroundNoiseAugment|3.955|23.998|23.467|22.494 MusicAugment|3.954|24.142|24.181|19.143 SporadicNoiseAugment|3.954|23.97|23.894|22.61 MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|24.118|23.59|23.717 NoisyOverlapAugment|3.954|24.265|24.103|23.167 ### Evaluation on data with background noise added at SNR = (-5) - 5 Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) ---|---|---|---|--- None|3.954|20.201|22.525|19.66 ConcatAugment|3.940|20.904|24.706|21.353 BabbleAugment|3.957|20.687|22.374|18.907 BackgroundNoiseAugment|3.955|21.574|22.998|20.043 MusicAugment|3.954|21.65|23.529|19.87 SporadicNoiseAugment|3.954|20.578|22.577|19.096 MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|21.811|23.144|20.986 NoisyOverlapAugment|3.954|21.312|23.153|20.302 ### Evaluation on data with all four types of noises added at SNR = (-5) - 5, each applied with prob 0.5 Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) ---|---|---|---|--- None|3.954|10.895|19.319|12.748 ConcatAugment|3.940|13.517|21.658|15.428 BabbleAugment|3.957|18.09|21.384|16.018 BackgroundNoiseAugment|3.955|12.837|20.719|13.933 MusicAugment|3.954|16.589|21.823|15.927 SporadicNoiseAugment|3.954|11.238|19.91|13.31 MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|18.636|21.935|17.845 NoisyOverlapAugment|3.954|12.829|20.856|15.048 ### Evaluation on data with noisy overlap augment Augmentation in training data|Training loss|BLEU (covost)|BLEU (epst)|BLEU (mtedx) ---|---|---|---|--- None|3.954|21.245|22.24|20.994 ConcatAugment|3.940|21.611|24.247|23.068 BabbleAugment|3.957|21.867|21.987|20.099| BackgroundNoiseAugment|3.955|21.533|21.806|19.717| MusicAugment|3.954|21.823|22.643|20.847| SporadicNoiseAugment|3.954|21.373|22.381|20.672| MusicAugment + BabbleAugment + BackgroundNoiseAugment + SporadicNoiseAugment|3.953|22.206|22.414|21.375| NoisyOverlapAugment|3.954|23.371|23.396|22.627| ## Using transforms Transforms are configurable. 1. Please pay careful attention to the type of transform you are applying. - `concataugment` and `noisyoverlapaugment` are instances of `AudioDatasetTransform` and should be listed in the config under `dataset_transforms`. - `musicaugment`, `babbleaugment`, `sporadicnoiseaugment`, and `backgroundnoiseaugment` are instances of `AudioWaveformTransform` and should be listed under `waveform_transforms`. - Instances of `AudioFeatureTransform` should be listed under `feature_transforms`. 2. Feel free to apply these augmentations in different contexts, e.g., you may use a `_train` or `_eval` flag to specify when the transform will be applied. If the dataset at hand contains `train` in its name, those transforms under the `_train` flag will be applied; else, the remaining transforms will be applied. For example, you would add this to your config to apply the musicaugment transform to a training dataset: ```yaml musicaugment: samples_path: ${MUSIC_PATH} snr_min: 10 snr_max: 15 rate: 0.25 waveform_transforms: _train: - musicaugment ``` or add this to apply the concataugment transform: ```yaml concataugment: rate: 0.25 max_tokens: 3000 attempts: 5 dataset_transforms: _train: - concataugment ``` You may also want to add multiple of one type of transform; here, we add multiple `AudioWaveformTransform`s: ```yaml musicaugment: samples_path: ${MUSIC_PATH} snr_min: 5 snr_max: 20 rate: 0.25 backgroundnoiseaugment: samples_path: ${NOISES_PATH} snr_min: 10 snr_max: 20 rate: 0.1 sporadicnoiseaugment: samples_path: ${NOISES_PATH} snr_min: 5 snr_max: 15 rate: 0.1 noise_rate: 0.25 waveform_transforms: _train: - musicaugment - backgroundnoiseaugment - sporadicnoiseaugment ``` ## Adding your own transforms Note: We store transform implementations in `fairseq/data/audio/*_transforms` directories. You may refer to these as examples while implementing your own transform. ### Step 1. Picking the right class for your transform The integration into SpeechToTextDataset is quite different for each kind of transform, so it is important to understand which one is best suited to your purposes. **Feature transforms** `AudioFeatureTransform` is a base class which allows **some transform to be applied to audio spectrograms** in the data loading step. One thing to note is that the source data is either saved as `np.ndarrays` or as audio files, and is to be returned either as features (spectrogram) or waveform. If and only if the data is to be returned as a spectrogram, then `AudioFeatureTransform`s will be applied. **Waveform transforms** `AudioWaveformTransform` is a base class which allows some **transform to be applied to waveforms** in the data loading step. As mentioned above, there are two source and return types to data loading for this dataset. If and only if the data is saved in audio file format, then `AudioWaveformTransform`s will be applied, whichever return type is used. **Dataset transforms** `AudioDatasetTransform` is a base class for transforms **based on more than one item in a dataset**, ex. concatenation of two random samples in a dataset. Rather than being applied in a consistent way, i.e., to all features or to all waveforms, the integration of a dataset transform is entirely specific. Adding a dataset transform requires actually editing the `fairseq/data/audio/speech_to_text_dataset.py` file. ### Step 2. Setting up your transform (generic to all types of transforms) Now that you know which kind of transform you would like to use, we are ready to implement it. This step is generic for all transform types, i.e., `TRANSFORM_TYPE` may be any of `feature`, `waveform`, or `dataset`. We will show how to build utterance concatenation (an `AudioDatasetTransform`) as an example. Import the base class and registration function for your transform. ```python from fairseq.data.audio.dataset_transforms import ( AudioDatasetTransform, register_audio_dataset_transform ) ``` Define the class and register the transform. The name passed into the registration function is how your transform should be named in the config. ```python @register_audio_dataset_transform("concataugment") class ConcatAugment(AudioDatasetTransform): ``` We are now ready to add the basic important functions to our new class. In this example, `_DEFAULTS` refers to a dictionary with the default hyperparameter values that we defined. `from_config_dict` is called to instantiate the transform given hyperparameters from the config. ```python @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return ConcatAugment( _config.get("rate", _DEFAULTS["rate"]), _config.get("max_tokens", _DEFAULTS["max_tokens"]), _config.get("attempts", _DEFAULTS["attempts"]), ) ``` We edit the instantiation function `__init__` to track hyperparameters and do any setup work. ```python def __init__( self, rate=_DEFAULTS["rate"], max_tokens=_DEFAULTS["max_tokens"], attempts=_DEFAULTS["attempts"], ): self.rate, self.max_tokens, self.attempts = rate, max_tokens, attempts ``` Lastly `__repr__` gives how the transform will be reported in an output log. ```python def __repr__(self): return ( self.__class__.__name__ + "(" + ", ".join( [ f"rate={self.rate}", f"max_tokens={self.max_tokens}", f"attempts={self.attempts}", ] ) + ")" ) ``` ### Step 3. Adding the transform logic At this point, we are ready to implement the actual transform logic. The flow from here is different for each of the three transforms, so follow the path that is relevant to you. ### ...for feature transforms The final step is implementing the `__call__` function, which applies the transform logic and **returns** the spectrogram with transform applied. This supports and should take exactly **two arguments**: - `self` - `x` (np.ndarray): the spectrogram for one source sample. (This is a positional argument, so you can use another parameter name like `spectrogram` instead of `x`.) For example, this is the `__call__` function for GlobalCMVN (cepstral mean and variance normalization). ```python def __call__(self, x): x = np.subtract(x, self.mean) x = np.divide(x, self.std) return x ``` ### ...for waveform transforms The final step is implementing the `__call__` function, which applies the transform logic. This supports and should take exactly **three arguments**: - `self` - `source` (numpy.ndarray or torch.Tensor): source audio 2d waveform (channels x length) - `sample_rate` (optional, defaults to None): sample rate of `source` `__call__` **returns**: - transformed audio waveform - sample rate of transformed audio waveform For example, this is the `__call__` function for augmentations in the Noise Augmentation Suite. ```python def __call__(self, source, sample_rate=None): if np.random.random() > self.rate: return source noise = self._get_noise( source.shape, always_2d=True, use_sample_rate=sample_rate ) return self._mix(source, noise, rand_uniform(self.snr_min, self.snr_max)), sample_rate ``` ### ...for dataset transforms Dataset transforms are extremely flexible, and implementation involves directly integrating them into `fairseq/data/audio/speech_to_text_dataset.py` in transform-specific ways. There are two basic components: (1) check whether or not this transform is part of this dataset instance using `self.dataset_transforms.has_transform(TRANSFORM_CLS)`, and (2) if so, get the transform using `self.dataset_transforms.get_transform(TRANSFORM_CLS)` & apply it. Due to the case-by-case specificity, it is easier to demonstrate this by examples. #### Example: NoisyOverlapAugment This transform requires access to multiple items within the same batch at once. **Logic**: We still use the transform classes to keep away the transform logic. For example, `__call__` of `NoisyOverlapAugment` class takes a list of source tokens for items in a mini-batch, applies noise/utterance as dictated by the transform, and returns the list of transformed source tokens for items in the mini-batch. ```python def __call__(self, sources): for i, source in enumerate(sources): if np.random.random() > self.rate: continue pri = source.numpy() # ... some transform code omitted pri[s_source : s_source + l] = np.add( pri[s_source : s_source + l], np.multiply(scl, sec[s_sec : s_sec + l]) ) sources[i] = torch.from_numpy(pri).float() return sources ``` **Integration**: The `collater` function for `SpeechToTextDataset` is responsible for preparing a mini-batch for training, so we integrate NOAug through adding a few lines to the top of this function: ```python def collater( self, samples: List[SpeechToTextDatasetItem], return_order: bool = False ) -> Dict: if len(samples) == 0: return {} indices = torch.tensor([x.index for x in samples], dtype=torch.long) sources = [x.source for x in samples] # NOAUG INTEGRATION BLOCK # (1) Check whether or not this transform is part of this dataset instance has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment) # (2) If so, get & apply the transform if has_NOAug and self.cfg.use_audio_input: NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment) sources = NOAug(sources) frames = _collate_frames(sources, self.cfg.use_audio_input) # sort samples by descending number of frames n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long) n_frames, order = n_frames.sort(descending=True) indices = indices.index_select(0, order) frames = frames.index_select(0, order) # ... rest of function ``` #### Example: ConcatAugment This transform requires access to another item within the dataset at once. **Logic**: We abstract the logic for picking indices to concatenate by adding a `find_indices` function to the `ConcatAugment` class, which takes one index in the dataset and finds a compatible second index to concatenate source and target tokens. ```python def find_indices(self, index: int, n_frames: List[int], n_samples: int): # skip conditions: application rate, max_tokens limit exceeded if np.random.random() > self.rate: return [index] if self.max_tokens and n_frames[index] > self.max_tokens: return [index] # pick second sample to concatenate for _ in range(self.attempts): index2 = np.random.randint(0, n_samples) if index2 != index and ( not self.max_tokens or n_frames[index] + n_frames[index2] < self.max_tokens ): return [index, index2] return [index] ``` **Integration**: `SpeechToTextDataset` uses a custom `__getitem__(self, index)` function (called in the background when you write `dataset[i]`). We edited this function (as well as `_get_source_audio` and `get_tokenized_tgt_text`) to achieve the desired transform effect where accessing `dataset[i]` will return a `SpeechToTextDatasetItem` where `source=source[i]+source[j]` and `target=target[i]+target[j]`. ```python def __getitem__(self, index: int) -> SpeechToTextDatasetItem: # CONCATAUGMENT INTEGRATION BLOCK # (1) Check whether or not this transform is part of this dataset instance has_concat = self.dataset_transforms.has_transform(ConcatAugment) # (2) If so, get & apply the transform if has_concat: concat = self.dataset_transforms.get_transform(ConcatAugment) indices = concat.find_indices(index, self.n_frames, self.n_samples) source = self._get_source_audio(indices if has_concat else index) source = self.pack_frames(source) target = None if self.tgt_texts is not None: tokenized = self.get_tokenized_tgt_text(indices if has_concat else index) target = self.tgt_dict.encode_line( # ... rest of function ``` ================================================ FILE: examples/speech_to_speech/docs/direct_s2st_discrete_units.md ================================================ # Direct speech-to-speech translation with discrete units We provide the implementation for speech-to-unit translation (S2UT) proposed in "[Direct speech-to-speech translation with discrete units (Lee et al. 2021)](https://arxiv.org/abs/2107.05604)" and also the transformer-based implementation of the speech-to-spectrogram translation (S2SPECT, or transformer-based [Translatotron](https://arxiv.org/abs/1904.06037)) baseline in the paper. ## Pretrained Models ### Unit-based HiFi-GAN Vocoder Unit config | Unit size | Vocoder dataset | Model |---|---|---|--- [HuBERT Base, Librispeech](https://github.com/fairinternal/fairseq-py/tree/main/examples/hubert), layer 6 | 100 | [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/hubert_base_100_lj/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/hubert_base_100_lj/config.json) ## Data preparation ### Target speech 0. (optional) To prepare S2S data from a speech-to-text translation (ST) dataset, see [fairseq-S^2](https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis) for pre-trained TTS models and instructions on how to train and decode TTS models. 1. Prepare two folders, `$SRC_AUDIO` and `$TGT_AUDIO`, with `${SPLIT}/${SAMPLE_ID}.wav` for source and target speech under each folder, separately. Note that for S2UT experiments, target audio sampling rate should be in 16,000 Hz, and for S2SPECT experiments, target audio sampling rate is recommended to be in 22,050 Hz. 2. To prepare target discrete units for S2UT model training, see [Generative Spoken Language Modeling (speech2unit)](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/speech2unit) for pre-trained k-means models, checkpoints, and instructions on how to decode units from speech. Set the output target unit files (`--out_quantized_file_path`) as `${TGT_AUDIO}/${SPLIT}.txt`. In [Lee et al. 2021](https://arxiv.org/abs/2107.05604), we use 100 units from the sixth layer (`--layer 6`) of the HuBERT Base model. ### Formatting data **Speech-to-speech data** _S2UT_ * Set `--reduce-unit` for training S2UT _reduced_ model * Pre-trained vocoder and config (`$VOCODER_CKPT`, `$VOCODER_CFG`) can be downloaded from the **Pretrained Models** section. They are not required if `--eval-inference` is not going to be set during model training. ``` # $SPLIT1, $SPLIT2, etc. are split names such as train, dev, test, etc. python examples/speech_to_speech/preprocessing/prep_s2ut_data.py \ --source-dir $SRC_AUDIO --target-dir $TGT_AUDIO --data-split $SPLIT1 $SPLIT2 \ --output-root $DATA_ROOT --reduce-unit \ --vocoder-checkpoint $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG ``` _S2SPECT_ ``` # $SPLIT1, $SPLIT2, etc. are split names such as train, dev, test, etc. python examples/speech_to_speech/preprocessing/prep_s2spect_data.py \ --source-dir $SRC_AUDIO --target-dir $TGT_AUDIO --data-split $SPLIT1 $SPLIT2 \ --output-root $DATA_ROOT ``` **Multitask data** * For each multitask `$TASK_NAME`, prepare `${DATA_ROOT}/${TASK_NAME}/${SPLIT}.tsv` files for each split following the format below: (Two tab separated columns. The sample_ids should match with the sample_ids for the speech-to-speech data in `${DATA_ROOT}/${SPLIT}.tsv`.) ``` id tgt_text sample_id_0 token1 token2 token3 ... sample_id_1 token1 token2 token3 ... ... ``` * For each multitask `$TASK_NAME`, prepare `${DATA_ROOT}/${TASK_NAME}/dict.txt`, a dictionary in fairseq format with all tokens for the targets for `$TASK_NAME`. * Create `config_multitask.yaml`. Below is an example of the config used for S2UT _reduced_ with Fisher experiments including two encoder multitasks (`source_letter`, `target_letter`) and one decoder CTC task (`decoder_target_ctc`). ``` source_letter: # $TASK_NAME decoder_type: transformer dict: ${DATA_ROOT}/source_letter/dict.txt data: ${DATA_ROOT}/source_letter encoder_layer: 6 loss_weight: 8.0 target_letter: decoder_type: transformer dict: ${DATA_ROOT}/target_letter/dict.txt data: ${DATA_ROOT}/target_letter encoder_layer: 8 loss_weight: 8.0 decoder_target_ctc: decoder_type: ctc dict: ${DATA_ROOT}/decoder_target_ctc/dict.txt data: ${DATA_ROOT}/decoder_target_ctc decoder_layer: 3 loss_weight: 1.6 ``` ## Training **Speech-to-unit translation (S2UT)** Here's an example for training Fisher S2UT models with 100 discrete units as target: ``` fairseq-train $DATA_ROOT \ --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ --task speech_to_speech --target-is-code --target-code-size 100 --vocoder code_hifigan \ --criterion speech_to_unit --label-smoothing 0.2 \ --arch s2ut_transformer_fisher --share-decoder-input-output-embed \ --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \ --train-subset train --valid-subset dev \ --save-dir ${MODEL_DIR} \ --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-7 --warmup-updates 10000 \ --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 \ --max-update 400000 --max-tokens 20000 --max-target-positions 3000 --update-freq 4 \ --seed 1 --fp16 --num-workers 8 ``` * Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 4` to simulate training with 4 GPUs. * Set `--n-frames-per-step 5` to train an S2UT _stacked_ system with reduction ratio r=5. (Use `$DATA_ROOT` prepared without `--reduce-unit`.) * (optional) one can turn on tracking MCD loss during training for checkpoint selection by setting `--eval-inference --eval-args '{"beam": 1, "max_len_a": 1}' --best-checkpoint-metric mcd_loss`. It is recommended to sample a smaller subset as the validation set as MCD loss computation is time-consuming. **Speech-to-spectrogram translation (S2SPECT)** Here's an example for training Fisher S2SPECT models with reduction ratio r=5: ``` fairseq-train $DATA_ROOT \ --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ --task speech_to_speech --n-frames-per-step 5 \ --criterion speech_to_spectrogram \ --arch s2spect_transformer_fisher --decoder-normalize-before \ --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \ --train-subset train --valid-subset dev \ --save-dir ${MODEL_DIR} \ --eval-inference --best-checkpoint-metric mcd_loss \ --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-7 --warmup-updates 10000 \ --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 --weight-decay 1e-6 \ --max-update 400000 --max-tokens 80000 --max-tokens-valid 30000 --required-batch-size-multiple 1 \ --max-target-positions 3000 --update-freq 16 \ --seed 1 --fp16 --num-workers 8 ``` * Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 16` to simulate training with 16 GPUs. * We recommend turning on MCD loss during training for the best checkpoint selection. **Unit-based HiFi-GAN vocoder** The vocoder is trained with the [speech-resynthesis repo](https://github.com/facebookresearch/speech-resynthesis). See [here](https://github.com/facebookresearch/speech-resynthesis/tree/main/examples/speech_to_speech_translation) for instructions on how to train the unit-based HiFi-GAN vocoder with duration prediction. The same vocoder can support waveform generation for both _reduced_ unit sequences (with `--dur-prediction` set during inference) and original unit sequences. ## Inference **Speech-to-unit translation (S2UT)** 1. Follow the same inference process as in [fairseq-S2T](https://github.com/pytorch/fairseq/tree/main/examples/speech_to_text) to generate unit sequences (`${RESULTS_PATH}/generate-${GEN_SUBSET}.txt`). ``` fairseq-generate $DATA_ROOT \ --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ --task speech_to_speech --target-is-code --target-code-size 100 --vocoder code_hifigan \ --path $MODEL_DIR/checkpoint_best.pt --gen-subset $GEN_SUBSET \ --max-tokens 50000 \ --beam 10 --max-len-a 1 \ --results-path ${RESULTS_PATH} ``` * Set `--beam 1 --n-frames-per-step $r` for decoding with S2UT _stacked_ models. 2. Convert unit sequences to waveform. ``` grep "^D\-" ${RESULTS_PATH}/generate-${GEN_SUBSET}.txt | \ sed 's/^D-//ig' | sort -nk1 | cut -f3 \ > ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit python examples/speech_to_speech/generate_waveform_from_code.py \ --in-code-file ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit \ --vocoder $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG \ --results-path ${RESULTS_PATH} --dur-prediction ``` * Set `--dur-prediction` for generating audio for S2UT _reduced_ models. **Speech-to-spectrogram translation (S2SPECT)** Follow the same inference process as in [fairseq-S^2](https://github.com/pytorch/fairseq/tree/main/examples/speech_synthesis) to generate waveform. ``` # assume using a default Griffin-Lim vocoder python examples/speech_synthesis/generate_waveform.py $DATA_ROOT \ --config-yaml config.yaml --multitask-config-yaml config_multitask.yaml \ --task speech_to_speech --n-frames-per-step 5 \ --path $MODEL_DIR/checkpoint_best.pt --gen-subset $GEN_SUBSET \ --max-tokens 50000 \ --results-path ${RESULTS_PATH} --dump-waveforms --output-sample-rate 16000 ``` In addition to using the default Griffin-Lim vocoder, one can also finetune a HiFi-GAN vocoder for the S2SPECT model by following the instructions in the [HiFi-GAN repo](https://github.com/jik876/hifi-gan). **Multitask decoding** Coming soon. ## Evaluation To evaluate speech translation output, we first apply ASR on the speech output and then compute BLEU score betweent the ASR decoded text and the references using sacreBLEU. **En** * ASR: We use the "[Wav2Vec 2.0 Large (LV-60) + Self Training / 960 hours / Libri-Light + Librispeech](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt)" En ASR model open-sourced by the [wav2vec](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) project. See [instructions](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#evaluating-a-ctc-model) on how to run inference with a wav2vec-based ASR model. The model is also available on [Hugging Face](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self). * Text normalization: We use the text cleaner at [https://github.com/keithito/tacotron](https://github.com/keithito/tacotron) for pre-processing reference English text for ASR BLEU evaluation. ================================================ FILE: examples/speech_to_speech/docs/enhanced_direct_s2st_discrete_units.md ================================================ # Speech to speech translation (S2ST) We provide the implementation for speech-to-unit translation (S2UT) proposed in [Enhanced Direct Speech-to-Speech Translation Using Self-supervised Pre-training and Data Augmentation (Popuri et al. 2022)](https://arxiv.org/abs/2204.02967) and the various pretrained models used. ## Pretrained Models ### Unit extraction We used the multilingual HuBERT model open sourced in [Textless S2ST with Real Data](textless_s2st_real_data.md) ### Wav2vec 2.0 Language | Block type | Model size | Dataset | Model | --- | --- | --- | --- | --- | Es | Transformer | BASE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/transformer_B.pt) | Es | Transformer | LARGE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/transformer_L.pt) | Es | Conformer | LARGE | Voxpopuli | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/es/conformer_L.pt) | En | Transformer | BASE | Librilight| [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/en/transformer_B.pt) | En | Conformer | LARGE | Librilight | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/w2v2/en/conformer_L.pt) | ### Unit mBART Unit size | Dataset | Unit config | Model | --- | --- | --- | --- | 1000 | [Voxpopuli](https://aclanthology.org/2021.acl-long.80) En, Es unlabelled speech | [mbart_large](https://github.com/pytorch/fairseq/blob/f591cc94caa85098ccf125a4782f91125b6a086d/fairseq/models/bart/model.py#L368) |[ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/unit_mBART/checkpoint.pt) | ## Data preparation 1. To prepare data for S2UT finetuning, follow the steps from [Direct S2ST with Discrete Units](./direct_s2st_discrete_units.md) and format the data in the _S2UT_ format. Note that we use 1000 units from the eleventh layer (`--layer 11`) of the multilingual hubert model linked above instead 2. Run ``` var="id\taudio\tn_frames\ttgt_text\ttgt_n_frames" sed -i "1s/.*/$var/" ${SPLIT}.tsv ``` ## Training **Speech-to-unit translation (S2UT)** Here's an example for finetuning S2UT models with 1000 discrete units as target. You can download the sample [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/config.yaml) file and [vocabulary](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/dict.txt) for Es-En from here: ``` fairseq-train $DATA_ROOT \ --config-yaml config.yaml \ --task speech_to_text --arch xm_transformer\ --criterion l --label-smoothing 0.2 \ --share-decoder-input-output-embed --adaptor-n-layers 1 --normalize\ --dropout 0.1 --attention-dropout 0.1 --relu-dropout 0.1 \ --train-subset train --valid-subset dev \ --load-pretrained-decoder-from ${unit_mBART} --w2v-path ${wav2vec2.0} \ --mask-prob 0.3 --mask-channel-length 32 --mask-channel-prob 0.25\ --save-dir ${MODEL_DIR} --checkpoint-activations --encoder-proj \ --lr 0.0005 --dropout 0.1 --attention-dropout 0.1 --lr-scheduler inverse_sqrt\ --warmup-init-lr 1e-7 --warmup-updates 10000 \ --optimizer adam --adam-betas "(0.9,0.98)" --clip-norm 10.0 \ --max-update 20000 --max-tokens 4000 --max-tokens-valid 4000 --max-source-positions 4000 \ --max-target-positions 4000 --update-freq 120 \ --seed 1 --fp16 --num-workers 1 ``` * Adjust `--update-freq` accordingly for different #GPUs. In the above we set `--update-freq 15` to simulate training with 120 GPUs. * In the above setting we finetune the model end to end, corresponding to the full setup in the paper. * To apply LNA-E partial finetuning, add `--finetune-w2v-params layer_norm,self_attn` * For LNA-D partial finetuning add `--finetune-decoder-params encoder_attn,layer_norm,self_attn`. To optionally freeze the encoder by k updates, use `--freeze-finetune-updates ${K}` * For LNA-E,D partial finetuning add both the above options. **Unit-based HiFi-GAN vocoder** We apply the open-sourced unit-based HiFi-GAN vocoders to convert the predicted unit sequences to waveform. They are open sourced in [Textless S2ST with Real Data](textless_s2st_real_data.md) ## Inference **Speech-to-unit translation (S2UT)** 1. Follow the same inference process as in [fairseq-S2T](https://github.com/pytorch/fairseq/tree/main/examples/speech_to_text) to generate unit sequences (`${RESULTS_PATH}/generate-${GEN_SUBSET}.txt`). ``` fairseq-generate $DATA_ROOT \ --config-yaml config.yaml \ --task speech_to_text \ --path $MODEL_DIR/checkpoint_best.pt --gen-subset $GEN_SUBSET \ --max-tokens 10000 --max-source-positions 10000 --max-target-positions 10000\ --beam 10 --max-len-a 1 --max-len-b 200 \ --results-path ${RESULTS_PATH} ``` 2. Convert unit sequences to waveform. ``` grep "^D\-" ${RESULTS_PATH}/generate-${GEN_SUBSET}.txt | \ sed 's/^D-//ig' | sort -nk1 | cut -f3 \ > ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit python examples/speech_to_speech/generate_waveform_from_code.py \ --in-code-file ${RESULTS_PATH}/generate-${GEN_SUBSET}.unit \ --vocoder $VOCODER_CKPT --vocoder-cfg $VOCODER_CFG \ --results-path ${RESULTS_PATH} --dur-prediction ``` ## Evaluation To evaluate speech translation output, we first apply ASR on the speech output and then compute BLEU score betweent the ASR decoded text and the references using sacreBLEU. * Text normalization: We use the text cleaner at [https://github.com/keithito/tacotron](https://github.com/keithito/tacotron) for pre-processing reference English text for ASR BLEU evaluation. The text cleaner used for Spanish text normalization will be updated here shortly. * En ASR: We use the "[Wav2Vec 2.0 Large (LV-60) + Self Training / 960 hours / Libri-Light + Librispeech](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt)" En ASR model open-sourced by the [wav2vec](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec) project. The model is also available on [Hugging Face](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self). * Es ASR: We use the [Wav2Vec2-Large-XLSR-53-Spanish](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) finetuned on spanish Common Voice Es ASR model open-sourced by Jonatasgrosman(<https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish>) on [Hugging Face](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-spanish). * See [instructions](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#evaluating-a-ctc-model) on how to run inference with a wav2vec-based ASR model. ## Finetuned Model Checkpoints ID | En - Es | Es - En | | --- | --- | --- | **S2UT systems without pre-training** S2UT with multitask | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//S2UT_w_multitask.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//S2UT_w_multitask.pt) | **S2UT systems with model pre-training** w2v2-L | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_only.pt ) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_only.pt) | w2v2-L + mBART (LNA-E) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LNE.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LNE.pt) | w2v2-L + mBART (LNA-D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LND.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LND.pt) | w2v2-L + mBART (LNA-E,D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LNED.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LNED.pt) | **S2UT systems with model pre-training and data augmentation** w2v2-L + mBART (LNA-D) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/en_es//w2v2_mbart_LND_w_ASR.pt) | [checkpoint](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/s2st_finetuning/es_en//w2v2_mbart_LND_w_ASR.pt) | Note: Some of the tasks use speech_to_text_sharded task which is yet to be open sourced. So make sure to override the task to speech_to_text to use those models. ================================================ FILE: examples/speech_to_speech/docs/textless_s2st_real_data.md ================================================ # Textless Speech-to-Speech Translation (S2ST) on Real Data We provide instructions and pre-trained models for the work "[Textless Speech-to-Speech Translation on Real Data (Lee et al. 2021)](https://arxiv.org/abs/2112.08352)". ## Pre-trained Models ### HuBERT Model | Pretraining Data | Model | Quantizer |---|---|---|--- mHuBERT Base | [VoxPopuli](https://github.com/facebookresearch/voxpopuli) En, Es, Fr speech from the 100k subset | [download](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3.pt) | [L11 km1000](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin) ### Unit-based HiFi-GAN vocoder Unit config | Unit size | Vocoder language | Dataset | Model |---|---|---|---|--- mHuBERT, layer 11 | 1000 | En | [LJSpeech](https://keithito.com/LJ-Speech-Dataset/) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json) mHuBERT, layer 11 | 1000 | Es | [CSS10](https://github.com/Kyubyong/css10) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/config.json) mHuBERT, layer 11 | 1000 | Fr | [CSS10](https://github.com/Kyubyong/css10) | [ckpt](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/g_00500000), [config](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/config.json) ### Speech normalizer Language | Training data | Target unit config | Model |---|---|---|--- En | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_10min.tar.gz) En | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_1h.tar.gz) En | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/en/en_10h.tar.gz) Es | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_10min.tar.gz) Es | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_1h.tar.gz) Es | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/es/es_10h.tar.gz) Fr | 10 mins | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_10min.tar.gz) Fr | 1 hr | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_1h.tar.gz) Fr | 10 hrs | mHuBERT, layer 11, km1000 | [download](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/speech_normalizer/fr/fr_10h.tar.gz) * Refer to the paper for the details of the training data. ## Inference with Pre-trained Models ### Speech normalizer 1. Download the pre-trained models, including the dictionary, to `DATA_DIR`. 2. Format the audio data. ```bash # AUDIO_EXT: audio extension, e.g. wav, flac, etc. # Assume all audio files are at ${AUDIO_DIR}/*.${AUDIO_EXT} python examples/speech_to_speech/preprocessing/prep_sn_data.py \ --audio-dir ${AUDIO_DIR} --ext ${AUIDO_EXT} \ --data-name ${GEN_SUBSET} --output-dir ${DATA_DIR} \ --for-inference ``` 3. Run the speech normalizer and post-process the output. ```bash mkdir -p ${RESULTS_PATH} python examples/speech_recognition/new/infer.py \ --config-dir examples/hubert/config/decode/ \ --config-name infer_viterbi \ task.data=${DATA_DIR} \ task.normalize=false \ common_eval.results_path=${RESULTS_PATH}/log \ common_eval.path=${DATA_DIR}/checkpoint_best.pt \ dataset.gen_subset=${GEN_SUBSET} \ '+task.labels=["unit"]' \ +decoding.results_path=${RESULTS_PATH} \ common_eval.post_process=none \ +dataset.batch_size=1 \ common_eval.quiet=True # Post-process and generate output at ${RESULTS_PATH}/${GEN_SUBSET}.txt python examples/speech_to_speech/preprocessing/prep_sn_output_data.py \ --in-unit ${RESULTS_PATH}/hypo.units \ --in-audio ${DATA_DIR}/${GEN_SUBSET}.tsv \ --output-root ${RESULTS_PATH} ``` ### Unit-to-waveform conversion with unit vocoder The pre-trained vocoders can support generating audio for both full unit sequences and reduced unit sequences (i.e. duplicating consecutive units removed). Set `--dur-prediction` for generating audio with reduced unit sequences. ```bash # IN_CODE_FILE contains one unit sequence per line. Units are separated by space. python examples/speech_to_speech/generate_waveform_from_code.py \ --in-code-file ${IN_CODE_FILE} \ --vocoder ${VOCODER_CKPT} --vocoder-cfg ${VOCODER_CFG} \ --results-path ${RESULTS_PATH} --dur-prediction ``` ## Training new models To be updated. ================================================ FILE: examples/speech_to_speech/generate_waveform_from_code.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import json import logging from pathlib import Path import random import soundfile as sf import torch from tqdm import tqdm from fairseq import utils from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def dump_result(args, sample_id, pred_wav, suffix=""): sf.write( f"{args.results_path}/{sample_id}{suffix}_pred.wav", pred_wav.detach().cpu().numpy(), 16000, ) def load_code(in_file): with open(in_file) as f: out = [list(map(int, line.strip().split())) for line in f] return out def main(args): logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu with open(args.vocoder_cfg) as f: vocoder_cfg = json.load(f) vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg) if use_cuda: vocoder = vocoder.cuda() multispkr = vocoder.model.multispkr if multispkr: logger.info("multi-speaker vocoder") num_speakers = vocoder_cfg.get( "num_speakers", 200 ) # following the default in codehifigan to set to 200 assert ( args.speaker_id < num_speakers ), f"invalid --speaker-id ({args.speaker_id}) with total #speakers = {num_speakers}" data = load_code(args.in_code_file) Path(args.results_path).mkdir(exist_ok=True, parents=True) for i, d in tqdm(enumerate(data), total=len(data)): x = { "code": torch.LongTensor(d).view(1, -1), } suffix = "" if multispkr: spk = ( random.randint(0, num_speakers - 1) if args.speaker_id == -1 else args.speaker_id ) suffix = f"_spk{spk}" x["spkr"] = torch.LongTensor([spk]).view(1, 1) x = utils.move_to_cuda(x) if use_cuda else x wav = vocoder(x, args.dur_prediction) dump_result(args, i, wav, suffix=suffix) def cli_main(): parser = argparse.ArgumentParser() parser.add_argument( "--in-code-file", type=str, required=True, help="one unit sequence per line" ) parser.add_argument( "--vocoder", type=str, required=True, help="path to the CodeHiFiGAN vocoder" ) parser.add_argument( "--vocoder-cfg", type=str, required=True, help="path to the CodeHiFiGAN vocoder config", ) parser.add_argument("--results-path", type=str, required=True) parser.add_argument( "--dur-prediction", action="store_true", help="enable duration prediction (for reduced/unique code sequences)", ) parser.add_argument( "--speaker-id", type=int, default=-1, help="Speaker id (for vocoder that supports multispeaker). Set to -1 to randomly sample speakers.", ) parser.add_argument("--cpu", action="store_true", help="run on CPU") args = parser.parse_args() main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/speech_to_speech/preprocessing/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: examples/speech_to_speech/preprocessing/data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from pathlib import Path from typing import List, Optional from examples.speech_to_text.data_utils import S2TDataConfigWriter def gen_config_yaml( manifest_root: Path, yaml_filename: str = "config.yaml", specaugment_policy: Optional[str] = "lb", feature_transform: Optional[List[str]] = None, input_channels: Optional[int] = 1, input_feat_per_channel: Optional[int] = 80, audio_root: str = "", vocoder_type: Optional[str] = None, vocoder_checkpoint: Optional[str] = None, vocoder_cfg: Optional[str] = None, extra=None, ): manifest_root = manifest_root.absolute() writer = S2TDataConfigWriter(manifest_root / yaml_filename) if input_channels is not None: writer.set_input_channels(input_channels) if input_feat_per_channel is not None: writer.set_input_feat_per_channel(input_feat_per_channel) specaugment_setters = { "lb": writer.set_specaugment_lb_policy, "ld": writer.set_specaugment_ld_policy, "sm": writer.set_specaugment_sm_policy, "ss": writer.set_specaugment_ss_policy, } specaugment_setter = specaugment_setters.get(specaugment_policy, None) if specaugment_setter is not None: specaugment_setter() if feature_transform is None: feature_transform = [] else: writer.set_feature_transforms("*", feature_transform) if specaugment_policy is not None: writer.set_feature_transforms("_train", feature_transform + ["specaugment"]) if len(audio_root) > 0: writer.set_audio_root(audio_root) if ( vocoder_type is not None and vocoder_checkpoint is not None and vocoder_cfg is not None ): writer.set_extra( { "vocoder": { "type": vocoder_type, "config": vocoder_cfg, "checkpoint": vocoder_checkpoint, } } ) if extra is not None: writer.set_extra(extra) writer.flush() def load_units(in_file): out = {} with open(in_file) as f: for line in f: sample_id, units = line.strip().split("|", 1) out[sample_id] = units.split() return out def process_units(units, reduce=False): if not reduce: return units out = [u for i, u in enumerate(units) if i == 0 or u != units[i - 1]] return out ================================================ FILE: examples/speech_to_speech/preprocessing/prep_s2spect_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os from pathlib import Path import shutil import torchaudio import soundfile as sf from tqdm import tqdm import pandas as pd from examples.speech_synthesis.data_utils import extract_logmel_spectrogram from examples.speech_to_speech.preprocessing.data_utils import gen_config_yaml from examples.speech_to_text.data_utils import create_zip, get_zip_manifest, save_df_to_tsv from fairseq.data.audio.audio_utils import convert_waveform logger = logging.getLogger(__name__) MANIFEST_COLUMNS = ["id", "src_audio", "src_n_frames", "tgt_audio", "tgt_n_frames"] def prepare_target_data(args, tgt_audios): feature_name = "logmelspec80" zip_path = args.output_root / f"{feature_name}.zip" if zip_path.exists(): print(f"{zip_path} exists.") return zip_path feature_root = args.output_root / feature_name feature_root.mkdir(exist_ok=True) print("Extracting Mel spectrogram features...") for tgt_audio in tqdm(tgt_audios): sample_id = tgt_audio.stem waveform, sample_rate = torchaudio.load(tgt_audio.as_posix()) waveform, sample_rate = convert_waveform( waveform, sample_rate, normalize_volume=args.normalize_volume, to_sample_rate=args.sample_rate ) extract_logmel_spectrogram( waveform, sample_rate, feature_root / f"{sample_id}.npy", win_length=args.win_length, hop_length=args.hop_length, n_fft=args.n_fft, n_mels=args.n_mels, f_min=args.f_min, f_max=args.f_max ) print("ZIPing features...") create_zip(feature_root, zip_path) shutil.rmtree(feature_root) return zip_path def process(args): os.makedirs(args.output_root, exist_ok=True) manifest = {} tgt_audios = [] for split in args.data_split: print(f"Processing {split}...") manifest[split] = {c: [] for c in MANIFEST_COLUMNS} missing_tgt_audios = [] src_audios = list(args.source_dir.glob(f"{split}/*.wav")) for src_audio in tqdm(src_audios): sample_id = src_audio.stem tgt_audio = args.target_dir / split / f"{sample_id}.wav" if not tgt_audio.is_file(): missing_tgt_audios.append(sample_id) continue tgt_audios.append(tgt_audio) src_n_frames = sf.info(src_audio.as_posix()).frames manifest[split]["id"].append(sample_id) manifest[split]["src_audio"].append(src_audio.as_posix()) manifest[split]["src_n_frames"].append( src_n_frames // 160 ) # estimation of 10-ms frame for 16kHz audio print(f"Processed {len(manifest[split]['id'])} samples") if len(missing_tgt_audios) > 0: print( f"{len(missing_tgt_audios)} with missing target data (first 3 examples: {', '.join(missing_tgt_audios[:3])})" ) # Extract features and pack features into ZIP zip_path = prepare_target_data(args, tgt_audios) print("Fetching ZIP manifest...") tgt_audio_paths, tgt_audio_lengths = get_zip_manifest(zip_path) print("Generating manifest...") for split in args.data_split: print(f"Processing {split}...") for sample_id in tqdm(manifest[split]["id"]): manifest[split]["tgt_audio"].append(tgt_audio_paths[sample_id]) manifest[split]["tgt_n_frames"].append(tgt_audio_lengths[sample_id]) out_manifest = args.output_root / f"{split}.tsv" print(f"Writing manifest to {out_manifest}...") save_df_to_tsv(pd.DataFrame.from_dict(manifest[split]), out_manifest) # Generate config YAML win_len_t = args.win_length / args.sample_rate hop_len_t = args.hop_length / args.sample_rate extra = { "features": { "type": "spectrogram+melscale+log", "sample_rate": args.sample_rate, "eps": 1e-5, "n_mels": args.n_mels, "n_fft": args.n_fft, "window_fn": "hann", "win_length": args.win_length, "hop_length": args.hop_length, "win_len_t": win_len_t, "hop_len_t": hop_len_t, "f_min": args.f_min, "f_max": args.f_max, "n_stft": args.n_fft // 2 + 1 } } gen_config_yaml( args.output_root, audio_root=args.output_root.as_posix(), specaugment_policy="lb", feature_transform=["utterance_cmvn", "delta_deltas"], extra=extra, ) def main(): parser = argparse.ArgumentParser() parser.add_argument( "--source-dir", required=True, type=Path, help="source audio directory" ) parser.add_argument( "--target-dir", required=True, type=Path, help="target audio directory" ) parser.add_argument( "--data-split", default=["train", "valid", "test"], nargs="+", help="data split names", ) parser.add_argument( "--output-root", required=True, type=Path, help="output directory" ) # target feature related parser.add_argument("--win-length", type=int, default=1024) parser.add_argument("--hop-length", type=int, default=256) parser.add_argument("--n-fft", type=int, default=1024) parser.add_argument("--n-mels", type=int, default=80) parser.add_argument("--f-min", type=int, default=20) parser.add_argument("--f-max", type=int, default=8000) parser.add_argument("--sample-rate", type=int, default=22050) parser.add_argument("--normalize-volume", "-n", action="store_true") args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_speech/preprocessing/prep_s2ut_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path import soundfile as sf from tqdm import tqdm import pandas as pd from examples.speech_to_speech.preprocessing.data_utils import ( gen_config_yaml, load_units, process_units, ) from examples.speech_to_text.data_utils import save_df_to_tsv logger = logging.getLogger(__name__) MANIFEST_COLUMNS = ["id", "src_audio", "src_n_frames", "tgt_audio", "tgt_n_frames"] def process(args): args.output_root.mkdir(exist_ok=True) print("Generating manifest...") for split in args.data_split: print(f"Processing {split}") # load target units target_unit_data = load_units(args.target_dir / f"{split}.txt") manifest = {c: [] for c in MANIFEST_COLUMNS} missing_tgt_audios = [] src_audios = list(args.source_dir.glob(f"{split}/*.wav")) for src_audio in tqdm(src_audios): sample_id = src_audio.stem if sample_id not in target_unit_data: missing_tgt_audios.append(sample_id) continue src_n_frames = sf.info(src_audio.as_posix()).frames manifest["id"].append(sample_id) manifest["src_audio"].append(src_audio.as_posix()) manifest["src_n_frames"].append( src_n_frames // 160 ) # estimation of 10-ms frame for 16kHz audio target_units = process_units(target_unit_data[sample_id], args.reduce_unit) manifest["tgt_audio"].append(" ".join(target_units)) manifest["tgt_n_frames"].append(len(target_units)) print(f"Processed {len(manifest['id'])} samples") if len(missing_tgt_audios) > 0: print( f"{len(missing_tgt_audios)} with missing target data (first 3 examples: {', '.join(missing_tgt_audios[:3])})" ) out_manifest = args.output_root / f"{split}.tsv" print(f"Writing manifest to {out_manifest}...") save_df_to_tsv(pd.DataFrame.from_dict(manifest), out_manifest) # Generate config YAML gen_config_yaml( args.output_root, specaugment_policy="lb", feature_transform=["utterance_cmvn"], vocoder_type="code_hifigan", vocoder_checkpoint=args.vocoder_checkpoint, vocoder_cfg=args.vocoder_cfg, ) def main(): parser = argparse.ArgumentParser() parser.add_argument( "--source-dir", required=True, type=Path, help="source audio directory" ) parser.add_argument( "--target-dir", required=True, type=Path, help="target audio directory" ) parser.add_argument( "--data-split", default=["train", "valid", "test"], nargs="+", help="data split names", ) parser.add_argument( "--output-root", required=True, type=Path, help="output directory" ) parser.add_argument( "--reduce-unit", action="store_true", help="reduce a target unit sequence to a unique unit sequence, i.e. '1 1 1 2 2' -> '1 2'", ) parser.add_argument( "--vocoder-checkpoint", default=None, type=str, help="vocoder checkpoint" ) parser.add_argument( "--vocoder-cfg", default=None, type=str, help="vocoder config file" ) args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_speech/preprocessing/prep_sn_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # # Adapted from examples/wav2vec/wav2vec_manifest.py """ Data preparation for the speech normalizer """ import argparse import glob import os import soundfile from examples.speech_to_speech.preprocessing.data_utils import load_units, process_units def process(args): assert ( args.for_inference or args.target_unit is not None ), "missing --target-unit or --for-inference" if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) dir_path = os.path.realpath(args.audio_dir) search_path = os.path.join(dir_path, "**/*." + args.ext) if args.target_unit: unit_data = load_units(args.target_unit) with open(os.path.join(args.output_dir, f"{args.data_name}.tsv"), "w") as o_t, open( os.path.join(args.output_dir, f"{args.data_name}.unit"), "w" ) as o_u: print(dir_path, file=o_t) for fname in glob.iglob(search_path, recursive=True): file_path = os.path.realpath(fname) frames = soundfile.info(fname).frames print( "{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=o_t ) if args.for_inference: print("0", file=o_u) else: sample_id = os.path.basename(file_path)[: -len(args.ext) - 1] assert ( sample_id in unit_data ), f'{fname} does not have unit data in {args.target_unit}. Expecting sample_id "{sample_id}".' target_units = process_units(unit_data[sample_id], reduce=True) print(" ".join(target_units), file=o_u) def main(): parser = argparse.ArgumentParser() parser.add_argument("--audio-dir", required=True, type=str, help="audio directory") parser.add_argument("--ext", default="flac", type=str, help="audio extension") parser.add_argument( "--data-name", required=True, type=str, help="dataset name", ) parser.add_argument( "--output-dir", required=True, type=str, help="output directory" ) parser.add_argument( "--for-inference", action="store_true", help="set this if preparing data for running inference with a speech normalizer", ) parser.add_argument( "--target-unit", default=None, type=str, help="a file containing unit sequences in the format: sample_id|u1 u2 ...", ) args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_speech/preprocessing/prep_sn_output_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse from pathlib import Path from tqdm import tqdm def process(args): args.output_root.mkdir(exist_ok=True) # load units units = {} with open(args.in_unit) as f: for line in f: unit_seq, utt_id = line.strip().rsplit(" ", 1) utt_id = int(utt_id[6:-1]) # remove "(None-" units[utt_id] = unit_seq with open(args.in_audio) as f, open( args.output_root / f"{args.in_audio.stem}.txt", "w" ) as o: f.readline() for i, line in enumerate(tqdm(f.readlines())): audio, _ = line.strip().split("\t", 1) sample_id = Path(audio).stem o.write(f"{sample_id}|{units[i]}\n") def main(): parser = argparse.ArgumentParser() parser.add_argument( "--in-unit", required=True, type=Path, help="unit file (output from the speech normalizer)", ) parser.add_argument( "--in-audio", required=True, type=Path, help="tsv file (input to the normalizer)", ) parser.add_argument( "--output-root", required=True, type=Path, help="output directory" ) args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_speech/unity/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import sequence_generator # noqa from . import sequence_generator_multi_decoder # noqa ================================================ FILE: examples/speech_to_speech/unity/sequence_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import sys from typing import Dict, List, Optional import torch from torch import Tensor from fairseq.sequence_generator import EnsembleModel as EnsembleModelBase from fairseq.sequence_generator import SequenceGenerator as SequenceGeneratorBase class SequenceGenerator(SequenceGeneratorBase): def __init__( self, models, tgt_dict, beam_size=1, max_len_a=0, max_len_b=200, max_len=0, min_len=1, normalize_scores=True, len_penalty=1.0, unk_penalty=0.0, temperature=1.0, match_source_len=False, no_repeat_ngram_size=0, search_strategy=None, eos=None, symbols_to_strip_from_output=None, lm_model=None, lm_weight=1.0, tokens_to_suppress=(), ): """Generates translations of a given source sentence. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models, currently support fairseq.models.TransformerModel for scripting beam_size (int, optional): beam width (default: 1) max_len_a/b (int, optional): generate sequences of maximum length ax + b, where x is the source length max_len (int, optional): the maximum length of the generated output (not including end-of-sentence) min_len (int, optional): the minimum length of the generated output (not including end-of-sentence) normalize_scores (bool, optional): normalize scores by the length of the output (default: True) len_penalty (float, optional): length penalty, where <1.0 favors shorter, >1.0 favors longer sentences (default: 1.0) unk_penalty (float, optional): unknown word penalty, where <0 produces more unks, >0 produces fewer (default: 0.0) temperature (float, optional): temperature, where values >1.0 produce more uniform samples and values <1.0 produce sharper samples (default: 1.0) match_source_len (bool, optional): outputs should match the source length (default: False) """ super().__init__( models=models, tgt_dict=tgt_dict, beam_size=beam_size, max_len_a=max_len_a, max_len_b=max_len_b, max_len=max_len, min_len=min_len, normalize_scores=normalize_scores, len_penalty=len_penalty, unk_penalty=unk_penalty, temperature=temperature, match_source_len=match_source_len, no_repeat_ngram_size=no_repeat_ngram_size, search_strategy=search_strategy, eos=eos, symbols_to_strip_from_output=symbols_to_strip_from_output, lm_model=lm_model, lm_weight=lm_weight, tokens_to_suppress=tokens_to_suppress, ) if isinstance(models, EnsembleModel): self.model = models else: self.model = EnsembleModel(models) self.model.set_decoder_beam_size(self.beam_size) self.model.eval() def _generate( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, constraints: Optional[Tensor] = None, bos_token: Optional[int] = None, ): net_input = sample["net_input"] if "src_tokens" in net_input: src_tokens = net_input["src_tokens"] # length of the source text being the character length except EndOfSentence and pad # if src_lengths exists in net_input (speech_to_text dataset case), then use it if "src_lengths" in net_input: src_lengths = net_input["src_lengths"] else: src_lengths = ( (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)) .long() .sum(dim=1) ) elif "source" in net_input: src_tokens = net_input["source"] src_lengths = ( net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens) ) elif "features" in net_input: src_tokens = net_input["features"] src_lengths = ( net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens) ) else: raise Exception( "expected src_tokens or source in net input. input keys: " + str(net_input.keys()) ) if constraints is not None and not self.search.supports_constraints: raise NotImplementedError( "Target-side constraints were provided, but search method doesn't support them" ) # Initialize constraints, when active self.search.init_constraints(constraints, self.beam_size) # compute the encoder output for each beam with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"): encoder_outs = self.model.forward_encoder(net_input) finalized = self.generate_decoder( encoder_outs, src_tokens, src_lengths, sample, prefix_tokens, constraints, bos_token, ) return finalized def generate_decoder( self, encoder_outs, src_tokens, src_lengths, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, constraints: Optional[Tensor] = None, bos_token: Optional[int] = None, aux_task_name="", encoder_outs_aug: Optional[ Tensor ] = None, # an additional/augmented encoder_outs ): incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) for i in range(self.model.models_size) ], ) # bsz: total number of sentences in beam # Note that src_tokens may have more than 2 dimensions (i.e. audio features) bsz, src_len = src_tokens.size()[:2] beam_size = self.beam_size decoder_name = f"{aux_task_name}_decoder" if aux_task_name else "decoder" max_len: int = -1 if self.match_source_len: max_len = src_lengths.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), self.max_len - 1, ) assert ( self.min_len <= max_len ), "min_len cannot be larger than max_len, please adjust these!" # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) # ensure encoder_outs is a List. assert encoder_outs is not None if encoder_outs_aug is not None: encoder_outs_aug = self.model.reorder_encoder_out( encoder_outs_aug, new_order ) # initialize buffers scores = ( torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float() ) # +1 for eos; pad is never chosen for scoring tokens = ( torch.zeros(bsz * beam_size, max_len + 2) .to(src_tokens) .long() .fill_(self.pad) ) # +2 for eos and pad tokens[:, 0] = self.eos if bos_token is None else bos_token attn: Optional[Tensor] = None # A list that indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then cands_to_ignore would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. cands_to_ignore = ( torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) ) # forward and backward-compatible False mask # list of completed sentences finalized = torch.jit.annotate( List[List[Dict[str, Tensor]]], [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)], ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step # a boolean array indicating if the sentence at the index is finished or not finished = [False for i in range(bsz)] num_remaining_sent = bsz # number of sentences remaining # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = ( (torch.arange(0, bsz) * beam_size) .unsqueeze(1) .type_as(tokens) .to(src_tokens.device) ) cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device) reorder_state: Optional[Tensor] = None batch_idxs: Optional[Tensor] = None original_batch_idxs: Optional[Tensor] = None if "id" in sample and isinstance(sample["id"], Tensor): original_batch_idxs = sample["id"] else: original_batch_idxs = torch.arange(0, bsz).type_as(tokens) for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as( batch_idxs ) reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size ) original_batch_idxs = original_batch_idxs[batch_idxs] self.model.reorder_incremental_state( incremental_states, reorder_state, decoder_name ) encoder_outs = self.model.reorder_encoder_out( encoder_outs, reorder_state ) if encoder_outs_aug is not None: encoder_outs_aug = self.model.reorder_encoder_out( encoder_outs_aug, reorder_state ) with torch.autograd.profiler.record_function( "EnsembleModel: forward_decoder" ): lprobs, avg_attn_scores = self.model.forward_decoder( tokens[:, : step + 1], encoder_outs, incremental_states, self.temperature, decoder_name=decoder_name, encoder_outs_aug=encoder_outs_aug, ) if self.lm_model is not None and not aux_task_name: lm_out = self.lm_model(tokens[:, : step + 1]) probs = self.lm_model.get_normalized_probs( lm_out, log_probs=True, sample=None ) probs = probs[:, -1, :] * self.lm_weight lprobs += probs lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs) lprobs[:, self.pad] = -math.inf # never select pad lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty # handle max length constraint if step >= max_len: lprobs[:, : self.eos] = -math.inf lprobs[:, self.eos + 1 :] = -math.inf # handle prefix tokens (possibly with different lengths) if ( prefix_tokens is not None and step < prefix_tokens.size(1) and step < max_len ): lprobs, tokens, scores = self._prefix_tokens( step, lprobs, scores, tokens, prefix_tokens, beam_size ) else: if step < self.min_len: # minimum length constraint (does not apply if using prefix_tokens) lprobs[:, self.eos] = -math.inf if self.token_indices_to_suppress is not None: lprobs[:, self.token_indices_to_suppress] = -math.inf # Record attention scores, only support avg_attn_scores is a Tensor if avg_attn_scores is not None: if attn is None: attn = torch.empty( bsz * beam_size, avg_attn_scores.size(1), max_len + 2 ).to(scores) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs) eos_bbsz_idx = torch.empty(0).to( tokens ) # indices of hypothesis ending with eos (finished sentences) eos_scores = torch.empty(0).to( scores ) # scores of hypothesis ending with eos (finished sentences) if self.should_set_src_lengths: self.search.set_src_lengths(src_lengths) if self.repeat_ngram_blocker is not None: lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step) # Shape: (batch, cand_size) cand_scores, cand_indices, cand_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], tokens[:, : step + 1], original_batch_idxs, ) # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos # Shape of eos_mask: (batch size, beam size) eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask) # only consider eos when it's among the top beam_size indices # Now we know what beam item(s) to finish # Shape: 1d list of absolute-numbered eos_bbsz_idx = torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] ) finalized_sents: List[int] = [] if eos_bbsz_idx.numel() > 0: eos_scores = torch.masked_select( cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size] ) finalized_sents = self.finalize_hypos( step, eos_bbsz_idx, eos_scores, tokens, scores, finalized, finished, beam_size, attn, src_lengths, max_len, ) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break if self.search.stop_on_max_len and step >= max_len: break assert step < max_len, f"{step} < {max_len}" # Remove finalized sentences (ones for which {beam_size} # finished hypotheses have been generated) from the batch. if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones( bsz, dtype=torch.bool, device=cand_indices.device ) batch_mask[finalized_sents] = False # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it batch_idxs = torch.arange( bsz, device=cand_indices.device ).masked_select(batch_mask) # Choose the subset of the hypothesized constraints that will continue self.search.prune_sentences(batch_idxs) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths = src_lengths[batch_idxs] cands_to_ignore = cands_to_ignore[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, attn.size(1), -1 ) bsz = new_bsz else: batch_idxs = None # Set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos # Rewrite the operator since the element wise or is not supported in torchscript. eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) active_mask = torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[: eos_mask.size(1)], ) # get the top beam_size active hypotheses, which are just # the hypos with the smallest values in active_mask. # {active_hypos} indicates which {beam_size} hypotheses # from the list of {2 * beam_size} candidates were # selected. Shapes: (batch size, beam size) new_cands_to_ignore, active_hypos = torch.topk( active_mask, k=beam_size, dim=1, largest=False ) # update cands_to_ignore to ignore any finalized hypos. cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] # Make sure there is at least one active item for each sentence in the batch. assert (~cands_to_ignore).any(dim=1).all() # update cands_to_ignore to ignore any finalized hypos # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam # can be selected more than once). active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) active_scores = torch.gather(cand_scores, dim=1, index=active_hypos) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses # Set the tokens for each beam (can select the same row more than once) tokens[:, : step + 1] = torch.index_select( tokens[:, : step + 1], dim=0, index=active_bbsz_idx ) # Select the next token for each of them tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather( cand_indices, dim=1, index=active_hypos ) if step > 0: scores[:, :step] = torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx ) scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather( cand_scores, dim=1, index=active_hypos ) # Update constraints based on which candidates were selected for the next beam self.search.update_constraints(active_hypos) # copy attention for active hypotheses if attn is not None: attn[:, :, : step + 2] = torch.index_select( attn[:, :, : step + 2], dim=0, index=active_bbsz_idx ) # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): scores = torch.tensor( [float(elem["score"].item()) for elem in finalized[sent]] ) _, sorted_scores_indices = torch.sort(scores, descending=True) finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices] finalized[sent] = torch.jit.annotate( List[Dict[str, Tensor]], finalized[sent] ) return finalized class EnsembleModel(EnsembleModelBase): """A wrapper around an ensemble of models.""" def __init__(self, models): super().__init__(models) @torch.jit.export def forward_decoder( self, tokens, encoder_outs: List[Dict[str, List[Tensor]]], incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], temperature: float = 1.0, decoder_name="decoder", encoder_outs_aug: List[Dict[str, List[Tensor]]] = None, ): log_probs = [] avg_attn: Optional[Tensor] = None encoder_out: Optional[Dict[str, List[Tensor]]] = None encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None for i, model in enumerate(self.models): if self.has_encoder(): encoder_out = encoder_outs[i] if encoder_outs_aug is not None: encoder_out_aug = encoder_outs_aug[i] # decode each model if self.has_incremental_states(): if encoder_out_aug is not None: decoder_out = getattr(model, decoder_name).forward( tokens, encoder_out=encoder_out, encoder_out_aug=encoder_out_aug, incremental_state=incremental_states[i], ) else: decoder_out = getattr(model, decoder_name).forward( tokens, encoder_out=encoder_out, incremental_state=incremental_states[i], ) else: if hasattr(model, decoder_name): decoder_out = getattr(model, decoder_name).forward( tokens, encoder_out=encoder_out ) else: decoder_out = model.forward(tokens) attn: Optional[Tensor] = None decoder_len = len(decoder_out) if decoder_len > 1 and decoder_out[1] is not None: if isinstance(decoder_out[1], Tensor): attn = decoder_out[1] else: attn_holder = decoder_out[1]["attn"] if isinstance(attn_holder, Tensor): attn = attn_holder elif attn_holder is not None: attn = attn_holder[0] if attn is not None: attn = attn[:, -1, :] decoder_out_tuple = ( decoder_out[0][:, -1:, :].div_(temperature), None if decoder_len <= 1 else decoder_out[1], ) probs = getattr(model, decoder_name).get_normalized_probs( decoder_out_tuple, log_probs=True, sample=None ) probs = probs[:, -1, :] if self.models_size == 1: return probs, attn log_probs.append(probs) if attn is not None: if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log( self.models_size ) if avg_attn is not None: avg_attn.div_(self.models_size) return avg_probs, avg_attn @torch.jit.export def reorder_incremental_state( self, incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], new_order, decoder_name="decoder", ): if not self.has_incremental_states(): return for i, model in enumerate(self.models): getattr(model, decoder_name).reorder_incremental_state_scripting( incremental_states[i], new_order ) ================================================ FILE: examples/speech_to_speech/unity/sequence_generator_multi_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Optional import torch import torch.nn as nn from torch import Tensor from fairseq import search class MultiDecoderSequenceGenerator(nn.Module): def __init__( self, models, tgt_dict, tgt_dict_mt, beam_size=1, beam_size_mt=1, max_len_a=0, max_len_b=200, max_len_a_mt=0, max_len_b_mt=200, max_len=0, min_len=1, normalize_scores=True, len_penalty=1.0, len_penalty_mt=1.0, unk_penalty=0.0, temperature=1.0, match_source_len=False, no_repeat_ngram_size=0, eos=None, eos_mt=None, symbols_to_strip_from_output=None, lm_model=None, lm_weight=1.0, ): """Generates translations of a given source sentence. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models, currently support fairseq.models.TransformerModel for scripting beam_size (int, optional): beam width (default: 1) max_len_a/b (int, optional): generate sequences of maximum length ax + b, where x is the source length for the second pass max_len_a_mt/b_mt (int, optional): generate sequences of maximum length ax + b, where x is the source length for the first pass max_len (int, optional): the maximum length of the generated output (not including end-of-sentence) min_len (int, optional): the minimum length of the generated output (not including end-of-sentence) normalize_scores (bool, optional): normalize scores by the length of the output (default: True) len_penalty (float, optional): length penalty in the second pass, where <1.0 favors shorter, >1.0 favors longer sentences (default: 1.0) len_penalty (float, optional): length penalty in the first pass, where <1.0 favors shorter, >1.0 favors longer sentences (default: 1.0) unk_penalty (float, optional): unknown word penalty, where <0 produces more unks, >0 produces fewer (default: 0.0) temperature (float, optional): temperature, where values >1.0 produce more uniform samples and values <1.0 produce sharper samples (default: 1.0) match_source_len (bool, optional): outputs should match the source length (default: False) """ super().__init__() from examples.speech_to_speech.unity.sequence_generator import SequenceGenerator self.generator = SequenceGenerator( models, tgt_dict, beam_size=beam_size, max_len_a=max_len_a, max_len_b=max_len_b, max_len=max_len, min_len=min_len, normalize_scores=normalize_scores, len_penalty=len_penalty, unk_penalty=unk_penalty, temperature=temperature, match_source_len=match_source_len, no_repeat_ngram_size=no_repeat_ngram_size, search_strategy=search.BeamSearch(tgt_dict), eos=eos, symbols_to_strip_from_output=symbols_to_strip_from_output, lm_model=lm_model, lm_weight=lm_weight, ) self.eos = self.generator.eos self.generator_mt = SequenceGenerator( models, tgt_dict_mt, beam_size=beam_size_mt, max_len_a=max_len_a_mt, max_len_b=max_len_b_mt, max_len=max_len, min_len=min_len, normalize_scores=normalize_scores, len_penalty=len_penalty_mt, unk_penalty=unk_penalty, temperature=temperature, match_source_len=match_source_len, no_repeat_ngram_size=no_repeat_ngram_size, search_strategy=search.BeamSearch(tgt_dict_mt), eos=eos_mt, symbols_to_strip_from_output=symbols_to_strip_from_output, ) @torch.no_grad() def generate( self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs ) -> List[List[Dict[str, Tensor]]]: """Generate translations. Match the api of other fairseq generators. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models sample (dict): batch prefix_tokens (torch.LongTensor, optional): force decoder to begin with these tokens constraints (torch.LongTensor, optional): force decoder to include the list of constraints bos_token (int, optional): beginning of sentence token (default: self.eos) """ return self._generate(sample, **kwargs) def _generate( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, constraints: Optional[Tensor] = None, bos_token: Optional[int] = None, ): net_input = sample["net_input"] if "src_tokens" in net_input: src_tokens = net_input["src_tokens"] # length of the source text being the character length except EndOfSentence and pad # if src_lengths exists in net_input (speech_to_text dataset case), then use it if "src_lengths" in net_input: src_lengths = net_input["src_lengths"] else: src_lengths = ( ( src_tokens.ne(self.generator.eos) & src_tokens.ne(self.generator.pad) ) .long() .sum(dim=1) ) else: raise Exception( "expected src_tokens or source in net input. input keys: " + str(net_input.keys()) ) if constraints is not None and not self.generator.search.supports_constraints: raise NotImplementedError( "Target-side constraints were provided, but search method doesn't support them" ) # Initialize constraints, when active self.generator.search.init_constraints(constraints, self.generator.beam_size) self.generator_mt.search.init_constraints( constraints, self.generator_mt.beam_size ) # compute the encoder output for each beam with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"): encoder_outs = self.generator.model.forward_encoder(net_input) single_model = self.generator.model.single_model mt_decoder = getattr(single_model, f"{single_model.mt_task_name}_decoder") # 1. MT decoder finalized_mt = self.generator_mt.generate_decoder( encoder_outs, src_tokens, src_lengths, sample, prefix_tokens, constraints, bos_token, aux_task_name=single_model.mt_task_name, ) # extract decoder output corresponding to the best hypothesis max_tgt_len = max([len(hypo[0]["tokens"]) for hypo in finalized_mt]) prev_output_tokens_mt = ( src_tokens.new_zeros(src_tokens.shape[0], max_tgt_len) .fill_(mt_decoder.padding_idx) .int() ) # B x T for i, hypo in enumerate(finalized_mt): i_beam = 0 tmp = hypo[i_beam]["tokens"].int() # hyp + eos prev_output_tokens_mt[i, 0] = self.generator_mt.eos if tmp[-1] == self.generator_mt.eos: tmp = tmp[:-1] prev_output_tokens_mt[i, 1 : len(tmp) + 1] = tmp text = "".join([self.generator_mt.tgt_dict[c] for c in tmp]) text = text.replace("_", " ") text = text.replace("▁", " ") text = text.replace("<unk>", " ") text = text.replace("<s>", "") text = text.replace("</s>", "") if len(text) > 0 and text[0] == " ": text = text[1:] sample_id = sample["id"].tolist()[i] print("{} (None-{})".format(text, sample_id)) x = mt_decoder( prev_output_tokens_mt, encoder_out=encoder_outs[0], features_only=True, )[0].transpose(0, 1) if getattr(single_model, "proj", None) is not None: x = single_model.proj(x) mt_decoder_padding_mask = None if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) # 2. T2U encoder if getattr(single_model, "synthesizer_encoder", None) is not None: t2u_encoder_out = single_model.synthesizer_encoder( x, mt_decoder_padding_mask, ) else: t2u_encoder_out = { "encoder_out": [x], # T x B x C "encoder_padding_mask": [mt_decoder_padding_mask] if mt_decoder_padding_mask is not None else [], # B x T "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [], } if getattr(single_model, "t2u_augmented_cross_attn", False): encoder_outs_aug = [t2u_encoder_out] else: encoder_outs = [t2u_encoder_out] encoder_outs_aug = None # 3. T2U decoder finalized = self.generator.generate_decoder( encoder_outs, src_tokens, src_lengths, sample, prefix_tokens, constraints, bos_token, encoder_outs_aug=encoder_outs_aug, ) return finalized ================================================ FILE: examples/speech_to_text/README.md ================================================ # Speech-to-Text (S2T) Modeling [https://www.aclweb.org/anthology/2020.aacl-demo.6](https://www.aclweb.org/anthology/2020.aacl-demo.6.pdf) Speech recognition (ASR) and speech-to-text translation (ST) with fairseq. ## Data Preparation S2T modeling data consists of source speech features, target text and other optional information (source text, speaker id, etc.). Fairseq S2T uses per-dataset-split TSV manifest files to store these information. Each data field is represented by a column in the TSV file. Unlike text token embeddings, speech features (e.g. log mel-scale filter banks) are usually fixed during model training and can be pre-computed. The manifest file contains the path to either the feature file in NumPy format or the WAV/FLAC audio file. For the latter, features will be extracted on-the-fly by fairseq S2T. Optionally, feature/audio files can be packed into uncompressed ZIP files (then accessed via byte offset and length) to improve I/O performance. Fairseq S2T also employs a YAML file for data related configurations: tokenizer type and dictionary path for the target text, feature transforms such as CMVN (cepstral mean and variance normalization) and SpecAugment, temperature-based resampling, etc. ## Model Training Fairseq S2T uses the unified `fairseq-train` interface for model training. It requires arguments `--task speech_to_text`, `--arch <model architecture in fairseq.models.speech_to_text.*>` and `--config-yaml <config YAML filename>`. ## Inference & Evaluation Fairseq S2T uses the unified `fairseq-generate`/`fairseq-interactive` interface for inference and evaluation. It requires arguments `--task speech_to_text` and `--config-yaml <config YAML filename>`. The interactive console takes audio paths (one per line) as inputs. ## Examples - [Speech Recognition (ASR) on LibriSpeech](docs/librispeech_example.md) - [Speech-to-Text Translation (ST) on MuST-C](docs/mustc_example.md) - [Speech-to-Text Translation (ST) on CoVoST 2](docs/covost_example.md) - [Speech-to-Text Translation (ST) on Multilingual TEDx](docs/mtedx_example.md) - [Simultaneous Speech-to-Text Translation (SimulST) on MuST-C](docs/simulst_mustc_example.md) ## Updates - 02/04/2021: Added interactive decoding (`fairseq-interactive`) support. Examples: [ASR (LibriSpeech)](docs/librispeech_example.md#interactive-decoding) and [ST (CoVoST 2)](docs/covost_example.md#interactive-decoding). - 01/08/2021: Several fixes for S2T Transformer model, inference-time de-tokenization, scorer configuration and data preparation scripts. We also add pre-trained models to the examples and revise the instructions. Breaking changes: the data preparation scripts now extract filterbank features without CMVN. CMVN is instead applied on-the-fly (defined in the config YAML). ## What's Next - We are migrating the old fairseq [ASR example](../speech_recognition) into this S2T framework and merging the features from both sides. - The following papers also base their experiments on fairseq S2T. We are adding more examples for replication. - [Improving Cross-Lingual Transfer Learning for End-to-End Speech Recognition with Speech Translation (Wang et al., 2020)](https://arxiv.org/abs/2006.05474) - [Self-Supervised Representations Improve End-to-End Speech Translation (Wu et al., 2020)](https://arxiv.org/abs/2006.12124) - [Self-Training for End-to-End Speech Translation (Pino et al., 2020)](https://arxiv.org/abs/2006.02490) - [CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus (Wang et al., 2020)](https://arxiv.org/abs/2002.01320) - [Harnessing Indirect Training Data for End-to-End Automatic Speech Translation: Tricks of the Trade (Pino et al., 2019)](https://arxiv.org/abs/1909.06515) ## Citation Please cite as: ``` @inproceedings{wang2020fairseqs2t, title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq}, author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino}, booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations}, year = {2020}, } @inproceedings{ott2019fairseq, title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, year = {2019}, } ``` ================================================ FILE: examples/speech_to_text/data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import csv from pathlib import Path import zipfile from functools import reduce from multiprocessing import cpu_count from typing import Any, Dict, List, Optional, Union import io import numpy as np import pandas as pd import sentencepiece as sp from fairseq.data.audio.audio_utils import ( convert_waveform, _get_kaldi_fbank, _get_torchaudio_fbank, is_npy_data, is_sf_audio_data ) import torch import soundfile as sf from tqdm import tqdm UNK_TOKEN, UNK_TOKEN_ID = "<unk>", 3 BOS_TOKEN, BOS_TOKEN_ID = "<s>", 0 EOS_TOKEN, EOS_TOKEN_ID = "</s>", 2 PAD_TOKEN, PAD_TOKEN_ID = "<pad>", 1 def gen_vocab( input_path: Path, output_path_prefix: Path, model_type="bpe", vocab_size=1000, special_symbols: Optional[List[str]] = None ): # Train SentencePiece Model arguments = [ f"--input={input_path.as_posix()}", f"--model_prefix={output_path_prefix.as_posix()}", f"--model_type={model_type}", f"--vocab_size={vocab_size}", "--character_coverage=1.0", f"--num_threads={cpu_count()}", f"--unk_id={UNK_TOKEN_ID}", f"--bos_id={BOS_TOKEN_ID}", f"--eos_id={EOS_TOKEN_ID}", f"--pad_id={PAD_TOKEN_ID}", ] if special_symbols is not None: _special_symbols = ",".join(special_symbols) arguments.append(f"--user_defined_symbols={_special_symbols}") sp.SentencePieceTrainer.Train(" ".join(arguments)) # Export fairseq dictionary spm = sp.SentencePieceProcessor() spm.Load(output_path_prefix.as_posix() + ".model") vocab = {i: spm.IdToPiece(i) for i in range(spm.GetPieceSize())} assert ( vocab.get(UNK_TOKEN_ID) == UNK_TOKEN and vocab.get(PAD_TOKEN_ID) == PAD_TOKEN and vocab.get(BOS_TOKEN_ID) == BOS_TOKEN and vocab.get(EOS_TOKEN_ID) == EOS_TOKEN ) vocab = { i: s for i, s in vocab.items() if s not in {UNK_TOKEN, BOS_TOKEN, EOS_TOKEN, PAD_TOKEN} } with open(output_path_prefix.as_posix() + ".txt", "w") as f_out: for _, s in sorted(vocab.items(), key=lambda x: x[0]): f_out.write(f"{s} 1\n") def extract_fbank_features( waveform: torch.FloatTensor, sample_rate: int, output_path: Optional[Path] = None, n_mel_bins: int = 80, overwrite: bool = False, ): if output_path is not None and output_path.is_file() and not overwrite: return _waveform, _ = convert_waveform(waveform, sample_rate, to_mono=True) # Kaldi compliance: 16-bit signed integers _waveform = _waveform * (2 ** 15) _waveform = _waveform.numpy() features = _get_kaldi_fbank(_waveform, sample_rate, n_mel_bins) if features is None: features = _get_torchaudio_fbank(_waveform, sample_rate, n_mel_bins) if features is None: raise ImportError( "Please install pyKaldi or torchaudio to enable fbank feature extraction" ) if output_path is not None: np.save(output_path.as_posix(), features) return features def create_zip(data_root: Path, zip_path: Path): paths = list(data_root.glob("*.npy")) paths.extend(data_root.glob("*.flac")) with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_STORED) as f: for path in tqdm(paths): f.write(path, arcname=path.name) def get_zip_manifest( zip_path: Path, zip_root: Optional[Path] = None, is_audio=False ): _zip_path = Path.joinpath(zip_root or Path(""), zip_path) with zipfile.ZipFile(_zip_path, mode="r") as f: info = f.infolist() paths, lengths = {}, {} for i in tqdm(info): utt_id = Path(i.filename).stem offset, file_size = i.header_offset + 30 + len(i.filename), i.file_size paths[utt_id] = f"{zip_path.as_posix()}:{offset}:{file_size}" with open(_zip_path, "rb") as f: f.seek(offset) byte_data = f.read(file_size) assert len(byte_data) > 1 if is_audio: assert is_sf_audio_data(byte_data), i else: assert is_npy_data(byte_data), i byte_data_fp = io.BytesIO(byte_data) if is_audio: lengths[utt_id] = sf.info(byte_data_fp).frames else: lengths[utt_id] = np.load(byte_data_fp).shape[0] return paths, lengths def gen_config_yaml( manifest_root: Path, spm_filename: Optional[str] = None, vocab_name: Optional[str] = None, yaml_filename: str = "config.yaml", specaugment_policy: Optional[str] = "lb", prepend_tgt_lang_tag: bool = False, sampling_alpha: Optional[float] = None, input_channels: Optional[int] = 1, input_feat_per_channel: Optional[int] = 80, audio_root: str = "", cmvn_type: str = "utterance", gcmvn_path: Optional[Path] = None, extra=None ): manifest_root = manifest_root.absolute() writer = S2TDataConfigWriter(manifest_root / yaml_filename) assert spm_filename is not None or vocab_name is not None vocab_name = spm_filename.replace(".model", ".txt") if vocab_name is None \ else vocab_name writer.set_vocab_filename(vocab_name) if input_channels is not None: writer.set_input_channels(input_channels) if input_feat_per_channel is not None: writer.set_input_feat_per_channel(input_feat_per_channel) specaugment_setters = { "lb": writer.set_specaugment_lb_policy, "ld": writer.set_specaugment_ld_policy, "sm": writer.set_specaugment_sm_policy, "ss": writer.set_specaugment_ss_policy, } specaugment_setter = specaugment_setters.get(specaugment_policy, None) if specaugment_setter is not None: specaugment_setter() if spm_filename is not None: writer.set_bpe_tokenizer( { "bpe": "sentencepiece", "sentencepiece_model": (manifest_root / spm_filename).as_posix(), } ) if prepend_tgt_lang_tag: writer.set_prepend_tgt_lang_tag(True) if sampling_alpha is not None: writer.set_sampling_alpha(sampling_alpha) if cmvn_type not in ["global", "utterance"]: raise NotImplementedError if specaugment_policy is not None: writer.set_feature_transforms( "_train", [f"{cmvn_type}_cmvn", "specaugment"] ) writer.set_feature_transforms("*", [f"{cmvn_type}_cmvn"]) if cmvn_type == "global": if gcmvn_path is None: raise ValueError("Please provide path of global cmvn file.") else: writer.set_global_cmvn(gcmvn_path.as_posix()) if len(audio_root) > 0: writer.set_audio_root(audio_root) if extra is not None: writer.set_extra(extra) writer.flush() def load_df_from_tsv(path: Union[str, Path]) -> pd.DataFrame: _path = path if isinstance(path, str) else path.as_posix() return pd.read_csv( _path, sep="\t", header=0, encoding="utf-8", escapechar="\\", quoting=csv.QUOTE_NONE, na_filter=False, ) def save_df_to_tsv(dataframe, path: Union[str, Path]): _path = path if isinstance(path, str) else path.as_posix() dataframe.to_csv( _path, sep="\t", header=True, index=False, encoding="utf-8", escapechar="\\", quoting=csv.QUOTE_NONE, ) def load_tsv_to_dicts(path: Union[str, Path]) -> List[dict]: with open(path, "r") as f: reader = csv.DictReader( f, delimiter="\t", quotechar=None, doublequote=False, lineterminator="\n", quoting=csv.QUOTE_NONE, ) rows = [dict(e) for e in reader] return rows def filter_manifest_df( df, is_train_split=False, extra_filters=None, min_n_frames=5, max_n_frames=3000 ): filters = { "no speech": df["audio"] == "", f"short speech (<{min_n_frames} frames)": df["n_frames"] < min_n_frames, "empty sentence": df["tgt_text"] == "", } if is_train_split: filters[f"long speech (>{max_n_frames} frames)"] = df["n_frames"] > max_n_frames if extra_filters is not None: filters.update(extra_filters) invalid = reduce(lambda x, y: x | y, filters.values()) valid = ~invalid print( "| " + ", ".join(f"{n}: {f.sum()}" for n, f in filters.items()) + f", total {invalid.sum()} filtered, {valid.sum()} remained." ) return df[valid] def cal_gcmvn_stats(features_list): features = np.concatenate(features_list) square_sums = (features ** 2).sum(axis=0) mean = features.mean(axis=0) features = np.subtract(features, mean) var = square_sums / features.shape[0] - mean ** 2 std = np.sqrt(np.maximum(var, 1e-8)) return {"mean": mean.astype("float32"), "std": std.astype("float32")} class S2TDataConfigWriter(object): DEFAULT_VOCAB_FILENAME = "dict.txt" DEFAULT_INPUT_FEAT_PER_CHANNEL = 80 DEFAULT_INPUT_CHANNELS = 1 def __init__(self, yaml_path: Path): try: import yaml except ImportError: print("Please install PyYAML for S2T data config YAML files") self.yaml = yaml self.yaml_path = yaml_path self.config = {} def flush(self): with open(self.yaml_path, "w") as f: self.yaml.dump(self.config, f) def set_audio_root(self, audio_root=""): self.config["audio_root"] = audio_root def set_vocab_filename(self, vocab_filename: str = "dict.txt"): self.config["vocab_filename"] = vocab_filename def set_specaugment( self, time_wrap_w: int, freq_mask_n: int, freq_mask_f: int, time_mask_n: int, time_mask_t: int, time_mask_p: float, ): self.config["specaugment"] = { "time_wrap_W": time_wrap_w, "freq_mask_N": freq_mask_n, "freq_mask_F": freq_mask_f, "time_mask_N": time_mask_n, "time_mask_T": time_mask_t, "time_mask_p": time_mask_p, } def set_specaugment_lb_policy(self): self.set_specaugment( time_wrap_w=0, freq_mask_n=1, freq_mask_f=27, time_mask_n=1, time_mask_t=100, time_mask_p=1.0, ) def set_specaugment_ld_policy(self): self.set_specaugment( time_wrap_w=0, freq_mask_n=2, freq_mask_f=27, time_mask_n=2, time_mask_t=100, time_mask_p=1.0, ) def set_specaugment_sm_policy(self): self.set_specaugment( time_wrap_w=0, freq_mask_n=2, freq_mask_f=15, time_mask_n=2, time_mask_t=70, time_mask_p=0.2, ) def set_specaugment_ss_policy(self): self.set_specaugment( time_wrap_w=0, freq_mask_n=2, freq_mask_f=27, time_mask_n=2, time_mask_t=70, time_mask_p=0.2, ) def set_input_channels(self, input_channels: int = 1): self.config["input_channels"] = input_channels def set_input_feat_per_channel(self, input_feat_per_channel: int = 80): self.config["input_feat_per_channel"] = input_feat_per_channel def set_bpe_tokenizer(self, bpe_tokenizer: Dict[str, Any]): self.config["bpe_tokenizer"] = bpe_tokenizer def set_global_cmvn(self, stats_npz_path: str): self.config["global_cmvn"] = {"stats_npz_path": stats_npz_path} def set_feature_transforms(self, split: str, transforms: List[str]): if "transforms" not in self.config: self.config["transforms"] = {} self.config["transforms"][split] = transforms def set_prepend_tgt_lang_tag(self, flag: bool = True): self.config["prepend_tgt_lang_tag"] = flag def set_sampling_alpha(self, sampling_alpha: float = 1.0): self.config["sampling_alpha"] = sampling_alpha def set_extra(self, data): self.config.update(data) ================================================ FILE: examples/speech_to_text/docs/covost_example.md ================================================ [[Back]](..) # S2T Example: ST on CoVoST We replicate the experiments in [CoVoST 2 and Massively Multilingual Speech-to-Text Translation (Wang et al., 2020)](https://arxiv.org/abs/2007.10310). ## Data Preparation [Download](https://commonvoice.mozilla.org/en/datasets) and unpack Common Voice v4 to a path `${COVOST_ROOT}/${SOURCE_LANG_ID}`, then preprocess it with ```bash # additional Python packages for S2T data processing/model training pip install pandas torchaudio sentencepiece # En ASR python examples/speech_to_text/prep_covost_data.py \ --data-root ${COVOST_ROOT} --vocab-type char --src-lang en # ST python examples/speech_to_text/prep_covost_data.py \ --data-root ${COVOST_ROOT} --vocab-type char \ --src-lang fr --tgt-lang en ``` The generated files (manifest, features, vocabulary and data configuration) will be added to `${COVOST_ROOT}/${SOURCE_LANG_ID}`. Download our vocabulary files if you want to use our pre-trained models: - ASR: [En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_asr_vocab_char.zip) - ST: [Fr-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_fr_en_st_vocab_char.zip), [De-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_de_en_st_vocab_char.zip), [Es-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_es_en_st_vocab_char.zip), [Ca-En](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_ca_en_st_vocab_char.zip), [En-De](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_de_st_vocab_char.zip), [En-Ca](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_ca_st_vocab_char.zip), [En-Fa](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_fa_st_vocab_char.zip), [En-Et](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_et_st_vocab_char.zip) ## ASR #### Training We train an En ASR model for encoder pre-training some of the ST models. ```bash fairseq-train ${COVOST_ROOT}/en \ --config-yaml config_asr_en.yaml --train-subset train_asr_en --valid-subset dev_asr_en \ --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 50000 --max-update 60000 \ --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --report-accuracy --arch s2t_transformer_s --dropout 0.15 --optimizer adam --lr 2e-3 \ --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ --attn-type None --pos-enc-type ${POS_ENC_TYPE} ``` where `ASR_SAVE_DIR` is the checkpoint root path and `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder. Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively. Transformer encoder only supports absolute positional encoding and by default, the transformer encoder will be used. To switch to conformer, set `--attn-type espnet` and `--POS_ENC_TYPE`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. #### Inference & Evaluation ```bash CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py \ --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${COVOST_ROOT}/en \ --config-yaml config_asr_en.yaml --gen-subset test_asr_en --task speech_to_text \ --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct ``` #### Results | --arch | --pos-enc-type | Params | En | Model | |---|---|---|---|---| | s2t_transformer_s | - | 31M | 25.6 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_asr_transformer_s.pt) | | s2t_conformer | rel_pos | 42.9M | 23.18| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/rel_pos_asr_checkpoint_best.pt) | | s2t_conformer | rope | 42.1M | 23.8| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/rope_pos_asr_checkpoint_best.pt) | | s2t_conformer | abs | 42.1M | 23.8| [Download](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_asr/abs_asr_checkpoint_best.pt) | ## ST #### Training Fr-En as example: ```bash fairseq-train ${COVOST_ROOT}/fr \ --config-yaml config_st_fr_en.yaml --train-subset train_st_fr_en --valid-subset dev_st_fr_en \ --save-dir ${ST_SAVE_DIR} --num-workers 4 --max-update 30000 --max-tokens 40000 \ # --max-tokens 50000 for en-* --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ --arch s2t_transformer_s --encoder-freezing-updates 1000 --optimizer adam --lr 2e-3 \ --lr-scheduler inverse_sqrt --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ --attn-type None --pos-enc-type ${POS_ENC_TYPE} \ --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} ``` where `ST_SAVE_DIR` is the checkpoint root path and `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder. Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively. Transformer encoder only supports absolute positional encoding and by default, the transformer encoder will be used. To switch to conformer, set `--attn-type espnet` and `--POS_ENC_TYPE`. Optionally load the pre-trained En ASR encoder for faster training and better performance: `--load-pretrained-encoder-from <ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. #### Inference & Evaluation Average the last 10 checkpoints and evaluate on test split: ```bash CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py \ --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${COVOST_ROOT}/fr \ --config-yaml config_st_fr_en.yaml --gen-subset test_st_fr_en --task speech_to_text \ --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 50000 --beam 5 --scoring sacrebleu ``` ## Interactive Decoding Launch the interactive console via ```bash fairseq-interactive ${COVOST_ROOT}/fr --config-yaml config_st_fr_en.yaml \ --task speech_to_text --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 50000 --beam 5 ``` Type in WAV/FLAC/OGG audio paths (one per line) after the prompt. #### Results | --arch | --pos-enc-type | Params | ASR PT | Fr-En | De-En | Es-En | Ca-En | En-De | En-Ca | En-Fa | En-Et | Model | |---|---|---|---|---|---|---|---|---|---|---|---|---| | s2t_transformer | - | 31M | Yes | [27.2](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_fr_en_st_transformer_s.pt) | [17.7](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_de_en_st_transformer_s.pt) | [23.1](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_es_en_st_transformer_s.pt) | [19.3](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_ca_en_st_transformer_s.pt) | [16.1](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_de_st_transformer_s.pt) | [21.6](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_ca_st_transformer_s.pt) | [12.9](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_fa_st_transformer_s.pt) | [12.8](https://dl.fbaipublicfiles.com/fairseq/s2t/covost2_en_et_st_transformer_s.pt) | (<-Download) | | s2t_conformer | rel_pos | 42.9M | No | [28.32](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [18.21](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [25.98](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [21.13](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [20.37](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [25.89](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [15.59](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | [14.49](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rel_pos_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) | | s2t_conformer | rel_pos | 42.9M | Yes| [27.15](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [18.22](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [25.14](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [21.68](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [20.35](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [25.92](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [15.76](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | [16.52](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rel_pos_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) | | s2t_conformer | rope | 42.1M | No | [27.61](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [17.6](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [24.91](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [20.78](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rope_from_scratch_avg_last_10_checkpoint.pt) | [19.7](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rope_from_scratch_avg_last_10_checkpoint.pt) | [25.13](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rope_from_scratch_avg_last_10_checkpoint.pt) | [15.22](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rope_from_scratch_avg_last_10_checkpoint.pt) | [15.87](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rope_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) | | s2t_conformer | rope | 42.1M | Yes | [26.99](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [17.71](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [24.24](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [21.24](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/rope_asr_pt_avg_last_10_checkpoint.pt) | [19.9](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/rope_asr_pt_avg_last_10_checkpoint.pt) | [25.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/rope_asr_pt_avg_last_10_checkpoint.pt) | [15.58](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/rope_asr_pt_avg_last_10_checkpoint.pt) | [15.97](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/rope_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) | | s2t_conformer | abs | 42.1M | No | [27.45](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [17.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [25.01](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [20.26](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/abs_from_scratch_avg_last_10_checkpoint.pt) | [19.86](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/abs_from_scratch_avg_last_10_checkpoint.pt) | [25.25](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/abs_from_scratch_avg_last_10_checkpoint.pt) | [15.46](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/abs_from_scratch_avg_last_10_checkpoint.pt) | [15.81](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/abs_from_scratch_avg_last_10_checkpoint.pt) | (<-Download) | | s2t_conforme | abs | 42.1M | Yes| [26.52](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/fr_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [17.37](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/de_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [25.40](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/es_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [20.45](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/ca_en/abs_asr_pt_avg_last_10_checkpoint.pt) | [19.57](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_de/abs_asr_pt_avg_last_10_checkpoint.pt) | [25.40](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_ca/abs_asr_pt_avg_last_10_checkpoint.pt) | [15.17](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_fa/abs_asr_pt_avg_last_10_checkpoint.pt) | [15.83](https://dl.fbaipublicfiles.com/fairseq/conformer/covost2/en_et/abs_asr_pt_avg_last_10_checkpoint.pt) | (<-Download) | [[Back]](..) ================================================ FILE: examples/speech_to_text/docs/librispeech_example.md ================================================ [[Back]](..) # S2T Example: Speech Recognition (ASR) on LibriSpeech [LibriSpeech](https://www.danielpovey.com/files/2015_icassp_librispeech.pdf) is a de-facto standard English ASR benchmark. We provide competitive vanilla [Transformer](https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf) baselines. ## Data preparation Download and preprocess LibriSpeech data with ```bash # additional Python packages for S2T data processing/model training pip install pandas torchaudio sentencepiece python examples/speech_to_text/prep_librispeech_data.py \ --output-root ${LS_ROOT} --vocab-type unigram --vocab-size 10000 ``` where `LS_ROOT` is the root path for downloaded data as well as generated files (manifest, features, vocabulary and data configuration). [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_vocab_unigram10000.zip) our vocabulary files if you want to use our pre-trained models. ## Training ```bash fairseq-train ${LS_ROOT} --save-dir ${SAVE_DIR} \ --config-yaml config.yaml --train-subset train-clean-100,train-clean-360,train-other-500 --valid-subset dev-clean,dev-other \ --num-workers 4 --max-tokens 40000 --max-update 300000 \ --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ --arch s2t_transformer_s --share-decoder-input-output-embed \ --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt --warmup-updates 10000 \ --clip-norm 10.0 --seed 1 --update-freq 8 ``` where `SAVE_DIR` is the checkpoint root path. Here we use `--arch s2t_transformer_s` (31M parameters) as example. For better performance, you may switch to `s2t_transformer_m` (71M, with `--lr 1e-3`) or `s2t_transformer_l` (268M, with `--lr 5e-4`). We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. ## Inference & Evaluation Average the last 10 checkpoints and evaluate on the 4 splits (`dev-clean`, `dev-other`, `test-clean` and `test-other`): ```bash CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py --inputs ${SAVE_DIR} \ --num-epoch-checkpoints 10 \ --output "${SAVE_DIR}/${CHECKPOINT_FILENAME}" for SUBSET in dev-clean dev-other test-clean test-other; do fairseq-generate ${LS_ROOT} --config-yaml config.yaml --gen-subset ${SUBSET} \ --task speech_to_text --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 50000 --beam 5 --scoring wer done ``` ## Interactive Decoding Launch the interactive console via ```bash fairseq-interactive ${LS_ROOT} --config-yaml config.yaml --task speech_to_text \ --path ${SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 ``` Type in WAV/FLAC/OGG audio paths (one per line) after the prompt. ## Results | --arch | Params | dev-clean | dev-other | test-clean | test-other | Model | |---|---|---|---|---|---|---| | s2t_transformer_s | 30M | 3.8 | 8.9 | 4.4 | 9.0 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_transformer_s.pt) | | s2t_transformer_m | 71M | 3.2 | 8.0 | 3.4 | 7.9 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_transformer_m.pt) | | s2t_transformer_l | 268M | 3.0 | 7.5 | 3.2 | 7.5 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/librispeech_transformer_l.pt) | [[Back]](..) ================================================ FILE: examples/speech_to_text/docs/mtedx_example.md ================================================ [[Back]](..) # S2T Example: Speech Translation (ST) on Multilingual TEDx [Multilingual TEDx](https://arxiv.org/abs/2102.01757) is multilingual corpus for speech recognition and speech translation. The data is derived from TEDx talks in 8 source languages with translations to a subset of 5 target languages. ## Data Preparation [Download](http://openslr.org/100/) and unpack Multilingual TEDx data to a path `${MTEDX_ROOT}/${LANG_PAIR}`, then preprocess it with ```bash # additional Python packages for S2T data processing/model training pip install pandas torchaudio soundfile sentencepiece # Generate TSV manifests, features, vocabulary # and configuration for each language python examples/speech_to_text/prep_mtedx_data.py \ --data-root ${MTEDX_ROOT} --task asr \ --vocab-type unigram --vocab-size 1000 python examples/speech_to_text/prep_mtedx_data.py \ --data-root ${MTEDX_ROOT} --task st \ --vocab-type unigram --vocab-size 1000 # Add vocabulary and configuration for joint data # (based on the manifests and features generated above) python examples/speech_to_text/prep_mtedx_data.py \ --data-root ${MTEDX_ROOT} --task asr --joint \ --vocab-type unigram --vocab-size 8000 python examples/speech_to_text/prep_mtedx_data.py \ --data-root ${MTEDX_ROOT} --task st --joint \ --vocab-type unigram --vocab-size 8000 ``` The generated files (manifest, features, vocabulary and data configuration) will be added to `${MTEDX_ROOT}/${LANG_PAIR}` (per-language data) and `MTEDX_ROOT` (joint data). ## ASR #### Training Spanish as example: ```bash fairseq-train ${MTEDX_ROOT}/es-es \ --config-yaml config_asr.yaml --train-subset train_asr --valid-subset valid_asr \ --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ --arch s2t_transformer_xs --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ --load-pretrained-encoder-from ${PRETRAINED_ENCODER} \ --skip-invalid-size-inputs-valid-test \ --keep-last-epochs 10 --update-freq 8 --patience 10 ``` For joint model (using ASR data from all 8 languages): ```bash fairseq-train ${MTEDX_ROOT} \ --config-yaml config_asr.yaml \ --train-subset train_es-es_asr,train_fr-fr_asr,train_pt-pt_asr,train_it-it_asr,train_ru-ru_asr,train_el-el_asr,train_ar-ar_asr,train_de-de_asr \ --valid-subset valid_es-es_asr,valid_fr-fr_asr,valid_pt-pt_asr,valid_it-it_asr,valid_ru-ru_asr,valid_el-el_asr,valid_ar-ar_asr,valid_de-de_asr \ --save-dir ${MULTILINGUAL_ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ --skip-invalid-size-inputs-valid-test \ --keep-last-epochs 10 --update-freq 8 --patience 10 \ --ignore-prefix-size 1 ``` where `MULTILINGUAL_ASR_SAVE_DIR` is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. For multilingual models, we prepend target language ID token as target BOS, which should be excluded from the training loss via `--ignore-prefix-size 1`. #### Inference & Evaluation ```bash CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py \ --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${MTEDX_ROOT}/es-es \ --config-yaml config_asr.yaml --gen-subset test --task speech_to_text \ --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ --skip-invalid-size-inputs-valid-test \ --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct --remove-bpe # For models trained on joint data CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py \ --inputs ${MULTILINGUAL_ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${MULTILINGUAL_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" for LANG in es fr pt it ru el ar de; do fairseq-generate ${MTEDX_ROOT} \ --config-yaml config_asr.yaml --gen-subset test_${LANG}-${LANG}_asr --task speech_to_text \ --prefix-size 1 --path ${MULTILINGUAL_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 40000 --beam 5 \ --skip-invalid-size-inputs-valid-test \ --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct --remove-bpe done ``` #### Results | Data | --arch | Params | Es | Fr | Pt | It | Ru | El | Ar | De | |--------------|--------------------|--------|------|------|------|------|------|-------|-------|-------| | Monolingual | s2t_transformer_xs | 10M | 46.4 | 45.6 | 54.8 | 48.0 | 74.7 | 109.5 | 104.4 | 111.1 | ## ST #### Training Es-En as example: ```bash fairseq-train ${MTEDX_ROOT}/es-en \ --config-yaml config_st.yaml --train-subset train_st --valid-subset valid_st \ --save-dir ${ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ --arch s2t_transformer_xs --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ --load-pretrained-encoder-from ${PRETRAINED_ENCODER} \ --skip-invalid-size-inputs-valid-test \ --keep-last-epochs 10 --update-freq 8 --patience 10 ``` For multilingual model (all 12 directions): ```bash fairseq-train ${MTEDX_ROOT} \ --config-yaml config_st.yaml \ --train-subset train_el-en_st,train_es-en_st,train_es-fr_st,train_es-it_st,train_es-pt_st,train_fr-en_st,train_fr-es_st,train_fr-pt_st,train_it-en_st,train_it-es_st,train_pt-en_st,train_pt-es_st,train_ru-en_st \ --valid-subset valid_el-en_st,valid_es-en_st,valid_es-fr_st,valid_es-it_st,valid_es-pt_st,valid_fr-en_st,valid_fr-es_st,valid_fr-pt_st,valid_it-en_st,valid_it-es_st,valid_pt-en_st,valid_pt-es_st,valid_ru-en_st \ --save-dir ${MULTILINGUAL_ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-epoch 200 \ --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --dropout 0.3 --label-smoothing 0.1 \ --skip-invalid-size-inputs-valid-test \ --keep-last-epochs 10 --update-freq 8 --patience 10 \ --ignore-prefix-size 1 \ --load-pretrained-encoder-from ${PRETRAINED_ENCODER} ``` where `ST_SAVE_DIR` (`MULTILINGUAL_ST_SAVE_DIR`) is the checkpoint root path. The ST encoder is pre-trained by ASR for faster training and better performance: `--load-pretrained-encoder-from <(JOINT_)ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. For multilingual models, we prepend target language ID token as target BOS, which should be excluded from the training loss via `--ignore-prefix-size 1`. #### Inference & Evaluation Average the last 10 checkpoints and evaluate on the `test` split: ```bash CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py \ --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${MTEDX_ROOT}/es-en \ --config-yaml config_st.yaml --gen-subset test --task speech_to_text \ --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 50000 --beam 5 --scoring sacrebleu --remove-bpe # For multilingual models python scripts/average_checkpoints.py \ --inputs ${MULTILINGUAL_ST_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" for LANGPAIR in es-en es-fr es-pt fr-en fr-es fr-pt pt-en pt-es it-en it-es ru-en el-en; do fairseq-generate ${MTEDX_ROOT} \ --config-yaml config_st.yaml --gen-subset test_${LANGPAIR}_st --task speech_to_text \ --prefix-size 1 --path ${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 40000 --beam 5 \ --skip-invalid-size-inputs-valid-test \ --scoring sacrebleu --remove-bpe done ``` For multilingual models, we force decoding from the target language ID token (as BOS) via `--prefix-size 1`. #### Results | Data | --arch | Params | Es-En | Es-Pt | Es-Fr | Fr-En | Fr-Es | Fr-Pt | Pt-En | Pt-Es | It-En | It-Es | Ru-En | El-En | |--------------|--------------------|-----|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------|-------| | Bilingual | s2t_transformer_xs | 10M | 7.0 | 12.2 | 1.7 | 8.9 | 10.6 | 7.9 | 8.1 | 8.7 | 6.4 | 1.0 | 0.7 | 0.6 | | Multilingual | s2t_transformer_s | 31M | 12.3 | 17.4 | 6.1 | 12.0 | 13.6 | 13.2 | 12.0 | 13.7 | 10.7 | 13.1 | 0.6 | 0.8 | ## Citation Please cite as: ``` @inproceedings{salesky2021mtedx, title={Multilingual TEDx Corpus for Speech Recognition and Translation}, author={Elizabeth Salesky and Matthew Wiesner and Jacob Bremerman and Roldano Cattoni and Matteo Negri and Marco Turchi and Douglas W. Oard and Matt Post}, booktitle={Proceedings of Interspeech}, year={2021}, } @inproceedings{wang2020fairseqs2t, title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq}, author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino}, booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations}, year = {2020}, } @inproceedings{ott2019fairseq, title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling}, author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli}, booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations}, year = {2019}, } ``` [[Back]](..) ================================================ FILE: examples/speech_to_text/docs/mustc_example.md ================================================ [[Back]](..) # S2T Example: Speech Translation (ST) on MuST-C [MuST-C](https://www.aclweb.org/anthology/N19-1202) is multilingual speech-to-text translation corpus with 8-language translations on English TED talks. We match the state-of-the-art performance in [ESPNet-ST](https://arxiv.org/pdf/2004.10234.pdf) with a simpler model training pipeline. ## Data Preparation [Download](https://ict.fbk.eu/must-c) and unpack MuST-C data to a path `${MUSTC_ROOT}/en-${TARGET_LANG_ID}`, then preprocess it with ```bash # additional Python packages for S2T data processing/model training pip install pandas torchaudio soundfile sentencepiece # Generate TSV manifests, features, vocabulary # and configuration for each language python examples/speech_to_text/prep_mustc_data.py \ --data-root ${MUSTC_ROOT} --task asr \ --vocab-type unigram --vocab-size 5000 python examples/speech_to_text/prep_mustc_data.py \ --data-root ${MUSTC_ROOT} --task st \ --vocab-type unigram --vocab-size 8000 # Add vocabulary and configuration for joint data # (based on the manifests and features generated above) python examples/speech_to_text/prep_mustc_data.py \ --data-root ${MUSTC_ROOT} --task asr --joint \ --vocab-type unigram --vocab-size 10000 python examples/speech_to_text/prep_mustc_data.py \ --data-root ${MUSTC_ROOT} --task st --joint \ --vocab-type unigram --vocab-size 10000 ``` The generated files (manifest, features, vocabulary and data configuration) will be added to `${MUSTC_ROOT}/en-${TARGET_LANG_ID}` (per-language data) and `MUSTC_ROOT` (joint data). Download our vocabulary files if you want to use our pre-trained models: - ASR: [En-De](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_asr_vocab_unigram5000.zip), [En-Nl](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_asr_vocab_unigram5000.zip), [En-Es](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_asr_vocab_unigram5000.zip), [En-Fr](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_asr_vocab_unigram5000.zip), [En-It](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_asr_vocab_unigram5000.zip), [En-Pt](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_asr_vocab_unigram5000.zip), [En-Ro](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_asr_vocab_unigram5000.zip), [En-Ru](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_asr_vocab_unigram5000.zip), [Joint](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_joint_asr_vocab_unigram10000.zip) - ST: [En-De](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_st_vocab_unigram8000.zip), [En-Nl](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_st_vocab_unigram8000.zip), [En-Es](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_st_vocab_unigram8000.zip), [En-Fr](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_st_vocab_unigram8000.zip), [En-It](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_st_vocab_unigram8000.zip), [En-Pt](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_st_vocab_unigram8000.zip), [En-Ro](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_st_vocab_unigram8000.zip), [En-Ru](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_st_vocab_unigram8000.zip), [Multilingual](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_multilingual_st_vocab_unigram10000.zip) ## ASR #### Training En-De as example: ```bash fairseq-train ${MUSTC_ROOT}/en-de \ --config-yaml config_asr.yaml --train-subset train_asr --valid-subset dev_asr \ --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ --arch s2t_transformer_s --optimizer adam --lr 1e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 ``` For joint model (using ASR data from all 8 directions): ```bash fairseq-train ${MUSTC_ROOT} \ --config-yaml config_asr.yaml \ --train-subset train_de_asr,train_nl_asr,train_es_asr,train_fr_asr,train_it_asr,train_pt_asr,train_ro_asr,train_ru_asr \ --valid-subset dev_de_asr,dev_nl_asr,dev_es_asr,dev_fr_asr,dev_it_asr,dev_pt_asr,dev_ro_asr,dev_ru_asr \ --save-dir ${JOINT_ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ --arch s2t_transformer_s --optimizer adam --lr 1e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 ``` where `ASR_SAVE_DIR` (`JOINT_ASR_SAVE_DIR`) is the checkpoint root path. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. #### Inference & Evaluation ```bash CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py \ --inputs ${ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${MUSTC_ROOT}/en-de \ --config-yaml config_asr.yaml --gen-subset tst-COMMON_asr --task speech_to_text \ --path ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct # For models trained on joint data python scripts/average_checkpoints.py \ --inputs ${JOINT_ASR_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME}" for LANG in de nl es fr it pt ro ru; do fairseq-generate ${MUSTC_ROOT} \ --config-yaml config_asr.yaml --gen-subset tst-COMMON_${LANG}_asr --task speech_to_text \ --path ${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} --max-tokens 50000 --beam 5 \ --scoring wer --wer-tokenizer 13a --wer-lowercase --wer-remove-punct done ``` #### Results | Data | --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru | Model | |---|---|---|---|---|---|---|---|---|---|---|---| | Single | s2t_transformer_s | 31M | [18.2](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_asr_transformer_s.pt) | [17.6](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_asr_transformer_s.pt) | [17.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_asr_transformer_s.pt) | [17.2](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_asr_transformer_s.pt) | [17.9](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_asr_transformer_s.pt) | [19.1](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_asr_transformer_s.pt) | [18.1](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_asr_transformer_s.pt) | [17.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_asr_transformer_s.pt) | (<-Download) | | Joint | s2t_transformer_m | 76M | 16.8 | 16.7 | 16.9 | 16.9 | 17.0 | 17.4 | 17.0 | 16.9 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_joint_asr_transformer_m.pt) | ## ST #### Training En-De as example: ```bash fairseq-train ${MUSTC_ROOT}/en-de \ --config-yaml config_st.yaml --train-subset train_st --valid-subset dev_st \ --save-dir ${ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ --arch s2t_transformer_s --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} ``` For multilingual model (all 8 directions): ```bash fairseq-train ${MUSTC_ROOT} \ --config-yaml config_st.yaml \ --train-subset train_de_st,train_nl_st,train_es_st,train_fr_st,train_it_st,train_pt_st,train_ro_st,train_ru_st \ --valid-subset dev_de_st,dev_nl_st,dev_es_st,dev_fr_st,dev_it_st,dev_pt_st,dev_ro_st,dev_ru_st \ --save-dir ${MULTILINGUAL_ST_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ --task speech_to_text --criterion label_smoothed_cross_entropy --label-smoothing 0.1 --report-accuracy \ --arch s2t_transformer_s --ignore-prefix-size 1 --optimizer adam --lr 2e-3 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 \ --load-pretrained-encoder-from ${JOINT_ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} ``` where `ST_SAVE_DIR` (`MULTILINGUAL_ST_SAVE_DIR`) is the checkpoint root path. The ST encoder is pre-trained by ASR for faster training and better performance: `--load-pretrained-encoder-from <(JOINT_)ASR checkpoint path>`. We set `--update-freq 8` to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU. For multilingual models, we prepend target language ID token as target BOS, which should be excluded from the training loss via `--ignore-prefix-size 1`. #### Inference & Evaluation Average the last 10 checkpoints and evaluate on the `tst-COMMON` split: ```bash CHECKPOINT_FILENAME=avg_last_10_checkpoint.pt python scripts/average_checkpoints.py \ --inputs ${ST_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" fairseq-generate ${MUSTC_ROOT}/en-de \ --config-yaml config_st.yaml --gen-subset tst-COMMON_st --task speech_to_text \ --path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 50000 --beam 5 --scoring sacrebleu # For multilingual models python scripts/average_checkpoints.py \ --inputs ${MULTILINGUAL_ST_SAVE_DIR} --num-epoch-checkpoints 10 \ --output "${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME}" for LANG in de nl es fr it pt ro ru; do fairseq-generate ${MUSTC_ROOT} \ --config-yaml config_st.yaml --gen-subset tst-COMMON_${LANG}_st --task speech_to_text \ --prefix-size 1 --path ${MULTILINGUAL_ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --max-tokens 50000 --beam 5 --scoring sacrebleu done ``` For multilingual models, we force decoding from the target language ID token (as BOS) via `--prefix-size 1`. #### Results | Data | --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru | Model | |---|---|---|---|---|---|---|---|---|---|---|---| | Bilingual | s2t_transformer_s | 31M | [22.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_st_transformer_s.pt) | [27.3](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_nl_st_transformer_s.pt) | [27.2](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_st_transformer_s.pt) | [32.9](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_fr_st_transformer_s.pt) | [22.7](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_it_st_transformer_s.pt) | [28.1](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_pt_st_transformer_s.pt) | [21.9](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ro_st_transformer_s.pt) | [15.3](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_ru_st_transformer_s.pt) | (<-Download) | | Multilingual | s2t_transformer_m | 76M | 24.5 | 28.6 | 28.2 | 34.9 | 24.6 | 31.1 | 23.8 | 16.0 | [Download](https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_multilingual_st_transformer_m.pt) | [[Back]](..) ================================================ FILE: examples/speech_to_text/docs/simulst_mustc_example.md ================================================ # Simultaneous Speech Translation (SimulST) on MuST-C This is a tutorial of training and evaluating a transformer *wait-k* simultaneous model on MUST-C English-Germen Dataset, from [SimulMT to SimulST: Adapting Simultaneous Text Translation to End-to-End Simultaneous Speech Translation](https://www.aclweb.org/anthology/2020.aacl-main.58.pdf). [MuST-C](https://www.aclweb.org/anthology/N19-1202) is multilingual speech-to-text translation corpus with 8-language translations on English TED talks. ## Data Preparation This section introduces the data preparation for training and evaluation. If you only want to evaluate the model, please jump to [Inference & Evaluation](#inference--evaluation) [Download](https://ict.fbk.eu/must-c) and unpack MuST-C data to a path `${MUSTC_ROOT}/en-${TARGET_LANG_ID}`, then preprocess it with ```bash # Additional Python packages for S2T data processing/model training pip install pandas torchaudio sentencepiece # Generate TSV manifests, features, vocabulary, # global cepstral and mean estimation, # and configuration for each language cd fairseq python examples/speech_to_text/prep_mustc_data.py \ --data-root ${MUSTC_ROOT} --task asr \ --vocab-type unigram --vocab-size 10000 \ --cmvn-type global python examples/speech_to_text/prep_mustc_data.py \ --data-root ${MUSTC_ROOT} --task st \ --vocab-type unigram --vocab-size 10000 \ --cmvn-type global ``` ## ASR Pretraining We need a pretrained offline ASR model. Assuming the save directory of the ASR model is `${ASR_SAVE_DIR}`. The following command (and the subsequent training commands in this tutorial) assume training on 1 GPU (you can also train on 8 GPUs and remove the `--update-freq 8` option). ``` fairseq-train ${MUSTC_ROOT}/en-de \ --config-yaml config_asr.yaml --train-subset train_asr --valid-subset dev_asr \ --save-dir ${ASR_SAVE_DIR} --num-workers 4 --max-tokens 40000 --max-update 100000 \ --task speech_to_text --criterion label_smoothed_cross_entropy --report-accuracy \ --arch convtransformer_espnet --optimizer adam --lr 0.0005 --lr-scheduler inverse_sqrt \ --warmup-updates 10000 --clip-norm 10.0 --seed 1 --update-freq 8 ``` A pretrained ASR checkpoint can be downloaded [here](https://dl.fbaipublicfiles.com/simultaneous_translation/must_c_v1_en_de_pretrained_asr) ## Simultaneous Speech Translation Training ### Wait-K with fixed pre-decision module Fixed pre-decision indicates that the model operate simultaneous policy on the boundaries of fixed chunks. Here is a example of fixed pre-decision ratio 7 (the simultaneous decision is made every 7 encoder states) and a wait-3 policy model. Assuming the save directory is `${ST_SAVE_DIR}` ```bash fairseq-train ${MUSTC_ROOT}/en-de \ --config-yaml config_st.yaml --train-subset train_st --valid-subset dev_st \ --save-dir ${ST_SAVE_DIR} --num-workers 8 \ --optimizer adam --lr 0.0001 --lr-scheduler inverse_sqrt --clip-norm 10.0 \ --criterion label_smoothed_cross_entropy \ --warmup-updates 4000 --max-update 100000 --max-tokens 40000 --seed 2 \ --load-pretrained-encoder-from ${ASR_SAVE_DIR}/checkpoint_best.pt \ --task speech_to_text \ --arch convtransformer_simul_trans_espnet \ --simul-type waitk_fixed_pre_decision \ --waitk-lagging 3 \ --fixed-pre-decision-ratio 7 \ --update-freq 8 ``` ### Monotonic multihead attention with fixed pre-decision module ``` fairseq-train ${MUSTC_ROOT}/en-de \ --config-yaml config_st.yaml --train-subset train_st --valid-subset dev_st \ --save-dir ${ST_SAVE_DIR} --num-workers 8 \ --optimizer adam --lr 0.0001 --lr-scheduler inverse_sqrt --clip-norm 10.0 \ --warmup-updates 4000 --max-update 100000 --max-tokens 40000 --seed 2 \ --load-pretrained-encoder-from ${ASR_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --task speech_to_text \ --criterion latency_augmented_label_smoothed_cross_entropy \ --latency-weight-avg 0.1 \ --arch convtransformer_simul_trans_espnet \ --simul-type infinite_lookback_fixed_pre_decision \ --fixed-pre-decision-ratio 7 \ --update-freq 8 ``` ## Inference & Evaluation [SimulEval](https://github.com/facebookresearch/SimulEval) is used for evaluation. The following command is for evaluation. ``` git clone https://github.com/facebookresearch/SimulEval.git cd SimulEval pip install -e . simuleval \ --agent ${FAIRSEQ}/examples/speech_to_text/simultaneous_translation/agents/fairseq_simul_st_agent.py --source ${SRC_LIST_OF_AUDIO} --target ${TGT_FILE} --data-bin ${MUSTC_ROOT}/en-de \ --config config_st.yaml \ --model-path ${ST_SAVE_DIR}/${CHECKPOINT_FILENAME} \ --output ${OUTPUT} \ --scores ``` The source file `${SRC_LIST_OF_AUDIO}` is a list of paths of audio files. Assuming your audio files stored at `/home/user/data`, it should look like this ```bash /home/user/data/audio-1.wav /home/user/data/audio-2.wav ``` Each line of target file `${TGT_FILE}` is the translation for each audio file input. ```bash Translation_1 Translation_2 ``` The evaluation runs on the original MUSTC segmentation. The following command will generate the wav list and text file for a evaluation set `${SPLIT}` (chose from `dev`, `tst-COMMON` and `tst-HE`) in MUSTC to `${EVAL_DATA}`. ```bash python ${FAIRSEQ}/examples/speech_to_text/seg_mustc_data.py \ --data-root ${MUSTC_ROOT} --lang de \ --split ${SPLIT} --task st \ --output ${EVAL_DATA} ``` The `--data-bin` and `--config` should be the same in previous section if you prepare the data from the scratch. If only for evaluation, a prepared data directory can be found [here](https://dl.fbaipublicfiles.com/simultaneous_translation/must_c_v1.0_en_de_databin.tgz). It contains - `spm_unigram10000_st.model`: a sentencepiece model binary. - `spm_unigram10000_st.txt`: the dictionary file generated by the sentencepiece model. - `gcmvn.npz`: the binary for global cepstral mean and variance. - `config_st.yaml`: the config yaml file. It looks like this. You will need to set the absolute paths for `sentencepiece_model` and `stats_npz_path` if the data directory is downloaded. ```yaml bpe_tokenizer: bpe: sentencepiece sentencepiece_model: ABS_PATH_TO_SENTENCEPIECE_MODEL global_cmvn: stats_npz_path: ABS_PATH_TO_GCMVN_FILE input_channels: 1 input_feat_per_channel: 80 sampling_alpha: 1.0 specaugment: freq_mask_F: 27 freq_mask_N: 1 time_mask_N: 1 time_mask_T: 100 time_mask_p: 1.0 time_wrap_W: 0 transforms: '*': - global_cmvn _train: - global_cmvn - specaugment vocab_filename: spm_unigram10000_st.txt ``` Notice that once a `--data-bin` is set, the `--config` is the base name of the config yaml, not the full path. Set `--model-path` to the model checkpoint. A pretrained checkpoint can be downloaded from [here](https://dl.fbaipublicfiles.com/simultaneous_translation/convtransformer_wait5_pre7), which is a wait-5 model with a pre-decision of 280 ms. The result of this model on `tst-COMMON` is: ```bash { "Quality": { "BLEU": 13.94974229366959 }, "Latency": { "AL": 1751.8031870037803, "AL_CA": 2338.5911762796536, "AP": 0.7931395378788959, "AP_CA": 0.9405103863210942, "DAL": 1987.7811616943081, "DAL_CA": 2425.2751560926167 } } ``` If `--output ${OUTPUT}` option is used, the detailed log and scores will be stored under the `${OUTPUT}` directory. The quality is measured by detokenized BLEU. So make sure that the predicted words sent to the server are detokenized. The latency metrics are * Average Proportion * Average Lagging * Differentiable Average Lagging Again they will also be evaluated on detokenized text. ================================================ FILE: examples/speech_to_text/prep_covost_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path import shutil from tempfile import NamedTemporaryFile from typing import Optional, Tuple import pandas as pd import torchaudio from examples.speech_to_text.data_utils import ( create_zip, extract_fbank_features, filter_manifest_df, gen_config_yaml, gen_vocab, get_zip_manifest, load_df_from_tsv, save_df_to_tsv, ) from torch import Tensor from torch.utils.data import Dataset from torchaudio.datasets.utils import download_url, extract_archive from tqdm import tqdm log = logging.getLogger(__name__) MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] class CoVoST(Dataset): """Create a Dataset for CoVoST (https://github.com/facebookresearch/covost). Args: root (str): root path to the dataset and generated manifests/features source_language (str): source (audio) language target_language (str, optional): target (text) language, None for no translation (default: None) version (int, optional): CoVoST version. (default: 2) download (bool, optional): Whether to download the dataset if it is not found at root path. (default: ``False``). """ COVOST_URL_TEMPLATE = ( "https://dl.fbaipublicfiles.com/covost/" "covost_v2.{src_lang}_{tgt_lang}.tsv.tar.gz" ) VERSIONS = {2} SPLITS = ["train", "dev", "test"] XX_EN_LANGUAGES = { 1: ["fr", "de", "nl", "ru", "es", "it", "tr", "fa", "sv-SE", "mn", "zh-CN"], 2: [ "fr", "de", "es", "ca", "it", "ru", "zh-CN", "pt", "fa", "et", "mn", "nl", "tr", "ar", "sv-SE", "lv", "sl", "ta", "ja", "id", "cy", ], } EN_XX_LANGUAGES = { 1: [], 2: [ "de", "tr", "fa", "sv-SE", "mn", "zh-CN", "cy", "ca", "sl", "et", "id", "ar", "ta", "lv", "ja", ], } def __init__( self, root: str, split: str, source_language: str, target_language: Optional[str] = None, version: int = 2, ) -> None: assert version in self.VERSIONS and split in self.SPLITS assert source_language is not None self.no_translation = target_language is None if not self.no_translation: assert "en" in {source_language, target_language} if source_language == "en": assert target_language in self.EN_XX_LANGUAGES[version] else: assert source_language in self.XX_EN_LANGUAGES[version] else: # Hack here so that we can get "split" column from CoVoST TSV. # Note that we use CoVoST train split for ASR which is an extension # to Common Voice train split. target_language = "de" if source_language == "en" else "en" self.root: Path = Path(root) cv_tsv_path = self.root / "validated.tsv" assert cv_tsv_path.is_file() covost_url = self.COVOST_URL_TEMPLATE.format( src_lang=source_language, tgt_lang=target_language ) covost_archive = self.root / Path(covost_url).name if not covost_archive.is_file(): download_url(covost_url, self.root.as_posix(), hash_value=None) extract_archive(covost_archive.as_posix()) cv_tsv = load_df_from_tsv(cv_tsv_path) covost_tsv = load_df_from_tsv( self.root / Path(covost_url).name.replace(".tar.gz", "") ) df = pd.merge( left=cv_tsv[["path", "sentence", "client_id"]], right=covost_tsv[["path", "translation", "split"]], how="inner", on="path", ) if split == "train": df = df[(df["split"] == split) | (df["split"] == f"{split}_covost")] else: df = df[df["split"] == split] data = df.to_dict(orient="index").items() data = [v for k, v in sorted(data, key=lambda x: x[0])] self.data = [] for e in data: try: path = self.root / "clips" / e["path"] _ = torchaudio.info(path.as_posix()) self.data.append(e) except RuntimeError: pass def __getitem__( self, n: int ) -> Tuple[Tensor, int, str, str, Optional[str], str, str]: """Load the n-th sample from the dataset. Args: n (int): The index of the sample to be loaded Returns: tuple: ``(waveform, sample_rate, sentence, translation, speaker_id, sample_id)`` """ data = self.data[n] path = self.root / "clips" / data["path"] waveform, sample_rate = torchaudio.load(path) sentence = data["sentence"] translation = None if self.no_translation else data["translation"] speaker_id = data["client_id"] _id = data["path"].replace(".mp3", "") return waveform, sample_rate, sentence, translation, speaker_id, _id def __len__(self) -> int: return len(self.data) def process(args): root = Path(args.data_root).absolute() / args.src_lang if not root.is_dir(): raise NotADirectoryError(f"{root} does not exist") # Extract features feature_root = root / "fbank80" feature_root.mkdir(exist_ok=True) for split in CoVoST.SPLITS: print(f"Fetching split {split}...") dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): extract_fbank_features( waveform, sample_rate, feature_root / f"{utt_id}.npy" ) # Pack features into ZIP zip_path = root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] task = f"asr_{args.src_lang}" if args.tgt_lang is not None: task = f"st_{args.src_lang}_{args.tgt_lang}" for split in CoVoST.SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = CoVoST(root, split, args.src_lang, args.tgt_lang) for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append(src_utt if args.tgt_lang is None else tgt_utt) manifest["speaker"].append(speaker_id) is_train_split = split.startswith("train") if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, root / f"{split}_{task}.tsv") # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), root / spm_filename_prefix, args.vocab_type, args.vocab_size ) # Generate config YAML gen_config_yaml( root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(feature_root) def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data-root", "-d", required=True, type=str, help="data root with sub-folders for each language <root>/<src_lang>" ) parser.add_argument( "--vocab-type", default="unigram", required=True, type=str, choices=["bpe", "unigram", "char"], ), parser.add_argument("--vocab-size", default=1000, type=int) parser.add_argument("--src-lang", "-s", required=True, type=str) parser.add_argument("--tgt-lang", "-t", type=str) args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_text/prep_librispeech_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path import shutil from tempfile import NamedTemporaryFile import pandas as pd from examples.speech_to_text.data_utils import ( create_zip, extract_fbank_features, gen_config_yaml, gen_vocab, get_zip_manifest, save_df_to_tsv, ) from torchaudio.datasets import LIBRISPEECH from tqdm import tqdm log = logging.getLogger(__name__) SPLITS = [ "train-clean-100", "train-clean-360", "train-other-500", "dev-clean", "dev-other", "test-clean", "test-other", ] MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] def process(args): out_root = Path(args.output_root).absolute() out_root.mkdir(exist_ok=True) # Extract features feature_root = out_root / "fbank80" feature_root.mkdir(exist_ok=True) for split in SPLITS: print(f"Fetching split {split}...") dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True) print("Extracting log mel filter bank features...") for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" extract_fbank_features( wav, sample_rate, feature_root / f"{sample_id}.npy" ) # Pack features into ZIP zip_path = out_root / "fbank80.zip" print("ZIPing features...") create_zip(feature_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in SPLITS: manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = LIBRISPEECH(out_root.as_posix(), url=split) for _, _, utt, spk_id, chapter_no, utt_no in tqdm(dataset): sample_id = f"{spk_id}-{chapter_no}-{utt_no}" manifest["id"].append(sample_id) manifest["audio"].append(audio_paths[sample_id]) manifest["n_frames"].append(audio_lengths[sample_id]) manifest["tgt_text"].append(utt.lower()) manifest["speaker"].append(spk_id) save_df_to_tsv( pd.DataFrame.from_dict(manifest), out_root / f"{split}.tsv" ) if split.startswith("train"): train_text.extend(manifest["tgt_text"]) # Generate vocab vocab_size = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), out_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML gen_config_yaml( out_root, spm_filename=spm_filename_prefix + ".model", specaugment_policy="ld" ) # Clean up shutil.rmtree(feature_root) def main(): parser = argparse.ArgumentParser() parser.add_argument("--output-root", "-o", required=True, type=str) parser.add_argument( "--vocab-type", default="unigram", required=True, type=str, choices=["bpe", "unigram", "char"], ), parser.add_argument("--vocab-size", default=10000, type=int) args = parser.parse_args() process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_text/prep_mtedx_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os from pathlib import Path import shutil from itertools import groupby from tempfile import NamedTemporaryFile from typing import Tuple import pandas as pd import soundfile as sf from examples.speech_to_text.data_utils import ( create_zip, extract_fbank_features, filter_manifest_df, gen_config_yaml, gen_vocab, get_zip_manifest, load_df_from_tsv, save_df_to_tsv, ) import torch from torch.utils.data import Dataset from tqdm import tqdm from fairseq.data.audio.audio_utils import get_waveform, convert_waveform log = logging.getLogger(__name__) MANIFEST_COLUMNS = [ "id", "audio", "n_frames", "tgt_text", "speaker", "tgt_lang" ] class mTEDx(Dataset): """ Create a Dataset for Multilingual TEDx. Each item is a tuple of the form: waveform, sample_rate, source utterance, target utterance, speaker_id, utterance_id """ SPLITS = ["train", "valid", "test"] LANGPAIRS = ["es-es", "fr-fr", "pt-pt", "it-it", "ru-ru", "el-el", "ar-ar", "de-de", "es-en", "es-fr", "es-pt", "es-it", "fr-en", "fr-es", "fr-pt", "pt-en", "pt-es", "it-en", "it-es", "ru-en", "el-en"] def __init__(self, root: str, lang: str, split: str) -> None: assert split in self.SPLITS and lang in self.LANGPAIRS _root = Path(root) / f"{lang}" / "data" / split wav_root, txt_root = _root / "wav", _root / "txt" assert _root.is_dir() and wav_root.is_dir() and txt_root.is_dir() # Load audio segments try: import yaml except ImportError: print( "Please install PyYAML to load the Multilingual TEDx YAML files" ) with open(txt_root / f"{split}.yaml") as f: segments = yaml.load(f, Loader=yaml.BaseLoader) # Load source and target utterances src, tgt = lang.split("-") for _lang in [src, tgt]: with open(txt_root / f"{split}.{_lang}") as f: utterances = [r.strip() for r in f] assert len(segments) == len(utterances) for i, u in enumerate(utterances): segments[i][_lang] = u # Gather info self.data = [] for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]): wav_filename = wav_filename.replace(".wav", ".flac") wav_path = wav_root / wav_filename sample_rate = sf.info(wav_path.as_posix()).samplerate seg_group = sorted(_seg_group, key=lambda x: float(x["offset"])) for i, segment in enumerate(seg_group): offset = int(float(segment["offset"]) * sample_rate) n_frames = int(float(segment["duration"]) * sample_rate) _id = f"{wav_path.stem}_{i}" self.data.append( ( wav_path.as_posix(), offset, n_frames, sample_rate, segment[src], segment[tgt], segment["speaker_id"], tgt, _id, ) ) def __getitem__( self, n: int ) -> Tuple[torch.Tensor, int, str, str, str, str, str]: wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, tgt_lang, \ utt_id = self.data[n] waveform, _ = get_waveform(wav_path, frames=n_frames, start=offset) waveform = torch.from_numpy(waveform) return waveform, sr, src_utt, tgt_utt, spk_id, tgt_lang, utt_id def __len__(self) -> int: return len(self.data) def process(args): root = Path(args.data_root).absolute() for lang in mTEDx.LANGPAIRS: cur_root = root / f"{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in mTEDx.SPLITS: print(f"Fetching split {split}...") dataset = mTEDx(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate ) sf.write( (audio_root / f"{utt_id}.flac").as_posix(), _wavform.numpy(), tgt_sample_rate ) else: print("Extracting log mel filter bank features...") for waveform, sample_rate, _, _, _, _, utt_id in tqdm(dataset): extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy" ) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest(zip_path) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in mTEDx.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} ds = mTEDx(args.data_root, lang, split) for _, _, src_utt, tgt_utt, spk_id, tgt_lang, utt_id in tqdm(ds): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append( src_utt if args.task == "asr" else tgt_utt ) manifest["speaker"].append(spk_id) manifest["tgt_lang"].append(tgt_lang) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True} ) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", ) # Clean up shutil.rmtree(audio_root) def process_joint(args): cur_root = Path(args.data_root) assert all((cur_root / f"{lang}").is_dir() for lang in mTEDx.LANGPAIRS), \ "do not have downloaded data available for all languages" # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for lang in mTEDx.LANGPAIRS: tsv_path = cur_root / f"{lang}" / f"train_{args.task}.tsv" df = load_df_from_tsv(tsv_path) for t in df["tgt_text"]: f.write(t + "\n") special_symbols = None if args.joint: # Add tgt_lang tags to dict special_symbols = list( {f'<lang:{lang.split("-")[1]}>' for lang in mTEDx.LANGPAIRS} ) gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, special_symbols=special_symbols ) # Generate config YAML gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="ld", prepend_tgt_lang_tag=(args.joint), ) # Make symbolic links to manifests for lang in mTEDx.LANGPAIRS: for split in mTEDx.SPLITS: src_path = cur_root / f"{lang}" / f"{split}_{args.task}.tsv" desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv" if not desc_path.is_symlink(): os.symlink(src_path, desc_path) def main(): parser = argparse.ArgumentParser() parser.add_argument("--data-root", "-d", required=True, type=str) parser.add_argument( "--vocab-type", default="unigram", required=True, type=str, choices=["bpe", "unigram", "char"], ), parser.add_argument("--vocab-size", default=8000, type=int) parser.add_argument("--task", type=str, choices=["asr", "st"]) parser.add_argument("--joint", action="store_true", help="") parser.add_argument("--use-audio-input", action="store_true") args = parser.parse_args() if args.joint: process_joint(args) else: process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_text/prep_mustc_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os from pathlib import Path import shutil from itertools import groupby from tempfile import NamedTemporaryFile from typing import Tuple import numpy as np import pandas as pd import soundfile as sf from examples.speech_to_text.data_utils import ( create_zip, extract_fbank_features, filter_manifest_df, gen_config_yaml, gen_vocab, get_zip_manifest, load_df_from_tsv, save_df_to_tsv, cal_gcmvn_stats, ) import torch from torch.utils.data import Dataset from tqdm import tqdm from fairseq.data.audio.audio_utils import get_waveform, convert_waveform log = logging.getLogger(__name__) MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"] class MUSTC(Dataset): """ Create a Dataset for MuST-C. Each item is a tuple of the form: waveform, sample_rate, source utterance, target utterance, speaker_id, utterance_id """ SPLITS = ["train", "dev", "tst-COMMON", "tst-HE"] LANGUAGES = ["de", "es", "fr", "it", "nl", "pt", "ro", "ru"] def __init__(self, root: str, lang: str, split: str) -> None: assert split in self.SPLITS and lang in self.LANGUAGES _root = Path(root) / f"en-{lang}" / "data" / split wav_root, txt_root = _root / "wav", _root / "txt" assert _root.is_dir() and wav_root.is_dir() and txt_root.is_dir() # Load audio segments try: import yaml except ImportError: print("Please install PyYAML to load the MuST-C YAML files") with open(txt_root / f"{split}.yaml") as f: segments = yaml.load(f, Loader=yaml.BaseLoader) # Load source and target utterances for _lang in ["en", lang]: with open(txt_root / f"{split}.{_lang}") as f: utterances = [r.strip() for r in f] assert len(segments) == len(utterances) for i, u in enumerate(utterances): segments[i][_lang] = u # Gather info self.data = [] for wav_filename, _seg_group in groupby(segments, lambda x: x["wav"]): wav_path = wav_root / wav_filename sample_rate = sf.info(wav_path.as_posix()).samplerate seg_group = sorted(_seg_group, key=lambda x: x["offset"]) for i, segment in enumerate(seg_group): offset = int(float(segment["offset"]) * sample_rate) n_frames = int(float(segment["duration"]) * sample_rate) _id = f"{wav_path.stem}_{i}" self.data.append( ( wav_path.as_posix(), offset, n_frames, sample_rate, segment["en"], segment[lang], segment["speaker_id"], _id, ) ) def __getitem__( self, n: int ) -> Tuple[torch.Tensor, int, str, str, str, str]: wav_path, offset, n_frames, sr, src_utt, tgt_utt, spk_id, \ utt_id = self.data[n] waveform, _ = get_waveform(wav_path, frames=n_frames, start=offset) waveform = torch.from_numpy(waveform) return waveform, sr, src_utt, tgt_utt, spk_id, utt_id def __len__(self) -> int: return len(self.data) def process(args): root = Path(args.data_root).absolute() for lang in MUSTC.LANGUAGES: cur_root = root / f"en-{lang}" if not cur_root.is_dir(): print(f"{cur_root.as_posix()} does not exist. Skipped.") continue # Extract features audio_root = cur_root / ("flac" if args.use_audio_input else "fbank80") audio_root.mkdir(exist_ok=True) for split in MUSTC.SPLITS: print(f"Fetching split {split}...") dataset = MUSTC(root.as_posix(), lang, split) if args.use_audio_input: print("Converting audios...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): tgt_sample_rate = 16_000 _wavform, _ = convert_waveform( waveform, sample_rate, to_mono=True, to_sample_rate=tgt_sample_rate ) sf.write( (audio_root / f"{utt_id}.flac").as_posix(), _wavform.T.numpy(), tgt_sample_rate ) else: print("Extracting log mel filter bank features...") gcmvn_feature_list = [] if split == 'train' and args.cmvn_type == "global": print("And estimating cepstral mean and variance stats...") for waveform, sample_rate, _, _, _, utt_id in tqdm(dataset): features = extract_fbank_features( waveform, sample_rate, audio_root / f"{utt_id}.npy" ) if split == 'train' and args.cmvn_type == "global": if len(gcmvn_feature_list) < args.gcmvn_max_num: gcmvn_feature_list.append(features) if split == 'train' and args.cmvn_type == "global": # Estimate and save cmv stats = cal_gcmvn_stats(gcmvn_feature_list) with open(cur_root / "gcmvn.npz", "wb") as f: np.savez(f, mean=stats["mean"], std=stats["std"]) # Pack features into ZIP zip_path = cur_root / f"{audio_root.name}.zip" print("ZIPing audios/features...") create_zip(audio_root, zip_path) print("Fetching ZIP manifest...") audio_paths, audio_lengths = get_zip_manifest( zip_path, is_audio=args.use_audio_input, ) # Generate TSV manifest print("Generating manifest...") train_text = [] for split in MUSTC.SPLITS: is_train_split = split.startswith("train") manifest = {c: [] for c in MANIFEST_COLUMNS} dataset = MUSTC(args.data_root, lang, split) for _, _, src_utt, tgt_utt, speaker_id, utt_id in tqdm(dataset): manifest["id"].append(utt_id) manifest["audio"].append(audio_paths[utt_id]) manifest["n_frames"].append(audio_lengths[utt_id]) manifest["tgt_text"].append( src_utt if args.task == "asr" else tgt_utt ) manifest["speaker"].append(speaker_id) if is_train_split: train_text.extend(manifest["tgt_text"]) df = pd.DataFrame.from_dict(manifest) df = filter_manifest_df(df, is_train_split=is_train_split) save_df_to_tsv(df, cur_root / f"{split}_{args.task}.tsv") # Generate vocab v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for t in train_text: f.write(t + "\n") gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, ) # Generate config YAML if args.use_audio_input: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy=None, extra={"use_audio_input": True} ) else: gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="lb", cmvn_type=args.cmvn_type, gcmvn_path=( cur_root / "gcmvn.npz" if args.cmvn_type == "global" else None ), ) # Clean up shutil.rmtree(audio_root) def process_joint(args): cur_root = Path(args.data_root) assert all( (cur_root / f"en-{lang}").is_dir() for lang in MUSTC.LANGUAGES ), "do not have downloaded data available for all 8 languages" # Generate vocab vocab_size_str = "" if args.vocab_type == "char" else str(args.vocab_size) spm_filename_prefix = f"spm_{args.vocab_type}{vocab_size_str}_{args.task}" with NamedTemporaryFile(mode="w") as f: for lang in MUSTC.LANGUAGES: tsv_path = cur_root / f"en-{lang}" / f"train_{args.task}.tsv" df = load_df_from_tsv(tsv_path) for t in df["tgt_text"]: f.write(t + "\n") special_symbols = None if args.task == 'st': special_symbols = [f'<lang:{lang}>' for lang in MUSTC.LANGUAGES] gen_vocab( Path(f.name), cur_root / spm_filename_prefix, args.vocab_type, args.vocab_size, special_symbols=special_symbols ) # Generate config YAML gen_config_yaml( cur_root, spm_filename=spm_filename_prefix + ".model", yaml_filename=f"config_{args.task}.yaml", specaugment_policy="ld", prepend_tgt_lang_tag=(args.task == "st"), ) # Make symbolic links to manifests for lang in MUSTC.LANGUAGES: for split in MUSTC.SPLITS: src_path = cur_root / f"en-{lang}" / f"{split}_{args.task}.tsv" desc_path = cur_root / f"{split}_{lang}_{args.task}.tsv" if not desc_path.is_symlink(): os.symlink(src_path, desc_path) def main(): parser = argparse.ArgumentParser() parser.add_argument("--data-root", "-d", required=True, type=str) parser.add_argument( "--vocab-type", default="unigram", required=True, type=str, choices=["bpe", "unigram", "char"], ), parser.add_argument("--vocab-size", default=8000, type=int) parser.add_argument("--task", type=str, choices=["asr", "st"]) parser.add_argument("--joint", action="store_true", help="") parser.add_argument( "--cmvn-type", default="utterance", choices=["global", "utterance"], help="The type of cepstral mean and variance normalization" ) parser.add_argument( "--gcmvn-max-num", default=150000, type=int, help="Maximum number of sentences to use to estimate global mean and " "variance" ) parser.add_argument("--use-audio-input", action="store_true") args = parser.parse_args() if args.joint: process_joint(args) else: process(args) if __name__ == "__main__": main() ================================================ FILE: examples/speech_to_text/seg_mustc_data.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from pathlib import Path import soundfile as sf from examples.speech_to_text.prep_mustc_data import ( MUSTC ) from tqdm import tqdm log = logging.getLogger(__name__) def main(args): root = Path(args.data_root).absolute() lang = args.lang split = args.split cur_root = root / f"en-{lang}" assert cur_root.is_dir(), ( f"{cur_root.as_posix()} does not exist. Skipped." ) dataset = MUSTC(root.as_posix(), lang, split) output = Path(args.output).absolute() output.mkdir(exist_ok=True) f_text = open(output / f"{split}.{lang}", "w") f_wav_list = open(output / f"{split}.wav_list", "w") for waveform, sample_rate, _, text, _, utt_id in tqdm(dataset): sf.write( output / f"{utt_id}.wav", waveform.squeeze(0).numpy(), samplerate=int(sample_rate) ) f_text.write(text + "\n") f_wav_list.write(str(output / f"{utt_id}.wav") + "\n") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data-root", "-d", required=True, type=str) parser.add_argument("--task", required=True, type=str, choices=["asr", "st"]) parser.add_argument("--lang", required=True, type=str) parser.add_argument("--output", required=True, type=str) parser.add_argument("--split", required=True, choices=MUSTC.SPLITS) args = parser.parse_args() main(args) ================================================ FILE: examples/speech_to_text/simultaneous_translation/agents/fairseq_simul_st_agent.py ================================================ import math import os import json import numpy as np import torch import torchaudio.compliance.kaldi as kaldi import yaml from fairseq import checkpoint_utils, tasks from fairseq.file_io import PathManager try: from simuleval import READ_ACTION, WRITE_ACTION, DEFAULT_EOS from simuleval.agents import SpeechAgent from simuleval.states import ListEntry, SpeechStates except ImportError: print("Please install simuleval 'pip install simuleval'") SHIFT_SIZE = 10 WINDOW_SIZE = 25 SAMPLE_RATE = 16000 FEATURE_DIM = 80 BOW_PREFIX = "\u2581" class OnlineFeatureExtractor: """ Extract speech feature on the fly. """ def __init__(self, args): self.shift_size = args.shift_size self.window_size = args.window_size assert self.window_size >= self.shift_size self.sample_rate = args.sample_rate self.feature_dim = args.feature_dim self.num_samples_per_shift = int(self.shift_size * self.sample_rate / 1000) self.num_samples_per_window = int(self.window_size * self.sample_rate / 1000) self.len_ms_to_samples = lambda x: x * self.sample_rate / 1000 self.previous_residual_samples = [] self.global_cmvn = args.global_cmvn def clear_cache(self): self.previous_residual_samples = [] def __call__(self, new_samples): samples = self.previous_residual_samples + new_samples if len(samples) < self.num_samples_per_window: self.previous_residual_samples = samples return # num_frames is the number of frames from the new segment num_frames = math.floor( (len(samples) - self.len_ms_to_samples(self.window_size - self.shift_size)) / self.num_samples_per_shift ) # the number of frames used for feature extraction # including some part of thte previous segment effective_num_samples = int( num_frames * self.len_ms_to_samples(self.shift_size) + self.len_ms_to_samples(self.window_size - self.shift_size) ) input_samples = samples[:effective_num_samples] self.previous_residual_samples = samples[ num_frames * self.num_samples_per_shift: ] torch.manual_seed(1) output = kaldi.fbank( torch.FloatTensor(input_samples).unsqueeze(0), num_mel_bins=self.feature_dim, frame_length=self.window_size, frame_shift=self.shift_size, ).numpy() output = self.transform(output) return torch.from_numpy(output) def transform(self, input): if self.global_cmvn is None: return input mean = self.global_cmvn["mean"] std = self.global_cmvn["std"] x = np.subtract(input, mean) x = np.divide(x, std) return x class TensorListEntry(ListEntry): """ Data structure to store a list of tensor. """ def append(self, value): if len(self.value) == 0: self.value = value return self.value = torch.cat([self.value] + [value], dim=0) def info(self): return { "type": str(self.new_value_type), "length": self.__len__(), "value": "" if type(self.value) is list else self.value.size(), } class FairseqSimulSTAgent(SpeechAgent): speech_segment_size = 40 # in ms, 4 pooling ratio * 10 ms step size def __init__(self, args): super().__init__(args) self.eos = DEFAULT_EOS self.gpu = getattr(args, "gpu", False) self.args = args self.load_model_vocab(args) if getattr( self.model.decoder.layers[0].encoder_attn, 'pre_decision_ratio', None ) is not None: self.speech_segment_size *= ( self.model.decoder.layers[0].encoder_attn.pre_decision_ratio ) args.global_cmvn = None if args.config: with open(os.path.join(args.data_bin, args.config), "r") as f: config = yaml.load(f, Loader=yaml.BaseLoader) if "global_cmvn" in config: args.global_cmvn = np.load(config["global_cmvn"]["stats_npz_path"]) if args.global_stats: with PathManager.open(args.global_stats, "r") as f: global_cmvn = json.loads(f.read()) self.global_cmvn = {"mean": global_cmvn["mean"], "std": global_cmvn["stddev"]} self.feature_extractor = OnlineFeatureExtractor(args) self.max_len = args.max_len self.force_finish = args.force_finish torch.set_grad_enabled(False) def build_states(self, args, client, sentence_id): # Initialize states here, for example add customized entry to states # This function will be called at beginning of every new sentence states = SpeechStates(args, client, sentence_id, self) self.initialize_states(states) return states def to_device(self, tensor): if self.gpu: return tensor.cuda() else: return tensor.cpu() @staticmethod def add_args(parser): # fmt: off parser.add_argument('--model-path', type=str, required=True, help='path to your pretrained model.') parser.add_argument("--data-bin", type=str, required=True, help="Path of data binary") parser.add_argument("--config", type=str, default=None, help="Path to config yaml file") parser.add_argument("--global-stats", type=str, default=None, help="Path to json file containing cmvn stats") parser.add_argument("--tgt-splitter-type", type=str, default="SentencePiece", help="Subword splitter type for target text") parser.add_argument("--tgt-splitter-path", type=str, default=None, help="Subword splitter model path for target text") parser.add_argument("--user-dir", type=str, default="examples/simultaneous_translation", help="User directory for simultaneous translation") parser.add_argument("--max-len", type=int, default=200, help="Max length of translation") parser.add_argument("--force-finish", default=False, action="store_true", help="Force the model to finish the hypothsis if the source is not finished") parser.add_argument("--shift-size", type=int, default=SHIFT_SIZE, help="Shift size of feature extraction window.") parser.add_argument("--window-size", type=int, default=WINDOW_SIZE, help="Window size of feature extraction window.") parser.add_argument("--sample-rate", type=int, default=SAMPLE_RATE, help="Sample rate") parser.add_argument("--feature-dim", type=int, default=FEATURE_DIM, help="Acoustic feature dimension.") # fmt: on return parser def load_model_vocab(self, args): filename = args.model_path if not os.path.exists(filename): raise IOError("Model file not found: {}".format(filename)) state = checkpoint_utils.load_checkpoint_to_cpu(filename) task_args = state["cfg"]["task"] task_args.data = args.data_bin if args.config is not None: task_args.config_yaml = args.config task = tasks.setup_task(task_args) # build model for ensemble state["cfg"]["model"].load_pretrained_encoder_from = None state["cfg"]["model"].load_pretrained_decoder_from = None self.model = task.build_model(state["cfg"]["model"]) self.model.load_state_dict(state["model"], strict=True) self.model.eval() self.model.share_memory() if self.gpu: self.model.cuda() # Set dictionary self.dict = {} self.dict["tgt"] = task.target_dictionary def initialize_states(self, states): self.feature_extractor.clear_cache() states.units.source = TensorListEntry() states.units.target = ListEntry() states.incremental_states = dict() def segment_to_units(self, segment, states): # Convert speech samples to features features = self.feature_extractor(segment) if features is not None: return [features] else: return [] def units_to_segment(self, units, states): # Merge sub word to full word. if self.model.decoder.dictionary.eos() == units[0]: return DEFAULT_EOS segment = [] if None in units.value: units.value.remove(None) for index in units: if index is None: units.pop() token = self.model.decoder.dictionary.string([index]) if token.startswith(BOW_PREFIX): if len(segment) == 0: segment += [token.replace(BOW_PREFIX, "")] else: for j in range(len(segment)): units.pop() string_to_return = ["".join(segment)] if self.model.decoder.dictionary.eos() == units[0]: string_to_return += [DEFAULT_EOS] return string_to_return else: segment += [token.replace(BOW_PREFIX, "")] if ( len(units) > 0 and self.model.decoder.dictionary.eos() == units[-1] or len(states.units.target) > self.max_len ): tokens = [self.model.decoder.dictionary.string([unit]) for unit in units] return ["".join(tokens).replace(BOW_PREFIX, "")] + [DEFAULT_EOS] return None def update_model_encoder(self, states): if len(states.units.source) == 0: return src_indices = self.to_device( states.units.source.value.unsqueeze(0) ) src_lengths = self.to_device( torch.LongTensor([states.units.source.value.size(0)]) ) states.encoder_states = self.model.encoder(src_indices, src_lengths) torch.cuda.empty_cache() def update_states_read(self, states): # Happens after a read action. self.update_model_encoder(states) def policy(self, states): if not getattr(states, "encoder_states", None): return READ_ACTION tgt_indices = self.to_device( torch.LongTensor( [self.model.decoder.dictionary.eos()] + [x for x in states.units.target.value if x is not None] ).unsqueeze(0) ) states.incremental_states["steps"] = { "src": states.encoder_states["encoder_out"][0].size(0), "tgt": 1 + len(states.units.target), } states.incremental_states["online"] = {"only": torch.tensor(not states.finish_read())} x, outputs = self.model.decoder.forward( prev_output_tokens=tgt_indices, encoder_out=states.encoder_states, incremental_state=states.incremental_states, ) states.decoder_out = x states.decoder_out_extra = outputs torch.cuda.empty_cache() if outputs.action == 0: return READ_ACTION else: return WRITE_ACTION def predict(self, states): decoder_states = states.decoder_out lprobs = self.model.get_normalized_probs( [decoder_states[:, -1:]], log_probs=True ) index = lprobs.argmax(dim=-1) index = index[0, 0].item() if ( self.force_finish and index == self.model.decoder.dictionary.eos() and not states.finish_read() ): # If we want to force finish the translation # (don't stop before finish reading), return a None # self.model.decoder.clear_cache(states.incremental_states) index = None return index ================================================ FILE: examples/stories/README.md ================================================ # Hierarchical Neural Story Generation (Fan et al., 2018) The following commands provide an example of pre-processing data, training a model, and generating text for story generation with the WritingPrompts dataset. ## Pre-trained models Description | Dataset | Model | Test set(s) ---|---|---|--- Stories with Convolutional Model <br> ([Fan et al., 2018](https://arxiv.org/abs/1805.04833)) | [WritingPrompts](https://dl.fbaipublicfiles.com/fairseq/data/writingPrompts.tar.gz) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.bz2) | [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/stories_test.tar.bz2) We provide sample stories generated by the [convolutional seq2seq model](https://dl.fbaipublicfiles.com/fairseq/data/seq2seq_stories.txt) and [fusion model](https://dl.fbaipublicfiles.com/fairseq/data/fusion_stories.txt) from [Fan et al., 2018](https://arxiv.org/abs/1805.04833). The corresponding prompts for the fusion model can be found [here](https://dl.fbaipublicfiles.com/fairseq/data/fusion_prompts.txt). Note that there are unk in the file, as we modeled a small full vocabulary (no BPE or pre-training). We did not use these unk prompts for human evaluation. ## Dataset The dataset can be downloaded like this: ```bash cd examples/stories curl https://dl.fbaipublicfiles.com/fairseq/data/writingPrompts.tar.gz | tar xvzf - ``` and contains a train, test, and valid split. The dataset is described here: https://arxiv.org/abs/1805.04833. We model only the first 1000 words of each story, including one newLine token. ## Example usage First we will preprocess the dataset. Note that the dataset release is the full data, but the paper models the first 1000 words of each story. Here is example code that trims the dataset to the first 1000 words of each story: ```python data = ["train", "test", "valid"] for name in data: with open(name + ".wp_target") as f: stories = f.readlines() stories = [" ".join(i.split()[0:1000]) for i in stories] with open(name + ".wp_target", "w") as o: for line in stories: o.write(line.strip() + "\n") ``` Once we've trimmed the data we can binarize it and train our model: ```bash # Binarize the dataset: export TEXT=examples/stories/writingPrompts fairseq-preprocess --source-lang wp_source --target-lang wp_target \ --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ --destdir data-bin/writingPrompts --padding-factor 1 --thresholdtgt 10 --thresholdsrc 10 # Train the model: fairseq-train data-bin/writingPrompts -a fconv_self_att_wp --lr 0.25 --optimizer nag --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau --decoder-attention True --encoder-attention False --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 --source-lang wp_source --target-lang wp_target --gated-attention True --self-attention True --project-input True --pretrained False # Train a fusion model: # add the arguments: --pretrained True --pretrained-checkpoint path/to/checkpoint # Generate: # Note: to load the pretrained model at generation time, you need to pass in a model-override argument to communicate to the fusion model at generation time where you have placed the pretrained checkpoint. By default, it will load the exact path of the fusion model's pretrained model from training time. You should use model-override if you have moved the pretrained model (or are using our provided models). If you are generating from a non-fusion model, the model-override argument is not necessary. fairseq-generate data-bin/writingPrompts --path /path/to/trained/model/checkpoint_best.pt --batch-size 32 --beam 1 --sampling --sampling-topk 10 --temperature 0.8 --nbest 1 --model-overrides "{'pretrained_checkpoint':'/path/to/pretrained/model/checkpoint'}" ``` ## Citation ```bibtex @inproceedings{fan2018hierarchical, title = {Hierarchical Neural Story Generation}, author = {Fan, Angela and Lewis, Mike and Dauphin, Yann}, booktitle = {Conference of the Association for Computational Linguistics (ACL)}, year = 2018, } ``` ================================================ FILE: examples/textless_nlp/dgslm/README.md ================================================ # Generative Spoken Dialogue Language Modeling [[paper]](https://arxiv.org/abs/2203.16502) [[demo samples]](https://speechbot.github.io/dgslm/index.html) [[blog]](https://ai.facebook.com/blog/generating-chit-chat-including-laughs-yawns-ums-and-other-nonverbal-cues-from-raw-audio/) This repo contains the code and pre-trained models for the paper _Generative Spoken Dialogue Language Modeling_. <details> <summary>Paper abstract </summary> > We introduce dGSLM, the first "textless" model able to generate audio samples of naturalistic spoken dialogues. It uses recent work on unsupervised spoken unit discovery coupled with a dual-tower transformer architecture with cross-attention trained on 2000 hours of two-channel raw conversational audio (Fisher dataset) without any text or labels. We show that our model is able to generate speech, laughter and other paralinguistic signals in the two channels simultaneously and reproduces more naturalistic and fluid turn taking compared to a text-based cascaded model. </details> ## [Speech-to-Unit Encoder for dGSLM: The Fisher HuBERT model](hubert_fisher/) The [hubert_fisher](hubert_fisher/) repository contains the pre-trained models and recipies to produce discrete units for the dGSLM model. ## [Unit-to-Speech Decoder for dGSLM](vocoder_hifigan/) The [vocoder_hifigan](vocoder_hifigan/) repo contains the vocoder and recipies to synthesize the waveform from the discrete units. ## Spoken Dialogue Transformer Language Model (SpeechDLM) ### Pre-trained model We share the pre-trained model checkpoint for the best configuration in the paper (DLM-5 model, with Edge Unit Prediction & Delayed Duration Prediction objectives), dubbed as `SpeechDLM`, trained on the 2000 hours of Fisher dataset : | Pre-trained SpeechDLM model trained on Fisher dataset | |-----------------------------------------------| |[model checkpoint](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/speech_dlm/speech_dlm_base.pt) - [dictionary 1](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/speech_dlm/dict.unitA.txt) - [dictionary 2](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/speech_dlm/dict.unitB.txt)| the two dictionary files correspond to the two channels, and actually have the same content. ### Sample from a trained model You can sample from a trained SpeechDLM model interactively : ```python from fairseq.models.speech_dlm import SpeechDLM # Load SpeechDLM model speech_dlm = SpeechDLM.from_pretrained( model_name_or_path='/path/to/model/dir', checkpoint_file='speech_dlm_base.pt', data_name_or_path='/path/to/data/dir' ) # Disable dropout speech_dlm.eval() # Move model to GPU speech_dlm.cuda() # Define the input sequences input_sequences = [{ 'unitA': '7 376 376 133 178 486 486 486 486 486 486 486 486 2 486', 'unitB': '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408' }] # Sample from the SpeechDLM model generated_units = speech_dlm.sample( input_sequences, max_len_a = 0, max_len_b = 500, sampling=True, beam=5, ) # >> {'unitA': '7 376 376 133 178 486 486 486 486 486 486 486 486 2 486 486 178 486 486 2 2 376 376 486 486 486 376 376 387 387 ...', # >> 'unitB': '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408 32 428 95 356 141 331 439 350 350 192 331 445 202 104 104 ...'} ``` Or using the `sample_speech_dlm.py` script : ```bash python sample_speech_dlm.py \ --in-file $INPUT_CODE_FILE --out-file $OUTPUT_FILE \ --ckpt $CHECKPOINT_PATH --data $DATA_DIR ``` where each line of INPUT_CODE_FILE is a dictionary with keys `'audio', 'unitA', 'unitB'` as follows : ``` {'audio': 'file_1', 'unitA': '8 8 ... 352 352', 'unitB': '217 8 ... 8 8'} {'audio': 'file_2', 'unitA': '5 5 ... 65 65', 'unitB': '6 35 ... 8 9'} ... ``` This code file can be created with the script `create_input_code.py` (using the outputs of `quantize_with_kmeans.py` [here](hubert_fisher/#encode-audio-to-discrete-units)) : ```bash python examples/textless_nlp/dgslm/vocoder_hifigan/create_input_code.py \ $CHANNEL1_UNITS $CHANNEL2_UNITS $OUTPUT_CODE_FILE ``` ### Training a SpeechDLM model #### 1) Data preparation First, you need to prepare the raw dataset. For each `split` (train, valid), you need two files corresponding to two channels (namely `unitA` and `unitB` for example) containing the units from each channel separately. Make sure that 2 files have the same number of lines and each corresponding line has the same number of units. Here is an example of `.unitA` file : ``` 7 376 376 133 178 486 486 486 486 376 ``` and the corresponding `.unitB` file : ``` 7 499 415 177 7 7 7 136 331 445 ``` These two files can be obtained using the [example command](hubert_fisher/#encode-audio-to-discrete-units) of hubert fisher, with the `--hide-fname` option added. The raw dataset directory should contain the following files : ``` train.unitA valid.unitA train.unitB valid.unitB ``` Next preprocess/binarize the data with `fairseq-preprocess`, but make sure to preprocess each channel separately, and **rename** the preprocessed files under the following format `${split}.${channel}.{bin, idx}`. Each channel also needs a separate dictionary file under the name `dict.${channel}.txt` . Here is an example pre-processing code : ```bash # Preprocess the first channel (unitA) fairseq-preprocess --source-lang unitA \ --only-source \ --trainpref $RAW_DATA_DIR/train \ --validpref $RAW_DATA_DIR/valid \ --destdir $BIN_DATA_DIR \ --workers 20 # Preprocess the second channel (unitB) and reuse the dictionary from the first channel fairseq-preprocess --source-lang unitB \ --srcdict $BIN_DATA_DIR/dict.unitA.txt \ --only-source \ --trainpref $RAW_DATA_DIR/train \ --validpref $RAW_DATA_DIR/valid \ --destdir $BIN_DATA_DIR \ --workers 20 # Rename the bin & index files for channel in unitA unitB; do for split in train valid; do mv $BIN_DATA_DIR/${split}.${channel}-None.${channel}.bin $BIN_DATA_DIR/${split}.${channel}.bin mv $BIN_DATA_DIR/${split}.${channel}-None.${channel}.idx $BIN_DATA_DIR/${split}.${channel}.idx done done ``` Finally, the preprocessed (bin) dataset directory should contain the following files : ``` dict.unitA.txt train.unitA.idx train.unitA.bin valid.unitA.idx valid.unitA.bin dict.unitB.txt train.unitB.idx train.unitB.bin valid.unitB.idx valid.unitB.bin ``` #### 2) Train the model To train the SpeechDLM (with the configuration as the pre-trained model) on 2 GPUs : ```bash fairseq-train $BIN_DATA_DIR \ --save-dir $CHECKPOINT_DIR \ --tensorboard-logdir $CHECKPOINT_DIR \ --task speech_dlm_task --channels unitA,unitB \ --next-unit-prediction "False" --edge-unit-prediction "True" \ --duration-prediction "True" --delayed-duration-target "True" \ --criterion speech_dlm_criterion \ --arch speech_dlm --decoder-cross-layers 4 \ --share-decoder-input-output-embed \ --dropout 0.1 --attention-dropout 0.1 \ --optimizer adam --adam-betas "(0.9, 0.98)" --clip-norm 1.0 \ --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 \ --max-tokens 18432 --tokens-per-sample 6144 --sample-break-mode none \ --update-freq 16 --num-workers 4 --skip-invalid-size-inputs-valid-test \ --max-update 250000 --warmup-updates 20000 \ --save-interval-updates 10000 --keep-last-epochs 1 --no-epoch-checkpoints \ --log-interval 50 --seed 100501 \ --fp16 --checkpoint-activations ``` #### 3) Validate The model can be validated via the `fairseq-validate` command : ```bash fairseq-validate $BIN_DATA_DIR \ --task speech_dlm_task \ --path $CHECKPOINT_PATH \ --max-tokens 6144 ``` ## Reference If you find our work useful in your research, please consider citing our paper: ```bibtex @article{nguyen2022dgslm, title = {Generative Spoken Dialogue Language Modeling}, author = {Nguyen, Tu Anh and Kharitonov, Eugene and Copet, Jade and Adi, Yossi and Hsu, Wei-Ning and Elkahky, Ali and Tomasello, Paden and Algayres, Robin and Sagot, Benoit and Mohamed, Abdelrahman and Dupoux, Emmanuel}, eprint={2203.16502}, archivePrefix={arXiv}, primaryClass={cs.CL}, year={2022} } ``` ================================================ FILE: examples/textless_nlp/dgslm/create_code_file.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse def main(): """ Create code file with the following format: {'audio': 'file1', 'unitA': 'file1_chnl1_units', 'unitB': 'file1_chnl2_units'} {'audio': 'file2', 'unitA': 'file2_chnl1_units', 'unitB': 'file2_chnl2_units'} ... Given the input units files - channel1_units_file: file1|file1_chnl1_units file2|file2_chnl1_units ... - channel2_units_file: file1|file1_chnl2_units file2|file2_chnl2_units ... """ parser = argparse.ArgumentParser() parser.add_argument( "channel1_units_file", type=str, help="Units of the first channel.", ) parser.add_argument( "channel2_units_file", type=str, help="Units of the second channel.", ) parser.add_argument( "output_file", type=str, help="Output file.", ) parser.add_argument( "--channels", type=str, default='unitA,unitB', help="Comma-separated list of the channel names to create in the code" "(Default: 'unitA,unitB').", ) args = parser.parse_args() channel_names = args.channels.split(',') with open(args.channel1_units_file) as funit1, \ open(args.channel2_units_file) as funit2, \ open(args.output_file, 'w') as fout: for line1, line2 in zip(funit1, funit2): fname1, units1 = line1.strip().split('|') fname2, units2 = line2.strip().split('|') assert len(units1.split()) == len(units2.split()), \ f"Mismatch units length ({len(units1.split())} vs {len(units2.split())})" base_fname1 = fname1[:-9] base_fname2 = fname2[:-9] assert base_fname1 == base_fname2, \ f"Mismatch filenames ({base_fname1} vs {base_fname2}). " \ f"Expected $filename-channel1 and $filename-channel2 in two files" code = { "audio" : base_fname1, channel_names[0] : units1, channel_names[1] : units2, } fout.write(str(code)) fout.write("\n") print(f"Codes written to {args.output_file}") if __name__ == "__main__": main() ================================================ FILE: examples/textless_nlp/dgslm/dgslm_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch import json from fairseq import utils from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder # from examples.hubert.simple_kmeans.dump_hubert_feature import HubertFeatureReader from examples.textless_nlp.gslm.speech2unit.pretrained.hubert_feature_reader import HubertFeatureReader from examples.hubert.simple_kmeans.dump_km_label import ApplyKmeans # Hubert tokenizer class HubertTokenizer: def __init__( self, hubert_path, hubert_layer, km_path, use_cuda=True, ): self.feature_extractor = HubertFeatureReader(hubert_path, hubert_layer, use_cuda=use_cuda) self.quantizer = ApplyKmeans(km_path) if not use_cuda: self.quantizer.C = self.quantizer.C.cpu() self.quantizer.Cnorm = self.quantizer.Cnorm.cpu() def wav2code(self, path, channel_id=1): feat = self.feature_extractor.get_feats(path, channel_id=channel_id) code = self.quantizer(feat) return ' '.join(map(str, code)) def wav2codes(self, path): codes = [ self.wav2code(path, channel_id=1), self.wav2code(path, channel_id=2) ] return codes # Vocoder class HifiganVocoder: def __init__( self, vocoder_path, vocoder_cfg_path, use_cuda=True, ): with open(vocoder_cfg_path) as f: cfg = json.load(f) self.vocoder = CodeHiFiGANVocoder(vocoder_path, cfg).eval() self.use_cuda = use_cuda if self.use_cuda: self.vocoder.cuda() def code2wav(self, code, speaker_id=0, pred_dur=False): if isinstance(code, str): code = list(map(int, code.split())) inp = {"code": torch.LongTensor(code).view(1, -1)} if self.vocoder.model.multispkr: inp["spkr"] = torch.LongTensor([speaker_id]).view(1, 1) if self.use_cuda: inp = utils.move_to_cuda(inp) return self.vocoder(inp, pred_dur).detach().cpu().numpy() def codes2wav(self, codes, speaker_ids=[0, 4], pred_dur=False): if isinstance(codes, dict): codes = list(codes.values()) assert len(codes) == 2 wav1 = self.code2wav(codes[0], speaker_ids[0], pred_dur) wav2 = self.code2wav(codes[1], speaker_ids[1], pred_dur) wav = np.stack([wav1, wav2]) return wav ================================================ FILE: examples/textless_nlp/dgslm/hubert_fisher/README.md ================================================ # Dialogue Speech-to-Unit Encoder for dGSLM: The Fisher HuBERT model For the speech2unit encoder, we train a [HuBERT model](https://arxiv.org/pdf/2106.07447.pdf) on the [Fisher dataset](http://www.lrec-conf.org/proceedings/lrec2004/pdf/767.pdf) for 3 iterations (see [our paper](https://arxiv.org/pdf/2203.16502.pdf) for more details) and train a k-means model with 500 units on the layer 12 features of the HuBERT model. ## Model checkpoints The pre-trained HuBERT and k-means model checkpoints can be found here: | Fisher HuBERT model | k-means model | |---------------------|---------------| |[download](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hubert/hubert_fisher.pt)|[download](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hubert/hubert_fisher_km_500.bin)| ## Encode audio to discrete units Below is an example command to encode a stereo dataset to discrete units using the pre-trained model checkpoints : ```bash for CHANNEL_ID in 1 2; do python examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py \ --feature_type hubert \ --kmeans_model_path path/to/hubert_fisher_km_500.bin \ --acoustic_model_path path/to/hubert_fisher.pt \ --layer 12 \ --manifest_path $MANIFEST_FILE \ --out_quantized_file_path ${OUTPUT_FILE}-channel${CHANNEL_ID} \ --extension $EXTENSION \ --channel_id $CHANNEL_ID done ``` where MANIFEST_FILE is the output of [wav2vec manifest script](https://github.com/facebookresearch/fairseq/blob/main/examples/wav2vec/wav2vec_manifest.py), which can be obtained through the following command : ``` python examples/wav2vec/wav2vec_manifest.py --valid-percent=0.0 $AUDIO_DIR --dest=$OUTPUT_DIR --ext=$EXTENSION ``` Otherwise, you can encode an audio file in python interactively with the HubertTokenizer class : ```python # Load the Hubert tokenizer from examples.textless_nlp.dgslm.dgslm_utils import HubertTokenizer encoder = HubertTokenizer( hubert_path = "/path/to/hubert_ckpt.pt", hubert_layer = 12, km_path = "path/to/km.bin" ) # Encode the audio to units path = "/path/to/stereo/audio.wav" codes = encoder.wav2codes(path) # > ['7 376 376 133 178 486 486 486 486 486 486 486 486 2 486', # > '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408'] ``` ================================================ FILE: examples/textless_nlp/dgslm/sample_speech_dlm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import ast import argparse import logging import torch from fairseq import utils from fairseq.models.speech_dlm import SpeechDLM logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def load_data(in_file): with open(in_file) as f: data = [ast.literal_eval(line.strip()) for line in f] return data def write_data(out_file, data): with open(out_file, 'w') as f: for d in data: f.write(str(d)) f.write('\n') def limit(codes, n): new_codes = {} for k, v in codes.items(): new_codes[k] = ' '.join(v.split()[:n]) return new_codes def main(args): logger.info(args) use_cuda = torch.cuda.is_available() # Load the data data = load_data(args.in_file) channels = args.channels.split(',') unit_sequences = [{ channels[0]: d[channels[0]], channels[1]: d[channels[1]], } for d in data] fnames = [d['audio'] for d in data] print(f"Found {len(data)} sequences from {args.in_file}") # Limit the prefix size if args.prefix_size is not None: print(f"Limit the prefix size to {args.prefix_size}") unit_sequences = [limit(codes, args.prefix_size) for codes in unit_sequences] # Load model from ckpt print(f"Loading the SpeechDLM model from {args.ckpt}") model = SpeechDLM.from_pretrained( model_name_or_path=os.path.dirname(args.ckpt), checkpoint_file=os.path.basename(args.ckpt), data_name_or_path=args.data ) model.eval() if use_cuda: model.cuda() # Set batch sizes model.cfg.dataset.max_tokens = args.batch_max_tokens model.max_positions = args.batch_max_positions if args.batch_max_sentences is not None: model.cfg.dataset.batch_size = args.batch_max_sentences # Set seed (if needed) if args.seed is not None: utils.set_torch_seed(args.seed) # Sample from the SpeechDLM model print(f"Generating {len(unit_sequences)} sequences with SpeechDLM model...\n" f"Generation args: sampling={(not args.beam_search)}, " f"sampling_topk={args.sampling_topk}, sampling_topp={args.sampling_topp}, " f"beam={args.beam_size}, min_len={args.min_len}, " f"max_len_a={args.max_len_a}, max_len_b={args.max_len_b}, " f"temperature={args.temperature}, dur_temperature={args.dur_temperature}, " f"seed={args.seed}") generated_units = model.sample( unit_sequences, sampling=(not args.beam_search), sampling_topk=args.sampling_topk, sampling_topp=args.sampling_topp, beam=args.beam_size, max_len_a=args.max_len_a, max_len_b=args.max_len_b, min_len=args.min_len, temperature=args.temperature, duration_temperature=args.dur_temperature, verbose=args.verbose, skip_invalid_size_inputs=args.skip_invalid_size_batch, ) # Create the generated sequences generated_data = [] for fname, gen_units in zip(fnames, generated_units): d = { "audio" : fname+'-generated', **gen_units } generated_data.append(d) # Write the generated sequences print(f"Write the generated units to {args.out_file}") if args.out_file: os.makedirs(os.path.dirname(args.out_file), exist_ok=True) write_data(args.out_file, generated_data) def cli_main(): parser = argparse.ArgumentParser() parser.add_argument( "--in-file", type=str, required=True, help="Input file following the same format of the output from create_input.py", ) parser.add_argument( "--ckpt", type=str, required=True, help="Path to the model checkpoint." ) parser.add_argument( "--data", type=str, required=True, help="path to the model data dir (containing dict files)", ) parser.add_argument( "--out-file", type=str, required=True, help="Path of the output file.", ) parser.add_argument( "--channels", type=str, default='unitA,unitB', help="Comma-separated list of the channel names" "(Default: 'unitA,unitB').", ) parser.add_argument("--prefix-size", type=int, default=None, help='Limit the prefix size') # Batch sizes parser.add_argument("--batch-max-tokens", type=int, default=9216, help='maximum number of tokens considered in a batch') parser.add_argument("--batch-max-positions", type=int, default=6144, help='maximum number of tokens allowed for a sentence in a batch') parser.add_argument("--batch-max-sentences", type=int, default=None, help='maximum number of sentences considered in a batch') parser.add_argument("--skip-invalid-size-batch", action='store_true', help='skip sentences with more tokens than --batch-max-positions') # Generation args parser.add_argument("--beam-search", action='store_true', help='perform beam search instead of sampling') parser.add_argument("--beam-size", type=int, default=5, help="beam width (used in both sampling and beam search mode) " "(default: 5)") parser.add_argument("--sampling-topk", type=int, default=-1, help="only sample from top-k candidates (default: -1, non applied)") parser.add_argument("--sampling-topp", type=float, default=-1.0, help="only sample among the smallest set of elements whose cumulative " "probability mass exceeds p (default: -1.0, non applied)") parser.add_argument("--max-len-a", type=int, default=0, help="generate sequences of maximum length ax + b, " "where x is the source length (default: 0)") parser.add_argument("--max-len-b", type=int, default=500, help="generate sequences of maximum length ax + b, " "where x is the source length (default: 500 ~ 10s)") parser.add_argument("--min-len", type=int, default=1, help="generate sequences of maximum length ax + b, " "where x is the source length (default: 1)") parser.add_argument("--temperature", type=float, default=1.0, help="temperature when generating unit tokens (default: 1.0)") parser.add_argument("--dur-temperature", type=float, default=1.0, help="temperature when generating duration tokens (default: 1.0)") parser.add_argument("--verbose", action='store_true', help="print the scores given by the model to generated sequences") parser.add_argument("--seed", type=int, default=123, help="seed of the generation model") args = parser.parse_args() main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/textless_nlp/dgslm/vocoder_hifigan/README.md ================================================ # Dialogue Unit-to-Speech Decoder for dGSLM For the unit2speech decoder, we train a [discrete unit-based HiFi-GAN vocoder](https://arxiv.org/pdf/2104.00355.pdf) on the [Fisher dataset](http://www.lrec-conf.org/proceedings/lrec2004/pdf/767.pdf). ## Model checkpoint The pre-trained model checkpoint can be found here : | HiFi-GAN vocoder based on HuBERT Fisher Units | |-----------------------------------------------| |[model checkpoint](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hifigan/hifigan_vocoder) - [config](https://dl.fbaipublicfiles.com/textless_nlp/dgslm/checkpoints/hifigan/config.json) | ## Decode discrete units to audio To create waveform from discrete units, use the script `generate_stereo_waveform.py` : ```bash python examples/textless_nlp/dgslm/vocoder_hifigan/generate_stereo_waveform.py \ --in-file $INPUT_CODE_FILE \ --vocoder $VOCODER_PATH \ --vocoder-cfg $VOCODER_CONFIG \ --results-path $OUTPUT_DIR ``` where INPUT_CODE_FILE is expected to have the following format : ``` {'audio': 'file_1', 'unitA': '8 8 ... 352 352', 'unitB': '217 8 ... 8 8'} {'audio': 'file_2', 'unitA': '5 5 ... 65 65', 'unitB': '6 35 ... 8 9'} ... ``` You can also use the HifiganVocoder class to generate waveform from the codes interactively : ```python # Load the Hifigan vocoder from examples.textless_nlp.dgslm.dgslm_utils import HifiganVocoder decoder = HifiganVocoder( vocoder_path = "/path/to/hifigan_vocoder", vocoder_cfg_path = "/path/to/config.json", ) # Decode the units to waveform codes = [ '7 376 376 133 178 486 486 486 486 486 486 486 486 2 486', '7 499 415 177 7 7 7 7 7 7 136 136 289 289 408', ] wav = decoder.codes2wav(codes) # > array of shape (2, 4800) # Play the waveform import IPython.display as ipd ipd.Audio(wav, rate=16_000) ``` ================================================ FILE: examples/textless_nlp/dgslm/vocoder_hifigan/generate_stereo_waveform.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ast import argparse import json import logging from pathlib import Path import soundfile as sf import torch from tqdm import tqdm from fairseq import utils from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def dump_result(args, data, sample_id, pred_wav): assert "audio" in data or args.results_path is not None if args.results_path: fname = Path(data["audio"]).stem + ".wav" if "audio" in data else f"{sample_id}_pred.wav" out_file = Path(args.results_path) / fname sf.write( out_file.as_posix(), pred_wav.detach().cpu().numpy(), args.sample_rate, ) def load_data(in_file): with open(in_file) as f: data = [ast.literal_eval(line.strip()) for line in f] return data def load_vocoder(vocoder_path, vocoder_cfg_path, use_cuda=True): with open(vocoder_cfg_path) as f: cfg = json.load(f) vocoder = CodeHiFiGANVocoder(vocoder_path, cfg).eval() if use_cuda: vocoder = vocoder.cuda() return vocoder def code2wav(vocoder, code, speaker_id, use_cuda=True): if isinstance(code, str): code = list(map(int, code.split())) inp = dict() inp["code"] = torch.LongTensor(code).view(1, -1) if vocoder.model.multispkr: inp["spkr"] = torch.LongTensor([speaker_id]).view(1, 1) if use_cuda: inp = utils.move_to_cuda(inp) return vocoder(inp) def main(args): logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu vocoder = load_vocoder(args.vocoder, args.vocoder_cfg, use_cuda) data = load_data(args.in_file) if args.results_path: Path(args.results_path).mkdir(exist_ok=True, parents=True) channels = args.channels.split(',') speakers = [args.channel1_spk, args.channel2_spk] for i, d in tqdm(enumerate(data), total=len(data)): wavs = [] for key, speaker_id in zip(channels, speakers): wav = code2wav(vocoder, d[key], speaker_id, use_cuda=use_cuda) wavs.append(wav) wav = torch.stack(wavs, dim=-1) if args.mix: wav = torch.mean(wav, dim=-1) dump_result(args, d, i, wav) def cli_main(): parser = argparse.ArgumentParser() parser.add_argument( "--in-file", type=str, required=True, help="Input file following the same format of the output from create_input.py", ) parser.add_argument( "--vocoder", type=str, required=True, help="path to the vocoder" ) parser.add_argument( "--vocoder-cfg", type=str, required=True, help="path to the vocoder config", ) parser.add_argument( "--channels", type=str, default='unitA,unitB', help="Comma-separated list of the channel names" "(Default: 'unitA,unitB').", ) parser.add_argument("--sample-rate", type=int, default=16_000) parser.add_argument( "--results-path", type=str, default=None, help="Output directory. If not set, the audios will be stored following the 'audio' field specified in the input file", ) parser.add_argument("--channel1-spk", type=int, default=0, help="Speaker of the first channel",) parser.add_argument("--channel2-spk", type=int, default=4, help="Speaker of the second channel",) parser.add_argument("--mix", action="store_true", help="Mix the two channels to create output mono files") parser.add_argument("--cpu", action="store_true", help="run on CPU") args = parser.parse_args() main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/textless_nlp/gslm/README.md ================================================ # Generative Spoken Language Modeling * [Paper](https://arxiv.org/abs/2102.01192) * [Demo](https://speechbot.github.io/gslm/index.html) We build and evaluate generative speech2speech systems using [Log Mel Filtebank](https://pytorch.org/audio/stable/compliance.kaldi.html#fbank), [Modified CPC](https://github.com/facebookresearch/CPC_audio), [HuBERT Base](https://github.com/pytorch/fairseq/tree/main/examples/hubert) and [Wav2Vec 2.0 Large](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec). Our system is composed of three components, namely, *speech2unit*, *ulm* and *unit2speech*. We explain about models and usage of these components in their respective sub-directories. See the links below. ## Speech to Unit Model (speech2unit) Speech to unit model is used for quantizing raw speech into learned discrete speech units. [More details](speech2unit) ## Unit Language Model (ulm) Unit Language Model is a generative language model trained on discrete speech units. [More details](ulm) ## Unit to Speech Model (unit2speech) Unit to speech model is used for synthesizing speech from discrete speech units. [More details](unit2speech) ## Metrics We show how to compute ASR based metrics as well as zero-shot metrics proposed in our paper [here](metrics). ## Tools We share two tools to resynthesize a given spoken utterance, and generate novel spoken language given a spoken prompt. [More detail](tools) ================================================ FILE: examples/textless_nlp/gslm/metrics/README.md ================================================ # GSLM Metrics ## ASR Metrics The suite of metrics here uses an ASR model to transcribe the synthesized speech into text, and then uses text-based metrics. We also use word error rate from ASR transcription itself as one of the metrics. [More details](asr_metrics) ## ABX Metrics We use [ABX](https://www.semanticscholar.org/paper/ABX-Discriminability-Measures-and-Applications-Schatz/13d3537228f728c1063cc83743cb118bba3367a0) to evaluate how well-separated phonetic categories are with quantized representations. [More details](abx_metrics) ## sWUGGY and sBLIMP We refer to [ZeroSpeech challenge](https://www.zerospeech.com/2021/track_s.html#scoring-based-metrics) for details on the sWUGGY and sBLIMP metrics. ================================================ FILE: examples/textless_nlp/gslm/metrics/abx_metrics/README.md ================================================ # ABX-based evaluation ABX is used to evaluate the quality of the obtained discrete units. The life cycle of the ABX-based evaluation for the Speech-to-Unit contains the following steps: 1. Training an acoustic model (or use an existing acoustic model) ([description](./../..)) 2. Perform quantization of speech by learning a K-means clustering model ([description](./../..)) 3. Compute discrete features for ABX computation using the learned clusters 4. Compute the ABX score over the discrete features taking advantage of [libri-light's ABX evaluation script][ll-abx] Here we assume that you already went throught the first two steps and focus solely on extracting features and computing ABX scores. ## Libri-light setup Follow [libri-light's instructions][ll-instructions] for installation and [ABX evaluation setup][ll-abx] (including the download of the data items required for ABX computation). ## Computing ABX ### Dumping quantized features The first step for the ABX computation is to dump the quantized representations corresponding to the test files. ```shell TYPE="hubert" LAYER=6 CKPT_PATH="<PATH_TO_HUBERT_MODEL_CHECKPOINT_FILE>" KM_MODEL_PATH="<PATH_TO_PRETRAINED_KM_MODEL_FILE>" SUBSET="dev-clean" MANIFEST="<PATH_TO_MANIFEST_FOR_LS_DEV-CLEAN>" DATA_DIR="<PATH_TO_DIR_TO_STORE_FEATURES>/$SUBSET" PYTHONPATH=. python examples/textless_nlp/gslm/metrics/abx_metrics/dump_abx_feats.py \ --feature_type $TYPE \ --kmeans_model_path $KM_MODEL_PATH \ --checkpoint_path $CKPT_PATH \ --layer $LAYER \ --manifest_path $MANIFEST \ --out_dir_path $DATA_DIR \ --extension ".flac" ``` Again the manifest file follows the same structure than elsewhere in the codebase. ### Compute ABX with Libri-light Use libri-light's `eval_ABX.py` script (within the appropriate environment set up) as followed: ```shell LIBRILIGHT_ROOT="<PATH_TO_LIBRILIGHT>" SUBSET="dev-clean" DATA_DIR="<PATH_TO_DIR_TO_STORE_FEATURES>/$SUBSET" ITEM_FILE_PATH="$LIBRILIGHT_ROOT/eval/ABX_data/$SUBSET.item" OUT_DIR="<PATH_TO_DIR_TO_STORE_ABX_SCORES>/$SUBSET" FILE_EXTENSION=".npy" FEATURE_SIZE=0.02 # depends on the model used PYTHONPATH=$LIBRILIGHT_ROOT \ python $LIBRILIGHT_ROOT/eval/eval_ABX.py \ $DATA_DIR \ $ITEM_FILE_PATH \ --file_extension $FILE_EXTENSION \ --feature_size $FEATURE_SIZE \ --out $OUT_DIR \ --mode "all" ``` Note that `FEATURE_SIZE` will depend on the model type you are using to extract the acoustic features: * For HuBERT and Wav2Vec2.0, use `FEATURE_SIZE=0.02` * For CPC and Log Mel, use `FEATURE_SIZE=0.01` If you have a gpu available, make sure you add the `--cuda` flag for faster computation. [ll-instructions]: https://github.com/facebookresearch/libri-light [ll-abx]: https://github.com/facebookresearch/libri-light/tree/master/eval#abx ================================================ FILE: examples/textless_nlp/gslm/metrics/abx_metrics/dump_abx_feats.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os import joblib import numpy as np from examples.textless_nlp.gslm.speech2unit.clustering.utils import get_audio_files from examples.textless_nlp.gslm.speech2unit.pretrained.utils import get_features def get_logger(): log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger(__name__) return logger def get_parser(): parser = argparse.ArgumentParser( description="Quantize using K-means clustering over acoustic features." ) parser.add_argument( "--feature_type", type=str, choices=["logmel", "hubert", "w2v2", "cpc"], default=None, required=True, help="Acoustic feature type", ) parser.add_argument( "--kmeans_model_path", type=str, required=True, help="K-means model file path to use for inference", ) parser.add_argument( "--manifest_path", type=str, default=None, help="Manifest file containing the root dir and file names", ) parser.add_argument( "--checkpoint_path", type=str, help="Pretrained model checkpoint", ) parser.add_argument( "--layer", type=int, help="The layer of the pretrained model to extract features from", default=-1, ) parser.add_argument( "--out_dir_path", required=True, type=str, help="File path of quantized output.", ) parser.add_argument( "--extension", type=str, default=".flac", help="Features file path" ) return parser def one_hot(feat, n_clusters): return np.eye(n_clusters)[feat] def main(args, logger): # Feature extraction logger.info(f"Extracting {args.feature_type} acoustic features...") features_batch = get_features( feature_type=args.feature_type, checkpoint_path=args.checkpoint_path, layer=args.layer, manifest_path=args.manifest_path, sample_pct=1.0, flatten=False, ) logger.info(f"Features extracted for {len(features_batch)} utterances.\n") logger.info(f"Dimensionality of representation = {features_batch[0].shape[1]}") logger.info(f"Loading K-means model from {args.kmeans_model_path} ...") kmeans_model = joblib.load(open(args.kmeans_model_path, "rb")) kmeans_model.verbose = False _, fnames, _ = get_audio_files(args.manifest_path) os.makedirs(args.out_dir_path, exist_ok=True) logger.info(f"Writing quantized features to {args.out_dir_path}") for i, feats in enumerate(features_batch): pred = kmeans_model.predict(feats) emb = one_hot(pred, kmeans_model.n_clusters) base_fname = os.path.basename(fnames[i]).rstrip(args.extension) output_path = os.path.join(args.out_dir_path, f"{base_fname}.npy") with open(output_path, "wb") as f: np.save(f, emb) if __name__ == "__main__": parser = get_parser() args = parser.parse_args() logger = get_logger() logger.info(args) main(args, logger) ================================================ FILE: examples/textless_nlp/gslm/metrics/asr_metrics/README.md ================================================ # ASR-based evaluation Overall, the life cycle of the ASR-based evaluation for an ULM contains the following steps: 1. Training an ULM and sampling from it [[description]](./../../ulm) 2. Running UTS on the sampled unit sequences [[description]](./../../unit2speech) 3. Pre-processing for the ASR (down-sampling to 16 KHz, aligning length of the generated audio with ground-truth utterances) 4. Running ASR 5. Calculation of the post-ASR evaluation metrics Here we assume that you have already went throught the first two steps and focus on the rest. ## Preprocessing ### Down-sampling to 16KHz The bulk conversion can be done by running ```bash python $FAIRSEQ_ROOT/examples/textless_nlp/gslm/unit2speech/convert_to_16k.py $UTS_OUTPUT $UTS_OUTPUT_DOWNSAMPLE ``` where `$UTS_OUTPUT` specifies the directory with the generated audio and `$UTS_OUTPUT_DOWNSAMPLE` is the directory where downsampled audio would be saved. ### Matching by length This step is somewhat optional. However, if you want to compare the fluency and diversity of a generated speech utterance to that of the ground-truth speech with the same prefix, it is a good idea to force them to be of the same length. ```bash python $FAIRSEQ_ROOT/examples/textless_nlp/asr_metrics/cut_as.py \ --samples_dir=$UTS_OUTPUT_DOWNSAMPLE --out_dir=$UTS_OUTPUT_DOWNSAMPLE_CUT \ --prompts_description=data/ground_truth_continuation_dev.json ``` Here `ground_truth_continuation_dev.json` is a json file with ground-truth text from LibriSpeech dev-clean, associated with some meta-data (assuming the evaluation is done on dev-clean). This file can be downloaded [[here]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/ground_truth_continuation_dev.json). A similar file for the test-clean is [[here]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/ground_truth_continuation_test.json). These files are used for the evaluation and contain texts for audio sequences that are at least 6s long. ## Running ASR We use a pre-trained wav2vec model to run the ASR step. We firstly need to prepare manifest files which, roughly, tell the ASR system which files we want to transcribe. You can find more details and download the `960h_scratch.pt` checkpoint [[here]](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md)). To run ASR, you would also need to install KenLM, Flashlight decoder, and download the KenLM 4-gram English language model. ```bash python $FAIRSEQ_ROOT/examples/wav2vec/wav2vec_manifest.py \ $UTS_OUTPUT_DOWNSAMPLE_CUT --valid-percent 0.0 --dest $MANIFEST_DIR --ext wav ``` where `$UTS_OUTPUT_DOWNSAMPLE_CUT` speficies the directory with the preprocessed UTS outputs and `$MANIFEST_DIR` is the output directory. We will be running an out-of-the-box evaluation script which requires ground-truth transcripts to measure quality metrics. We are only interested in the transcripts (and we don't have ground-truth outputs for when our ULM generated!), hence we will just generate some dummy transcripts instead: ```bash cp $FAIRSEQ_ROOT/examples/textless_nlp/gslm/asr_metrics/misc/dict.ltr.txt $MANIFEST_DIR python $FAIRSEQ_ROOT/examples/textless_nlp/gslm/asr_metrics/misc/dummy_asr_data.py --tsv=$MANIFEST_DIR/train.tsv \ --output-dir=$MANIFEST_DIR ``` Now we are ready for running ASR: ``` mkdir -p asr python $FAIRSEQ_ROOT/examples/speech_recognition/infer.py \ $MANIFEST_DIR \ --task audio_pretraining --nbest 1 --path 960h_scratch.pt \ --gen-subset=train --results-path $PATH_TO_ASR_OUTPUT \ --w2l-decoder kenlm --lm-model 4-gram.bin \ --lexicon librispeech/lexicon_ltr.lst --word-score -1 \ --sil-weight 0 --lm-weight 2 --criterion ctc --labels ltr --max-tokens 300000 --remove-bpe letter ``` where `lexicon_ltr.lst` is the LibriSpeech lexicon and `$PATH_TO_ASR_OUTPUT` is the output directory (can be downloaded [[here]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/lexicon_ltr.lst)). ## Evaluation metrics We run evaluation on the 1_000 shortest sequences that are at least 6s long. To filter those from the ASR transcript, we additionally provide each metric script with the paths to the manifest and `ground_truth_continuation_*` files. ### Perplexity (PPX) To get a PPX metric estimate on an ASR transcript, you need to run the following command: ```bash python ppx.py $PATH_TO_ASR_OUTPUT/hypo.word-960h_scratch.pt-train.txt --cut-tail\ --manifest=$MANIFEST_DIR/train.tsv --prompts-description=data/ground_truth_continuation_dev.json ``` where `--cut-tail` tells the script to ignore the last token on each line (ASR puts the sequence ID there). ### Self- and Auto-BLEU ```bash python self_bleu.py $PATH_TO_ASR_OUTPUT/hypo.word-960h_scratch.pt-train.txt --cut-tail \ --manifest=$MANIFEST_DIR/train.tsv --prompts-description=data/ground_truth_continuation_dev.json ``` ### Continuation-BLEU ```bash python continuation_eval.py --asr-transcript $PATH_TO_ASR_OUTPUT/hypo.word-960h_scratch.pt-train.txt \ --manifest=$MANIFEST_DIR/train.tsv --prompts-description=data/ground_truth_continuation_dev.json ``` ### AUC Based on the metrics calculated above, we can estimate the AUC of the perplexity/diversity trade-off. We provide an illustration in a [Colab notebook](https://colab.research.google.com/drive/1pVPfOVax_PU3MkYdHRSsa-SI8GBUldNt?usp=sharing). ================================================ FILE: examples/textless_nlp/gslm/metrics/asr_metrics/continuation_eval.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import defaultdict import numpy as np from misc.bleu_utils import sentence_bleu import json import warnings def get_args(): import argparse parser = argparse.ArgumentParser("Tool to calculate Continuation-BLEU2") parser.add_argument('--asr-transcript', type=str, help='Path to the transcript file.') parser.add_argument('--prompts-description', type=str, help='Path to the ground-truth continuation') parser.add_argument('--manifest', type=str, required=True) parser.add_argument('--take-shortest', type=int, default=1000) args = parser.parse_args() return args def main(): # NLTK produces warnings warnings.filterwarnings("ignore") args = get_args() with open(args.prompts_description, 'r') as fin: original_continuations = json.loads(fin.read()) sequence2length = [(k, v[0]) for k, v in original_continuations.items()] assert all(float(v) >= 6.0 for (_, v) in sequence2length) # 6 seconds sequence2length.sort(key=lambda x: x[1]) to_take = set(v[0] for v in sequence2length[:args.take_shortest]) with open(args.manifest, 'r') as fin: fin.readline() linenum2file = dict([ (i, l.split("__")[0]) for (i, l) in enumerate(fin) ]) max_files = max(linenum2file.keys()) continuations = defaultdict(list) mean_length_after = 0 n_examples = 0 with open(args.asr_transcript, 'r') as fin: for line in fin: n_examples += 1 line = line.split() sequence_id = int(line[-1].split('-')[1][:-1]) assert sequence_id <= max_files sequence_name = linenum2file[sequence_id] continuations[sequence_name].append(line[:-1]) mean_length_after += len(line) mean_length_after /= n_examples print(f'Mean length of continuations, in words: {mean_length_after}') metric_values = [] mean_ground_truth_words = 0 n_examples = 0 n_candidates = 0 for k, candidates in continuations.items(): if k not in to_take: continue n_examples += 1 ground_truth = original_continuations[k][1].split() n_candidates += len(candidates) bleu = sentence_bleu(candidates, ground_truth, weights=( 0.5, 0.5), no_length_penalty=True, averaging_mode="geometric") mean_ground_truth_words += len(ground_truth) metric_values.append(bleu) n = len(metric_values) print( f'Median BLEU over {n} examples: {np.median(metric_values)} +- {np.std(metric_values) / np.sqrt(n)}') if __name__ == '__main__': main() ================================================ FILE: examples/textless_nlp/gslm/metrics/asr_metrics/misc/bleu_utils.py ================================================ """ TODO: the code is take from Apache-2 Licensed NLTK: make sure we do this properly! Copied over from nltk.tranlate.bleu_score. This code has two major changes: - allows to turn off length/brevity penalty --- it has no sense for self-bleu, - allows to use arithmetic instead of geometric mean """ import math import sys from fractions import Fraction import warnings from collections import Counter from nltk.translate.bleu_score import modified_precision, closest_ref_length, brevity_penalty, SmoothingFunction def corpus_bleu( list_of_references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, averaging_mode="geometric", no_length_penalty=False ): """ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all the hypotheses and their respective references. Instead of averaging the sentence level BLEU scores (i.e. marco-average precision), the original BLEU metric (Papineni et al. 2002) accounts for the micro-average precision (i.e. summing the numerators and denominators for each hypothesis-reference(s) pairs before the division). >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party'] >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was', ... 'interested', 'in', 'world', 'history'] >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history', ... 'because', 'he', 'read', 'the', 'book'] >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]] >>> hypotheses = [hyp1, hyp2] >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS 0.5920... The example below show that corpus_bleu() is different from averaging sentence_bleu() for hypotheses >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1) >>> score2 = sentence_bleu([ref2a], hyp2) >>> (score1 + score2) / 2 # doctest: +ELLIPSIS 0.6223... :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses :type list_of_references: list(list(list(str))) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param weights: weights for unigrams, bigrams, trigrams and so on :type weights: list(float) :param smoothing_function: :type smoothing_function: SmoothingFunction :param auto_reweigh: Option to re-normalize the weights uniformly. :type auto_reweigh: bool :return: The corpus-level BLEU score. :rtype: float """ # Before proceeding to compute BLEU, perform sanity checks. p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches. p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref. hyp_lengths, ref_lengths = 0, 0 assert len(list_of_references) == len(hypotheses), ( "The number of hypotheses and their reference(s) should be the " "same " ) # Iterate through each hypothesis and their corresponding references. for references, hypothesis in zip(list_of_references, hypotheses): # For each order of ngram, calculate the numerator and # denominator for the corpus-level modified precision. for i, _ in enumerate(weights, start=1): p_i = modified_precision(references, hypothesis, i) p_numerators[i] += p_i.numerator p_denominators[i] += p_i.denominator # Calculate the hypothesis length and the closest reference length. # Adds them to the corpus-level hypothesis and reference counts. hyp_len = len(hypothesis) hyp_lengths += hyp_len ref_lengths += closest_ref_length(references, hyp_len) # Calculate corpus-level brevity penalty. if no_length_penalty and averaging_mode == 'geometric': bp = 1.0 elif no_length_penalty and averaging_mode == 'arithmetic': bp = 0.0 else: assert not no_length_penalty assert averaging_mode != 'arithmetic', 'Not sure how to apply length penalty when aurithmetic mode' bp = brevity_penalty(ref_lengths, hyp_lengths) # Uniformly re-weighting based on maximum hypothesis lengths if largest # order of n-grams < 4 and weights is set at default. if auto_reweigh: if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25): weights = (1 / hyp_lengths,) * hyp_lengths # Collects the various precision values for the different ngram orders. p_n = [ Fraction(p_numerators[i], p_denominators[i], _normalize=False) for i, _ in enumerate(weights, start=1) ] # Returns 0 if there's no matching n-grams # We only need to check for p_numerators[1] == 0, since if there's # no unigrams, there won't be any higher order ngrams. if p_numerators[1] == 0: return 0 # If there's no smoothing, set use method0 from SmoothinFunction class. if not smoothing_function: smoothing_function = SmoothingFunction().method0 # Smoothen the modified precision. # Note: smoothing_function() may convert values into floats; # it tries to retain the Fraction object as much as the # smoothing method allows. p_n = smoothing_function( p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths ) if averaging_mode == "geometric": s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n)) s = bp * math.exp(math.fsum(s)) elif averaging_mode == "arithmetic": s = (w_i * p_i for w_i, p_i in zip(weights, p_n)) s = math.fsum(s) return s def sentence_bleu( references, hypothesis, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=None, auto_reweigh=False, averaging_mode="geometric", no_length_penalty=False ): return corpus_bleu( [references], [hypothesis], weights, smoothing_function, auto_reweigh, averaging_mode, no_length_penalty ) ================================================ FILE: examples/textless_nlp/gslm/metrics/asr_metrics/misc/cut_as.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torchaudio import argparse import json import pathlib def get_args(): parser = argparse.ArgumentParser( "Assuring generated audio have the same length as ground-truth audio") parser.add_argument('--samples_dir', required=True, type=str) parser.add_argument('--out_dir', required=True, type=str) parser.add_argument('--prompts_description', required=True, type=str) return parser.parse_args() def cut(src, tgt, l): x, sr = torchaudio.load(str(src)) assert sr == 16_000 x = x.squeeze() target_frames = int(l * sr) flag = 0 if target_frames <= x.size(0): x = x[:target_frames] flag = 1 else: flag = 0 torchaudio.save(str(tgt), x.unsqueeze(0), sr) return flag def main(): args = get_args() tgt_dir = pathlib.Path(args.out_dir) tgt_dir.mkdir(exist_ok=True, parents=True) total_files, sufficiently_long = 0, 0 with open(args.prompts_description, 'r') as f: description = json.loads(f.read()) for src_f in pathlib.Path(args.samples_dir).glob('*.wav'): name_prompt = src_f.with_suffix('').name.split('__')[0] assert name_prompt in description, f'Cannot find {name_prompt}!' target_length = description[name_prompt][0] tgt_f = tgt_dir / (src_f.name) is_long_enough = cut(src_f, tgt_f, target_length) sufficiently_long += is_long_enough if not is_long_enough: print(f'{src_f} is not long enough') total_files += 1 print( f'Total files: {total_files}; sufficiently long: {sufficiently_long}') if __name__ == '__main__': main() ================================================ FILE: examples/textless_nlp/gslm/metrics/asr_metrics/misc/dict.ltr.txt ================================================ | 94802 E 51860 T 38431 A 33152 O 31495 N 28855 I 28794 H 27187 S 26071 R 23546 D 18289 L 16308 U 12400 M 10685 W 10317 C 9844 F 9062 G 8924 Y 8226 P 6890 B 6339 V 3936 K 3456 ' 1023 X 636 J 598 Q 437 Z 213 ================================================ FILE: examples/textless_nlp/gslm/metrics/asr_metrics/ppx.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import numpy as np import warnings def get_target_sequences(manifest, ground_truth, to_take=1000): import json import pathlib with open(ground_truth, 'r') as fin: original_continuations = json.loads(fin.read()) sequence2length = [(k, v[0]) for k, v in original_continuations.items()] assert all(float(v) >= 6.0 for (_, v) in sequence2length) # 6 seconds sequence2length.sort(key=lambda x: x[1]) to_take_sequences = set(v[0] for v in sequence2length[:to_take]) to_take_ids = [] with open(manifest, 'r') as f: f.readline() for i, line in enumerate(f.readlines()): seq_id = line.split()[0] seq_id = pathlib.Path(seq_id).name.split('__')[0] if seq_id in to_take_sequences: to_take_ids.append(i) print(f'Took {len(to_take_ids)} ids') return set(to_take_ids) def get_args(): import argparse parser = argparse.ArgumentParser("Evaluate PPX metric of a transcript.") parser.add_argument('--asr-transcript', type=str, help='Path to the transcript file.') parser.add_argument('--cut-id', action='store_true', help='Whether cut the first token (typically a seq id)') parser.add_argument('--cut-tail', action='store_true', help='Whether cut the last token (typically a speaker id)') parser.add_argument('--manifest', type=str, default=None) parser.add_argument('--prompts-description', type=str, default=None) args = parser.parse_args() return args def main(): args = get_args() lm = torch.hub.load( 'pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe') lm.eval().cuda() # disable dropout if args.manifest is None and args.prompts_description is None: target_ids = None else: target_ids = get_target_sequences( args.manifest, args.prompts_description) with open(args.asr_transcript, 'r') as fin: lines = fin.readlines() if target_ids is not None: filtered = [] for line in lines: line_id = line.split()[-1] line_id = int(line_id.split('-')[1][:-1]) if line_id in target_ids: filtered.append(line) lines = filtered else: pass if args.cut_id: lines = [' '.join(x.split()[1:]) for x in lines] if args.cut_tail: lines = [' '.join(x.split()[:-1]) for x in lines] lines = [x.strip().lower() for x in lines] def get_logprob(sent): return \ lm.score(sent)['positional_scores'].mean().neg().item() logprobs = [get_logprob(l) for l in lines] filtered = [x for x in logprobs if not np.isnan(x)] if len(filtered) != len(logprobs): warnings.warn("NaNs detected!") logprobs = filtered perplexities = [np.exp(l) for l in logprobs] for name, stats in [('logprob', logprobs), ('perplexity', perplexities)]: mean = np.mean(stats) sem = np.std(stats) / np.sqrt(len(stats)) median = np.median(stats) interval = list(np.percentile(stats, [10, 90])) mean, sem, median, percentile10, percentile90 = [ round(x, 2) for x in [mean, sem, median] + interval] print(name) print(f"\tMean {mean} +- {sem}") print( f"\tMedian {median}, 90% confidence interval {percentile10}...{percentile90}") if __name__ == '__main__': main() ================================================ FILE: examples/textless_nlp/gslm/metrics/asr_metrics/self_auto_bleu.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import nltk from misc.bleu_utils import sentence_bleu import warnings def get_target_sequences(manifest, ground_truth, to_take=1000): import json import pathlib with open(ground_truth, 'r') as fin: original_continuations = json.loads(fin.read()) sequence2length = [(k, v[0]) for k, v in original_continuations.items()] assert all(float(v) >= 6.0 for (_, v) in sequence2length) # 6 seconds sequence2length.sort(key=lambda x: x[1]) to_take_sequences = set(v[0] for v in sequence2length[:to_take]) to_take_ids = [] with open(manifest, 'r') as f: f.readline() for i, line in enumerate(f.readlines()): seq_id = line.split()[0] seq_id = pathlib.Path(seq_id).name.split('__')[0] if seq_id in to_take_sequences: to_take_ids.append(i) print(f'Took {len(to_take_ids)} ids') return set(to_take_ids) def get_args(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--asr-transcript', type=str, help='Path to the transcript file.') parser.add_argument('--manifest', required=True) parser.add_argument('--prompts-description', required=True) parser.add_argument('--cut-id', action='store_true', help='Whether cut the first token (typically a seq id)') parser.add_argument('--cut-tail', action='store_true', help='Whether cut the last token (typically a speaker id)') parser.add_argument('--debug', action='store_true') args = parser.parse_args() return args def get_self_bleu(utterances, averaging_mode, weights): self_bleu = [] for i in range(len(utterances)): hypo = utterances[i] rest = utterances[:i] + utterances[i+1:] self_bleu.append(sentence_bleu(rest, hypo, weights, no_length_penalty=True, averaging_mode=averaging_mode)) return self_bleu def get_self_bleu2_arithmetic(utterances): weights = (0.5, 0.5) # equal weight for unigrams and bigrams return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights) def get_self_bleu2_geometric(utterances): weights = (0.5, 0.5) return get_self_bleu(utterances, averaging_mode='geometric', weights=weights) def get_auto_bleu2_arithmetic(utterances): weights = (0.5, 0.5) return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances] def get_auto_bleu2_geometric(utterances): weights = (0.5, 0.5) return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances] def get_auto_bleu3_geometric(utterances): weights = (1./3, 1./3, 1./3) return [auto_bleu(u, mean_mode='geometric', weights=weights) for u in utterances] def get_auto_bleu3_arithmetic(utterances): weights = (1./3, 1./3, 1./3) return [auto_bleu(u, mean_mode='arithmetic', weights=weights) for u in utterances] def get_self_bleu3_arithmetic(utterances): weights = (1./3, 1./3, 1./3) return get_self_bleu(utterances, averaging_mode='arithmetic', weights=weights) def get_self_bleu3_geometric(utterances): weights = (1./3, 1./3, 1./3) return get_self_bleu(utterances, averaging_mode='geometric', weights=weights) def auto_bleu(sentence, weights, mean_mode='arithmetic'): if len(sentence) <= 1: return 0 N = len(weights) bleu_n = np.zeros([N]) for n in range(N): targ_ngrams = list(nltk.ngrams(sentence, n+1)) for p in range(len(targ_ngrams)): left = sentence[:p] right = sentence[(p+n+1):] rest_ngrams = list(nltk.ngrams(left, n+1)) + \ list(nltk.ngrams(right, n+1)) # compute the nb of matching ngrams bleu_n[n] += targ_ngrams[p] in rest_ngrams bleu_n[n] /= len(targ_ngrams) # average them to get a proportion weights = np.array(weights) if mean_mode == 'arithmetic': return (bleu_n * weights).sum() elif mean_mode == 'geometric': return (bleu_n ** weights).prod() else: raise ValueError(f'Unknown agggregation mode {mean_mode}') def main(): from multiprocessing import Pool args = get_args() target_ids = get_target_sequences(args.manifest, args.prompts_description) with open(args.asr_transcript, 'r') as fin: lines = fin.readlines() terms = [x.strip().split() for x in lines] filtered = [] for term in terms: line_id = int(term[-1].split('-')[1][:-1]) if line_id in target_ids: filtered.append(term) terms = filtered if args.cut_id: terms = [x[1:] for x in terms] if args.cut_tail: terms = [x[:-1] for x in terms] if args.debug: terms = terms[:10] tasks = [ ('Self-BLEU2-arithmetic', get_self_bleu2_arithmetic), ('Self-BLEU2-geometric', get_self_bleu2_geometric), ('Auto-BLEU2-arithmetic', get_auto_bleu2_arithmetic), ('Auto-BLEU2-geometric', get_auto_bleu2_geometric), ('Self-BLEU3-arithmetic', get_self_bleu3_arithmetic), ('Self-BLEU3-geometric', get_self_bleu3_geometric), ('Auto-BLEU3-arithmetic', get_auto_bleu3_arithmetic), ('Auto-BLEU3-geometric', get_auto_bleu3_geometric), ] n_processes = min(16, len(tasks)) with Pool(n_processes) as pool: metrics = pool.map(run_f, [(t[1], terms) for t in tasks]) for (metric_name, _), metric in zip(tasks, metrics): metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric)) metric, sem = [ round(100 * x, 2) for x in [metric, sem] ] print(f'{metric_name} {metric} +- {sem}') def run_f(task_params): f, terms = task_params return f(terms) if __name__ == '__main__': # NLTK produces warnings warnings.filterwarnings("ignore") main() ================================================ FILE: examples/textless_nlp/gslm/speech2unit/README.md ================================================ # Speech to Unit Model (speech2unit) ## Acoustic Model For quantizing speech we learn a K-means clustering over acoustic representations for which we either use Log-Mel Filterbank or pretrained acoustic representation models. For using pretrained models, please download from their respective locations linked below. * [Modified CPC](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/cpc_big_ll6kh_top_ctc.pt) * [HuBERT-Base](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) * [Wav2Vec 2.0-Base](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt) ## Quantization Model You can download pretrained quantized model from the list below. K-Means Model | Download Link |-|- Log Mel Filterbank + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/km50/km.bin) Log Mel Filterbank + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/km100/km.bin) Log Mel Filterbank + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/km200/km.bin) Modified CPC + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/km50/km.bin) Modified CPC + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/km100/km.bin) Modified CPC + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/km200/km.bin) HuBERT Base + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km50/km.bin) HuBERT Base + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km100/km.bin) HuBERT Base + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km200/km.bin) wav2vec 2.0 Large + KM50 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/km50/km.bin) wav2vec 2.0 Large + KM100 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/km100/km.bin) wav2vec 2.0 Large + KM200 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/km200/km.bin) ### Quantization For quantizing speech with a given acoustic representation, please follow the steps below. 1. Learn K-means clustering model ``` N_CLUSTERS=<number_of_clusters_used_for_kmeans> TYPE=<one_of_logmel/cpc/hubert/w2v2> CKPT_PATH=<path_of_pretrained_acoustic_model> LAYER=<layer_of_acoustic_model_to_extract_features_from> MANIFEST=<tab_separated_manifest_of_audio_files_for_training_kmeans> KM_MODEL_PATH=<output_path_of_the_kmeans_model> PYTHONPATH=. python examples/textless_nlp/gslm/speech2unit/clustering/cluster_kmeans.py \ --num_clusters $N_CLUSTERS \ --feature_type $TYPE \ --checkpoint_path $CKPT_PATH \ --layer $LAYER \ --manifest_path $MANIFEST \ --out_kmeans_model_path $KM_MODEL_PATH ``` 2. Quantize using the learned clusters ``` MANIFEST=<tab_separated_manifest_of_audio_files_to_quantize> OUT_QUANTIZED_FILE=<output_quantized_audio_file_path> python examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py \ --feature_type $TYPE \ --kmeans_model_path $KM_MODEL_PATH \ --acoustic_model_path $CKPT_PATH \ --layer $LAYER \ --manifest_path $MANIFEST \ --out_quantized_file_path $OUT_QUANTIZED_FILE \ --extension ".flac" ``` Note about the manifest file is a file with paths and length of input audio files. The format of the file is as follows: ``` <path_of_root_directory_containing_audio_files> <relative_path_of_audio_file_1>\t<number_of_frames_1> <relative_path_of_audio_file_2>\t<number_of_frames_1> ... ``` ================================================ FILE: examples/textless_nlp/gslm/speech2unit/__init__.py ================================================ ================================================ FILE: examples/textless_nlp/gslm/speech2unit/clustering/__init__.py ================================================ ================================================ FILE: examples/textless_nlp/gslm/speech2unit/clustering/cluster_kmeans.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os import time import numpy as np from sklearn.cluster import MiniBatchKMeans import joblib from examples.textless_nlp.gslm.speech2unit.pretrained.utils import ( get_and_dump_features, get_features, ) def get_logger(): log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger(__name__) return logger def get_parser(): parser = argparse.ArgumentParser( description="Learn K-means clustering over acoustic features." ) # Features arguments parser.add_argument( "--in_features_path", type=str, default=None, help="Features file path" ) parser.add_argument( "--feature_type", type=str, choices=["logmel", "hubert", "w2v2", "cpc"], default=None, help="Acoustic feature type", ) parser.add_argument( "--manifest_path", type=str, default=None, help="Manifest file containing the root dir and file names", ) parser.add_argument( "--out_features_path", type=str, default=None, help="Features file path to write to", ) parser.add_argument( "--checkpoint_path", type=str, help="Pretrained acoustic model checkpoint", ) parser.add_argument( "--layer", type=int, help="The layer of the pretrained model to extract features from", default=-1, ) parser.add_argument( "--sample_pct", type=float, help="Percent data to use for K-means training", default=0.1, ) # K-means arguments parser.add_argument( "--num_clusters", type=int, help="Nubmer of clusters", default=50 ) parser.add_argument("--init", default="k-means++") parser.add_argument( "--max_iter", type=int, help="Maximum number of iterations for K-means training", default=150, ) parser.add_argument( "--batch_size", type=int, help="Batch size for K-means training", default=10000, ) parser.add_argument("--tol", default=0.0, type=float) parser.add_argument("--max_no_improvement", default=100, type=int) parser.add_argument("--n_init", default=20, type=int) parser.add_argument("--reassignment_ratio", default=0.5, type=float) parser.add_argument( "--out_kmeans_model_path", type=str, required=True, help="Path to save K-means model", ) # Leftovers parser.add_argument( "--seed", type=int, help="Random seed to use for K-means training", default=1369, ) return parser def get_kmeans_model( n_clusters, init, max_iter, batch_size, tol, max_no_improvement, n_init, reassignment_ratio, random_state, ): return MiniBatchKMeans( n_clusters=n_clusters, init=init, max_iter=max_iter, batch_size=batch_size, tol=tol, max_no_improvement=max_no_improvement, n_init=n_init, reassignment_ratio=reassignment_ratio, random_state=random_state, verbose=1, compute_labels=True, init_size=None, ) def train_kmeans(kmeans_model, features_batch): start_time = time.time() kmeans_model.fit(features_batch) time_taken = round((time.time() - start_time) // 60, 2) return kmeans_model, time_taken def main(args, logger): # Features loading/extraction for K-means if args.in_features_path: # Feature loading logger.info(f"Loading features from {args.in_features_path}...") features_batch = np.load(args.in_features_path, allow_pickle=True) else: # Feature extraction logger.info(f"Extracting {args.feature_type} acoustic features...") features_batch = ( get_features( feature_type=args.feature_type, checkpoint_path=args.checkpoint_path, layer=args.layer, manifest_path=args.manifest_path, sample_pct=args.sample_pct, flatten=True, ) if not args.out_features_path else get_and_dump_features( feature_type=args.feature_type, checkpoint_path=args.checkpoint_path, layer=args.layer, manifest_path=args.manifest_path, sample_pct=args.sample_pct, flatten=True, out_features_path=args.out_features_path, ) ) if args.out_features_path: logger.info( f"Saved extracted features at {args.out_features_path}" ) logger.info(f"Features shape = {features_batch.shape}\n") # Learn and save K-means model kmeans_model = get_kmeans_model( n_clusters=args.num_clusters, init=args.init, max_iter=args.max_iter, batch_size=args.batch_size, tol=args.tol, max_no_improvement=args.max_no_improvement, n_init=args.n_init, reassignment_ratio=args.reassignment_ratio, random_state=args.seed, ) logger.info("Starting k-means training...") kmeans_model, time_taken = train_kmeans( kmeans_model=kmeans_model, features_batch=features_batch ) logger.info(f"...done k-means training in {time_taken} minutes") inertia = -kmeans_model.score(features_batch) / len(features_batch) logger.info(f"Total intertia: {round(inertia, 2)}\n") logger.info(f"Saving k-means model to {args.out_kmeans_model_path}") os.makedirs(os.path.dirname(args.out_kmeans_model_path), exist_ok=True) joblib.dump(kmeans_model, open(args.out_kmeans_model_path, "wb")) if __name__ == "__main__": parser = get_parser() args = parser.parse_args() logger = get_logger() logger.info(args) main(args, logger) ================================================ FILE: examples/textless_nlp/gslm/speech2unit/clustering/dump_feats.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging from examples.textless_nlp.gslm.speech2unit.pretrained.utils import ( get_and_dump_features, ) def get_parser(): parser = argparse.ArgumentParser( description="Compute and dump log mel fbank features." ) parser.add_argument( "--feature_type", type=str, choices=["logmel", "hubert", "w2v2", "cpc"], default=None, help="Acoustic feature type", ) parser.add_argument( "--manifest_path", type=str, default=None, help="Manifest file containing the root dir and file names", ) parser.add_argument( "--out_features_path", type=str, default=None, help="Features file path to write to", ) parser.add_argument( "--checkpoint_path", type=str, help="Pretrained acoustic model checkpoint", ) parser.add_argument( "--layer", type=int, help="The layer of the pretrained model to extract features from", default=-1, ) parser.add_argument( "--sample_pct", type=float, help="Percent data to use for K-means training", default=0.1, ) parser.add_argument( "--out_features_path", type=str, help="Path to save log mel fbank features", ) return parser def get_logger(): log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger(__name__) return logger if __name__ == "__main__": """ Example command: python ~/speechbot/clustering/dump_logmelfank_feats.py \ --manifest_path /checkpoint/kushall/data/LJSpeech-1.1/asr_input_wavs_16k/train.tsv --out_features_path /checkpoint/kushall/experiments/speechbot/logmelfbank/features/ljspeech/train.npy """ parser = get_parser() args = parser.parse_args() logger = get_logger() logger.info(args) logger.info(f"Extracting {args.feature_type} acoustic features...") get_and_dump_features( feature_type=args.feature_type, checkpoint_path=args.checkpoint_path, layer=args.layer, manifest_path=args.manifest_path, sample_pct=args.sample_pct, flatten=True, out_features_path=args.out_features_path, ) logger.info(f"Saved extracted features at {args.out_features_path}") ================================================ FILE: examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os import numpy as np import joblib from examples.textless_nlp.gslm.speech2unit.clustering.utils import ( get_audio_files, ) from examples.textless_nlp.gslm.speech2unit.pretrained.utils import ( get_features, ) def get_logger(): log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger(__name__) return logger def get_parser(): parser = argparse.ArgumentParser( description="Quantize using K-means clustering over acoustic features." ) parser.add_argument( "--feature_type", type=str, choices=["logmel", "hubert", "w2v2", "cpc"], default=None, required=True, help="Acoustic feature type", ) parser.add_argument( "--acoustic_model_path", type=str, help="Pretrained acoustic model checkpoint" ) parser.add_argument( "--layer", type=int, help="The layer of the pretrained model to extract features from", default=-1, ) parser.add_argument( "--kmeans_model_path", type=str, required=True, help="K-means model file path to use for inference", ) parser.add_argument( "--features_path", type=str, default=None, help="Features file path. You don't need to enter acoustic model details if you have dumped features", ) parser.add_argument( "--manifest_path", type=str, default=None, help="Manifest file containing the root dir and file names", ) parser.add_argument( "--out_quantized_file_path", required=True, type=str, help="File path of quantized output.", ) parser.add_argument( "--extension", type=str, default=".flac", help="Features file path" ) parser.add_argument( "--channel_id", choices=['1', '2'], help="The audio channel to extract the units in case of stereo file.", default=None, ) parser.add_argument( "--hide-fname", action='store_true', help="Hide file names in the output file." ) return parser def main(args, logger): # Feature extraction if args.features_path is not None: logger.info(f"Loading acoustic features from {args.features_path}...") features_batch = np.load(args.features_path) else: logger.info(f"Extracting {args.feature_type} acoustic features...") features_batch = get_features( feature_type=args.feature_type, checkpoint_path=args.acoustic_model_path, layer=args.layer, manifest_path=args.manifest_path, sample_pct=1.0, flatten=False, channel_id=int(args.channel_id) if args.channel_id else None, ) logger.info( f"Features extracted for {len(features_batch)} utterances.\n" ) logger.info( f"Dimensionality of representation = {features_batch[0].shape[1]}" ) # K-means model logger.info(f"Loading K-means model from {args.kmeans_model_path} ...") kmeans_model = joblib.load(open(args.kmeans_model_path, "rb")) kmeans_model.verbose = False _, fnames, _ = get_audio_files(args.manifest_path) os.makedirs(os.path.dirname(args.out_quantized_file_path), exist_ok=True) print(f"Writing quantized predictions to {args.out_quantized_file_path}") with open(args.out_quantized_file_path, "w") as fout: for i, feats in enumerate(features_batch): pred = kmeans_model.predict(feats) pred_str = " ".join(str(p) for p in pred) base_fname = os.path.basename(fnames[i]).rstrip('.'+args.extension.lstrip('.')) if args.channel_id is not None: base_fname = base_fname+f'-channel{args.channel_id}' if not args.hide_fname: fout.write(f"{base_fname}|{pred_str}\n") else: fout.write(f"{pred_str}\n") if __name__ == "__main__": parser = get_parser() args = parser.parse_args() logger = get_logger() logger.info(args) main(args, logger) ================================================ FILE: examples/textless_nlp/gslm/speech2unit/clustering/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import List, Tuple def get_audio_files(manifest_path: str) -> Tuple[str, List[str], List[int]]: fnames, sizes = [], [] with open(manifest_path, "r") as f: root_dir = f.readline().strip() for line in f: items = line.strip().split("\t") assert ( len(items) == 2 ), f"File must have two columns separated by tab. Got {line}" fnames.append(items[0]) sizes.append(int(items[1])) return root_dir, fnames, sizes ================================================ FILE: examples/textless_nlp/gslm/speech2unit/pretrained/cpc_feature_reader.py ================================================ import soundfile as sf import torch import torch.nn as nn import torch.nn.functional as F class CpcFeatureReader: """ Wrapper class to run inference on CPC model. Helps extract features for a given audio file. """ def __init__( self, checkpoint_path, layer, use_encoder_layer=False, norm_features=False, sample_rate=16000, max_chunk=64000, use_cuda=True, ): self.model = load_cpc_model(checkpoint_path, layer).eval() self.sample_rate = sample_rate self.max_chunk = max_chunk self.norm_features = norm_features self.use_encoder_layer = use_encoder_layer self.use_cuda = use_cuda if self.use_cuda: self.model.cuda() def read_audio(self, path, ref_len=None, channel_id=None): wav, sr = sf.read(path) if channel_id is not None: assert wav.ndim == 2, \ f"Expected stereo input when channel_id is given ({path})" assert channel_id in [1, 2], \ "channel_id is expected to be in [1, 2]" wav = wav[:, channel_id-1] if wav.ndim == 2: wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim assert sr == self.sample_rate, sr if ref_len is not None and abs(ref_len - len(wav)) > 160: print(f"ref {ref_len} != read {len(wav)} ({path})") return wav def get_feats(self, file_path, ref_len=None, channel_id=None): x = self.read_audio(file_path, ref_len, channel_id) # Inspired from CPC_audio feature_loader.py with torch.no_grad(): x = torch.from_numpy(x).float() if self.use_cuda: x = x.cuda() x = x.view(1, 1, -1) size = x.size(2) feat = [] start = 0 while start < size: if start + self.max_chunk > size: break x_chunk = x[..., start : start + self.max_chunk] feat_chunk = self.model.extract_features( source=x_chunk, get_encoded=self.use_encoder_layer, norm_output=self.norm_features, ) feat.append(feat_chunk) start += self.max_chunk if start < size: x_chunk = x[:, -self.max_chunk :] feat_chunk = self.model.extract_features( source=x_chunk, get_encoded=self.use_encoder_layer, norm_output=self.norm_features, ) df = x_chunk.size(2) // feat_chunk.size(1) delta = (size - start) // df feat.append(feat_chunk[:, -delta:]) return torch.cat(feat, 1).squeeze(0) def load_cpc_model(checkpoint_path, layer=None): state_dict = torch.load(checkpoint_path) weights = state_dict["weights"] config = state_dict["config"] if layer is not None: config["nLevelsGRU"] = layer encoder = CPCEncoder(config["hiddenEncoder"]) ar_net = CPCAR( config["hiddenEncoder"], config["hiddenGar"], False, config["nLevelsGRU"] ) model = CPCModel(encoder, ar_net) model.load_state_dict(weights, strict=False) model.config = config return model class ChannelNorm(nn.Module): def __init__(self, num_features, epsilon=1e-05, affine=True): super(ChannelNorm, self).__init__() if affine: self.weight = nn.parameter.Parameter(torch.Tensor(1, num_features, 1)) self.bias = nn.parameter.Parameter(torch.Tensor(1, num_features, 1)) else: self.weight = None self.bias = None self.epsilon = epsilon self.p = 0 self.affine = affine self.reset_parameters() def reset_parameters(self): if self.affine: torch.nn.init.ones_(self.weight) torch.nn.init.zeros_(self.bias) def forward(self, x): cum_mean = x.mean(dim=1, keepdim=True) cum_var = x.var(dim=1, keepdim=True) x = (x - cum_mean) * torch.rsqrt(cum_var + self.epsilon) if self.weight is not None: x = x * self.weight + self.bias return x class CPCEncoder(nn.Module): def __init__(self, hidden_dim=512): super(CPCEncoder, self).__init__() self.conv0 = nn.Conv1d(1, hidden_dim, 10, stride=5, padding=3) self.batchNorm0 = ChannelNorm(hidden_dim) self.conv1 = nn.Conv1d(hidden_dim, hidden_dim, 8, stride=4, padding=2) self.batchNorm1 = ChannelNorm(hidden_dim) self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) self.batchNorm2 = ChannelNorm(hidden_dim) self.conv3 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) self.batchNorm3 = ChannelNorm(hidden_dim) self.conv4 = nn.Conv1d(hidden_dim, hidden_dim, 4, stride=2, padding=1) self.batchNorm4 = ChannelNorm(hidden_dim) self.DOWNSAMPLING = 160 def get_output_dim(self): return self.conv4.out_channels def forward(self, x): x = F.relu(self.batchNorm0(self.conv0(x))) x = F.relu(self.batchNorm1(self.conv1(x))) x = F.relu(self.batchNorm2(self.conv2(x))) x = F.relu(self.batchNorm3(self.conv3(x))) x = F.relu(self.batchNorm4(self.conv4(x))) return x class CPCAR(nn.Module): def __init__(self, dim_encoded, dim_output, keep_hidden, num_layers): super(CPCAR, self).__init__() self.baseNet = nn.LSTM( dim_encoded, dim_output, num_layers=num_layers, batch_first=True ) self.hidden = None self.keep_hidden = keep_hidden def get_output_dim(self): return self.baseNet.hidden_size def forward(self, x): try: self.baseNet.flatten_parameters() except RuntimeError: pass x, h = self.baseNet(x, self.hidden) if self.keep_hidden: if isinstance(h, tuple): self.hidden = tuple(x.detach() for x in h) else: self.hidden = h.detach() return x class CPCModel(nn.Module): def __init__(self, encoder, ar_net): super(CPCModel, self).__init__() self.gEncoder = encoder self.gAR = ar_net self.config = None def forward(self, x, label): encoded = self.gEncoder(x).permute(0, 2, 1) cpc_feature = self.gAR(encoded) return cpc_feature, encoded, label def extract_features(self, source, get_encoded=False, norm_output=False): cpc_feature, encoded, _ = self.forward(source, None) if get_encoded: cpc_feature = encoded if norm_output: mean = cpc_feature.mean(dim=1, keepdim=True) var = cpc_feature.var(dim=1, keepdim=True) cpc_feature = (cpc_feature - mean) / torch.sqrt(var + 1e-08) return cpc_feature ================================================ FILE: examples/textless_nlp/gslm/speech2unit/pretrained/hubert_feature_reader.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import fairseq import soundfile as sf import torch.nn.functional as F class HubertFeatureReader: """ Wrapper class to run inference on HuBERT model. Helps extract features for a given audio file. """ def __init__(self, checkpoint_path, layer, max_chunk=1600000, use_cuda=True): ( model, cfg, task, ) = fairseq.checkpoint_utils.load_model_ensemble_and_task( [checkpoint_path] ) self.model = model[0].eval() self.task = task self.layer = layer self.max_chunk = max_chunk self.use_cuda = use_cuda if self.use_cuda: self.model.cuda() def read_audio(self, path, ref_len=None, channel_id=None): wav, sr = sf.read(path) if channel_id is not None: assert wav.ndim == 2, \ f"Expected stereo input when channel_id is given ({path})" assert channel_id in [1, 2], \ "channel_id is expected to be in [1, 2]" wav = wav[:, channel_id-1] if wav.ndim == 2: wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim assert sr == self.task.cfg.sample_rate, sr if ref_len is not None and abs(ref_len - len(wav)) > 160: print(f"ref {ref_len} != read {len(wav)} ({path})") return wav def get_feats(self, file_path, ref_len=None, channel_id=None): x = self.read_audio(file_path, ref_len, channel_id) with torch.no_grad(): x = torch.from_numpy(x).float() if self.use_cuda: x = x.cuda() if self.task.cfg.normalize: x = F.layer_norm(x, x.shape) x = x.view(1, -1) feat = [] for start in range(0, x.size(1), self.max_chunk): x_chunk = x[:, start: start + self.max_chunk] feat_chunk, _ = self.model.extract_features( source=x_chunk, padding_mask=None, mask=False, output_layer=self.layer, ) feat.append(feat_chunk) return torch.cat(feat, 1).squeeze(0) ================================================ FILE: examples/textless_nlp/gslm/speech2unit/pretrained/logmel_feature_reader.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import soundfile as sf import torch import torchaudio.compliance.kaldi as kaldi class LogMelFeatureReader: """ Wrapper class to run inference on HuBERT model. Helps extract features for a given audio file. """ def __init__(self, *args, **kwargs): self.num_mel_bins = kwargs.get("num_mel_bins", 80) self.frame_length = kwargs.get("frame_length", 25.0) def get_feats(self, file_path, channel_id=None): wav, sr = sf.read(file_path) if channel_id is not None: assert wav.ndim == 2, \ f"Expected stereo input when channel_id is given ({file_path})" wav = wav[:, channel_id-1] feats = torch.from_numpy(wav).float() feats = kaldi.fbank( feats.unsqueeze(0), num_mel_bins=self.num_mel_bins, frame_length=self.frame_length, sample_frequency=sr, ) return feats ================================================ FILE: examples/textless_nlp/gslm/speech2unit/pretrained/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import gc import os import random import shutil import numpy as np import torch import tqdm from examples.textless_nlp.gslm.speech2unit.pretrained.cpc_feature_reader import ( CpcFeatureReader, ) from examples.textless_nlp.gslm.speech2unit.pretrained.hubert_feature_reader import ( HubertFeatureReader, ) from examples.textless_nlp.gslm.speech2unit.pretrained.logmel_feature_reader import ( LogMelFeatureReader, ) from examples.textless_nlp.gslm.speech2unit.pretrained.w2v2_feature_reader import ( Wav2VecFeatureReader, ) def get_feature_reader(feature_type): if feature_type == "logmel": return LogMelFeatureReader elif feature_type == "hubert": return HubertFeatureReader elif feature_type == "w2v2": return Wav2VecFeatureReader elif feature_type == "cpc": return CpcFeatureReader else: raise NotImplementedError(f"{feature_type} is not supported.") def get_feature_iterator( feature_type, checkpoint_path, layer, manifest_path, sample_pct, channel_id ): feature_reader_cls = get_feature_reader(feature_type) with open(manifest_path, "r") as fp: lines = fp.read().split("\n") root = lines.pop(0).strip() file_path_list = [ os.path.join(root, line.split("\t")[0]) for line in lines if len(line) > 0 ] if sample_pct < 1.0: file_path_list = random.sample( file_path_list, int(sample_pct * len(file_path_list)) ) num_files = len(file_path_list) reader = feature_reader_cls( checkpoint_path=checkpoint_path, layer=layer ) def iterate(): for file_path in file_path_list: feats = reader.get_feats(file_path, channel_id=channel_id) yield feats.cpu().numpy() return iterate, num_files def get_features( feature_type, checkpoint_path, layer, manifest_path, sample_pct, flatten, channel_id ): generator, num_files = get_feature_iterator( feature_type=feature_type, checkpoint_path=checkpoint_path, layer=layer, manifest_path=manifest_path, sample_pct=sample_pct, channel_id=channel_id ) iterator = generator() features_list = [] for features in tqdm.tqdm(iterator, total=num_files): features_list.append(features) # Explicit clean up del iterator del generator gc.collect() torch.cuda.empty_cache() if flatten: return np.concatenate(features_list) return features_list def get_and_dump_features( feature_type, checkpoint_path, layer, manifest_path, sample_pct, flatten, out_features_path, ): # Feature extraction features_batch = get_features( feature_type=feature_type, checkpoint_path=checkpoint_path, layer=layer, manifest_path=manifest_path, sample_pct=sample_pct, flatten=flatten, ) # Save features out_dir_path = os.path.dirname(out_features_path) os.makedirs(out_dir_path, exist_ok=True) shutil.copyfile( manifest_path, os.path.join(out_dir_path, os.path.basename(manifest_path)), ) np.save(out_features_path, features_batch) return features_batch ================================================ FILE: examples/textless_nlp/gslm/speech2unit/pretrained/w2v2_feature_reader.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import fairseq import soundfile as sf class Wav2VecFeatureReader: """ Wrapper class to run inference on Wav2Vec 2.0 model. Helps extract features for a given audio file. """ def __init__(self, checkpoint_path, layer, use_cuda=True): state = fairseq.checkpoint_utils.load_checkpoint_to_cpu( checkpoint_path ) w2v_args = state["args"] self.task = fairseq.tasks.setup_task(w2v_args) model = self.task.build_model(w2v_args) model.load_state_dict(state["model"], strict=True) model.eval() self.model = model self.layer = layer self.use_cuda = use_cuda if self.use_cuda: self.model.cuda() def read_audio(self, fname, channel_id=None): wav, sr = sf.read(fname) if channel_id is not None: assert wav.ndim == 2, \ f"Expected stereo input when channel_id is given ({fname})" assert channel_id in [1, 2], \ "channel_id is expected to be in [1, 2]" wav = wav[:, channel_id-1] if wav.ndim == 2: wav = wav.mean(-1) assert wav.ndim == 1, wav.ndim assert sr == self.task.cfg.sample_rate, sr return wav def get_feats(self, file_path, channel_id=None): x = self.read_audio(file_path, channel_id) with torch.no_grad(): source = torch.from_numpy(x).view(1, -1).float() if self.use_cuda: source = source.cuda() res = self.model( source=source, mask=False, features_only=True, layer=self.layer ) return res["layer_results"][self.layer][0].squeeze(1) ================================================ FILE: examples/textless_nlp/gslm/tools/README.md ================================================ # GSLM Tools ## Resynthesis You can use the command line tool below to input an audio file and get the resynthesized audio. This tool implements the unsupervised method for resynthesis described in the paper. The way to invoke the command line tool is shown below. ``` FAIRSEQ_ROOT=<path_to_your_fairseq_repo_root> TYPE=<one_of_logmel/cpc/hubert/w2v2> ACOUSTIC_MODEL_PATH=<path_of_pretrained_acoustic_model> LAYER=<layer_of_acoustic_model_to_extract_features_from> KM_MODEL_PATH=<output_path_of_the_kmeans_model> TTS_MODEL_PATH=<unit2speech_model_file_path> # A text file containing the codes, one per line CODE_DICT_PATH=<unit2speech_code_dict_path> WAVEGLOW_PATH=<path_where_you_have_downloaded_waveglow_checkpoint> PYTHONPATH=${FAIRSEQ_ROOT}:${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/unit2speech python ${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/tools/resynthesize_speech.py \ --feature_type $TYPE \ --acoustic_model_path $ACOUSTIC_MODEL_PATH \ --layer $LAYER \ --kmeans_model_path $KM_MODEL_PATH \ --tts_model_path $TTS_MODEL_PATH \ --code_dict_path $CODE_DICT_PATH \ --waveglow_path $WAVEGLOW_PATH \ --max_decoder_steps 2000 ``` ================================================ FILE: examples/textless_nlp/gslm/tools/resynthesize_speech.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import gc import logging import os import joblib import soundfile as sf import torch from examples.textless_nlp.gslm.speech2unit.pretrained.utils import get_feature_reader from examples.textless_nlp.gslm.unit2speech.tts_data import TacotronInputDataset from examples.textless_nlp.gslm.unit2speech.utils import ( load_tacotron, load_waveglow, synthesize_audio, ) def get_logger(): log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger(__name__) return logger def get_parser(): parser = argparse.ArgumentParser(description="GSLM U2S tool") parser.add_argument( "--feature_type", type=str, choices=["logmel", "hubert", "w2v2", "cpc"], default=None, required=True, help="Acoustic feature type", ) parser.add_argument( "--acoustic_model_path", type=str, help="Pretrained acoustic model checkpoint", ) parser.add_argument("--layer", type=int, help="Layer of acoustic model") parser.add_argument( "--kmeans_model_path", type=str, required=True, help="K-means model file path to use for inference", ) parser.add_argument( "--tts_model_path", type=str, help="TTS model file path to use for inference", ) parser.add_argument( "--code_dict_path", type=str, help="Code dict file path to use for inference", ) parser.add_argument( "--waveglow_path", type=str, help="Waveglow (vocoder) model file path to use for inference", ) parser.add_argument("--max_decoder_steps", type=int, default=2000) parser.add_argument("--denoiser_strength", type=float, default=0.1) return parser ################################################ def main(args, logger): # Acoustic Model logger.info(f"Loading acoustic model from {args.tts_model_path}...") feature_reader_cls = get_feature_reader(args.feature_type) reader = feature_reader_cls( checkpoint_path=args.acoustic_model_path, layer=args.layer ) # K-means Model logger.info(f"Loading K-means model from {args.kmeans_model_path} ...") kmeans_model = joblib.load(open(args.kmeans_model_path, "rb")) kmeans_model.verbose = False # TTS Model logger.info(f"Loading TTS model from {args.tts_model_path}...") tacotron_model, sample_rate, hparams = load_tacotron( tacotron_model_path=args.tts_model_path, max_decoder_steps=args.max_decoder_steps, ) # Waveglow Model logger.info(f"Loading Waveglow model from {args.waveglow_path}...") waveglow, denoiser = load_waveglow(waveglow_path=args.waveglow_path) # Dataset if not os.path.exists(hparams.code_dict): hparams.code_dict = args.code_dict_path tts_dataset = TacotronInputDataset(hparams) iters = 0 while True: in_file_path = input("Input: Enter the full file path of audio file...\n") out_file_path = input("Output: Enter the full file path of audio file...\n") feats = reader.get_feats(in_file_path).cpu().numpy() iters += 1 if iters == 1000: gc.collect() torch.cuda.empty_cache() quantized_units = kmeans_model.predict(feats) quantized_units_str = " ".join(map(str, quantized_units)) tts_input = tts_dataset.get_tensor(quantized_units_str) mel, aud, aud_dn, has_eos = synthesize_audio( tacotron_model, waveglow, denoiser, tts_input.unsqueeze(0), strength=args.denoiser_strength, ) sf.write(f"{out_file_path}", aud_dn[0].cpu().float().numpy(), sample_rate) logger.info("Resynthesis done!\n") if __name__ == "__main__": parser = get_parser() args = parser.parse_args() logger = get_logger() logger.info(args) main(args, logger) ================================================ FILE: examples/textless_nlp/gslm/ulm/README.md ================================================ # Unit Language Model (ULM) Here you can find links to the pre-trained ULMs and instructions on training new models using fairseq. At the end of the page, we also share how to run sampling for those models and provide pointers to the transcribed prompts we used. ## Pre-trained models Using the links below, you can download pre-trained models for various unit types and vocabulary sizes: | | 50 | 100 | 200 |-|-|-|- | LogMel Filterbank | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/lm_km50/logmel50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/lm_km100/logmel100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/lm_km200/logmel200_lm.tgz) | Modified CPC | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/lm_km50/cpc50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/lm_km100/cpc100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/lm_km200/cpc200_lm.tgz) | HuBERT | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km50/hubert50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km100/hubert100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/lm_km200/hubert200_lm.tgz) | Wav2Vec 2.0 | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/lm_km50/w2v2_50_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/lm_km100/w2v2_100_lm.tgz) | [download](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/lm_km200/w2v2_200_lm.tgz) ## Preprocessing data Assuming that unit-transcribed train, valid, and test sets are located in `data/train.txt`, `data/valid.txt`, and `data/test.txt`, respectively, we run the following command to get a preprocessed version of the datast in `data-bin`: ```bash fairseq-preprocess --only-source \ --trainpref data/train.txt --validpref data/valid.txt --testpref data/test.txt \ --destdir data-bin/ --workers 40 ``` As a result, the `data-bin` directory should appear. ## Fitting a Unit Language Model (ULM) As an ULM, we train a standard fairseq Transformer LM. Assuming 8 GPUs used for training, a good starting point for an ULM training would be: ```bash fairseq-train data-bin/ \ --task=language_modeling \ --arch=transformer_lm_big \ --share-decoder-input-output-embed \ --dropout=0.1 \ --attention-dropout=0.1 \ --optimizer=adam \ --adam-betas='(0.9, 0.98)' \ --clip-norm=1.0 \ --lr=0.0005 \ --lr-scheduler=inverse_sqrt \ --warmup-updates=4000 \ --warmup-init-lr=1e-07 \ --tokens-per-sample=3072 \ --update-freq=16 \ --max-tokens=4096 \ --num-workers=4 \ --skip-invalid-size-inputs-valid-test \ --max-update=500000 \ --log-interval=10 \ --seed=100501 \ --fp16 \ --sample-break-mode=eos ``` This command will train a Transformer-large model (12 layers). You can train other standard LM models provided by fairseq, e.g. specify `--arch=transformer_lm` to train a smaller (6-layer) Transformer model. When training with a different number of GPUs, it might be a good idea to adjust the `update-freq` parameter. To save the GPU memory at an expense of additional computation, it can be useful to enable activation checkpointing with `--checkpoint-activations`. ## Sampling from an ULM Once an ULM was trained, we can use it for generating new utterances. Suppose, that the prompts are given in a file named `prompts.txt`. Then we can sample continuations by running the following command: ```bash python sample.py data-bin/ \ --path=checkpoints/checkpoint_best.pt --task=language_modeling --sampling --temperature=0.7 \ --seed=1 --prompts=prompts.txt --output=samples.txt --max-len-a=0 --max-len-b=500 \ --prefix-size=-1 --batch-size=16 --fp16 --samples-per-prompt=10 ``` Here, `--prefix-size` controls the number of tokens that are used to prime the ULM. When set to a positive value, the sampling script will take first `prefix-size` tokens to prompt the ULM; with `0` it runs unconditional sampling and with `-1` the entire prompt is used. `--samples-per-prompt` specifies how many utterances are generated with every prompt which can be useful when generating multiple prompt continuations. In this command, `--max-len-a` and `--max-len-b` control the number of generated tokens. When using a pretrained model from above, `data-bin` should point to the unpacked directory (with `dict.txt` file). Evaluation-time, to generate prompts, we used utterances from LibriSpeech dev-clean and test-clean that are longer than 6s. We took first 3s from an utterance as a prompt. Unit transcripts of those prompts can be downloaded here: [[dev]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/dev_prompts.tgz) [[test]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/eval_data/test_prompts.tgz) ================================================ FILE: examples/textless_nlp/gslm/ulm/sample.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Sample from a trained LM; hacked fairseq-interactive """ from collections import namedtuple import os import ast import numpy as np from fairseq import checkpoint_utils, options, tasks, utils import tqdm Batch = namedtuple('Batch', 'ids src_tokens src_lengths') Translation = namedtuple('Translation', 'src_str hypos pos_scores alignments') def make_batches(lines, args, task, max_positions): tokens = [ task.source_dictionary.encode_line( src_str, add_if_not_exist=False ).long() for src_str in lines ] lengths = [t.numel() for t in tokens] itr = task.get_batch_iterator( dataset=task.build_dataset_for_inference(tokens, lengths), max_tokens=args.dataset.max_tokens, max_sentences=args.dataset.batch_size, max_positions=max_positions, ignore_invalid_inputs=args.dataset.skip_invalid_size_inputs_valid_test ).next_epoch_itr(shuffle=False) for batch in itr: yield Batch( ids=batch['id'], src_tokens=batch['net_input']['src_tokens'], src_lengths=batch['net_input']['src_lengths'], ) def main(args): arg_prompts = args.prompts arg_output = args.output arg_debug = args.debug arg_sample_size = args.samples_per_prompt try: from fairseq.dataclass.utils import convert_namespace_to_omegaconf args = convert_namespace_to_omegaconf(args) except: pass # if args.max_tokens is None and args.max_sentences is None: if args.common.seed is not None: np.random.seed(args.common.seed) utils.set_torch_seed(args.common.seed) if args.generation.sampling: args.generation.nbest = args.generation.beam = arg_sample_size task = tasks.setup_task(args.task) overrides = ast.literal_eval(args.common_eval.model_overrides) models, _model_args = checkpoint_utils.load_model_ensemble( args.common_eval.path.split(os.pathsep), arg_overrides=overrides, task=task, suffix=getattr(args, "checkpoint_suffix", ""), ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: model.prepare_for_inference_(args) model.cuda() # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(args.generation.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) output_file = open(arg_output, 'w') with open(arg_prompts, 'r') as fin: lines = fin.readlines() split = [x.split('|', 1) for x in lines] seq_id = [x[0] for x in split] prompts = [x[1] for x in split] if args.generation.prefix_size >= 0: prompts = [' '.join(l.split()[:args.generation.prefix_size]) for l in prompts] if arg_debug: prompts = prompts[:10] generator = task.build_generator(models, args.generation) start_id = 0 pbar = tqdm.tqdm(total=len(prompts)) for batch in make_batches(prompts, args, task, max_positions): src_tokens = batch.src_tokens src_lengths = batch.src_lengths src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() sample = { 'net_input': { 'src_tokens': src_tokens, 'src_lengths': src_lengths, }, } results = [] translations = task.inference_step(generator, models, sample) for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) results.append((i + start_id, src_tokens_i, hypos)) # sort output to match input order for id, src_tokens, hypos in sorted(results, key=lambda x: x[0]): if src_dict is not None: src_str = src_dict.string( src_tokens, args.common_eval.post_process) # Process top predictions for hypo_id, hypo in enumerate(hypos): _hypo_tokens, hypo_str, _alignment = utils.post_process_prediction( hypo_tokens=hypo['tokens'].int().cpu(), src_str=src_str, alignment=hypo['alignment'], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=args.common_eval.post_process, ) detok_hypo_str = hypo_str utterance = detok_hypo_str print(f'{seq_id[id]}__{hypo_id}|{utterance}', file=output_file) pbar.update(1) start_id += len(results) # output_file.close() def cli_main(): parser = options.get_interactive_generation_parser() parser.add_argument('--prompts', type=str, default=None, required=True) parser.add_argument('--output', type=str, default=None, required=True) parser.add_argument('--debug', action='store_true') parser.add_argument('--samples-per-prompt', type=int, default=1) args = options.parse_args_and_arch(parser) np.random.seed(args.seed) utils.set_torch_seed(args.seed) main(args) if __name__ == '__main__': cli_main() ================================================ FILE: examples/textless_nlp/gslm/unit2speech/README.md ================================================ # Unit to Speech Model (unit2speech) Unit to speech model is modified Tacotron2 model that learns to synthesize speech from discrete speech units. All models are trained on quantized [LJSpeech](https://keithito.com/LJ-Speech-Dataset/). Upstream Units | Download Links | model md5 |-|-|- Log Mel Filterbank + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km50/code_dict) | 932b3b8527c0125f5f964b57762eba49 Log Mel Filterbank + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km100/code_dict) | cde0b0d278a39011d0acbd5df27abdf4 Log Mel Filterbank + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/logmel/tts_km200/code_dict) | dba0f1d4de64bc7976718834010b23e7 Modified CPC + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km50/code_dict) | a585e8dd8890ea56164f17635dd8e613 Modified CPC + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km100/code_dict) | 5c0ee2869b4f483d17f37f1a41a548e0 Modified CPC + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/tts_km200/code_dict) | 2f0c9951cf37020d9464514bff48bc5d HuBERT Base + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km50/code_dict) | 85ffce8baec5aa90035ab696fe676fce HuBERT Base + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km100/code_dict) | df4a9c6ffd1bb00c91405432c234aba3 HuBERT Base + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/tts_km200/code_dict) | ac72f2c0c563589819bec116c7f8d274 wav2vec 2.0 Large + KM50 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km50/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km50/code_dict) | e3503d0ad822b2c24b89f68b857fedff wav2vec 2.0 Large + KM100 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km100/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km100/code_dict) | eb3666e456ae4c96bf2a1eec825c13ed wav2vec 2.0 Large + KM200 | [model](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km200/tts_checkpoint_best.pt) - [code_dict](https://dl.fbaipublicfiles.com/textless_nlp/gslm/w2v2/tts_km200/code_dict) | 777d343e963c4d64f04d78eef032f4e8 ## Run inference using a unit2speech model * Install librosa, unidecode and inflect using `pip install librosa, unidecode, inflect` * Download [Waveglow checkpoint](https://dl.fbaipublicfiles.com/textless_nlp/gslm/waveglow_256channels_new.pt). This is the vocoder. Sample commnd to run inference using trained unit2speech models. Please note that the quantized audio to synthesized should be using the same units as the unit2speech model was trained with. ``` FAIRSEQ_ROOT=<path_to_your_fairseq_repo_root> TTS_MODEL_PATH=<unit2speech_model_file_path> QUANTIZED_UNIT_PATH=<quantized_audio_file_path> OUT_DIR=<dir_to_dump_synthesized_audio_files> WAVEGLOW_PATH=<path_where_you_have_downloaded_waveglow_checkpoint> CODE_DICT_PATH=<unit2speech_code_dict_path> PYTHONPATH=${FAIRSEQ_ROOT}:${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/unit2speech python ${FAIRSEQ_ROOT}/examples/textless_nlp/gslm/unit2speech/synthesize_audio_from_units.py \ --tts_model_path $TTS_MODEL_PATH \ --quantized_unit_path $QUANTIZED_UNIT_PATH \ --out_audio_dir $OUT_DIR \ --waveglow_path $WAVEGLOW_PATH \ --code_dict_path $CODE_DICT_PATH \ --max_decoder_steps 2000 ``` ================================================ FILE: examples/textless_nlp/gslm/unit2speech/convert_to_16k.py ================================================ import os import shlex import subprocess import progressbar from time import time from pathlib import Path def find_all_files(path_dir, extension): out = [] for root, dirs, filenames in os.walk(path_dir): for f in filenames: if f.endswith(extension): out.append(((str(Path(f).stem)), os.path.join(root, f))) return out def convert16k(inputfile, outputfile16k): command = ('sox -c 1 -b 16 {} -t wav {} rate 16k'.format(inputfile, outputfile16k)) subprocess.call(shlex.split(command)) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Convert to wav 16k audio using sox.') parser.add_argument('input_dir', type=str, help='Path to the input dir.') parser.add_argument('output_dir', type=str, help='Path to the output dir.') parser.add_argument('--extension', type=str, default='wav', help='Audio file extension in the input. Default: mp3') args = parser.parse_args() # Find all sequences print(f"Finding all audio files with extension '{args.extension}' from {args.input_dir}...") audio_files = find_all_files(args.input_dir, args.extension) print(f"Done! Found {len(audio_files)} files.") # Convert to relative path audio_files = [os.path.relpath(file[-1], start=args.input_dir) for file in audio_files] # Create all the directories needed rel_dirs_set = set([os.path.dirname(file) for file in audio_files]) for rel_dir in rel_dirs_set: Path(os.path.join(args.output_dir, rel_dir)).mkdir(parents=True, exist_ok=True) # Converting wavs files print("Converting the audio to wav files...") bar = progressbar.ProgressBar(maxval=len(audio_files)) bar.start() start_time = time() for index, file in enumerate(audio_files): bar.update(index) input_file = os.path.join(args.input_dir, file) output_file = os.path.join(args.output_dir, os.path.splitext(file)[0]+".wav") convert16k(input_file, output_file) bar.finish() print(f"...done {len(audio_files)} files in {time()-start_time} seconds.") ================================================ FILE: examples/textless_nlp/gslm/unit2speech/glow.py ================================================ # ***************************************************************************** # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the NVIDIA CORPORATION nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** import copy import torch from torch.autograd import Variable import torch.nn.functional as F @torch.jit.script def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels): n_channels_int = n_channels[0] in_act = input_a+input_b t_act = torch.tanh(in_act[:, :n_channels_int, :]) s_act = torch.sigmoid(in_act[:, n_channels_int:, :]) acts = t_act * s_act return acts class WaveGlowLoss(torch.nn.Module): def __init__(self, sigma=1.0): super(WaveGlowLoss, self).__init__() self.sigma = sigma def forward(self, model_output): z, log_s_list, log_det_W_list = model_output for i, log_s in enumerate(log_s_list): if i == 0: log_s_total = torch.sum(log_s) log_det_W_total = log_det_W_list[i] else: log_s_total = log_s_total + torch.sum(log_s) log_det_W_total += log_det_W_list[i] loss = torch.sum(z*z)/(2*self.sigma*self.sigma) - log_s_total - log_det_W_total return loss/(z.size(0)*z.size(1)*z.size(2)) class Invertible1x1Conv(torch.nn.Module): """ The layer outputs both the convolution, and the log determinant of its weight matrix. If reverse=True it does convolution with inverse """ def __init__(self, c): super(Invertible1x1Conv, self).__init__() self.conv = torch.nn.Conv1d(c, c, kernel_size=1, stride=1, padding=0, bias=False) # Sample a random orthonormal matrix to initialize weights _qr = torch.linalg.qr if torch.__version__ >= "1.8" else torch.qr W = _qr(torch.FloatTensor(c, c).normal_())[0] # Ensure determinant is 1.0 not -1.0 if torch.det(W) < 0: W[:,0] = -1*W[:,0] W = W.view(c, c, 1) self.conv.weight.data = W def forward(self, z, reverse=False): # shape batch_size, group_size, n_of_groups = z.size() W = self.conv.weight.squeeze() if reverse: if not hasattr(self, 'W_inverse'): # Reverse computation W_inverse = W.float().inverse() W_inverse = Variable(W_inverse[..., None]) if z.type() == 'torch.cuda.HalfTensor': W_inverse = W_inverse.half() self.W_inverse = W_inverse z = F.conv1d(z, self.W_inverse, bias=None, stride=1, padding=0) return z else: # Forward computation log_det_W = batch_size * n_of_groups * torch.logdet(W) z = self.conv(z) return z, log_det_W class WN(torch.nn.Module): """ This is the WaveNet like layer for the affine coupling. The primary difference from WaveNet is the convolutions need not be causal. There is also no dilation size reset. The dilation only doubles on each layer """ def __init__(self, n_in_channels, n_mel_channels, n_layers, n_channels, kernel_size): super(WN, self).__init__() assert(kernel_size % 2 == 1) assert(n_channels % 2 == 0) self.n_layers = n_layers self.n_channels = n_channels self.in_layers = torch.nn.ModuleList() self.res_skip_layers = torch.nn.ModuleList() start = torch.nn.Conv1d(n_in_channels, n_channels, 1) start = torch.nn.utils.weight_norm(start, name='weight') self.start = start # Initializing last layer to 0 makes the affine coupling layers # do nothing at first. This helps with training stability end = torch.nn.Conv1d(n_channels, 2*n_in_channels, 1) end.weight.data.zero_() end.bias.data.zero_() self.end = end cond_layer = torch.nn.Conv1d(n_mel_channels, 2*n_channels*n_layers, 1) self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight') for i in range(n_layers): dilation = 2 ** i padding = int((kernel_size*dilation - dilation)/2) in_layer = torch.nn.Conv1d(n_channels, 2*n_channels, kernel_size, dilation=dilation, padding=padding) in_layer = torch.nn.utils.weight_norm(in_layer, name='weight') self.in_layers.append(in_layer) # last one is not necessary if i < n_layers - 1: res_skip_channels = 2*n_channels else: res_skip_channels = n_channels res_skip_layer = torch.nn.Conv1d(n_channels, res_skip_channels, 1) res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight') self.res_skip_layers.append(res_skip_layer) def forward(self, forward_input): audio, spect = forward_input audio = self.start(audio) output = torch.zeros_like(audio) n_channels_tensor = torch.IntTensor([self.n_channels]) spect = self.cond_layer(spect) for i in range(self.n_layers): spect_offset = i*2*self.n_channels acts = fused_add_tanh_sigmoid_multiply( self.in_layers[i](audio), spect[:,spect_offset:spect_offset+2*self.n_channels,:], n_channels_tensor) res_skip_acts = self.res_skip_layers[i](acts) if i < self.n_layers - 1: audio = audio + res_skip_acts[:,:self.n_channels,:] output = output + res_skip_acts[:,self.n_channels:,:] else: output = output + res_skip_acts return self.end(output) class WaveGlow(torch.nn.Module): def __init__(self, n_mel_channels, n_flows, n_group, n_early_every, n_early_size, WN_config): super(WaveGlow, self).__init__() self.upsample = torch.nn.ConvTranspose1d(n_mel_channels, n_mel_channels, 1024, stride=256) assert(n_group % 2 == 0) self.n_flows = n_flows self.n_group = n_group self.n_early_every = n_early_every self.n_early_size = n_early_size self.WN = torch.nn.ModuleList() self.convinv = torch.nn.ModuleList() n_half = int(n_group/2) # Set up layers with the right sizes based on how many dimensions # have been output already n_remaining_channels = n_group for k in range(n_flows): if k % self.n_early_every == 0 and k > 0: n_half = n_half - int(self.n_early_size/2) n_remaining_channels = n_remaining_channels - self.n_early_size self.convinv.append(Invertible1x1Conv(n_remaining_channels)) self.WN.append(WN(n_half, n_mel_channels*n_group, **WN_config)) self.n_remaining_channels = n_remaining_channels # Useful during inference def forward(self, forward_input): """ forward_input[0] = mel_spectrogram: batch x n_mel_channels x frames forward_input[1] = audio: batch x time """ spect, audio = forward_input # Upsample spectrogram to size of audio spect = self.upsample(spect) assert(spect.size(2) >= audio.size(1)) if spect.size(2) > audio.size(1): spect = spect[:, :, :audio.size(1)] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) audio = audio.unfold(1, self.n_group, self.n_group).permute(0, 2, 1) output_audio = [] log_s_list = [] log_det_W_list = [] for k in range(self.n_flows): if k % self.n_early_every == 0 and k > 0: output_audio.append(audio[:,:self.n_early_size,:]) audio = audio[:,self.n_early_size:,:] audio, log_det_W = self.convinv[k](audio) log_det_W_list.append(log_det_W) n_half = int(audio.size(1)/2) audio_0 = audio[:,:n_half,:] audio_1 = audio[:,n_half:,:] output = self.WN[k]((audio_0, spect)) log_s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = torch.exp(log_s)*audio_1 + b log_s_list.append(log_s) audio = torch.cat([audio_0, audio_1],1) output_audio.append(audio) return torch.cat(output_audio,1), log_s_list, log_det_W_list def infer(self, spect, sigma=1.0): spect = self.upsample(spect) # trim conv artifacts. maybe pad spec to kernel multiple time_cutoff = self.upsample.kernel_size[0] - self.upsample.stride[0] spect = spect[:, :, :-time_cutoff] spect = spect.unfold(2, self.n_group, self.n_group).permute(0, 2, 1, 3) spect = spect.contiguous().view(spect.size(0), spect.size(1), -1).permute(0, 2, 1) if spect.type() == 'torch.cuda.HalfTensor': audio = torch.cuda.HalfTensor(spect.size(0), self.n_remaining_channels, spect.size(2)).normal_() else: audio = torch.cuda.FloatTensor(spect.size(0), self.n_remaining_channels, spect.size(2)).normal_() audio = torch.autograd.Variable(sigma*audio) for k in reversed(range(self.n_flows)): n_half = int(audio.size(1)/2) audio_0 = audio[:,:n_half,:] audio_1 = audio[:,n_half:,:] output = self.WN[k]((audio_0, spect)) s = output[:, n_half:, :] b = output[:, :n_half, :] audio_1 = (audio_1 - b)/torch.exp(s) audio = torch.cat([audio_0, audio_1],1) audio = self.convinv[k](audio, reverse=True) if k % self.n_early_every == 0 and k > 0: if spect.type() == 'torch.cuda.HalfTensor': z = torch.cuda.HalfTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() else: z = torch.cuda.FloatTensor(spect.size(0), self.n_early_size, spect.size(2)).normal_() audio = torch.cat((sigma*z, audio),1) audio = audio.permute(0,2,1).contiguous().view(audio.size(0), -1).data return audio @staticmethod def remove_weightnorm(model): waveglow = model for WN in waveglow.WN: WN.start = torch.nn.utils.remove_weight_norm(WN.start) WN.in_layers = remove(WN.in_layers) WN.cond_layer = torch.nn.utils.remove_weight_norm(WN.cond_layer) WN.res_skip_layers = remove(WN.res_skip_layers) return waveglow def remove(conv_list): new_conv_list = torch.nn.ModuleList() for old_conv in conv_list: old_conv = torch.nn.utils.remove_weight_norm(old_conv) new_conv_list.append(old_conv) return new_conv_list ================================================ FILE: examples/textless_nlp/gslm/unit2speech/multiproc.py ================================================ import os import time import torch import sys import subprocess argslist = list(sys.argv)[1:] log_dir = argslist[-1] num_gpus = torch.cuda.device_count() argslist.append('--n_gpus={}'.format(num_gpus)) workers = [] job_id = time.strftime("%Y_%m_%d-%H%M%S") argslist.append("--group_name=group_{}".format(job_id)) print("GPU log directory is {}".format(log_dir)) os.makedirs(log_dir, exist_ok=True) for i in range(num_gpus): argslist.append('--rank={}'.format(i)) stdout = None if i == 0 else open("{}/{}_GPU_{}.log".format(log_dir, job_id, i), "w") print(argslist) p = subprocess.Popen([str(sys.executable)]+argslist, stdout=stdout) workers.append(p) argslist = argslist[:-1] for p in workers: p.wait() ================================================ FILE: examples/textless_nlp/gslm/unit2speech/synthesize_audio_from_units.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import os import soundfile as sf from examples.textless_nlp.gslm.unit2speech.tts_data import ( TacotronInputDataset, ) from examples.textless_nlp.gslm.unit2speech.utils import ( load_quantized_audio_from_file, load_tacotron, load_waveglow, synthesize_audio, ) def get_logger(): log_format = "[%(asctime)s] [%(levelname)s]: %(message)s" logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger(__name__) return logger def get_parser(): parser = argparse.ArgumentParser( description="Wav2Vec 2.0 speech generator." ) parser.add_argument( "--quantized_unit_path", type=str, help="K-means model file path to use for inference", ) parser.add_argument( "--tts_model_path", type=str, help="TTS model file path to use for inference", ) parser.add_argument( "--waveglow_path", type=str, help="Path to the waveglow checkpoint (vocoder).", ) parser.add_argument( "--code_dict_path", type=str, help="Code dict file path to use for inference", ) parser.add_argument("--max_decoder_steps", type=int, default=2000) parser.add_argument("--denoiser_strength", type=float, default=0.1) parser.add_argument( "--out_audio_dir", type=str, help="Output directory to dump audio files", ) return parser def main(args, logger): # Load quantized audio logger.info(f"Loading quantized audio from {args.quantized_unit_path}...") names_batch, quantized_units_batch = load_quantized_audio_from_file( file_path=args.quantized_unit_path ) logger.info(f"Loading TTS model from {args.tts_model_path}...") tacotron_model, sample_rate, hparams = load_tacotron( tacotron_model_path=args.tts_model_path, max_decoder_steps=args.max_decoder_steps, ) logger.info(f"Loading Waveglow model from {args.waveglow_path}...") waveglow, denoiser = load_waveglow(waveglow_path=args.waveglow_path) if not os.path.exists(hparams.code_dict): hparams.code_dict = args.code_dict_path tts_dataset = TacotronInputDataset(hparams) for name, quantized_units in zip(names_batch, quantized_units_batch): quantized_units_str = " ".join(map(str, quantized_units)) tts_input = tts_dataset.get_tensor(quantized_units_str) mel, aud, aud_dn, has_eos = synthesize_audio( tacotron_model, waveglow, denoiser, tts_input.unsqueeze(0), strength=args.denoiser_strength, ) out_file_path = os.path.join(args.out_audio_dir, f"{name}.wav") sf.write( f"{out_file_path}", aud_dn[0].cpu().float().numpy(), sample_rate ) if __name__ == "__main__": parser = get_parser() args = parser.parse_args() logger = get_logger() logger.info(args) main(args, logger) ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/__init__.py ================================================ ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/audio_processing.py ================================================ import torch import numpy as np from scipy.signal import get_window import librosa.util as librosa_util def window_sumsquare(window, n_frames, hop_length=200, win_length=800, n_fft=800, dtype=np.float32, norm=None): """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. This is used to estimate modulation effects induced by windowing observations in short-time fourier transforms. Parameters ---------- window : string, tuple, number, callable, or list-like Window specification, as in `get_window` n_frames : int > 0 The number of analysis frames hop_length : int > 0 The number of samples to advance between frames win_length : [optional] The length of the window function. By default, this matches `n_fft`. n_fft : int > 0 The length of each analysis frame. dtype : np.dtype The data type of the output Returns ------- wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` The sum-squared envelope of the window function """ if win_length is None: win_length = n_fft n = n_fft + hop_length * (n_frames - 1) x = np.zeros(n, dtype=dtype) # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) win_sq = librosa_util.normalize(win_sq, norm=norm)**2 win_sq = librosa_util.pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] return x def griffin_lim(magnitudes, stft_fn, n_iters=30): """ PARAMS ------ magnitudes: spectrogram magnitudes stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods """ angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size()))) angles = angles.astype(np.float32) angles = torch.autograd.Variable(torch.from_numpy(angles)) signal = stft_fn.inverse(magnitudes, angles).squeeze(1) for i in range(n_iters): _, angles = stft_fn.transform(signal) signal = stft_fn.inverse(magnitudes, angles).squeeze(1) return signal def dynamic_range_compression(x, C=1, clip_val=1e-5): """ PARAMS ------ C: compression factor """ return torch.log(torch.clamp(x, min=clip_val) * C) def dynamic_range_decompression(x, C=1): """ PARAMS ------ C: compression factor used to compress """ return torch.exp(x) / C ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/cleaners.py ================================================ """ from https://github.com/keithito/tacotron """ ''' Cleaners are transformations that run over the input text at both training and eval time. Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners" hyperparameter. Some cleaners are English-specific. You'll typically want to use: 1. "english_cleaners" for English text 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using the Unidecode library (https://pypi.python.org/pypi/Unidecode) 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update the symbols in symbols.py to match your data). ''' import re from unidecode import unidecode from .numbers import normalize_numbers # Regular expression matching whitespace: _whitespace_re = re.compile(r'\s+') # List of (regular expression, replacement) pairs for abbreviations: _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [ ('mrs', 'misess'), ('mr', 'mister'), ('dr', 'doctor'), ('st', 'saint'), ('co', 'company'), ('jr', 'junior'), ('maj', 'major'), ('gen', 'general'), ('drs', 'doctors'), ('rev', 'reverend'), ('lt', 'lieutenant'), ('hon', 'honorable'), ('sgt', 'sergeant'), ('capt', 'captain'), ('esq', 'esquire'), ('ltd', 'limited'), ('col', 'colonel'), ('ft', 'fort'), ]] def expand_abbreviations(text): for regex, replacement in _abbreviations: text = re.sub(regex, replacement, text) return text def expand_numbers(text): return normalize_numbers(text) def lowercase(text): return text.lower() def collapse_whitespace(text): return re.sub(_whitespace_re, ' ', text) def convert_to_ascii(text): return unidecode(text) def basic_cleaners(text): '''Basic pipeline that lowercases and collapses whitespace without transliteration.''' text = lowercase(text) text = collapse_whitespace(text) return text def transliteration_cleaners(text): '''Pipeline for non-English text that transliterates to ASCII.''' text = convert_to_ascii(text) text = lowercase(text) text = collapse_whitespace(text) return text def english_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' text = convert_to_ascii(text) text = lowercase(text) text = expand_numbers(text) text = expand_abbreviations(text) text = collapse_whitespace(text) return text ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/cmudict.py ================================================ """ from https://github.com/keithito/tacotron """ import re valid_symbols = [ 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH' ] _valid_symbol_set = set(valid_symbols) class CMUDict: '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict''' def __init__(self, file_or_path, keep_ambiguous=True): if isinstance(file_or_path, str): with open(file_or_path, encoding='latin-1') as f: entries = _parse_cmudict(f) else: entries = _parse_cmudict(file_or_path) if not keep_ambiguous: entries = {word: pron for word, pron in entries.items() if len(pron) == 1} self._entries = entries def __len__(self): return len(self._entries) def lookup(self, word): '''Returns list of ARPAbet pronunciations of the given word.''' return self._entries.get(word.upper()) _alt_re = re.compile(r'\([0-9]+\)') def _parse_cmudict(file): cmudict = {} for line in file: if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"): parts = line.split(' ') word = re.sub(_alt_re, '', parts[0]) pronunciation = _get_pronunciation(parts[1]) if pronunciation: if word in cmudict: cmudict[word].append(pronunciation) else: cmudict[word] = [pronunciation] return cmudict def _get_pronunciation(s): parts = s.strip().split(' ') for part in parts: if part not in _valid_symbol_set: return None return ' '.join(parts) ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/layers.py ================================================ import torch from librosa.filters import mel as librosa_mel_fn from .audio_processing import dynamic_range_compression from .audio_processing import dynamic_range_decompression from .stft import STFT from .utils import get_mask_from_lengths class LinearNorm(torch.nn.Module): def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'): super(LinearNorm, self).__init__() self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias) torch.nn.init.xavier_uniform_( self.linear_layer.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) def forward(self, x): return self.linear_layer(x) class ConvNorm(torch.nn.Module): def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=None, dilation=1, bias=True, w_init_gain='linear'): super(ConvNorm, self).__init__() if padding is None: assert(kernel_size % 2 == 1) padding = int(dilation * (kernel_size - 1) / 2) self.conv = torch.nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, bias=bias) torch.nn.init.xavier_uniform_( self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain)) def forward(self, signal): conv_signal = self.conv(signal) return conv_signal class GlobalAvgPool(torch.nn.Module): def __init__(self): super(GlobalAvgPool, self).__init__() def forward(self, x, lengths=None): """Average pooling across time steps (dim=1) with optionally lengths. Args: x: torch.Tensor of shape (N, T, ...) lengths: None or torch.Tensor of shape (N,) dim: dimension to pool """ if lengths is None: return x.mean(dim=1, keepdim=False) else: mask = get_mask_from_lengths(lengths).type(x.type()).to(x.device) mask_shape = list(mask.size()) + [1 for _ in range(x.ndimension()-2)] mask = mask.reshape(*mask_shape) numer = (x * mask).sum(dim=1, keepdim=False) denom = mask.sum(dim=1, keepdim=False) return numer / denom class TacotronSTFT(torch.nn.Module): def __init__(self, filter_length=1024, hop_length=256, win_length=1024, n_mel_channels=80, sampling_rate=22050, mel_fmin=0.0, mel_fmax=8000.0): super(TacotronSTFT, self).__init__() self.n_mel_channels = n_mel_channels self.sampling_rate = sampling_rate self.stft_fn = STFT(filter_length, hop_length, win_length) mel_basis = librosa_mel_fn( sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax) mel_basis = torch.from_numpy(mel_basis).float() self.register_buffer('mel_basis', mel_basis) def spectral_normalize(self, magnitudes): output = dynamic_range_compression(magnitudes) return output def spectral_de_normalize(self, magnitudes): output = dynamic_range_decompression(magnitudes) return output def mel_spectrogram(self, y): """Computes mel-spectrograms from a batch of waves PARAMS ------ y: Variable(torch.FloatTensor) with shape (B, T) in range [-1, 1] RETURNS ------- mel_output: torch.FloatTensor of shape (B, n_mel_channels, T) """ assert(torch.min(y.data) >= -1) assert(torch.max(y.data) <= 1) magnitudes, phases = self.stft_fn.transform(y) magnitudes = magnitudes.data mel_output = torch.matmul(self.mel_basis, magnitudes) mel_output = self.spectral_normalize(mel_output) return mel_output ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/model.py ================================================ from math import sqrt import torch import torch.distributions as distr from torch.autograd import Variable from torch import nn from torch.nn import functional as F from .layers import ConvNorm, LinearNorm, GlobalAvgPool from .utils import to_gpu, get_mask_from_lengths class LocationLayer(nn.Module): def __init__(self, attention_n_filters, attention_kernel_size, attention_dim): super(LocationLayer, self).__init__() padding = int((attention_kernel_size - 1) / 2) self.location_conv = ConvNorm(2, attention_n_filters, kernel_size=attention_kernel_size, padding=padding, bias=False, stride=1, dilation=1) self.location_dense = LinearNorm(attention_n_filters, attention_dim, bias=False, w_init_gain='tanh') def forward(self, attention_weights_cat): processed_attention = self.location_conv(attention_weights_cat) processed_attention = processed_attention.transpose(1, 2) processed_attention = self.location_dense(processed_attention) return processed_attention class Attention(nn.Module): def __init__(self, attention_rnn_dim, embedding_dim, attention_dim, attention_location_n_filters, attention_location_kernel_size): super(Attention, self).__init__() self.query_layer = LinearNorm(attention_rnn_dim, attention_dim, bias=False, w_init_gain='tanh') self.memory_layer = LinearNorm(embedding_dim, attention_dim, bias=False, w_init_gain='tanh') self.v = LinearNorm(attention_dim, 1, bias=False) self.location_layer = LocationLayer(attention_location_n_filters, attention_location_kernel_size, attention_dim) self.score_mask_value = -float("inf") def get_alignment_energies(self, query, processed_memory, attention_weights_cat): """ PARAMS ------ query: decoder output (batch, n_mel_channels * n_frames_per_step) processed_memory: processed encoder outputs (B, T_in, attention_dim) attention_weights_cat: cumulative and prev. att weights (B, 2, max_time) RETURNS ------- alignment (batch, max_time) """ processed_query = self.query_layer(query.unsqueeze(1)) processed_attention_weights = self.location_layer(attention_weights_cat) energies = self.v(torch.tanh( processed_query + processed_attention_weights + processed_memory)) energies = energies.squeeze(-1) return energies def forward(self, attention_hidden_state, memory, processed_memory, attention_weights_cat, mask): """ PARAMS ------ attention_hidden_state: attention rnn last output memory: encoder outputs processed_memory: processed encoder outputs attention_weights_cat: previous and cummulative attention weights mask: binary mask for padded data """ alignment = self.get_alignment_energies( attention_hidden_state, processed_memory, attention_weights_cat) if mask is not None: alignment.data.masked_fill_(mask, self.score_mask_value) attention_weights = F.softmax(alignment, dim=1) attention_context = torch.bmm(attention_weights.unsqueeze(1), memory) attention_context = attention_context.squeeze(1) return attention_context, attention_weights class Prenet(nn.Module): def __init__(self, in_dim, sizes): super(Prenet, self).__init__() in_sizes = [in_dim] + sizes[:-1] self.layers = nn.ModuleList( [LinearNorm(in_size, out_size, bias=False) for (in_size, out_size) in zip(in_sizes, sizes)]) def forward(self, x): for linear in self.layers: x = F.dropout(F.relu(linear(x)), p=0.5, training=True) return x class Postnet(nn.Module): """Postnet - Five 1-d convolution with 512 channels and kernel size 5 """ def __init__(self, hparams): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() self.convolutions.append( nn.Sequential( ConvNorm(hparams.n_mel_channels, hparams.postnet_embedding_dim, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hparams.postnet_embedding_dim)) ) for i in range(1, hparams.postnet_n_convolutions - 1): self.convolutions.append( nn.Sequential( ConvNorm(hparams.postnet_embedding_dim, hparams.postnet_embedding_dim, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hparams.postnet_embedding_dim)) ) self.convolutions.append( nn.Sequential( ConvNorm(hparams.postnet_embedding_dim, hparams.n_mel_channels, kernel_size=hparams.postnet_kernel_size, stride=1, padding=int((hparams.postnet_kernel_size - 1) / 2), dilation=1, w_init_gain='linear'), nn.BatchNorm1d(hparams.n_mel_channels)) ) def forward(self, x): for i in range(len(self.convolutions) - 1): x = F.dropout(torch.tanh(self.convolutions[i](x)), 0.5, self.training) x = F.dropout(self.convolutions[-1](x), 0.5, self.training) return x class Encoder(nn.Module): """Encoder module: - Three 1-d convolution banks - Bidirectional LSTM """ def __init__(self, hparams): super(Encoder, self).__init__() convolutions = [] for _ in range(hparams.encoder_n_convolutions): conv_layer = nn.Sequential( ConvNorm(hparams.encoder_embedding_dim, hparams.encoder_embedding_dim, kernel_size=hparams.encoder_kernel_size, stride=1, padding=int((hparams.encoder_kernel_size - 1) / 2), dilation=1, w_init_gain='relu'), nn.BatchNorm1d(hparams.encoder_embedding_dim)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.encoder_embedding_dim, int(hparams.encoder_embedding_dim / 2), 1, batch_first=True, bidirectional=True) def forward(self, x, input_lengths): for conv in self.convolutions: x = F.dropout(F.relu(conv(x)), 0.5, self.training) x = x.transpose(1, 2) # pytorch tensor are not reversible, hence the conversion input_lengths = input_lengths.cpu().numpy() x = nn.utils.rnn.pack_padded_sequence( x, input_lengths, batch_first=True) self.lstm.flatten_parameters() outputs, _ = self.lstm(x) outputs, _ = nn.utils.rnn.pad_packed_sequence( outputs, batch_first=True) return outputs def inference(self, x): for conv in self.convolutions: x = F.dropout(F.relu(conv(x)), 0.5, self.training) x = x.transpose(1, 2) self.lstm.flatten_parameters() outputs, _ = self.lstm(x) return outputs class AudioEncoder(nn.Module): def __init__(self, hparams): super(AudioEncoder, self).__init__() assert hparams.lat_dim > 0 convolutions = [] inp_dim = hparams.n_mel_channels for _ in range(hparams.lat_n_convolutions): conv_layer = nn.Sequential( ConvNorm(inp_dim, hparams.lat_n_filters, kernel_size=hparams.lat_kernel_size, stride=1, padding=int((hparams.lat_kernel_size - 1) / 2), dilation=1, w_init_gain='tanh'), nn.BatchNorm1d(hparams.lat_n_filters)) inp_dim = hparams.lat_n_filters convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) self.lstm = nn.LSTM(hparams.lat_n_filters, int(hparams.lat_n_filters / 2), hparams.lat_n_blstms, batch_first=True, bidirectional=True) self.pool = GlobalAvgPool() self.mu_proj = LinearNorm(hparams.lat_n_filters, hparams.lat_dim) self.logvar_proj = LinearNorm(hparams.lat_n_filters, hparams.lat_dim) self.lat_dim = hparams.lat_dim def forward(self, x, lengths): """ Args: x (torch.Tensor): (B, F, T) """ for conv in self.convolutions: x = F.dropout(F.tanh(conv(x)), 0.5, self.training) x = x.transpose(1, 2) # (B, T, D) # x may not be sorted by length. Sort->process->unsort max_len = x.size(1) assert max_len == torch.max(lengths).item() lengths, perm_idx = lengths.sort(0, descending=True) x = x[perm_idx] x = nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True) self.lstm.flatten_parameters() outputs, _ = self.lstm(x) outputs, _ = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True) _, unperm_idx = perm_idx.sort(0) outputs = outputs[unperm_idx] # (B, T, D) lengths = lengths[unperm_idx] # (B, T, D) outputs = self.pool(outputs, lengths) # (B, D) mu = self.mu_proj(outputs) logvar = self.logvar_proj(outputs) z = distr.Normal(mu, logvar).rsample() return z, mu, logvar class Decoder(nn.Module): def __init__(self, hparams): super(Decoder, self).__init__() self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step self.encoder_embedding_dim = hparams.encoder_embedding_dim self.obs_dim = hparams.obs_dim self.lat_dim = hparams.lat_dim self.attention_rnn_dim = hparams.attention_rnn_dim self.decoder_rnn_dim = hparams.decoder_rnn_dim self.prenet_dim = hparams.prenet_dim self.max_decoder_steps = hparams.max_decoder_steps self.gate_threshold = hparams.gate_threshold self.p_attention_dropout = hparams.p_attention_dropout self.p_decoder_dropout = hparams.p_decoder_dropout self.prenet = Prenet( hparams.n_mel_channels * hparams.n_frames_per_step, [hparams.prenet_dim, hparams.prenet_dim]) self.attention_rnn = nn.LSTMCell( hparams.prenet_dim + hparams.encoder_embedding_dim, hparams.attention_rnn_dim) self.attention_layer = Attention( hparams.attention_rnn_dim, hparams.encoder_embedding_dim, hparams.attention_dim, hparams.attention_location_n_filters, hparams.attention_location_kernel_size) encoder_tot_dim = (hparams.encoder_embedding_dim + \ hparams.lat_dim + hparams.obs_dim) self.decoder_rnn = nn.LSTMCell( hparams.attention_rnn_dim + encoder_tot_dim, hparams.decoder_rnn_dim, 1) self.linear_projection = LinearNorm( hparams.decoder_rnn_dim + encoder_tot_dim, hparams.n_mel_channels * hparams.n_frames_per_step) self.gate_layer = LinearNorm( hparams.decoder_rnn_dim + encoder_tot_dim, 1, bias=True, w_init_gain='sigmoid') def get_go_frame(self, memory): """ Gets all zeros frames to use as first decoder input PARAMS ------ memory: decoder outputs RETURNS ------- decoder_input: all zeros frames """ B = memory.size(0) decoder_input = Variable(memory.data.new( B, self.n_mel_channels * self.n_frames_per_step).zero_()) return decoder_input def initialize_decoder_states(self, memory, obs_and_lat, mask): """ Initializes attention rnn states, decoder rnn states, attention weights, attention cumulative weights, attention context, stores memory and stores processed memory PARAMS ------ memory: Encoder outputs obs_and_lat: Observed and latent attribute embeddings mask: Mask for padded data if training, expects None for inference """ B = memory.size(0) MAX_TIME = memory.size(1) self.attention_hidden = Variable(memory.data.new( B, self.attention_rnn_dim).zero_()) self.attention_cell = Variable(memory.data.new( B, self.attention_rnn_dim).zero_()) self.decoder_hidden = Variable(memory.data.new( B, self.decoder_rnn_dim).zero_()) self.decoder_cell = Variable(memory.data.new( B, self.decoder_rnn_dim).zero_()) self.attention_weights = Variable(memory.data.new( B, MAX_TIME).zero_()) self.attention_weights_cum = Variable(memory.data.new( B, MAX_TIME).zero_()) self.attention_context = Variable(memory.data.new( B, self.encoder_embedding_dim).zero_()) self.memory = memory self.processed_memory = self.attention_layer.memory_layer(memory) self.obs_and_lat = obs_and_lat self.mask = mask def parse_decoder_inputs(self, decoder_inputs): """ Prepares decoder inputs, i.e. mel outputs PARAMS ------ decoder_inputs: inputs used for teacher-forced training, i.e. mel-specs RETURNS ------- inputs: processed decoder inputs """ # (B, n_mel_channels, T_out) -> (B, T_out, n_mel_channels) decoder_inputs = decoder_inputs.transpose(1, 2) decoder_inputs = decoder_inputs.view( decoder_inputs.size(0), int(decoder_inputs.size(1)/self.n_frames_per_step), -1) # (B, T_out, n_mel_channels) -> (T_out, B, n_mel_channels) decoder_inputs = decoder_inputs.transpose(0, 1) return decoder_inputs def parse_decoder_outputs(self, mel_outputs, gate_outputs, alignments): """ Prepares decoder outputs for output PARAMS ------ mel_outputs: gate_outputs: gate output energies alignments: RETURNS ------- mel_outputs: gate_outpust: gate output energies alignments: """ # (T_out, B) -> (B, T_out) alignments = torch.stack(alignments).transpose(0, 1) # (T_out, B) -> (B, T_out) gate_outputs = torch.stack(gate_outputs).transpose(0, 1) gate_outputs = gate_outputs.contiguous() # (T_out, B, n_mel_channels) -> (B, T_out, n_mel_channels) mel_outputs = torch.stack(mel_outputs).transpose(0, 1).contiguous() # decouple frames per step mel_outputs = mel_outputs.view( mel_outputs.size(0), -1, self.n_mel_channels) # (B, T_out, n_mel_channels) -> (B, n_mel_channels, T_out) mel_outputs = mel_outputs.transpose(1, 2) return mel_outputs, gate_outputs, alignments def decode(self, decoder_input): """ Decoder step using stored states, attention and memory PARAMS ------ decoder_input: previous mel output RETURNS ------- mel_output: gate_output: gate output energies attention_weights: """ cell_input = torch.cat((decoder_input, self.attention_context), -1) self.attention_hidden, self.attention_cell = self.attention_rnn( cell_input, (self.attention_hidden, self.attention_cell)) self.attention_hidden = F.dropout( self.attention_hidden, self.p_attention_dropout, self.training) attention_weights_cat = torch.cat( (self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1) self.attention_context, self.attention_weights = self.attention_layer( self.attention_hidden, self.memory, self.processed_memory, attention_weights_cat, self.mask) self.attention_weights_cum += self.attention_weights decoder_input = torch.cat( (self.attention_hidden, self.attention_context), -1) if self.obs_and_lat is not None: decoder_input = torch.cat((decoder_input, self.obs_and_lat), -1) self.decoder_hidden, self.decoder_cell = self.decoder_rnn( decoder_input, (self.decoder_hidden, self.decoder_cell)) self.decoder_hidden = F.dropout( self.decoder_hidden, self.p_decoder_dropout, self.training) decoder_hidden_attention_context = torch.cat( (self.decoder_hidden, self.attention_context), dim=1) if self.obs_and_lat is not None: decoder_hidden_attention_context = torch.cat( (decoder_hidden_attention_context, self.obs_and_lat), dim=1) decoder_output = self.linear_projection( decoder_hidden_attention_context) gate_prediction = self.gate_layer(decoder_hidden_attention_context) return decoder_output, gate_prediction, self.attention_weights def forward(self, memory, obs_and_lat, decoder_inputs, memory_lengths): """ Decoder forward pass for training PARAMS ------ memory: Encoder outputs obs_and_lat: Observed and latent attribute embeddings decoder_inputs: Decoder inputs for teacher forcing. i.e. mel-specs memory_lengths: Encoder output lengths for attention masking. RETURNS ------- mel_outputs: mel outputs from the decoder gate_outputs: gate outputs from the decoder alignments: sequence of attention weights from the decoder """ decoder_input = self.get_go_frame(memory).unsqueeze(0) decoder_inputs = self.parse_decoder_inputs(decoder_inputs) decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0) decoder_inputs = self.prenet(decoder_inputs) self.initialize_decoder_states( memory, obs_and_lat, mask=~get_mask_from_lengths(memory_lengths)) mel_outputs, gate_outputs, alignments = [], [], [] while len(mel_outputs) < decoder_inputs.size(0) - 1: decoder_input = decoder_inputs[len(mel_outputs)] mel_output, gate_output, attention_weights = self.decode( decoder_input) mel_outputs += [mel_output.squeeze(1)] gate_outputs += [gate_output.squeeze()] alignments += [attention_weights] mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( mel_outputs, gate_outputs, alignments) return mel_outputs, gate_outputs, alignments def inference(self, memory, obs_and_lat, ret_has_eos=False): """ Decoder inference PARAMS ------ memory: Encoder outputs obs_and_lat: Observed and latent attribute embeddings RETURNS ------- mel_outputs: mel outputs from the decoder gate_outputs: gate outputs from the decoder alignments: sequence of attention weights from the decoder """ decoder_input = self.get_go_frame(memory) self.initialize_decoder_states(memory, obs_and_lat, mask=None) mel_outputs, gate_outputs, alignments = [], [], [] has_eos = False while True: decoder_input = self.prenet(decoder_input) mel_output, gate_output, alignment = self.decode(decoder_input) mel_outputs += [mel_output.squeeze(1)] gate_outputs += [gate_output] alignments += [alignment] if torch.sigmoid(gate_output.data) > self.gate_threshold: has_eos = True break elif len(mel_outputs) == self.max_decoder_steps: # print("Warning! Reached max decoder steps") break decoder_input = mel_output mel_outputs, gate_outputs, alignments = self.parse_decoder_outputs( mel_outputs, gate_outputs, alignments) if ret_has_eos: return mel_outputs, gate_outputs, alignments, has_eos else: return mel_outputs, gate_outputs, alignments class Tacotron2(nn.Module): def __init__(self, hparams): super(Tacotron2, self).__init__() self.mask_padding = hparams.mask_padding self.fp16_run = hparams.fp16_run self.n_mel_channels = hparams.n_mel_channels self.n_frames_per_step = hparams.n_frames_per_step # initialize text encoder embedding self.embedding = nn.Embedding( hparams.n_symbols, hparams.symbols_embedding_dim) std = sqrt(2.0 / (hparams.n_symbols + hparams.symbols_embedding_dim)) val = sqrt(3.0) * std # uniform bounds for std self.embedding.weight.data.uniform_(-val, val) # initialize observed attribute embedding self.obs_embedding = None if hparams.obs_dim > 0: self.obs_embedding = nn.Embedding( hparams.obs_n_class, hparams.obs_dim) std = sqrt(2.0 / (hparams.obs_n_class + hparams.obs_dim)) val = sqrt(3.0) * std # uniform bounds for std self.obs_embedding.weight.data.uniform_(-val, val) self.encoder = Encoder(hparams) self.decoder = Decoder(hparams) self.postnet = Postnet(hparams) self.lat_encoder = None if hparams.lat_dim > 0: self.lat_encoder = AudioEncoder(hparams) def parse_batch(self, batch): (text_padded, input_lengths, obs_labels, mel_padded, gate_padded, output_lengths) = batch text_padded = to_gpu(text_padded).long() input_lengths = to_gpu(input_lengths).long() obs_labels = to_gpu(obs_labels).long() max_len = torch.max(input_lengths.data).item() mel_padded = to_gpu(mel_padded).float() gate_padded = to_gpu(gate_padded).float() output_lengths = to_gpu(output_lengths).long() return ( (text_padded, input_lengths, obs_labels, mel_padded, max_len, output_lengths), (mel_padded, gate_padded)) def parse_output(self, outputs, output_lengths=None): if self.mask_padding and output_lengths is not None: mask = ~get_mask_from_lengths(output_lengths) mask = mask.expand(self.n_mel_channels, mask.size(0), mask.size(1)) mask = mask.permute(1, 0, 2) outputs[0].data.masked_fill_(mask, 0.0) outputs[1].data.masked_fill_(mask, 0.0) outputs[2].data.masked_fill_(mask[:, 0, :], 1e3) # gate energies return outputs def forward(self, inputs): (text_inputs, text_lengths, obs_labels, mels, max_len, output_lengths) = inputs text_lengths, output_lengths = text_lengths.data, output_lengths.data embedded_inputs = self.embedding(text_inputs).transpose(1, 2) encoder_outputs = self.encoder(embedded_inputs, text_lengths) obs = None if self.obs_embedding is not None: obs = self.obs_embedding(obs_labels) lat, lat_mu, lat_logvar = None, None, None if self.lat_encoder is not None: (lat, lat_mu, lat_logvar) = self.lat_encoder(mels, output_lengths) obs_and_lat = [x for x in [obs, lat] if x is not None] if bool(obs_and_lat): obs_and_lat = torch.cat(obs_and_lat, dim=-1) else: obs_and_lat = None mel_outputs, gate_outputs, alignments = self.decoder( encoder_outputs, obs_and_lat, mels, memory_lengths=text_lengths) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return self.parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments, lat_mu, lat_logvar], output_lengths) def inference(self, inputs, obs_labels=None, lat=None, ret_has_eos=False): embedded_inputs = self.embedding(inputs).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) if obs_labels is None: obs_labels = torch.LongTensor(len(inputs)) obs_labels = obs_labels.to(inputs.device).zero_() obs = None if self.obs_embedding is not None: obs = self.obs_embedding(obs_labels) if self.lat_encoder is not None: if lat is None: lat = torch.FloatTensor(len(inputs), self.lat_encoder.lat_dim) lat = lat.to(inputs.device).zero_().type(encoder_outputs.type()) obs_and_lat = [x for x in [obs, lat] if x is not None] if bool(obs_and_lat): obs_and_lat = torch.cat(obs_and_lat, dim=-1) else: obs_and_lat = None mel_outputs, gate_outputs, alignments, has_eos = self.decoder.inference( encoder_outputs, obs_and_lat, ret_has_eos=True) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet outputs = self.parse_output( [mel_outputs, mel_outputs_postnet, gate_outputs, alignments]) if ret_has_eos: return outputs + [has_eos] else: return outputs ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/numbers.py ================================================ """ from https://github.com/keithito/tacotron """ import inflect import re _inflect = inflect.engine() _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])') _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)') _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)') _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)') _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)') _number_re = re.compile(r'[0-9]+') def _remove_commas(m): return m.group(1).replace(',', '') def _expand_decimal_point(m): return m.group(1).replace('.', ' point ') def _expand_dollars(m): match = m.group(1) parts = match.split('.') if len(parts) > 2: return match + ' dollars' # Unexpected format dollars = int(parts[0]) if parts[0] else 0 cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0 if dollars and cents: dollar_unit = 'dollar' if dollars == 1 else 'dollars' cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit) elif dollars: dollar_unit = 'dollar' if dollars == 1 else 'dollars' return '%s %s' % (dollars, dollar_unit) elif cents: cent_unit = 'cent' if cents == 1 else 'cents' return '%s %s' % (cents, cent_unit) else: return 'zero dollars' def _expand_ordinal(m): return _inflect.number_to_words(m.group(0)) def _expand_number(m): num = int(m.group(0)) if num > 1000 and num < 3000: if num == 2000: return 'two thousand' elif num > 2000 and num < 2010: return 'two thousand ' + _inflect.number_to_words(num % 100) elif num % 100 == 0: return _inflect.number_to_words(num // 100) + ' hundred' else: return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ') else: return _inflect.number_to_words(num, andword='') def normalize_numbers(text): text = re.sub(_comma_number_re, _remove_commas, text) text = re.sub(_pounds_re, r'\1 pounds', text) text = re.sub(_dollars_re, _expand_dollars, text) text = re.sub(_decimal_number_re, _expand_decimal_point, text) text = re.sub(_ordinal_re, _expand_ordinal, text) text = re.sub(_number_re, _expand_number, text) return text ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/stft.py ================================================ """ BSD 3-Clause License Copyright (c) 2017, Prem Seetharaman All rights reserved. * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ import torch import numpy as np import torch.nn.functional as F from torch.autograd import Variable from scipy.signal import get_window from librosa.util import pad_center, tiny from .audio_processing import window_sumsquare class STFT(torch.nn.Module): """adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft""" def __init__(self, filter_length=800, hop_length=200, win_length=800, window='hann'): super(STFT, self).__init__() self.filter_length = filter_length self.hop_length = hop_length self.win_length = win_length self.window = window self.forward_transform = None scale = self.filter_length / self.hop_length fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])]) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor( np.linalg.pinv(scale * fourier_basis).T[:, None, :]) if window is not None: assert(filter_length >= win_length) # get window and zero center pad it to filter_length fft_window = get_window(window, win_length, fftbins=True) fft_window = pad_center(fft_window, filter_length) fft_window = torch.from_numpy(fft_window).float() # window the bases forward_basis *= fft_window inverse_basis *= fft_window self.register_buffer('forward_basis', forward_basis.float()) self.register_buffer('inverse_basis', inverse_basis.float()) def transform(self, input_data): num_batches = input_data.size(0) num_samples = input_data.size(1) self.num_samples = num_samples # similar to librosa, reflect-pad the input input_data = input_data.view(num_batches, 1, num_samples) input_data = F.pad( input_data.unsqueeze(1), (int(self.filter_length / 2), int(self.filter_length / 2), 0, 0), mode='reflect') input_data = input_data.squeeze(1) forward_transform = F.conv1d( input_data, Variable(self.forward_basis, requires_grad=False), stride=self.hop_length, padding=0) cutoff = int((self.filter_length / 2) + 1) real_part = forward_transform[:, :cutoff, :] imag_part = forward_transform[:, cutoff:, :] magnitude = torch.sqrt(real_part**2 + imag_part**2) phase = torch.autograd.Variable( torch.atan2(imag_part.data, real_part.data)) return magnitude, phase def inverse(self, magnitude, phase): recombine_magnitude_phase = torch.cat( [magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1) inverse_transform = F.conv_transpose1d( recombine_magnitude_phase, Variable(self.inverse_basis, requires_grad=False), stride=self.hop_length, padding=0) if self.window is not None: window_sum = window_sumsquare( self.window, magnitude.size(-1), hop_length=self.hop_length, win_length=self.win_length, n_fft=self.filter_length, dtype=np.float32) # remove modulation effects approx_nonzero_indices = torch.from_numpy( np.where(window_sum > tiny(window_sum))[0]) window_sum = torch.autograd.Variable( torch.from_numpy(window_sum), requires_grad=False) window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] # scale by hop ratio inverse_transform *= float(self.filter_length) / self.hop_length inverse_transform = inverse_transform[:, :, int(self.filter_length/2):] inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):] return inverse_transform def forward(self, input_data): self.magnitude, self.phase = self.transform(input_data) reconstruction = self.inverse(self.magnitude, self.phase) return reconstruction ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/symbols.py ================================================ """ from https://github.com/keithito/tacotron """ ''' Defines the set of symbols used in text input to the model. The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. ''' from . import cmudict _pad = '_' _punctuation = '!\'(),.:;? ' _special = '-' _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters): _arpabet = ['@' + s for s in cmudict.valid_symbols] # Export all symbols: symbols = [_pad] + list(_special) + list(_punctuation) + list(_letters) + _arpabet ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/text.py ================================================ """ from https://github.com/keithito/tacotron """ import numpy as np import re from . import cleaners from .symbols import symbols # Mappings from symbol to numeric ID and vice versa: _symbol_to_id = {s: i for i, s in enumerate(symbols)} _id_to_symbol = {i: s for i, s in enumerate(symbols)} # Regular expression matching text enclosed in curly braces: _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)') # Special symbols SOS_TOK = '<s>' EOS_TOK = '</s>' def text_to_sequence(text, cleaner_names): '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text. The text can optionally have ARPAbet sequences enclosed in curly braces embedded in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street." Args: text: string to convert to a sequence cleaner_names: names of the cleaner functions to run the text through Returns: List of integers corresponding to the symbols in the text ''' sequence = [] # Check for curly braces and treat their contents as ARPAbet: while len(text): m = _curly_re.match(text) if not m: sequence += _symbols_to_sequence(_clean_text(text, cleaner_names)) break sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names)) sequence += _arpabet_to_sequence(m.group(2)) text = m.group(3) return sequence def sample_code_chunk(code, size): assert(size > 0 and size <= len(code)) start = np.random.randint(len(code) - size + 1) end = start + size return code[start:end], start, end def code_to_sequence(code, code_dict, collapse_code): if collapse_code: prev_c = None sequence = [] for c in code: if c in code_dict and c != prev_c: sequence.append(code_dict[c]) prev_c = c else: sequence = [code_dict[c] for c in code if c in code_dict] if len(sequence) < 0.95 * len(code): print('WARNING : over 5%% codes are OOV') return sequence def sequence_to_text(sequence): '''Converts a sequence of IDs back to a string''' result = '' for symbol_id in sequence: if symbol_id in _id_to_symbol: s = _id_to_symbol[symbol_id] # Enclose ARPAbet back in curly braces: if len(s) > 1 and s[0] == '@': s = '{%s}' % s[1:] result += s return result.replace('}{', ' ') def sequence_to_code(sequence, code_dict): '''Analogous to sequence_to_text''' id_to_code = {i: c for c, i in code_dict.items()} return ' '.join([id_to_code[i] for i in sequence]) def _clean_text(text, cleaner_names): for name in cleaner_names: cleaner = getattr(cleaners, name) if not cleaner: raise Exception('Unknown cleaner: %s' % name) text = cleaner(text) return text def _symbols_to_sequence(symbols): return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)] def _arpabet_to_sequence(text): return _symbols_to_sequence(['@' + s for s in text.split()]) def _should_keep_symbol(s): return s in _symbol_to_id and s != '_' and s != '~' ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import collections import io import json import librosa import numpy as np import soundfile as sf import time import torch from scipy.io.wavfile import read from .text import SOS_TOK, EOS_TOK def get_mask_from_lengths(lengths): max_len = torch.max(lengths).item() ids = torch.arange(0, max_len, out=torch.cuda.LongTensor(max_len)) mask = (ids < lengths.unsqueeze(1)) return mask def load_wav_to_torch(full_path, sr=None): data, sr = librosa.load(full_path, sr=sr) data = np.clip(data, -1, 1) # potentially out of [-1, 1] due to resampling data = data * 32768.0 # match values loaded by scipy return torch.FloatTensor(data.astype(np.float32)), sr def read_binary_audio(bin_data, tar_sr=None): """ read binary audio (`bytes` or `uint8` `numpy.ndarray`) to `float32` `numpy.ndarray` RETURNS: data (np.ndarray) : audio of shape (n,) or (2, n) tar_sr (int) : sample rate """ data, ori_sr = sf.read(io.BytesIO(bin_data), dtype='float32') data = data.T if (tar_sr is not None) and (ori_sr != tar_sr): data = librosa.resample(data, ori_sr, tar_sr) else: tar_sr = ori_sr data = np.clip(data, -1, 1) data = data * 32768.0 return torch.FloatTensor(data.astype(np.float32)), tar_sr def load_filepaths_and_text(filename): with open(filename, encoding='utf-8') as f: data = [json.loads(line.rstrip()) for line in f] return data def to_gpu(x): x = x.contiguous() if torch.cuda.is_available(): x = x.cuda(non_blocking=True) return torch.autograd.Variable(x) def load_code_dict(path, add_sos=False, add_eos=False): if not path: return {} with open(path, 'r') as f: codes = ['_'] + [line.rstrip() for line in f] # '_' for pad code_dict = {c: i for i, c in enumerate(codes)} if add_sos: code_dict[SOS_TOK] = len(code_dict) if add_eos: code_dict[EOS_TOK] = len(code_dict) assert(set(code_dict.values()) == set(range(len(code_dict)))) return code_dict def load_obs_label_dict(path): if not path: return {} with open(path, 'r') as f: obs_labels = [line.rstrip() for line in f] return {c: i for i, c in enumerate(obs_labels)} # A simple timer class inspired from `tnt.TimeMeter` class CudaTimer: def __init__(self, keys): self.keys = keys self.reset() def start(self, key): s = torch.cuda.Event(enable_timing=True) s.record() self.start_events[key].append(s) return self def stop(self, key): e = torch.cuda.Event(enable_timing=True) e.record() self.end_events[key].append(e) return self def reset(self): self.start_events = collections.defaultdict(list) self.end_events = collections.defaultdict(list) self.running_times = collections.defaultdict(float) self.n = collections.defaultdict(int) return self def value(self): self._synchronize() return {k: self.running_times[k] / self.n[k] for k in self.keys} def _synchronize(self): torch.cuda.synchronize() for k in self.keys: starts = self.start_events[k] ends = self.end_events[k] if len(starts) == 0: raise ValueError("Trying to divide by zero in TimeMeter") if len(ends) != len(starts): raise ValueError("Call stop before checking value!") time = 0 for start, end in zip(starts, ends): time += start.elapsed_time(end) self.running_times[k] += time * 1e-3 self.n[k] += len(starts) self.start_events = collections.defaultdict(list) self.end_events = collections.defaultdict(list) # Used to measure the time taken for multiple events class Timer: def __init__(self, keys): self.keys = keys self.n = {} self.running_time = {} self.total_time = {} self.reset() def start(self, key): self.running_time[key] = time.time() return self def stop(self, key): self.total_time[key] = time.time() - self.running_time[key] self.n[key] += 1 self.running_time[key] = None return self def reset(self): for k in self.keys: self.total_time[k] = 0 self.running_time[k] = None self.n[k] = 0 return self def value(self): vals = {} for k in self.keys: if self.n[k] == 0: raise ValueError("Trying to divide by zero in TimeMeter") else: vals[k] = self.total_time[k] / self.n[k] return vals ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tacotron2/waveglow_denoiser.py ================================================ # import sys # sys.path.append('tacotron2') import torch from .layers import STFT class Denoiser(torch.nn.Module): """ Removes model bias from audio produced with waveglow """ def __init__(self, waveglow, filter_length=1024, n_overlap=4, win_length=1024, mode='zeros'): super(Denoiser, self).__init__() self.stft = STFT(filter_length=filter_length, hop_length=int(filter_length/n_overlap), win_length=win_length).cuda() if mode == 'zeros': mel_input = torch.zeros( (1, 80, 88), dtype=waveglow.upsample.weight.dtype, device=waveglow.upsample.weight.device) elif mode == 'normal': mel_input = torch.randn( (1, 80, 88), dtype=waveglow.upsample.weight.dtype, device=waveglow.upsample.weight.device) else: raise Exception("Mode {} if not supported".format(mode)) with torch.no_grad(): bias_audio = waveglow.infer(mel_input, sigma=0.0).float() bias_spec, _ = self.stft.transform(bias_audio) self.register_buffer('bias_spec', bias_spec[:, :, 0][:, :, None]) def forward(self, audio, strength=0.1): audio_spec, audio_angles = self.stft.transform(audio.cuda().float()) audio_spec_denoised = audio_spec - self.bias_spec * strength audio_spec_denoised = torch.clamp(audio_spec_denoised, 0.0) audio_denoised = self.stft.inverse(audio_spec_denoised, audio_angles) return audio_denoised ================================================ FILE: examples/textless_nlp/gslm/unit2speech/tts_data.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import numpy as np from examples.textless_nlp.gslm.unit2speech.tacotron2.text import ( EOS_TOK, SOS_TOK, code_to_sequence, text_to_sequence, ) from examples.textless_nlp.gslm.unit2speech.tacotron2.utils import ( load_code_dict, ) class TacotronInputDataset: def __init__(self, hparams, append_str=""): self.is_text = getattr(hparams, "text_or_code", "text") == "text" if not self.is_text: self.code_dict = load_code_dict( hparams.code_dict, hparams.add_sos, hparams.add_eos ) self.code_key = hparams.code_key self.add_sos = hparams.add_sos self.add_eos = hparams.add_eos self.collapse_code = hparams.collapse_code self.append_str = append_str def process_code(self, inp_str): inp_toks = inp_str.split() if self.add_sos: inp_toks = [SOS_TOK] + inp_toks if self.add_eos: inp_toks = inp_toks + [EOS_TOK] return code_to_sequence(inp_toks, self.code_dict, self.collapse_code) def process_text(self, inp_str): return text_to_sequence(inp_str, ["english_cleaners"]) def get_tensor(self, inp_str): # uid, txt, inp_str = self._get_data(idx) inp_str = inp_str + self.append_str if self.is_text: inp_toks = self.process_text(inp_str) else: inp_toks = self.process_code(inp_str) return torch.from_numpy(np.array(inp_toks)).long() def __len__(self): return len(self.data) ================================================ FILE: examples/textless_nlp/gslm/unit2speech/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from examples.textless_nlp.gslm.unit2speech.tacotron2.model import Tacotron2 from examples.textless_nlp.gslm.unit2speech.tacotron2.waveglow_denoiser import ( Denoiser, ) def load_quantized_audio_from_file(file_path): base_fname_batch, quantized_units_batch = [], [] with open(file_path) as f: for line in f: base_fname, quantized_units_str = line.rstrip().split("|") quantized_units = [int(q) for q in quantized_units_str.split(" ")] base_fname_batch.append(base_fname) quantized_units_batch.append(quantized_units) return base_fname_batch, quantized_units_batch def synthesize_audio(model, waveglow, denoiser, inp, lab=None, strength=0.0): assert inp.size(0) == 1 inp = inp.cuda() if lab is not None: lab = torch.LongTensor(1).cuda().fill_(lab) with torch.no_grad(): _, mel, _, ali, has_eos = model.inference(inp, lab, ret_has_eos=True) aud = waveglow.infer(mel, sigma=0.666) aud_dn = denoiser(aud, strength=strength).squeeze(1) return mel, aud, aud_dn, has_eos def load_tacotron(tacotron_model_path, max_decoder_steps): ckpt_dict = torch.load(tacotron_model_path) hparams = ckpt_dict["hparams"] hparams.max_decoder_steps = max_decoder_steps sr = hparams.sampling_rate model = Tacotron2(hparams) model.load_state_dict(ckpt_dict["model_dict"]) model = model.cuda().eval().half() return model, sr, hparams def load_waveglow(waveglow_path): waveglow = torch.load(waveglow_path)["model"] waveglow = waveglow.cuda().eval().half() for k in waveglow.convinv: k.float() denoiser = Denoiser(waveglow) return waveglow, denoiser ================================================ FILE: examples/textless_nlp/pgslm/README.md ================================================ # Text-Free Prosody-Aware Generative Spoken Language Modeling This folder contains code and recipes to reproduce results reported in a paper _Text-Free Prosody-Aware Generative Spoken Language Modeling_, Eugene Kharitonov*, Ann Lee*, Adam Polyak, Yossi Adi, Jade Copet, Kushal Lakhotia, Tu-Anh Nguyen, Morgane Rivière, Abdelrahman Mohamed, Emmanuel Dupoux, Wei-Ning Hsu, 2021. arxiv/2109.03264 [[arxiv]](https://arxiv.org/abs/2109.03264). `*` denotes equal contribution. You can find demo samples [[here]](https://speechbot.github.io/pgslm/index.html). <details> <summary>If you find this code useful, please consider citing our work using this bibtex </summary> ``` @misc{Kharitonov2021, title={Text-Free Prosody-Aware Generative Spoken Language Modeling}, author={Eugene Kharitonov and Ann Lee and Adam Polyak and Yossi Adi and Jade Copet and Kushal Lakhotia and Tu-Anh Nguyen and Morgane Rivière and Abdelrahman Mohamed and Emmanuel Dupoux and Wei-Ning Hsu}, year={2021}, eprint={2109.03264}, archivePrefix={arXiv}, primaryClass={cs.CL} } ``` </details> ## Additional requirements Three packages are required in addition to fairseq, they are installable with pip: ```bash pip install AMFM-decompy SoundFile scipy sklearn torchaudio npy-append-array ``` ## Data preprocessing ### Prepare unit pseudo-text transcriptions of the audio To get unit trascripts of the speech data we rely on the preprocessing steps of [GSLM](https://github.com/pytorch/fairseq/tree/main/examples/textless_nlp/gslm/speech2unit/) work. Firstly, we will need to prepare manifest files for the dataset we want to preprocess ``` mkdir manifests/ python examples/wav2vec/wav2vec_manifest.py --valid-percent=0.0 $DATA_PATH --dest=manifests/train/ ``` Next, we need a pre-trained HuBERT-base-ls960 model [[download]](https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt) and a corresponding kmeans-100 quantizer [[download]](https://dl.fbaipublicfiles.com/textless_nlp/gslm/hubert/km100/km.bin). Having those we can quantize the dataset: ``` python examples/textless_nlp/gslm/speech2unit/clustering/quantize_with_kmeans.py \ --feature_type hubert \ --kmeans_model_path km.bin \ --acoustic_model_path hubert_base_ls960.pt \ --layer 6 \ --manifest_path manifests/train/train.tsv \ --out_quantized_file_path manifests/train/units ``` Finally, by running ``` python examples/textless_nlp/pgslm/scripts/join_units_manifest.py --manifest=manifests/train/train.tsv --units=manifests/train/units --output=train.txt ``` We will get the training data description `train.txt` in the format that pGSLM expects. The above steps have to be repeated for dev/test sets. Importantly, we rely on an assumption that the directories are structured as in LibriSpeech, i.e. the file paths follow the `<spk_id>/<session_id>/<sample_id>.wav` format. ### Preprocess data for pGSLM The very first step is to obtain the F0 quantization bins. Assume the vocoder training manifest is `vocoder_train.txt` (in pGSLM data format prepared with the same process above). We prepare the quantized F0 from the vocoder training data by running ```sh bash examples/textless_nlp/pgslm/scripts/prepare_f0_quantization.sh \ vocoder_train.txt <sample_rate> 32 <preprocessed_dir> <output_prefix> # we use 32 bins in the paper ``` - `<sample_rate>`: sampling rate of the audio files in the manifest - `<preprocessed_dir>`: where to output the output files - `<output_prefix>`: prefix of the output files The script will generate - `<output_prefix>.f0_stat.pt`: the speaker-level F0 statistics, which can be used in vocoder training - `<output_prefix>_mean_norm_log_f0_bin.th`: the quantized F0, which should be used in `prepare_data.sh` below **Note:** See "Pre-trained models" for the pre-computed speaker-level F0 statistics and quantized F0 bins. We suggest using the pre-computed statistics for the data preparation below in order to take advantage of the pre-trained vocoder for waveform generation. Next prepare the pGSLM data. Assume train/valid/test manifests are `{train,valid,test}.txt`. Here is an example of how to preprocess data: ```sh bash examples/textless_nlp/pgslm/scripts/prepare_data.sh \ train.txt valid.txt test.txt <n_unit> <hop_size> <sample_rate> \ <preprocessed_dir>/<output_prefix>_mean_norm_log_f0_bin.th <preprocessed_dir> ``` - `<n_unit>`: discrete unit vocabulary size (we used a kmeans quantizer with the number of units equal to 100 in the example above) - `<hop_size>`: downsampling rate relative to the waveform (e.g., 320 for HuBERT units) - `<sample_rate>`: sampling rate of the audio files in the manifest - `<preprocessed_dir>`: where to output the preprocessed files This will create the dataset json config used for the next section at `<preprocessed_dir>/data_config.json`. Note that the example script uses only one thread to compute F0, which can take _very long_ for preprocessing large datasets. It is suggested to distribute jobs over multiple nodes/processes with `--nshards=x` and `--rank=z` (where z is in [1, x]) in `preprocess_f0.py`, and set `--nshards_list=x` in `prepare_data.py` correspondingly to collect sharded F0 data. Now, everything is ready for training a model. ## Training Multi-Stream Transformer Unit Language Model (MS-TLM) Below is an example command that trains Multi-Stream Transformer Language Model (MS-TLM) on a prepared dataset: ```bash DATASET=data_config.json fairseq-train $DATASET \ --task=speech_unit_modeling \ --arch="transformer_ulm_tiny" \ --criterion=speech_unit_lm_criterion \ --share-decoder-input-output-embed \ --dropout=0.1 \ --attention-dropout=0.1 \ --optimizer="adam" \ --adam-betas="(0.9, 0.98)" \ --clip-norm=1.0 \ --lr=0.0005 \ --lr-scheduler="inverse_sqrt" \ --warmup-updates=4000 \ --warmup-init-lr=1e-07 \ --tokens-per-sample=3072 \ --max-tokens=3072 \ --update-freq=4 \ --max-epoch=70 \ --num-workers=0 \ --skip-invalid-size-inputs-valid-test \ --loss-weights="1.0;0.5;0.0" \ --ignore-f0-input \ --checkpoint-activations \ --fp16 \ --max-target-positions=4096 \ --stream-shifts="1,1" \ --log-f0 --normalize-f0-mean --interpolate-f0 \ --ignore-unused-valid-subsets \ --discrete-duration --discrete-f0 ``` Some of the important parameters that are specific to MS-TLM: * `arch`: specifies the Transformer architecture used. Supported options are: * `transformer_ulm_tiny` - a tiny model that can be used for debugging; it has 2 layers, 1 attention head, FFN and embedding dimensions of 64, * `transformer_ulm` - a base model with 6 layers, 8 heads, embedding dimension 512, and FFN dimensionality of 2048, * `transformer_ulm_big` - the largest model we experiment with in the paper: 12-layer/16 heads, 1024/4096 embedding and FFN dimensions; * `loss-weights`: this parameter sets importance weights (must be non-negative) for the components of the loss that correspond to unit, duration, and F0 streams. To turn off a component of the loss, its weight has to be set to 0. For instance, to predict only unit stream the parameter should be set to "1;0;0"; * `stream-shifts`: specifies relative shifts of the two prosodic streams w.r.t. the unit stream (duration and F0, respectively). No shift corresponds to "0,0"; * `ignore-duration-input`/`ignore-f0-input`: setting these flags would zero-out correpsonding input streams; * `max-token-duration`: duration values would be max-capped by the specified value; * `discrete-duration`/`discrete-f0`: whether duration and F0 streams should be quantized; * `log_f0`, `normalize-f0-mean`, `normalize-f0-std`, `interpolate-f0`: configure how F0 stream is treated. `log_f0` sets up modelling in the log-space, `normalize-f0-mean`/`normalize-f0-std` control per-speaker normalization, and `interpolate-f0` enables F0 interpolation for unvoiced regions where F0 was set to 0, * `mask-dur-prob`, `mask-f0-prob`, `mask-dur-seg-prob`, `mask-f0-seg-prob`, `mask-unit-seg-prob`, `mask-unit-seg-leng`: this family of parameters sets the probababilities of masking individual steps and spans on each stream as well as lengths of the maked spans. ## Pre-trained models ### MS-TLM Below you can find checkpoints for four best-performing models from the paper (IDs 9..12 in Table 1). These models are trained on Hubert-100 transcripts of the LibriLight-6K dataset. They have the prosody streams shifted by 1 w.r.t. the unit stream. All models predict all three streams (units, duration, and F0), but two of them only have unit steam in their input. | | Continuous prosody | Quantized prosody | |-------------------|--------------------|-------------------| | No prosody input | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/continuous_no_prosody_shift_1_1.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/discrete_no_prosody_shift_1_1.pt) | | Has prosody input | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/continuous_prosody_shift_1_1.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/ulm_checkpoints/discrete_prosody_shift_1_1.pt)| The optimal per-stream sampling temperatures/scaling parameters that we have identified for these models, in the (`T-token, T-duration, T-f0`) format: | | Continuous prosody | Quantized prosody | |-------------------|--------------------|-------------------| | No prosody input | 0.7, 0.125, 0.0003125| 0.7, 0.25, 0.5 | | Has prosody input | 0.7, 0.125, 0.00125 | 0.7, 0.25, 0.7 | ## Vocoder | Units | Prosody | F0 stats | Checkpoint | Config | |-------------------|---------|--------------|------------|--------| | HuBERT-base-ls960, kmeans-100 | [[Quantized 32 bins]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_seg_bin.th) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/f0_stats.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/naive_quant_32_norm_log_seg_hubert/checkpoint.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/naive_quant_32_norm_log_seg_hubert/config.json) | | HuBERT-base-ls960, kmeans-100 | Continuous | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/f0_stats.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_hubert/checkpoint.pt) | [[download]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_hubert/config.json) | ## Evaluating a trained model Evaluation is done with the `eval/cont_metrics.py` scripts. As described in the paper, there are several metrics used. **Teacher-forced metrics** ```bash SET=valid CHECKPOINT_PATH=discrete_prosody_shift_1_1.pt DATA=data_config.json python examples/textless_nlp/pgslm/eval/cont_metrics.py $DATA \ --metric=teacher_force_everything \ --path=$CHECKPOINT_PATH \ --batch-size=16 \ --fp16 \ --seed=111 \ --eval-subset=$SET \ --f0-discretization-bounds=mean_norm_log_f0_seg_bin.th --dequantize-prosody ``` (Using this command, our provided `discrete_prosody_shift_1_1.pt` checkpoint should produce `{'token_loss': 1.408..., 'duration_loss': 0.5424..., 'f0_loss': 0.0474...}` on LibriSpeech dev-clean). The parameters `--f0-discretization-bounds=mean_norm_log_f0_seg_bin.th --dequantize-prosody` are specific for quantized-prosody models. They signal that the prosody streams must be decoded into the continuous domain before calculating correlation. It is the same `*_mean_norm_log_f0_bin.th` file as we prepared before. The `mean_norm_log_f0_seg_bin.th` file we used with the pre-trained models can be downloaded [[here]](https://dl.fbaipublicfiles.com/textless_nlp/pgslm/vocoder/blizzard2013/mean_norm_log_f0_seg_bin.th). **Consistency (aka Correlation) metrics** The following command estimates correlation between mean values of the F0 stream in the prompt and in the generated continuation (unit and duration steams are fixed). ```bash T_F0=0.7 EXPLOSION=20 SET=test CHECKPOINT_PATH=discrete_prosody_shift_1_1.pt DATA=data_config.json python examples/textless_nlp/pgslm/eval/cont_metrics.py $DATA \ --prefix-length=150 \ --metric=correlation \ --path=$CHECKPOINT_PATH \ --batch-size=16 \ --fp16 \ --seed=111 \ --teacher-force-tokens \ --teacher-force-duration \ --min-length=300 \ --batch-explosion-rate=$EXPLOSION \ --T-f0=$T_F0 \ --eval-subset=$SET \ --f0-discretization-bounds=mean_norm_log_f0_seg_bin.th \ --dequantize-prosody --n-workers=8 ``` (Using this command, our provided `discrete_prosody_shift_1_1.pt` checkpoint should produce `{...'F0 corr': 0.315 ..}` on LibriSpeech test-clean). * By using flags `--teacher-force-tokens, --teacher-force-duration, --teacher-force-f0` one can calculate correlations along each stream while having other two streams fixed to ground-truth values (or freeze all three streams to get ground-truth correlation values); * The parameters `T-f0`, `T-duration`, and `T-token` specify per-stream temperatures and, in the case of continuous-valued prosody, scaling parameter of the corresponding Laplace distribution (setting a temperature to 0 will enforce greedy sampling); * `min-length` filters out sequences that are shorter then 300 duration units (i.e. 6s in the case of Hubert units); * `prefix-length` specifies that we want to use first 150 duration units are prompt (i.e. 3s in the case of Hubert units) **Correctness (aka Continuation) and Expressiveness (aka Std) metrics** By running the following command, we can get minMAE and Std for the log-F0 stream for the model with quantized prosody. ```bash DATA=data_config.json EXPLOSION=20 SET=test CHECKPOINT_PATH=discrete_prosody_shift_1_1.pt T_F0=0.7 python examples/textless_nlp/pgslm/eval/cont_metrics.py $DATA \ --prefix-length=150 \ --metric=continuation \ --path=$CHECKPOINT_PATH \ --batch-size=16 \ --fp16 \ --seed=111 \ --batch-explosion-rate=$EXPLOSION \ --teacher-force-tokens \ --teacher-force-duration \ --T-f0=$T_F0 \ --eval-subset=$SET \ --f0-discretization-bounds=mean_norm_log_f0_seg_bin.th --dequantize-prosody ``` (Using this command, our provided `discrete_prosody_shift_1_1.pt` checkpoint should produce `{...'F0 MAE': 0.0772, 'F0 Std': 0.1489...}` on LibriSpeech test-clean). Again, by setting `--teacher-force-tokens, --teacher-force-duration, --teacher-force-f0` we can calculate Token BLEU for the token stream (when `--teacher-force-duration` & `--teacher-force-f0` are on) and per-stream min MAE for each prosody stream individually. Finally, `cont_metrics.py` allows to specify the number of workers (e.g., `n-workers=8`) which allows to speed up the computation by spreading multiple worker processes over the available GPUs. **Cont Word BLEU** We used the code and the evaluation protocol of [(Lakhotia et al., 2021)](https://arxiv.org/abs/2102.01192). ## Sampling from a trained model To get (prompted or not) samples from a trained model it is enough to run `sample.py`: ```bash CHECKPOINT_PATH=checkpoints/checkpoint_best.pt DATASET=examples/textless_nlp/pgslm/repro/dataset/data_config.json python examples/textless_nlp/pgslm/sample/sample.py $DATASET \ --output=$SAMPLES \ --path=$CHECKPOINT_PATH \ --sampling \ --T-token=0.7 \ --T-duration=0.25 \ --T-f0=0.7 \ --max-length=500 \ --prefix-length=150 \ --subset=valid \ --seed=1 \ --match-duration \ --code-type=hubert \ --batch-explosion-rate=2 ``` Some useful parameters: * `T-token`, `T-duration`, `T-f0` specify sampling temperature for the three streams. Setting a temperature to `0` switches sample to the greedy (argmax) one; * `prefix-length`: length of the prompt, measured in timesteps (e.g. for Hubert (CPC) each timestep is 20 (10) ms); * `subset`: which subset of the dataset to use as prompts (can be `train`, `valid`, `test`); * `teacher-force-tokens`, `teacher-force-duration`, `teacher-force-f0`: if set, at each autoregressive step, ground-truth values replace the produced one; * `short-curcuit`: replace sampling by ground-truth inputs; * `match-duration`: forces the produced sample to have the same duration (in time), as the entire sequence (beyond the prompt if there is any); * `batch-explosion-rate`: number of samples per prompt; * `f0-discretization-bounds`: path to a file with quantization boundaries. If it is set, F0 values are de-quantized back to the continuous domain (the model must be a quanized one); * `max-length` sets the maximal number of segment steps to be produced. Note that `sample.py` automatically uses all available GPUs, to avoid that please use environment variable `CUDA_VISIBLE_DEVICES`. ## Vocoding samples To generate audios for output from `sample.py` (`$IN_FILE`): ```bash python examples/textless_nlp/pgslm/generate_waveform.py \ --in-file=$IN_FILE \ --vocoder=$VODOER \ --vocoder-cfg=$VOCODER_CFG \ --results-path=$RESULTS_PATH ``` See "Pre-trained model" for `$VOCODER` and `VOCODER_CFG`. ================================================ FILE: examples/textless_nlp/pgslm/data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import torch from tqdm import tqdm class Stat: def __init__(self, keep_raw=False): self.x = 0.0 self.x2 = 0.0 self.z = 0.0 # z = logx self.z2 = 0.0 self.n = 0.0 self.u = 0.0 self.keep_raw = keep_raw self.raw = [] def update(self, new_x): new_z = new_x.log() self.x += new_x.sum() self.x2 += (new_x**2).sum() self.z += new_z.sum() self.z2 += (new_z**2).sum() self.n += len(new_x) self.u += 1 if self.keep_raw: self.raw.append(new_x) @property def mean(self): return self.x / self.n @property def std(self): return (self.x2 / self.n - self.mean**2) ** 0.5 @property def mean_log(self): return self.z / self.n @property def std_log(self): return (self.z2 / self.n - self.mean_log**2) ** 0.5 @property def n_frms(self): return self.n @property def n_utts(self): return self.u @property def raw_data(self): assert self.keep_raw, "does not support storing raw data!" return torch.cat(self.raw) class F0Stat(Stat): def update(self, new_x): # assume unvoiced frames are 0 and consider only voiced frames if new_x is not None: super().update(new_x[new_x != 0]) def dump_speaker_f0_stat(speaker_to_f0_stat, out_prefix): path = f"{out_prefix}.f0_stat.pt" assert not os.path.exists(path) d = { speaker: { "f0_mean": speaker_to_f0_stat[speaker].mean, "f0_std": speaker_to_f0_stat[speaker].std, "logf0_mean": speaker_to_f0_stat[speaker].mean_log, "logf0_std": speaker_to_f0_stat[speaker].std_log, } for speaker in speaker_to_f0_stat } torch.save(d, path) return d def load_audio_path(path): audio_paths = [] with open(path) as f: for line in f.readlines(): sample = eval(line.strip()) audio_paths.append(sample["audio"]) return audio_paths def load_f0(f0_dir, nshards): path_to_f0 = {} for rank in tqdm(range(1, nshards + 1), desc=f"load f0"): f0_shard_path = f"{f0_dir}/f0_{rank}_{nshards}.pt" shard_path_to_f0 = torch.load(f0_shard_path) path_to_f0.update(shard_path_to_f0) return path_to_f0 ================================================ FILE: examples/textless_nlp/pgslm/eval/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: examples/textless_nlp/pgslm/eval/cont_metrics.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import numpy as np import scipy import torch import torch.multiprocessing as mp from fairseq import checkpoint_utils, options from fairseq.data.codedataset import CodeDataset, ExpressiveCodeDataConfig from fairseq.dataclass.utils import convert_namespace_to_omegaconf from torch.utils.data import DataLoader, DistributedSampler from fairseq.utils import move_to_cuda from fairseq import utils from fairseq.criterions.speech_ulm_criterion import nll_loss, mae_loss import time from types import SimpleNamespace import sys, pathlib sys.path.append(str(pathlib.Path(__file__).parent.parent.resolve())) from naive_decoder import Naive_F0_Decoder from inference_dataset import InferenceDataset, explode_batch from sample.sample import do_sampling, TemperatureDecoder, FilterNamesDataset try: from nltk.translate.bleu_score import sentence_bleu except ImportError: print("Please install nltk: `pip install --user -U nltk`") raise @torch.no_grad() def teacher_force_everything( args, dataset, model, criterion, tgt_dict, rank, world_size ): prefix = args.prefix_length f0_decoder = None if args.dequantize_prosody: assert dataset.discrete_f0 print("Reporting MAE for a discrete model") f0_decoder = Naive_F0_Decoder( args.f0_discretization_bounds, dataset.config.f0_vq_n_units ).cuda() dataset = InferenceDataset( dataset, prefix=args.prefix_length, only_prefix=False, filter_short=True, presort_by_length=True, ) sampler = ( None if world_size == 1 else DistributedSampler( dataset, num_replicas=world_size, rank=rank, shuffle=False ) ) dataloader = DataLoader( dataset, args.batch_size, shuffle=False, collate_fn=dataset.collater, sampler=sampler, ) total_token_loss, total_duration_loss, total_f0_loss, total_tokens = ( 0.0, 0.0, 0.0, 0.0, ) i = 0 for batch in dataloader: i += 1 batch = move_to_cuda(batch) output = model(**batch["net_input"]) tokens, durations, f0 = output["token"], output["duration"], output["f0"] durations, f0 = durations.squeeze(), f0.squeeze() token_loss = nll_loss( tokens[:, prefix - 1 :], batch["target"][:, prefix - 1 :].contiguous(), batch["mask"][:, prefix - 1 :].contiguous(), reduce=True, ) if args.dequantize_prosody: durations = durations.argmax(dim=-1) duration_loss = mae_loss( durations[:, prefix - 1 :].contiguous().float(), batch["dur_target"][:, prefix - 1 :].contiguous().float(), batch["dur_mask"][:, prefix - 1 :].contiguous(), reduce=True, ) else: duration_loss = criterion.dur_loss_fn( durations[:, prefix - 1 :].contiguous(), batch["dur_target"][:, prefix - 1 :].contiguous(), batch["dur_mask"][:, prefix - 1 :].contiguous(), reduce=True, ) if f0_decoder: f0 = f0.argmax(dim=-1) f0 = f0_decoder(f0).squeeze(-1) f0_target = batch["raw_f0"] f0_loss = mae_loss( f0[:, prefix - 1 :].contiguous(), f0_target[:, prefix - 1 :].contiguous(), batch["f0_mask"][:, prefix - 1 :].contiguous(), reduce=True, ) else: f0_loss = criterion.f0_loss_fn( f0[:, prefix - 1 :].contiguous(), batch["f0_target"][:, prefix - 1 :].contiguous(), batch["f0_mask"][:, prefix - 1 :].contiguous(), reduce=True, ) n_tokens = (~batch["dur_mask"])[:, prefix - 1 :].sum() total_token_loss += token_loss.item() total_duration_loss += duration_loss.item() total_f0_loss += f0_loss.item() total_tokens += n_tokens.item() if args.debug and i > 5: break values = torch.tensor([total_token_loss, total_duration_loss, total_f0_loss]) normalizers = torch.tensor([total_tokens for _ in range(3)]) return values, normalizers def get_bleu(produced_tokens, target_tokens, tgt_dict): assert target_tokens.ndim == 1 assert produced_tokens.size(1) == target_tokens.size(0) # we can have padding due to shifted channels shift = 0 for token in reversed(target_tokens.cpu().tolist()): if token in [tgt_dict.pad(), tgt_dict.eos()]: shift += 1 else: break target_tokens = target_tokens[:-shift] produced_tokens = produced_tokens[:, :-shift] string_target = tgt_dict.string(target_tokens).split() string_candidates = [ tgt_dict.string(produced_tokens[i, :]).split() for i in range(produced_tokens.size(0)) ] bleu3 = sentence_bleu( references=string_candidates, hypothesis=string_target, weights=(1.0 / 3, 1.0 / 3, 1.0 / 3), ) return bleu3 @torch.no_grad() def continuation(args, dataset, model, criterion, tgt_dict, rank, world_size): is_discrete_duration = dataset.discrete_dur is_discrete_f0 = dataset.discrete_f0 f0_decoder = None if args.dequantize_prosody: assert dataset.discrete_f0 print("Reporting MAE F0 for a discrete model") f0_decoder = Naive_F0_Decoder( args.f0_discretization_bounds, dataset.config.f0_vq_n_units ).cuda() dataset = InferenceDataset( dataset, args.prefix_length, filter_short=True, presort_by_length=True ) sampler = ( None if world_size == 1 else DistributedSampler( dataset, num_replicas=world_size, rank=rank, shuffle=False ) ) dataloader = DataLoader( dataset, batch_size=1, shuffle=False, collate_fn=dataset.collater, sampler=sampler, ) Ts = args.T_token, args.T_duration, args.T_f0 decoder = TemperatureDecoder( Ts, discrete_dur=is_discrete_duration, discrete_f0=is_discrete_f0 ) running_stats = SimpleNamespace( token_bleu=0.0, duration_nll=0.0, duration_mae=0.0, f0_nll=0.0, f0_mae=0.0, n_tokens=0.0, n_sentences=0.0, f0_sum=0.0, f0_sum_sq=0.0, dur_sum=0.0, dur_sum_sq=0.0, ) for i, batch in enumerate(dataloader): batch = explode_batch(batch, args.batch_explosion_rate) bsz = batch["target"].size(0) batch = move_to_cuda(batch) prefix = batch["prefix"][0] max_length_to_unroll = batch["target"].size(1) prefix_length = batch["net_input"]["src_tokens"].size(1) steps = max_length_to_unroll - prefix_length + 1 assert steps > 0 produced_tokens, produced_durations, produced_f0, outputs = do_sampling( model, batch, tgt_dict.eos(), decoder, autoregressive_steps=steps, teacher_force_tokens=args.teacher_force_tokens, teacher_force_duration=args.teacher_force_duration, teacher_force_f0=args.teacher_force_f0, ) if args.teacher_force_tokens: assert (produced_tokens[:, 1:] == batch["target"]).all() if args.teacher_force_duration: assert (produced_durations[:, 1:] == batch["dur_target"]).all() if args.teacher_force_f0: assert (produced_f0[:, 1:] == batch["f0_target"]).all() dur_target = batch["dur_target"][:, prefix - 1 :].contiguous() f0_target = batch["f0_target"][:, prefix - 1 :].contiguous() f0_mask = batch["f0_mask"][:, prefix - 1 :].contiguous() dur_mask = batch["dur_mask"][:, prefix - 1 :].contiguous() duration_mae = mae_loss( produced_durations[:, prefix:].float(), dur_target.float(), dur_mask, reduce=False, ) min_duration_mae = duration_mae.view(bsz, -1).sum(dim=-1).min(dim=0)[0] running_stats.duration_mae += min_duration_mae running_stats.dur_sum += ( produced_durations[:, prefix:].float() * (~dur_mask) ).sum() / args.batch_explosion_rate running_stats.dur_sum_sq += ( produced_durations[:, prefix:].float() * (~dur_mask) ).pow(2.0).sum() / args.batch_explosion_rate if is_discrete_duration: duration_loss = criterion.dur_loss_fn( torch.stack([x[1] for x in outputs], dim=1), dur_target, dur_mask, reduce=False, ) min_duration_loss = duration_loss.view(bsz, -1).sum(dim=-1).min(dim=0)[0] running_stats.duration_nll += min_duration_loss if f0_decoder: # can only exist for discrete F0 models decoded_produced_f0 = f0_decoder(produced_f0[:, prefix:]) decoded_f0_target = batch["raw_f0"][:, prefix - 1 :].contiguous() if produced_f0.ndim == 3: decoded_produced_f0 = decoded_produced_f0.squeeze(2) decoded_f0_target = decoded_f0_target.squeeze(2) f0_mae = mae_loss( decoded_produced_f0, decoded_f0_target, f0_mask, reduce=False ) f0_mae = f0_mae.view(bsz, -1).sum(dim=-1).min(dim=0)[0] running_stats.f0_mae += f0_mae f0_loss = criterion.f0_loss_fn( torch.stack([x[2] for x in outputs], dim=1), f0_target.long(), f0_mask, reduce=False, ) f0_loss = f0_loss.view(bsz, -1).sum(dim=-1).min(dim=0)[0] running_stats.f0_nll += f0_loss running_stats.f0_sum += ( decoded_produced_f0 * (~f0_mask) ).sum() / args.batch_explosion_rate running_stats.f0_sum_sq += (decoded_produced_f0 * (~f0_mask)).pow( 2.0 ).sum() / args.batch_explosion_rate else: assert not is_discrete_duration f0_loss = mae_loss( produced_f0[:, prefix:], f0_target, f0_mask, reduce=False ) f0_loss = f0_loss.view(bsz, -1).sum(dim=-1).min(dim=0)[0] running_stats.f0_mae += f0_loss running_stats.f0_sum += ( produced_f0[:, prefix:].sum() / args.batch_explosion_rate ) running_stats.f0_sum_sq += ( produced_f0[:, prefix:].pow(2.0).sum() / args.batch_explosion_rate ) running_stats.n_tokens += (~dur_mask)[0, ...].sum() token_loss = get_bleu( produced_tokens[:, prefix:], batch["target"][0, prefix - 1 :], tgt_dict ) running_stats.token_bleu += token_loss running_stats.n_sentences += 1 if args.debug: break values = torch.tensor( [ running_stats.token_bleu, running_stats.duration_nll, running_stats.duration_mae, running_stats.f0_nll, running_stats.f0_mae, running_stats.f0_sum, running_stats.f0_sum_sq, running_stats.dur_sum, running_stats.dur_sum_sq, ] ) normalizers = torch.tensor( [running_stats.n_sentences] + [running_stats.n_tokens] * 8 ) return values, normalizers @torch.no_grad() def correlation(args, dataset, model, criterion, tgt_dict, rank, world_size): is_discrete_duration = dataset.discrete_dur is_discrete_f0 = dataset.discrete_f0 f0_decoder = None if is_discrete_f0: assert dataset.discrete_f0 f0_decoder = Naive_F0_Decoder( args.f0_discretization_bounds, dataset.config.f0_vq_n_units ).cuda() if is_discrete_f0: assert f0_decoder # correlation on tokens is meaningless dataset = InferenceDataset( dataset, args.prefix_length, filter_short=True, presort_by_length=True, min_length=args.min_length, ) sampler = ( None if world_size == 1 else DistributedSampler( dataset, num_replicas=world_size, rank=rank, shuffle=False ) ) dataloader = DataLoader( dataset, batch_size=1, shuffle=False, collate_fn=dataset.collater, sampler=sampler, ) Ts = args.T_token, args.T_duration, args.T_f0 decoder = TemperatureDecoder( Ts, discrete_dur=is_discrete_duration, discrete_f0=is_discrete_f0 ) mean_dur_prefix, mean_dur_cont = [], [] mean_f0_prefix, mean_f0_cont = [], [] for batch in dataloader: batch = explode_batch(batch, args.batch_explosion_rate) batch = move_to_cuda(batch) assert len(batch["prefix"]) == 1 if args.teacher_force_tokens: autoregressive_steps = batch["target"].size(1) - args.prefix_length - 1 else: autoregressive_steps = args.max_length - args.prefix_length # + max_shift? if args.copy_target: produced_durations, produced_f0 = batch["dur_target"], batch["f0_target"] else: _, produced_durations, produced_f0, outputs = do_sampling( model, batch, tgt_dict.eos(), decoder, autoregressive_steps=autoregressive_steps, teacher_force_tokens=args.teacher_force_tokens, teacher_force_duration=args.teacher_force_duration, teacher_force_f0=args.teacher_force_f0, ) # first tokens actually correspond to BOS produced_durations = produced_durations[:, 1:] produced_f0 = produced_f0[:, 1:] dur_target = batch["dur_target"] if is_discrete_duration: produced_durations = produced_durations.float() dur_target = dur_target.float() if is_discrete_f0: produced_f0 = f0_decoder(produced_f0).squeeze(-1) f0_target = batch["raw_f0"] else: f0_target = batch["f0_target"] # prefix values prefix = batch["prefix"][0] dur_prefix_mean = dur_target[:, :prefix].sum(dim=-1) / ( (~batch["dur_mask"][:, :prefix]).sum(dim=-1) ) non_voiced = f0_target[:, :prefix] == 0.0 f0_mask = batch["f0_mask"][:, :prefix].logical_or(non_voiced) f0_prefix_mean = f0_target[:, :prefix].sum(dim=-1) / ((~f0_mask).sum(dim=-1)) # continuation values dur_cont_mean = produced_durations[:, prefix:].sum(dim=-1) / ( (~batch["dur_mask"][:, prefix:]).sum(dim=-1) ) non_voiced = produced_f0[:, prefix:] == 0.0 f0_mask = non_voiced f0_cont_mean = produced_f0[:, prefix:].sum(dim=-1) / ((~f0_mask).sum(dim=-1)) assert not f0_cont_mean.isnan().any() mean_dur_prefix.append(dur_prefix_mean.cpu()) mean_dur_cont.append(dur_cont_mean.cpu()) mean_f0_prefix.append(f0_prefix_mean.cpu()) mean_f0_cont.append(f0_cont_mean.cpu()) if args.debug and len(mean_dur_prefix) > 10: break mean_dur_prefix, mean_dur_cont = torch.cat(mean_dur_prefix), torch.cat( mean_dur_cont ) mean_f0_prefix, mean_f0_cont = torch.cat(mean_f0_prefix), torch.cat(mean_f0_cont) return mean_dur_prefix, mean_dur_cont, mean_f0_prefix, mean_f0_cont def main(rank, world_size, args): start = time.time() if world_size > 1: torch.distributed.init_process_group( backend="gloo", init_method="env://", world_size=world_size, rank=rank ) torch.cuda.set_device(rank % torch.cuda.device_count()) raw_args = args args = convert_namespace_to_omegaconf(args) if args.common.seed is not None: np.random.seed(args.common.seed) utils.set_torch_seed(args.common.seed) models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( [raw_args.path], arg_overrides={"data": args.task.data} ) tgt_dict = task.target_dictionary for model in models: model.prepare_for_inference_(args) model.cuda().eval() if raw_args.fp16: model = model.half() model = models[0] config = ExpressiveCodeDataConfig(args.task.data) dataset = CodeDataset( manifest=config.manifests[raw_args.eval_subset], dictionary=task.source_dictionary, dur_dictionary=task.source_duration_dictionary, f0_dictionary=task.source_f0_dictionary, config=config, discrete_dur=task.cfg.discrete_duration, discrete_f0=task.cfg.discrete_f0, log_f0=task.cfg.log_f0, normalize_f0_mean=task.cfg.normalize_f0_mean, normalize_f0_std=task.cfg.normalize_f0_std, interpolate_f0=task.cfg.interpolate_f0, shifts=task.cfg.stream_shifts, return_filename=True, strip_filename=False, return_continuous_f0=raw_args.dequantize_prosody, ) if raw_args.filter_names: dataset = FilterNamesDataset(dataset, raw_args.filter_names) criterion = task.build_criterion(model_args.criterion) name2metric = { "continuation": continuation, "teacher_force_everything": teacher_force_everything, "correlation": correlation, } name2keys = { "continuation": ( "Token BLEU3", "Duration NLL", "Duration MAE", "F0 NLL", "F0 MAE", "F0 sum", "F0 sum_sq", "Dur sum", "Dur sum_sq", ), "teacher_force_everything": ("token_loss", "duration_loss", "f0_loss"), "correlation": ("Duration corr", "F0 corr"), } metric_name = raw_args.metric metric = name2metric[metric_name] results = metric(raw_args, dataset, model, criterion, tgt_dict, rank, world_size) values = None if metric_name not in [ "correlation", ]: values, normalizers = results values = maybe_aggregate_normalize(values, normalizers, world_size) elif metric_name == "correlation": values = maybe_aggregate_correlations(results, world_size) else: assert False assert values is not None summary = dict(zip(name2keys[raw_args.metric], values.tolist())) if metric_name == "continuation": summary["F0 Std"] = np.sqrt(-summary["F0 sum"] ** 2 + summary["F0 sum_sq"]) summary["Dur Std"] = np.sqrt(-summary["Dur sum"] ** 2 + summary["Dur sum_sq"]) del summary["F0 sum"] del summary["F0 sum_sq"] del summary["Dur sum"] del summary["Dur sum_sq"] summary["metric"] = metric_name if rank == 0: print(summary) if raw_args.wandb: wandb_results(summary, raw_args) print("# finished in ", time.time() - start, "seconds") def wandb_results(summary, raw_args): import wandb run = wandb.init( project=raw_args.wandb_project_name, tags=raw_args.wandb_tags.split(",") ) run.config.metric = raw_args.metric run.config.model = raw_args.path run.config.data = raw_args.data if raw_args.wandb_run_name: run.name = raw_args.wandb_run_name run.save() wandb.log(summary) wandb.finish() def maybe_aggregate_normalize(values, normalizers, world_size): if world_size > 1: torch.distributed.barrier() torch.distributed.all_reduce_multigpu([values]) torch.distributed.all_reduce_multigpu([normalizers]) return values / normalizers def maybe_aggregate_correlations(results, world_size): if world_size > 1: output = [None for _ in range(world_size)] torch.distributed.all_gather_object(output, results) mean_dur_prefix, mean_dur_cont, mean_f0_prefix, mean_f0_cont = [ torch.cat([x[i] for x in output]) for i in range(4) ] else: mean_dur_prefix, mean_dur_cont, mean_f0_prefix, mean_f0_cont = results corr_dur = scipy.stats.pearsonr(mean_dur_prefix.numpy(), mean_dur_cont.numpy())[0] corr_f0 = scipy.stats.pearsonr(mean_f0_prefix.numpy(), mean_f0_cont.numpy())[0] values = torch.tensor([corr_dur, corr_f0]) return values def cli_main(): parser = options.get_interactive_generation_parser() parser.add_argument( "--prefix-length", type=int, default=1, help="Prompt prefix length (including <s>)", ) parser.add_argument( "--duration-scale", type=float, default=1, help="Multiply durations by the given scaler", ) parser.add_argument( "--debug", action="store_true", help="Process only the first batch" ) parser.add_argument("--n_hypotheses", type=int, default=1) parser.add_argument("--filter-names", type=str, default=None) parser.add_argument( "--max-length", type=int, default=200, help="Maximal produced length" ) parser.add_argument("--teacher-force-tokens", action="store_true", default=False) parser.add_argument("--teacher-force-duration", action="store_true", default=False) parser.add_argument("--teacher-force-f0", action="store_true", default=False) parser.add_argument("--copy-target", action="store_true", default=False) parser.add_argument("--min-length", type=int, default=None) parser.add_argument("--f0-discretization-bounds", type=str, default=None) parser.add_argument("--dequantize-prosody", action="store_true") parser.add_argument("--batch-explosion-rate", type=int, default=1) parser.add_argument( "--metric", choices=["continuation", "teacher_force_everything", "correlation"], required=True, ) parser.add_argument("--wandb", action="store_true") parser.add_argument("--wandb-project-name", type=str, default="eslm") parser.add_argument("--wandb-tags", type=str, default="") parser.add_argument("--wandb-run-name", type=str, default="") parser.add_argument("--T-token", type=float, default=1.0) parser.add_argument("--T-duration", type=float, default=1.0) parser.add_argument("--T-f0", type=float, default=1.0) parser.add_argument("--n-workers", type=int, default=1) parser.add_argument( "--eval-subset", type=str, default="valid", choices=["valid", "test"] ) args = options.parse_args_and_arch(parser) assert ( args.prefix_length >= 1 ), "Prefix length includes bos token <s>, hence the minimum is 1." assert args.temperature >= 0.0, "T must be non-negative!" if args.dequantize_prosody: assert args.f0_discretization_bounds world_size = args.n_workers or torch.cuda.device_count() if world_size > 1: import random mp.set_start_method("spawn", force=True) os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = str(random.randint(10_000, 50_000)) mp.spawn( main, nprocs=world_size, args=( world_size, args, ), join=True, ) else: main(rank=0, world_size=world_size, args=args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/textless_nlp/pgslm/generate_waveform.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ast import argparse import json import logging from pathlib import Path import soundfile as sf import torch from tqdm import tqdm from fairseq import utils from fairseq.models.text_to_speech.vocoder import CodeHiFiGANVocoder logging.basicConfig() logging.root.setLevel(logging.INFO) logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def dump_result(args, data, sample_id, pred_wav): assert "audio" in data or args.results_path is not None if args.results_path: fname = Path(data["audio"]).name if "audio" in data else f"{sample_id}_pred.wav" out_file = Path(args.results_path) / fname sf.write( out_file.as_posix(), pred_wav.detach().cpu().numpy(), args.sample_rate, ) def load_data(in_file): with open(in_file) as f: data = [ast.literal_eval(line.strip()) for line in f] return data def get_f0_upsample_ratio(code_hop_size, f_hop_size): ratio = (code_hop_size // 160) // (f_hop_size // 256) * 2 return ratio def main(args): logger.info(args) use_cuda = torch.cuda.is_available() and not args.cpu with open(args.vocoder_cfg) as f: vocoder_cfg = json.load(f) vocoder = CodeHiFiGANVocoder(args.vocoder, vocoder_cfg) if use_cuda: vocoder = vocoder.cuda() data = load_data(args.in_file) if args.results_path: Path(args.results_path).mkdir(exist_ok=True, parents=True) for i, d in tqdm(enumerate(data), total=len(data)): code_key = "cpc_km100" if "cpc_km100" in d else "hubert" code = list(map(int, d[code_key].split())) x = { "code": torch.LongTensor(code).view(1, -1), "f0": torch.Tensor(d["f0"]).view(1, -1), } f0_up_ratio = get_f0_upsample_ratio( vocoder_cfg["code_hop_size"], vocoder_cfg["hop_size"] ) if f0_up_ratio > 1: bsz, cond_length = x["f0"].size() x["f0"] = x["f0"].unsqueeze(2).repeat(1, 1, f0_up_ratio).view(bsz, -1) x = utils.move_to_cuda(x) if use_cuda else x wav = vocoder(x) dump_result(args, d, i, wav) def cli_main(): parser = argparse.ArgumentParser() parser.add_argument( "--in-file", type=str, required=True, help="Input file following the same format of the output from sample.py ('f0' and 'cpc_km100/hubert' are required fields)", ) parser.add_argument( "--vocoder", type=str, required=True, help="path to the vocoder" ) parser.add_argument( "--vocoder-cfg", type=str, required=True, help="path to the vocoder config", ) parser.add_argument("--sample-rate", type=int, default=16_000) parser.add_argument( "--results-path", type=str, default=None, help="Output directory. If not set, the audios will be stored following the 'audio' field specified in the input file.", ) parser.add_argument("--cpu", action="store_true", help="run on CPU") args = parser.parse_args() main(args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/textless_nlp/pgslm/inference_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch class InferenceDataset: def __init__( self, dataset, prefix, only_prefix=True, presort_by_length=True, filter_short=False, min_length=None, ): self.dataset = dataset self.collater = self.dataset.collater self.prefix = prefix self.only_prefix = only_prefix self.filter_short = filter_short self.remapping = list(range(len(self.dataset))) if min_length: assert min_length >= prefix + 1 length_thr = prefix + 1 if not min_length else min_length if filter_short: self.remapping = list( filter( lambda i: self.dataset[i]["dur_source"].sum() > length_thr, self.remapping, ) ) print( f"# the initial dataset of {len(self.dataset)} examples became {len(self.remapping)} after filtering" f" examples shorter than {length_thr} (in duration units)" ) if presort_by_length: lengths = {index: dataset.size(index) for index in self.remapping} self.remapping.sort(key=lambda i: lengths[i]) @property def pads(self): return self.dataset.pads def __len__(self): return len(self.remapping) def original_size(self, k): k = self.remapping[k] return self.dataset.size(k) def __getitem__(self, k): k = self.remapping[k] channels = self.dataset[k] if self.prefix and self.only_prefix: dur_channel = channels["dur_source"] assert dur_channel.sum() >= self.prefix token_times = dur_channel.cumsum(dim=-1) cut_after = torch.searchsorted(token_times, torch.tensor(self.prefix)) r = {} for channel_name, value in channels.items(): if isinstance(value, torch.Tensor) and "source" in channel_name: # if self.filter_short: assert value.size(0) >= self.prefix r[channel_name] = value[: cut_after + 1] else: r[channel_name] = value r["prefix"] = cut_after + 1 else: r = channels return r def explode_batch(batch, times): if times == 1: return batch new_batch = {} for key, value in batch.items(): if isinstance(value, torch.Tensor): assert value.size(0) == 1 new_batch[key] = torch.cat([value] * times) elif key in ["ntokens", "nsentences"]: new_batch[key] = value * times elif key in ["prefix", "filename"]: new_batch[key] = value elif key == "net_input": new_batch[key] = explode_batch(value, times) else: assert False, key return new_batch ================================================ FILE: examples/textless_nlp/pgslm/naive_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import warnings class Naive_F0_Decoder(torch.nn.Module): def __init__(self, bounds_path, n_units=32): super().__init__() bounds = torch.load(bounds_path) bounds = torch.from_numpy(bounds[n_units]) assert bounds.ndim == 1 pad = torch.tensor([-5.0, -5.0]) # bos, eos, pad are in the dictionary centers = torch.cat( [bounds[0:1], 0.5 * (bounds[1:] + bounds[:-1]), bounds[-1:], pad[:]] ) self.embedding = torch.nn.Embedding.from_pretrained( centers.unsqueeze(-1), freeze=True ) self.max_n = self.embedding.weight.numel() def forward(self, discrete_f0: torch.Tensor): in_bounds = (0 <= discrete_f0).all() and (discrete_f0 < self.max_n).all() if not in_bounds: warnings.warn( f"F0 contains some weird outputs: discrete_f0.max().item()={discrete_f0.max().item()} discrete_f0.min().item()={discrete_f0.min().item()}; " f"while we have embeddings for {self.max_n} values. " "Assuming this is a no-prosody model -- but be careful!" ) mask = discrete_f0 >= self.max_n discrete_f0 = discrete_f0.masked_fill(mask, self.max_n - 1) return self.embedding(discrete_f0).squeeze(-1) ================================================ FILE: examples/textless_nlp/pgslm/prepare_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from multiprocessing import Pool import os from collections import defaultdict from itertools import starmap import torch from npy_append_array import NpyAppendArray from tqdm import tqdm from data_utils import dump_speaker_f0_stat, F0Stat, load_f0 from fairseq.data.codedataset import ( ExpressiveCodeDataConfig, parse_manifest, F0_FRAME_SPACE, align_f0_to_durations, ) from fairseq.tasks.speech_ulm_task import UnitDictionary def load_meta(meta_path, split): config = ExpressiveCodeDataConfig(meta_path) manifest_path = config.manifests[split] dictionary = UnitDictionary(n_units=config.n_units) audio_paths, codes, durs, speakers = parse_manifest(manifest_path, dictionary) return config, audio_paths, codes, durs, speakers def _align_f0(f0, dur, ratio, frm_tol=5): if f0 is None: seg_f0 = torch.zeros_like(dur, dtype=torch.float) else: seg_f0 = align_f0_to_durations(f0, dur, ratio, tol=frm_tol * ratio) return seg_f0.numpy() # try a hacky stuff def align_f0(path_to_f0, audio_paths, durs, ratio, mp=False): chunk_size = 2000 num_procs = 40 iterable = ((path_to_f0[p], d, ratio) for p, d in zip(audio_paths, durs)) seg_f0s = [] if mp: with Pool(num_procs) as pool: iterator = tqdm( pool.istarmap(_align_f0, iterable, chunk_size), desc="align f0", total=len(durs), ) for seg_f0 in iterator: seg_f0s.append(torch.from_numpy(seg_f0).float()) else: iterator = tqdm(starmap(_align_f0, iterable), desc="align f0", total=len(durs)) for seg_f0 in iterator: seg_f0s.append(torch.from_numpy(seg_f0).float()) return seg_f0s def prepare_seg_data(config, audio_paths, codes, durs, speakers, path_to_f0): ratio = config.code_hop_size / (config.sampling_rate * F0_FRAME_SPACE) seg_f0s = align_f0(path_to_f0, audio_paths, durs, ratio) data = { "codes": codes, "duration": durs, "f0": seg_f0s, "speaker": speakers, "path": audio_paths, } return data def dump_seg_data(data, out_prefix): key_targs = { "codes": f"{out_prefix}.code.npy", "duration": f"{out_prefix}.dur.npy", "f0": f"{out_prefix}.f0.npy", } for key, targ in key_targs.items(): assert not os.path.exists(targ) npaa = NpyAppendArray(targ) for utt_data in tqdm(data[key], desc=f"dumping {key}"): npaa.append(utt_data.numpy()) assert not os.path.exists(f"{out_prefix}.path.txt") with open(f"{out_prefix}.path.txt", "w") as f: for x in data["path"]: f.write(f"{str(x)}\n") assert not os.path.exists(f"{out_prefix}.leng.txt") with open(f"{out_prefix}.leng.txt", "w") as f: for x in data["codes"]: f.write(f"{len(x)}\n") assert not os.path.exists(f"{out_prefix}.speaker.txt") with open(f"{out_prefix}.speaker.txt", "w") as f: for x in data["speaker"]: f.write(f"{str(x)}\n") print(f"wrote to files with prefix {out_prefix}") def main(meta_path, f0_dir, splits, nshards_list): speaker_to_stat = defaultdict(F0Stat) if len(nshards_list) == 1: nshards_list = nshards_list * len(splits) else: assert len(nshards_list) == len(splits) for split, nshards in zip(splits, nshards_list): config, audio_paths, codes, durs, speakers = load_meta(meta_path, split) path_to_f0 = load_f0(f"{f0_dir}/{split}", nshards) # segment-level data data = prepare_seg_data(config, audio_paths, codes, durs, speakers, path_to_f0) dump_seg_data(data, config.manifests[split]) # speaker f0 for audio_path, speaker in tqdm(zip(audio_paths, speakers)): f0 = path_to_f0[audio_path] speaker_to_stat[speaker].update(f0) dump_speaker_f0_stat(speaker_to_stat, config.manifests[split]) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("meta_path") parser.add_argument("f0_dir", help="out_dir from preprocess_f0") parser.add_argument("--splits", nargs="+", default=["train", "valid"]) parser.add_argument( "--nshards_list", type=int, nargs="+", default=[20], help="number of f0 shards" ) args = parser.parse_args() print(args) main(**vars(args)) ================================================ FILE: examples/textless_nlp/pgslm/preprocess_f0.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import torch from tqdm import tqdm from data_utils import load_audio_path from fairseq.data.codedataset import get_f0_by_filename def process_one(path, sr): """ Args: path: audio file path sr: sampling rate """ try: # YAAPT throws errors in some rare cases f0 = get_f0_by_filename(path, sr) except Exception as e: print( f"WARNING: error when processing {path}. set f0 to zero. original error message:\n{e}" ) f0 = None return f0 def main(file_path, out_dir, nshards, rank, sampling_rate): # load data audio_paths = load_audio_path(file_path) # shard assert nshards <= len(audio_paths) and nshards > 0 shard_size = len(audio_paths) / nshards s = int(round((rank - 1) * shard_size)) e = int(round(rank * shard_size)) audio_paths = audio_paths[s:e] # process path_to_f0 = {} for i, audio_path in enumerate(tqdm(audio_paths)): f0 = process_one(audio_path, sampling_rate) path_to_f0[audio_path] = f0 print(f"finished processing {len(path_to_f0)} utterances ({s}-{e})") f0_path = f"{out_dir}/f0_{rank}_{nshards}.pt" os.makedirs(out_dir, exist_ok=True) torch.save(path_to_f0, f0_path) print(f"saved to {f0_path}") if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("file_path") parser.add_argument("out_dir") parser.add_argument("--nshards", type=int, default=20) parser.add_argument("--rank", type=int, default=1) parser.add_argument("--sampling_rate", type=int, default=16000) args = parser.parse_args() main(**vars(args)) ================================================ FILE: examples/textless_nlp/pgslm/quantize_f0.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import defaultdict from functools import partial import numpy as np import torch from tqdm import tqdm from data_utils import dump_speaker_f0_stat, F0Stat, load_audio_path, load_f0 def load_speaker(path): speakers = [] with open(path) as f: for line in f.readlines(): sample = eval(line.strip()) assert "speaker" in sample speakers.append(sample["speaker"]) return speakers def quantize_f0(speaker_to_f0, f0_stats, nbins, normalize, log): f0_all = [] for speaker, f0 in speaker_to_f0.items(): f0 = f0.raw_data if log: f0 = f0.log() mean = f0_stats[speaker]["logf0_mean"] if log else f0_stats[speaker]["f0_mean"] std = f0_stats[speaker]["logf0_std"] if log else f0_stats[speaker]["f0_std"] if normalize == "mean": f0 = f0 - mean elif normalize == "meanstd": f0 = (f0 - mean) / std f0_all.extend(f0.tolist()) hist, bin_x = np.histogram(f0_all, 100000) cum_hist = np.cumsum(hist) / len(f0_all) * 100 f0_bin = {} for num_bin in nbins: bin_offset = [] bin_size = 100 / num_bin threshold = bin_size for i in range(num_bin - 1): index = (np.abs(cum_hist - threshold)).argmin() bin_offset.append(bin_x[index]) threshold += bin_size f0_bin[num_bin] = np.array(bin_offset) return f0_bin def main(file_path, f0_dir, out_dir, out_prefix, nbins, nshards, normalize, log): audio_paths = load_audio_path(file_path) path_to_f0 = load_f0(f0_dir, nshards) speakers = load_speaker(file_path) speaker_to_f0 = defaultdict(partial(F0Stat, True)) # speaker f0 stats for audio_path, speaker in tqdm(zip(audio_paths, speakers)): f0 = path_to_f0[audio_path] speaker_to_f0[speaker].update(f0) f0_stats = dump_speaker_f0_stat(speaker_to_f0, f"{out_dir}/{out_prefix}") # quantize f0_bin = quantize_f0(speaker_to_f0, f0_stats, nbins, normalize, log) log_suffix = "_log" if log else "" f0_bin_out_file = f"{out_dir}/{out_prefix}_{normalize}_norm{log_suffix}_f0_bin.th" torch.save(f0_bin, f0_bin_out_file) if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("file_path") parser.add_argument("f0_dir", help="out_dir from preprocess_f0") parser.add_argument("out_dir") parser.add_argument("out_prefix") parser.add_argument("--nbins", nargs="+", type=int, default=[32]) parser.add_argument("--nshards", type=int, default=20, help="number of f0 shards") parser.add_argument( "--normalize", type=str, choices=["meanstd", "mean", "none"], default="mean" ) parser.add_argument("--log", action="store_true") args = parser.parse_args() print(args) main(**vars(args)) ================================================ FILE: examples/textless_nlp/pgslm/sample/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: examples/textless_nlp/pgslm/sample/sample.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import torch.multiprocessing as mp import numpy as np import json import torch from torch.distributions.categorical import Categorical from fairseq import checkpoint_utils, options, utils from fairseq.data.codedataset import CodeDataset, ExpressiveCodeDataConfig from fairseq.dataclass.utils import convert_namespace_to_omegaconf from torch.utils.data import DataLoader, DistributedSampler from fairseq.utils import move_to_cuda import tqdm import random import pathlib import sys, pathlib sys.path.append(str(pathlib.Path(__file__).parent.parent)) from inference_dataset import InferenceDataset, explode_batch from naive_decoder import Naive_F0_Decoder from truncated_laplace import truncated_laplace CODETYPE_TO_FRAMETIME = {"cpc_km100": 0.01, "hubert": 0.02} # 10ms # 20ms class TemperatureDecoder: def __init__(self, Ts, discrete_dur=False, discrete_f0=False): self.T_token, self.T_dur, self.T_f0 = Ts self.discrete_dur = discrete_dur self.discrete_f0 = discrete_f0 def __call__(self, output): def sample_multinomial(key, T): logits = output[key][:, -1, :].float() return Categorical(logits=logits / T).sample().unsqueeze(-1) def sample_laplace(key, T, truncate_at_zero): mean = output[key][:, -1, :].float() return truncated_laplace(mean=mean, T=T, truncate_by_zero=truncate_at_zero) if self.T_token > 0: new_tokens = sample_multinomial("token", self.T_token) else: new_tokens = output["token"][:, -1, :].argmax(dim=-1, keepdim=True) if not self.discrete_dur and self.T_dur == 0: new_durations = output["duration"][:, -1].round().int() elif not self.discrete_dur and self.T_dur > 0: new_durations = ( sample_laplace("duration", self.T_dur, truncate_at_zero=True) .round() .int() ) elif self.discrete_dur and self.T_dur > 0: new_durations = sample_multinomial("duration", self.T_dur) elif self.discrete_dur and self.T_dur == 0: new_durations = output["duration"][:, -1, :].argmax(dim=-1, keepdim=True) else: assert False if not self.discrete_f0 and self.T_f0 == 0: new_f0 = output["f0"][:, -1] elif not self.discrete_f0 and self.T_f0 > 0: new_f0 = sample_laplace("f0", self.T_f0, truncate_at_zero=False) elif self.discrete_f0 and self.T_f0 > 0: new_f0 = sample_multinomial("f0", self.T_f0) elif self.discrete_f0 and self.T_f0 == 0: new_f0 = output["f0"][:, -1, :].argmax(dim=-1, keepdim=True) else: assert False return new_tokens, new_durations, new_f0 class FilterNamesDataset: def __init__(self, dataset, fnames_path): self.dataset = dataset with open(fnames_path, "r") as fin: fnames = set((eval(line)["audio"] for line in fin)) print(f"# will retrict the dataset for {len(fnames)} files") self.indexes = [] for i, datapoint in enumerate(dataset): if datapoint["filename"] in fnames: self.indexes.append(i) assert len(self.indexes) == len(fnames), f"{len(self.indexes)} {len(fnames)}" self.collater = self.dataset.collater self.discrete_dur = self.dataset.discrete_dur self.discrete_f0 = self.dataset.discrete_f0 def __len__(self): return len(self.indexes) def __getitem__(self, k): k = self.indexes[k] return self.dataset[k] def size(self, k): k = self.indexes[k] return self.dataset.size(k) @torch.no_grad() def do_sampling( model, batch, eos_token, decoder, autoregressive_steps=100, teacher_force_tokens=False, teacher_force_duration=False, teacher_force_f0=False, match_duration=False, ): def autoregressive_step_(output, autoregressive_steps): new_tokens, new_durations, new_f0 = decoder(output) n = output["token"].size(1) if output["token"].ndim == 3 else 1 if teacher_force_tokens: new_tokens = batch["target"][:, n - 1].unsqueeze(-1) if teacher_force_duration: new_durations = batch["dur_target"][:, n - 1].unsqueeze(-1) if teacher_force_f0: new_f0 = batch["f0_target"][:, n - 1].unsqueeze(-1) batch["net_input"]["src_tokens"] = torch.cat( [batch["net_input"]["src_tokens"], new_tokens], dim=1 ) batch["net_input"]["dur_src"] = torch.cat( [batch["net_input"]["dur_src"], new_durations], dim=1 ) batch["net_input"]["f0_src"] = torch.cat( [batch["net_input"]["f0_src"], new_f0], dim=1 ) outputs = [] if teacher_force_tokens or teacher_force_duration or teacher_force_f0: max_time = batch["target"].size(1) prefix_time = batch["net_input"]["src_tokens"].size(1) autoregressive_steps = max_time - prefix_time + 1 # should be 0 for _ in range(autoregressive_steps): output = model(**batch["net_input"]) last_steps = ( output["token"][:, -1, ...], output["duration"][:, -1, ...], output["f0"][:, -1, ...], ) outputs.append(last_steps) autoregressive_step_(output, autoregressive_steps) tokens, duration, f0 = ( batch["net_input"]["src_tokens"], batch["net_input"]["dur_src"], batch["net_input"]["f0_src"], ) if ( match_duration and (batch["dur_target"].sum(dim=-1) < duration.sum(dim=-1)).all() ): break return tokens, duration, f0, outputs def unroll_duration(token_stream, duration_stream): assert len(token_stream) == len( duration_stream ), f"{len(token_stream)} != {len(duration_stream)}" non_positive_durations = sum(d <= 0 for d in duration_stream) if non_positive_durations > 0: print( f"# {non_positive_durations} durations are non-positive, they will be capped to 1" ) result = [] duration_stream_rounded_capped = [max(1, int(round(x))) for x in duration_stream] for t, d in zip(token_stream, duration_stream_rounded_capped): result.extend([t] * d) return result def realign_shifted_streams(tokens, durations, F0s, shifts): """ Durations are shifted by 1, F0 by 2 >>> tokens = ["<s>", "t1", "t2", "t3", "</s>", "x", "x"] >>> durations = ["<0>", "<0>", "d1", "d2", "d3", "<0>", "x"] >>> F0s = ["<0>", "<0>", "<0>", "f1", "f2", "f3", "<0>"] >>> shifts = [1,2] >>> realign_shifted_streams(tokens, durations, F0s, shifts) (['<s>', 't1', 't2', 't3', '</s>'], ['<0>', 'd1', 'd2', 'd3', '<0>'], ['<0>', 'f1', 'f2', 'f3', '<0>']) """ max_shift = max(shifts) if max_shift > 0: shift_durations, shift_F0s = shifts tokens = tokens[:-max_shift] durations = durations[shift_durations:] if shift_durations < max_shift: durations = durations[: -(max_shift - shift_durations)] if F0s is not None: F0s = F0s[shift_F0s:] if shift_F0s < max_shift: F0s = F0s[: -(max_shift - shift_F0s)] assert len(tokens) == len(durations), f"{len(tokens)} =! {len(durations)}" if F0s is not None: assert len(tokens) == len(F0s), f"{len(tokens)} =! {len(F0s)}" return tokens, durations, F0s def maybe_cut_eos(produced_tokens, produced_duration, produced_f0, eos_idx): if eos_idx in produced_tokens: eos_index = produced_tokens.index(eos_idx) produced_tokens = produced_tokens[:eos_index] produced_duration = produced_duration[:eos_index] produced_f0 = produced_f0[:eos_index] return produced_tokens, produced_duration, produced_f0 def maybe_filter_pad(produced_tokens, produced_duration, produced_f0, pad_idx): if pad_idx not in produced_tokens: return produced_tokens, produced_duration, produced_f0 assert len(produced_tokens) == len(produced_duration) == len(produced_f0) print("<pad> is detected in the output!") filtered_tokens, filtered_duration, filtered_f0 = [], [], [] for t, d, f in zip(produced_tokens, produced_duration, produced_f0): if t != pad_idx: filtered_tokens.append(t) filtered_duration.append(d) filtered_f0.append(f) return filtered_tokens, filtered_duration, filtered_f0 def match_duration(produced_tokens, produced_duration, produced_f0, target_duration): """ >>> tokens = ['t'] * 4 >>> F0s = ['f0'] * 4 >>> produced_duration = [1, 10, 10, 10] >>> match_duration(tokens, produced_duration, F0s, target_duration=100) (['t', 't', 't', 't'], [1, 10, 10, 10], ['f0', 'f0', 'f0', 'f0']) >>> match_duration(tokens, produced_duration, F0s, target_duration=5) (['t', 't'], [1, 4], ['f0', 'f0']) """ if sum(produced_duration) <= target_duration: return produced_tokens, produced_duration, produced_f0 running_duration = 0 filtered_duration = [] for next_tok_duration in produced_duration: if running_duration + next_tok_duration < target_duration: filtered_duration.append(next_tok_duration) running_duration += next_tok_duration else: to_add = target_duration - running_duration assert to_add <= next_tok_duration filtered_duration.append(to_add) break produced_duration = filtered_duration assert sum(produced_duration) == target_duration n_tok = len(filtered_duration) return produced_tokens[:n_tok], produced_duration, produced_f0[:n_tok] def main(rank, world_size, args): if world_size > 1: torch.distributed.init_process_group( backend="gloo", init_method="env://", world_size=world_size, rank=rank ) torch.cuda.set_device(rank) raw_args = args args = convert_namespace_to_omegaconf(args) if args.common.seed is not None: random.seed(args.common.seed) np.random.seed(args.common.seed) utils.set_torch_seed(args.common.seed) models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( [raw_args.path], arg_overrides={"data": args.task.data} ) tgt_dict = task.target_dictionary for model in models: model.prepare_for_inference_(args) model.cuda().eval() if raw_args.fp16: model = model.half() model = models[0] config = ExpressiveCodeDataConfig(args.task.data) dataset = CodeDataset( manifest=config.manifests[raw_args.subset], dictionary=task.source_dictionary, dur_dictionary=task.source_duration_dictionary, f0_dictionary=task.source_f0_dictionary, config=config, discrete_dur=task.cfg.discrete_duration, discrete_f0=task.cfg.discrete_f0, log_f0=task.cfg.log_f0, normalize_f0_mean=task.cfg.normalize_f0_mean, normalize_f0_std=task.cfg.normalize_f0_std, interpolate_f0=task.cfg.interpolate_f0, shifts=task.cfg.stream_shifts, return_filename=True, strip_filename=False, ) tgt_dict = task.target_dictionary shifts = dataset.shifts.dur, dataset.shifts.f0 max_shift = max(shifts) fname = raw_args.output if world_size > 1: fname += f"_{rank}" output_file = open(fname, "w") if raw_args.filter_names: dataset = FilterNamesDataset(dataset, raw_args.filter_names) dataset = InferenceDataset(dataset, raw_args.prefix_length, filter_short=True) print(f"Dataset size {len(dataset)}") sampler = ( None if world_size == 1 else DistributedSampler( dataset, num_replicas=world_size, rank=rank, shuffle=False ) ) dataloader = DataLoader( dataset, batch_size=1, shuffle=False, collate_fn=dataset.collater, sampler=sampler, ) Ts = raw_args.T_token, raw_args.T_duration, raw_args.T_f0 decoder = TemperatureDecoder( Ts, discrete_dur=task.cfg.discrete_duration, discrete_f0=task.cfg.discrete_f0 ) dataset_size = len(dataset) f0_decoder = None if raw_args.f0_discretization_bounds: assert task.cfg.discrete_f0 f0_decoder = Naive_F0_Decoder(raw_args.f0_discretization_bounds).cuda() pbar = ( tqdm.tqdm( total=dataset_size if raw_args.max_samples is None else min(raw_args.max_samples, dataset_size) ) if world_size == 1 else None ) samples_produced = 0 for batch in dataloader: if ( raw_args.max_samples is not None and samples_produced >= raw_args.max_samples ): break prefix = batch["prefix"][0] batch = explode_batch(batch, raw_args.batch_explosion_rate) batch = move_to_cuda(batch) if not raw_args.short_curcuit: produced_tokens, produced_durations, produced_f0, _ = do_sampling( models[0], batch, tgt_dict.eos(), decoder, autoregressive_steps=raw_args.max_length - prefix + max_shift, teacher_force_tokens=raw_args.teacher_force_tokens, match_duration=raw_args.match_duration, teacher_force_duration=raw_args.teacher_force_duration, teacher_force_f0=raw_args.teacher_force_f0, ) # stip entries corresponding to <s> produced_tokens = produced_tokens[:, 1:] produced_durations = produced_durations[:, 1:] produced_f0 = produced_f0[:, 1:] else: max_length = raw_args.max_length + max_shift produced_tokens, produced_durations, produced_f0 = ( batch["target"][:, :max_length], batch["dur_target"][:, :max_length], batch["f0_target"][:, :max_length], ) if f0_decoder is not None: produced_f0 = f0_decoder(produced_f0) produced_tokens, produced_durations, produced_f0 = ( produced_tokens.cpu().tolist(), produced_durations.cpu().tolist(), produced_f0.cpu().tolist(), ) bsz = batch["target"].size(0) assert bsz == raw_args.batch_explosion_rate for i in range(bsz): if ( raw_args.max_samples is not None and samples_produced >= raw_args.max_samples ): break produced_tokens_i = produced_tokens[i] produced_durations_i = produced_durations[i] produced_f0_i = produced_f0[i] ( produced_tokens_i, produced_durations_i, produced_f0_i, ) = realign_shifted_streams( produced_tokens_i, produced_durations_i, produced_f0_i, shifts ) produced_tokens_i, produced_durations_i, produced_f0_i = maybe_cut_eos( produced_tokens_i, produced_durations_i, produced_f0_i, tgt_dict.eos() ) produced_tokens_i, produced_durations_i, produced_f0_i = maybe_filter_pad( produced_tokens_i, produced_durations_i, produced_f0_i, tgt_dict.pad() ) if raw_args.match_duration: # NB: here we cheat a bit and use that padding has duration 0 # so no need to re-align and remove padding dur_target_i = batch["dur_target"][i, :].sum().item() produced_tokens_i, produced_durations_i, produced_f0_i = match_duration( produced_tokens_i, produced_durations_i, produced_f0_i, dur_target_i ) if raw_args.cut_prompt: produced_tokens_i, produced_durations_i, produced_f0_i = ( produced_tokens_i[prefix:], produced_durations_i[prefix:], produced_f0_i[prefix:], ) prompt_fname = batch["filename"][0] fname = str(pathlib.Path(prompt_fname).with_suffix("")) + f"__{i}.wav" token_stream = unroll_duration(produced_tokens_i, produced_durations_i) f0_stream = unroll_duration(produced_f0_i, produced_durations_i) output_line = json.dumps( { "audio": fname, "prompt": prompt_fname, raw_args.code_type: " ".join(map(str, token_stream)), "duration": round( sum(produced_durations_i) * CODETYPE_TO_FRAMETIME[raw_args.code_type], 3, ), "raw_duration": produced_durations_i, "raw_f0": produced_f0_i, "f0": [round(f0, 3) for f0 in f0_stream], } ) print(output_line, file=output_file) if pbar: pbar.update(1) samples_produced += 1 if raw_args.debug: break output_file.close() if world_size > 1: # important that everything is flushed before aggregating torch.distributed.barrier() if world_size > 1 and rank == 0: with open(raw_args.output, "w") as fout: for i in range(world_size): f = raw_args.output + f"_{i}" with open(f, "r") as fin: fout.write(fin.read()) os.remove(f) def cli_main(): parser = options.get_interactive_generation_parser() parser.add_argument( "--prefix-length", type=int, default=1, help="Prompt prefix length (including <s>)", ) parser.add_argument("--output", type=str, default=None, required=True) parser.add_argument( "--debug", action="store_true", help="Process only the first batch" ) parser.add_argument( "--ignore-durations", action="store_true", help="If set, the duration stream is ignored", ) parser.add_argument( "--max-length", type=int, default=200, help="Maximal produced length" ) parser.add_argument( "--code-type", choices=["cpc_km100", "hubert"], default="cpc_km100" ) parser.add_argument("--max-samples", type=int, default=None) parser.add_argument("--prompt-duration-scaler", type=float, default=1.0) parser.add_argument("--teacher-force-tokens", action="store_true", default=False) parser.add_argument("--teacher-force-duration", action="store_true", default=False) parser.add_argument("--teacher-force-f0", action="store_true", default=False) parser.add_argument("--filter-names", type=str, default=None) parser.add_argument( "--match-duration", action="store_true", help="Do not produce sequences longer that ground-truth", ) parser.add_argument( "--cut-prompt", action="store_true", help="Remove prompt from the produced audio", ) parser.add_argument( "--short-curcuit", action="store_true", help="Use 'target' as a sample" ) parser.add_argument("--f0-discretization-bounds", type=str, default=None) parser.add_argument("--batch-explosion-rate", type=int, default=1) parser.add_argument("--T-token", type=float, default=1.0) parser.add_argument("--T-duration", type=float, default=1.0) parser.add_argument("--T-f0", type=float, default=1.0) parser.add_argument( "--subset", type=str, default="valid", choices=["test", "valid"] ) args = options.parse_args_and_arch(parser) assert ( args.prefix_length >= 1 ), "Prefix length includes bos token <s>, hence the minimum is 1." assert all( t >= 0 for t in [args.T_token, args.T_f0, args.T_duration] ), "T must be non-negative!" world_size = torch.cuda.device_count() if world_size > 1: import random mp.set_start_method("spawn", force=True) os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = str(random.randint(10_000, 50_000)) print(f"Using {world_size} devices, master port {os.environ['MASTER_PORT']}") mp.spawn( main, nprocs=world_size, args=( world_size, args, ), join=True, ) else: main(rank=0, world_size=world_size, args=args) if __name__ == "__main__": cli_main() ================================================ FILE: examples/textless_nlp/pgslm/scripts/join_units_manifest.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import argparse import pathlib def main(): parser = argparse.ArgumentParser() parser.add_argument("--manifest", required=True) parser.add_argument("--units", required=True) parser.add_argument("--output", required=True) parser.add_argument("--sample_rate", type=int, default=16_000) args = parser.parse_args() with open(args.manifest, "r") as manifest, open(args.units, "r") as units, open( args.output, "w" ) as outp: root = manifest.readline().strip() root = pathlib.Path(root) for manifest_line, unit_line in zip(manifest.readlines(), units.readlines()): path, frames = manifest_line.split() duration = int(frames) / float(args.sample_rate) fname = root / path speaker = fname.parent.parent.name units = unit_line.split("|")[1] print( json.dumps( dict( audio=str(root / path), duration=duration, hubert_km100=units.strip(), speaker=speaker, ) ), file=outp, ) if __name__ == "__main__": main() ================================================ FILE: examples/textless_nlp/pgslm/scripts/prepare_data.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. set -eu train_json=$1 valid_json=$2 test_json=$3 n_units=$4 hop_size=$5 sr=$6 f0_quantizer=$7 out_dir=$8 meta_path="$out_dir/data_config.json" f0_dir="$out_dir/f0" mkdir -p $out_dir ln -sf $train_json $out_dir/train.txt ln -sf $valid_json $out_dir/valid.txt ln -sf $test_json $out_dir/test.txt cat <<EOF >$meta_path { "manifests": { "train": "$out_dir/train.txt", "valid": "$out_dir/valid.txt", "test": "$out_dir/test.txt" }, "n_units": $n_units, "code_hop_size": $hop_size, "sampling_rate": $sr, "multispkr": "parent_parent_name", "f0_vq_type": "naive", "f0_vq_naive_quantizer": { "log_mean_norm": "$f0_quantizer" }, "f0_vq_n_units": 32 } EOF for split in train valid test; do python examples/textless_nlp/pgslm/preprocess_f0.py \ $out_dir/$split.txt $f0_dir/$split --nshards=1 --rank=1 --sampling_rate=$sr #NSHARDS=16 #seq 1 $NSHARDS | parallel -j $NSHARDS python examples/textless_nlp/pgslm/preprocess_f0.py \ # $out_dir/$split.txt $f0_dir/$split --nshards=$NSHARDS --sampling_rate=$sr --rank done # Please make sure that the number of shards (--nshards_list) is consistent across commands python examples/textless_nlp/pgslm/prepare_dataset.py \ $meta_path $f0_dir --splits test valid train --nshards_list 1 ================================================ FILE: examples/textless_nlp/pgslm/scripts/prepare_f0_quantization.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. set -eu train_json=$1 sr=$2 nbins=$3 out_dir=$4 out_prefix=$5 f0_dir="$out_dir/f0" python examples/textless_nlp/pgslm/preprocess_f0.py \ $train_json $f0_dir/${out_prefix}_f0_quant --nshards 1 --rank 1 --sampling_rate $sr # NB: one can use parallel here: # NSHARDS=16 # #seq 1 $NSHARDS | parallel -j $NSHARDS python examples/textless_nlp/pgslm/preprocess_f0.py \ # $train_json $f0_dir/${out_prefix}_f0_quant --nshards $NSHARDS --sampling_rate $sr --rank python examples/textless_nlp/pgslm/quantize_f0.py \ $train_json $f0_dir/${out_prefix}_f0_quant $out_dir $out_prefix --nbins $nbins --nshards 1 --normalize mean --log ================================================ FILE: examples/textless_nlp/pgslm/truncated_laplace.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import warnings def truncated_laplace(mean, T, truncate_by_zero=False): """Generating a sample from a Laplace distribution, possible left-truncated at zero. A bit of explanation here https://stats.stackexchange.com/a/357598 . """ assert isinstance(mean, torch.Tensor) if not truncate_by_zero: percentile = 0.0 else: if not (mean >= 0.0).all(): warnings.warn(f"means are supposed to be non-negative, but got {mean}") mean = torch.clamp_min(mean, 0.0) lower_bound = mean.new_tensor([0.0]) percentile = 0.5 + 0.5 * torch.sign(lower_bound - mean) * ( 1.0 - torch.exp(-1.0 / T * torch.abs(mean - lower_bound)) ) p = torch.empty_like(mean).uniform_() * (1.0 - percentile) + percentile return mean - T * torch.sign(p - 0.5) * torch.log(1 - 2 * torch.abs(p - 0.5)) ================================================ FILE: examples/textless_nlp/speech-resynth/README.md ================================================ # Speech Resynthesis from Discrete Disentangled Self-Supervised Representations Landing page with usfull resources for the [Speech Resynthesis from Discrete Disentangled Self-Supervised Representations](https://arxiv.org/abs/2104.00355) paper. <p align="center"><img width="70%" src="img/fig.png" /></p> __Abstract__: We propose using self-supervised discrete representations for the task of speech resynthesis. To generate disentangled representation, we separately extract low-bitrate representations for speech content, prosodic information, and speaker identity. This allows to synthesize speech in a controllable manner. We analyze various state-of-the-art, self-supervised representation learning methods and shed light on the advantages of each method while considering reconstruction quality and disentanglement properties. Specifically, we evaluate the F0 reconstruction, speaker identification performance (for both resynthesis and voice conversion), recordings' intelligibility, and overall quality using subjective human evaluation. Lastly, we demonstrate how these representations can be used for an ultra-lightweight speech codec. Using the obtained representations, we can get to a rate of 365 bits per second while providing better speech quality than the baseline methods. ## Quick Links - [Paper](https://arxiv.org/pdf/2104.00355.pdf) - [Samples](https://speechbot.github.io/resynthesis/index.html) - [Code](https://github.com/facebookresearch/speech-resynthesis) The codebase for the [Speech Resynthesis from Discrete Disentangled Self-Supervised Representations](https://arxiv.org/abs/2104.00355) paper can be found under the following [repository](https://github.com/facebookresearch/speech-resynthesis). ## Citation ``` @inproceedings{polyak21_interspeech, author={Adam Polyak and Yossi Adi and Jade Copet and Eugene Kharitonov and Kushal Lakhotia and Wei-Ning Hsu and Abdelrahman Mohamed and Emmanuel Dupoux}, title={{Speech Resynthesis from Discrete Disentangled Self-Supervised Representations}}, year=2021, booktitle={Proc. Interspeech 2021}, } ``` ================================================ FILE: examples/translation/README.md ================================================ # Neural Machine Translation This README contains instructions for [using pretrained translation models](#example-usage-torchhub) as well as [training new models](#training-a-new-model). ## Pre-trained models Model | Description | Dataset | Download ---|---|---|--- `conv.wmt14.en-fr` | Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.newstest2014.tar.bz2) <br> newstest2012/2013: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.ntst1213.tar.bz2) `conv.wmt14.en-de` | Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT14 English-German](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-de.newstest2014.tar.bz2) `conv.wmt17.en-de` | Convolutional <br> ([Gehring et al., 2017](https://arxiv.org/abs/1705.03122)) | [WMT17 English-German](http://statmt.org/wmt17/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt17.v2.en-de.newstest2014.tar.bz2) `transformer.wmt14.en-fr` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT14 English-French](http://statmt.org/wmt14/translation-task.html#Download) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt14.en-fr.joined-dict.newstest2014.tar.bz2) `transformer.wmt16.en-de` | Transformer <br> ([Ott et al., 2018](https://arxiv.org/abs/1806.00187)) | [WMT16 English-German](https://drive.google.com/uc?export=download&id=0B_bZck-ksdkpM25jRUN2X2UxMm8) | model: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2) <br> newstest2014: <br> [download (.tar.bz2)](https://dl.fbaipublicfiles.com/fairseq/data/wmt16.en-de.joined-dict.newstest2014.tar.bz2) `transformer.wmt18.en-de` | Transformer <br> ([Edunov et al., 2018](https://arxiv.org/abs/1808.09381)) <br> WMT'18 winner | [WMT'18 English-German](http://www.statmt.org/wmt18/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz) <br> See NOTE in the archive `transformer.wmt19.en-de` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 English-German](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz) `transformer.wmt19.de-en` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 German-English](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz) `transformer.wmt19.en-ru` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 English-Russian](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz) `transformer.wmt19.ru-en` | Transformer <br> ([Ng et al., 2019](https://arxiv.org/abs/1907.06616)) <br> WMT'19 winner | [WMT'19 Russian-English](http://www.statmt.org/wmt19/translation-task.html) | model: <br> [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz) ## Example usage (torch.hub) We require a few additional Python dependencies for preprocessing: ```bash pip install fastBPE sacremoses subword_nmt ``` Interactive translation via PyTorch Hub: ```python import torch # List available models torch.hub.list('pytorch/fairseq') # [..., 'transformer.wmt16.en-de', ... ] # Load a transformer trained on WMT'16 En-De # Note: WMT'19 models use fastBPE instead of subword_nmt, see instructions below en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt16.en-de', tokenizer='moses', bpe='subword_nmt') en2de.eval() # disable dropout # The underlying model is available under the *models* attribute assert isinstance(en2de.models[0], fairseq.models.transformer.TransformerModel) # Move model to GPU for faster translation en2de.cuda() # Translate a sentence en2de.translate('Hello world!') # 'Hallo Welt!' # Batched translation en2de.translate(['Hello world!', 'The cat sat on the mat.']) # ['Hallo Welt!', 'Die Katze saß auf der Matte.'] ``` Loading custom models: ```python from fairseq.models.transformer import TransformerModel zh2en = TransformerModel.from_pretrained( '/path/to/checkpoints', checkpoint_file='checkpoint_best.pt', data_name_or_path='data-bin/wmt17_zh_en_full', bpe='subword_nmt', bpe_codes='data-bin/wmt17_zh_en_full/zh.code' ) zh2en.translate('你好 世界') # 'Hello World' ``` If you are using a `transformer.wmt19` models, you will need to set the `bpe` argument to `'fastbpe'` and (optionally) load the 4-model ensemble: ```python en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt', tokenizer='moses', bpe='fastbpe') en2de.eval() # disable dropout ``` ## Example usage (CLI tools) Generation with the binarized test sets can be run in batch mode as follows, e.g. for WMT 2014 English-French on a GTX-1080ti: ```bash mkdir -p data-bin curl https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2 | tar xvjf - -C data-bin curl https://dl.fbaipublicfiles.com/fairseq/data/wmt14.v2.en-fr.newstest2014.tar.bz2 | tar xvjf - -C data-bin fairseq-generate data-bin/wmt14.en-fr.newstest2014 \ --path data-bin/wmt14.en-fr.fconv-py/model.pt \ --beam 5 --batch-size 128 --remove-bpe | tee /tmp/gen.out # ... # | Translated 3003 sentences (96311 tokens) in 166.0s (580.04 tokens/s) # | Generate test with beam=5: BLEU4 = 40.83, 67.5/46.9/34.4/25.5 (BP=1.000, ratio=1.006, syslen=83262, reflen=82787) # Compute BLEU score grep ^H /tmp/gen.out | cut -f3- > /tmp/gen.out.sys grep ^T /tmp/gen.out | cut -f2- > /tmp/gen.out.ref fairseq-score --sys /tmp/gen.out.sys --ref /tmp/gen.out.ref # BLEU4 = 40.83, 67.5/46.9/34.4/25.5 (BP=1.000, ratio=1.006, syslen=83262, reflen=82787) ``` ## Training a new model ### IWSLT'14 German to English (Transformer) The following instructions can be used to train a Transformer model on the [IWSLT'14 German to English dataset](http://workshop2014.iwslt.org/downloads/proceeding.pdf). First download and preprocess the data: ```bash # Download and prepare the data cd examples/translation/ bash prepare-iwslt14.sh cd ../.. # Preprocess/binarize the data TEXT=examples/translation/iwslt14.tokenized.de-en fairseq-preprocess --source-lang de --target-lang en \ --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ --destdir data-bin/iwslt14.tokenized.de-en \ --workers 20 ``` Next we'll train a Transformer translation model over this data: ```bash CUDA_VISIBLE_DEVICES=0 fairseq-train \ data-bin/iwslt14.tokenized.de-en \ --arch transformer_iwslt_de_en --share-decoder-input-output-embed \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr 5e-4 --lr-scheduler inverse_sqrt --warmup-updates 4000 \ --dropout 0.3 --weight-decay 0.0001 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --max-tokens 4096 \ --eval-bleu \ --eval-bleu-args '{"beam": 5, "max_len_a": 1.2, "max_len_b": 10}' \ --eval-bleu-detok moses \ --eval-bleu-remove-bpe \ --eval-bleu-print-samples \ --best-checkpoint-metric bleu --maximize-best-checkpoint-metric ``` Finally we can evaluate our trained model: ```bash fairseq-generate data-bin/iwslt14.tokenized.de-en \ --path checkpoints/checkpoint_best.pt \ --batch-size 128 --beam 5 --remove-bpe ``` ### WMT'14 English to German (Convolutional) The following instructions can be used to train a Convolutional translation model on the WMT English to German dataset. See the [Scaling NMT README](../scaling_nmt/README.md) for instructions to train a Transformer translation model on this data. The WMT English to German dataset can be preprocessed using the `prepare-wmt14en2de.sh` script. By default it will produce a dataset that was modeled after [Attention Is All You Need (Vaswani et al., 2017)](https://arxiv.org/abs/1706.03762), but with additional news-commentary-v12 data from WMT'17. To use only data available in WMT'14 or to replicate results obtained in the original [Convolutional Sequence to Sequence Learning (Gehring et al., 2017)](https://arxiv.org/abs/1705.03122) paper, please use the `--icml17` option. ```bash # Download and prepare the data cd examples/translation/ # WMT'17 data: bash prepare-wmt14en2de.sh # or to use WMT'14 data: # bash prepare-wmt14en2de.sh --icml17 cd ../.. # Binarize the dataset TEXT=examples/translation/wmt17_en_de fairseq-preprocess \ --source-lang en --target-lang de \ --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ --destdir data-bin/wmt17_en_de --thresholdtgt 0 --thresholdsrc 0 \ --workers 20 # Train the model mkdir -p checkpoints/fconv_wmt_en_de fairseq-train \ data-bin/wmt17_en_de \ --arch fconv_wmt_en_de \ --dropout 0.2 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --optimizer nag --clip-norm 0.1 \ --lr 0.5 --lr-scheduler fixed --force-anneal 50 \ --max-tokens 4000 \ --save-dir checkpoints/fconv_wmt_en_de # Evaluate fairseq-generate data-bin/wmt17_en_de \ --path checkpoints/fconv_wmt_en_de/checkpoint_best.pt \ --beam 5 --remove-bpe ``` ### WMT'14 English to French ```bash # Download and prepare the data cd examples/translation/ bash prepare-wmt14en2fr.sh cd ../.. # Binarize the dataset TEXT=examples/translation/wmt14_en_fr fairseq-preprocess \ --source-lang en --target-lang fr \ --trainpref $TEXT/train --validpref $TEXT/valid --testpref $TEXT/test \ --destdir data-bin/wmt14_en_fr --thresholdtgt 0 --thresholdsrc 0 \ --workers 60 # Train the model mkdir -p checkpoints/fconv_wmt_en_fr fairseq-train \ data-bin/wmt14_en_fr \ --arch fconv_wmt_en_fr \ --dropout 0.1 \ --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \ --optimizer nag --clip-norm 0.1 \ --lr 0.5 --lr-scheduler fixed --force-anneal 50 \ --max-tokens 3000 \ --save-dir checkpoints/fconv_wmt_en_fr # Evaluate fairseq-generate \ data-bin/fconv_wmt_en_fr \ --path checkpoints/fconv_wmt_en_fr/checkpoint_best.pt \ --beam 5 --remove-bpe ``` ## Multilingual Translation We also support training multilingual translation models. In this example we'll train a multilingual `{de,fr}-en` translation model using the IWSLT'17 datasets. Note that we use slightly different preprocessing here than for the IWSLT'14 En-De data above. In particular we learn a joint BPE code for all three languages and use fairseq-interactive and sacrebleu for scoring the test set. ```bash # First install sacrebleu and sentencepiece pip install sacrebleu sentencepiece # Then download and preprocess the data cd examples/translation/ bash prepare-iwslt17-multilingual.sh cd ../.. # Binarize the de-en dataset TEXT=examples/translation/iwslt17.de_fr.en.bpe16k fairseq-preprocess --source-lang de --target-lang en \ --trainpref $TEXT/train.bpe.de-en \ --validpref $TEXT/valid0.bpe.de-en,$TEXT/valid1.bpe.de-en,$TEXT/valid2.bpe.de-en,$TEXT/valid3.bpe.de-en,$TEXT/valid4.bpe.de-en,$TEXT/valid5.bpe.de-en \ --destdir data-bin/iwslt17.de_fr.en.bpe16k \ --workers 10 # Binarize the fr-en dataset # NOTE: it's important to reuse the en dictionary from the previous step fairseq-preprocess --source-lang fr --target-lang en \ --trainpref $TEXT/train.bpe.fr-en \ --validpref $TEXT/valid0.bpe.fr-en,$TEXT/valid1.bpe.fr-en,$TEXT/valid2.bpe.fr-en,$TEXT/valid3.bpe.fr-en,$TEXT/valid4.bpe.fr-en,$TEXT/valid5.bpe.fr-en \ --tgtdict data-bin/iwslt17.de_fr.en.bpe16k/dict.en.txt \ --destdir data-bin/iwslt17.de_fr.en.bpe16k \ --workers 10 # Train a multilingual transformer model # NOTE: the command below assumes 1 GPU, but accumulates gradients from # 8 fwd/bwd passes to simulate training on 8 GPUs mkdir -p checkpoints/multilingual_transformer CUDA_VISIBLE_DEVICES=0 fairseq-train data-bin/iwslt17.de_fr.en.bpe16k/ \ --max-epoch 50 \ --ddp-backend=legacy_ddp \ --task multilingual_translation --lang-pairs de-en,fr-en \ --arch multilingual_transformer_iwslt_de_en \ --share-decoders --share-decoder-input-output-embed \ --optimizer adam --adam-betas '(0.9, 0.98)' \ --lr 0.0005 --lr-scheduler inverse_sqrt \ --warmup-updates 4000 --warmup-init-lr '1e-07' \ --label-smoothing 0.1 --criterion label_smoothed_cross_entropy \ --dropout 0.3 --weight-decay 0.0001 \ --save-dir checkpoints/multilingual_transformer \ --max-tokens 4000 \ --update-freq 8 # Generate and score the test set with sacrebleu SRC=de sacrebleu --test-set iwslt17 --language-pair ${SRC}-en --echo src \ | python scripts/spm_encode.py --model examples/translation/iwslt17.de_fr.en.bpe16k/sentencepiece.bpe.model \ > iwslt17.test.${SRC}-en.${SRC}.bpe cat iwslt17.test.${SRC}-en.${SRC}.bpe \ | fairseq-interactive data-bin/iwslt17.de_fr.en.bpe16k/ \ --task multilingual_translation --lang-pairs de-en,fr-en \ --source-lang ${SRC} --target-lang en \ --path checkpoints/multilingual_transformer/checkpoint_best.pt \ --buffer-size 2000 --batch-size 128 \ --beam 5 --remove-bpe=sentencepiece \ > iwslt17.test.${SRC}-en.en.sys grep ^H iwslt17.test.${SRC}-en.en.sys | cut -f3 \ | sacrebleu --test-set iwslt17 --language-pair ${SRC}-en ``` ##### Argument format during inference During inference it is required to specify a single `--source-lang` and `--target-lang`, which indicates the inference langauge direction. `--lang-pairs`, `--encoder-langtok`, `--decoder-langtok` have to be set to the same value as training. ================================================ FILE: examples/translation/prepare-iwslt14.sh ================================================ #!/usr/bin/env bash # # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git echo 'Cloning Subword NMT repository (for BPE pre-processing)...' git clone https://github.com/rsennrich/subword-nmt.git SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl LC=$SCRIPTS/tokenizer/lowercase.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl BPEROOT=subword-nmt/subword_nmt BPE_TOKENS=10000 URL="http://dl.fbaipublicfiles.com/fairseq/data/iwslt14/de-en.tgz" GZ=de-en.tgz if [ ! -d "$SCRIPTS" ]; then echo "Please set SCRIPTS variable correctly to point to Moses scripts." exit fi src=de tgt=en lang=de-en prep=iwslt14.tokenized.de-en tmp=$prep/tmp orig=orig mkdir -p $orig $tmp $prep echo "Downloading data from ${URL}..." cd $orig wget "$URL" if [ -f $GZ ]; then echo "Data successfully downloaded." else echo "Data not successfully downloaded." exit fi tar zxvf $GZ cd .. echo "pre-processing train data..." for l in $src $tgt; do f=train.tags.$lang.$l tok=train.tags.$lang.tok.$l cat $orig/$lang/$f | \ grep -v '<url>' | \ grep -v '<talkid>' | \ grep -v '<keywords>' | \ sed -e 's/<title>//g' | \ sed -e 's/<\/title>//g' | \ sed -e 's/<description>//g' | \ sed -e 's/<\/description>//g' | \ perl $TOKENIZER -threads 8 -l $l > $tmp/$tok echo "" done perl $CLEAN -ratio 1.5 $tmp/train.tags.$lang.tok $src $tgt $tmp/train.tags.$lang.clean 1 175 for l in $src $tgt; do perl $LC < $tmp/train.tags.$lang.clean.$l > $tmp/train.tags.$lang.$l done echo "pre-processing valid/test data..." for l in $src $tgt; do for o in `ls $orig/$lang/IWSLT14.TED*.$l.xml`; do fname=${o##*/} f=$tmp/${fname%.*} echo $o $f grep '<seg id' $o | \ sed -e 's/<seg id="[0-9]*">\s*//g' | \ sed -e 's/\s*<\/seg>\s*//g' | \ sed -e "s/\’/\'/g" | \ perl $TOKENIZER -threads 8 -l $l | \ perl $LC > $f echo "" done done echo "creating train, valid, test..." for l in $src $tgt; do awk '{if (NR%23 == 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/valid.$l awk '{if (NR%23 != 0) print $0; }' $tmp/train.tags.de-en.$l > $tmp/train.$l cat $tmp/IWSLT14.TED.dev2010.de-en.$l \ $tmp/IWSLT14.TEDX.dev2012.de-en.$l \ $tmp/IWSLT14.TED.tst2010.de-en.$l \ $tmp/IWSLT14.TED.tst2011.de-en.$l \ $tmp/IWSLT14.TED.tst2012.de-en.$l \ > $tmp/test.$l done TRAIN=$tmp/train.en-de BPE_CODE=$prep/code rm -f $TRAIN for l in $src $tgt; do cat $tmp/train.$l >> $TRAIN done echo "learn_bpe.py on ${TRAIN}..." python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE for L in $src $tgt; do for f in train.$L valid.$L test.$L; do echo "apply_bpe.py to ${f}..." python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $prep/$f done done ================================================ FILE: examples/translation/prepare-iwslt17-multilingual.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. SRCS=( "de" "fr" ) TGT=en ROOT=$(dirname "$0") SCRIPTS=$ROOT/../../scripts SPM_TRAIN=$SCRIPTS/spm_train.py SPM_ENCODE=$SCRIPTS/spm_encode.py BPESIZE=16384 ORIG=$ROOT/iwslt17_orig DATA=$ROOT/iwslt17.de_fr.en.bpe16k mkdir -p "$ORIG" "$DATA" TRAIN_MINLEN=1 # remove sentences with <1 BPE token TRAIN_MAXLEN=250 # remove sentences with >250 BPE tokens URLS=( "https://wit3.fbk.eu/archive/2017-01-trnted/texts/de/en/de-en.tgz" "https://wit3.fbk.eu/archive/2017-01-trnted/texts/fr/en/fr-en.tgz" ) ARCHIVES=( "de-en.tgz" "fr-en.tgz" ) VALID_SETS=( "IWSLT17.TED.dev2010.de-en IWSLT17.TED.tst2010.de-en IWSLT17.TED.tst2011.de-en IWSLT17.TED.tst2012.de-en IWSLT17.TED.tst2013.de-en IWSLT17.TED.tst2014.de-en IWSLT17.TED.tst2015.de-en" "IWSLT17.TED.dev2010.fr-en IWSLT17.TED.tst2010.fr-en IWSLT17.TED.tst2011.fr-en IWSLT17.TED.tst2012.fr-en IWSLT17.TED.tst2013.fr-en IWSLT17.TED.tst2014.fr-en IWSLT17.TED.tst2015.fr-en" ) # download and extract data for ((i=0;i<${#URLS[@]};++i)); do ARCHIVE=$ORIG/${ARCHIVES[i]} if [ -f "$ARCHIVE" ]; then echo "$ARCHIVE already exists, skipping download" else URL=${URLS[i]} wget -P "$ORIG" "$URL" if [ -f "$ARCHIVE" ]; then echo "$URL successfully downloaded." else echo "$URL not successfully downloaded." exit 1 fi fi FILE=${ARCHIVE: -4} if [ -e "$FILE" ]; then echo "$FILE already exists, skipping extraction" else tar -C "$ORIG" -xzvf "$ARCHIVE" fi done echo "pre-processing train data..." for SRC in "${SRCS[@]}"; do for LANG in "${SRC}" "${TGT}"; do cat "$ORIG/${SRC}-${TGT}/train.tags.${SRC}-${TGT}.${LANG}" \ | grep -v '<url>' \ | grep -v '<talkid>' \ | grep -v '<keywords>' \ | grep -v '<speaker>' \ | grep -v '<reviewer' \ | grep -v '<translator' \ | grep -v '<doc' \ | grep -v '</doc>' \ | sed -e 's/<title>//g' \ | sed -e 's/<\/title>//g' \ | sed -e 's/<description>//g' \ | sed -e 's/<\/description>//g' \ | sed 's/^\s*//g' \ | sed 's/\s*$//g' \ > "$DATA/train.${SRC}-${TGT}.${LANG}" done done echo "pre-processing valid data..." for ((i=0;i<${#SRCS[@]};++i)); do SRC=${SRCS[i]} VALID_SET=(${VALID_SETS[i]}) for ((j=0;j<${#VALID_SET[@]};++j)); do FILE=${VALID_SET[j]} for LANG in "$SRC" "$TGT"; do grep '<seg id' "$ORIG/${SRC}-${TGT}/${FILE}.${LANG}.xml" \ | sed -e 's/<seg id="[0-9]*">\s*//g' \ | sed -e 's/\s*<\/seg>\s*//g' \ | sed -e "s/\’/\'/g" \ > "$DATA/valid${j}.${SRC}-${TGT}.${LANG}" done done done # learn BPE with sentencepiece TRAIN_FILES=$(for SRC in "${SRCS[@]}"; do echo $DATA/train.${SRC}-${TGT}.${SRC}; echo $DATA/train.${SRC}-${TGT}.${TGT}; done | tr "\n" ",") echo "learning joint BPE over ${TRAIN_FILES}..." python "$SPM_TRAIN" \ --input=$TRAIN_FILES \ --model_prefix=$DATA/sentencepiece.bpe \ --vocab_size=$BPESIZE \ --character_coverage=1.0 \ --model_type=bpe # encode train/valid echo "encoding train with learned BPE..." for SRC in "${SRCS[@]}"; do python "$SPM_ENCODE" \ --model "$DATA/sentencepiece.bpe.model" \ --output_format=piece \ --inputs $DATA/train.${SRC}-${TGT}.${SRC} $DATA/train.${SRC}-${TGT}.${TGT} \ --outputs $DATA/train.bpe.${SRC}-${TGT}.${SRC} $DATA/train.bpe.${SRC}-${TGT}.${TGT} \ --min-len $TRAIN_MINLEN --max-len $TRAIN_MAXLEN done echo "encoding valid with learned BPE..." for ((i=0;i<${#SRCS[@]};++i)); do SRC=${SRCS[i]} VALID_SET=(${VALID_SETS[i]}) for ((j=0;j<${#VALID_SET[@]};++j)); do python "$SPM_ENCODE" \ --model "$DATA/sentencepiece.bpe.model" \ --output_format=piece \ --inputs $DATA/valid${j}.${SRC}-${TGT}.${SRC} $DATA/valid${j}.${SRC}-${TGT}.${TGT} \ --outputs $DATA/valid${j}.bpe.${SRC}-${TGT}.${SRC} $DATA/valid${j}.bpe.${SRC}-${TGT}.${TGT} done done ================================================ FILE: examples/translation/prepare-wmt14en2de.sh ================================================ #!/bin/bash # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git echo 'Cloning Subword NMT repository (for BPE pre-processing)...' git clone https://github.com/rsennrich/subword-nmt.git SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl BPEROOT=subword-nmt/subword_nmt BPE_TOKENS=40000 URLS=( "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz" "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz" "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz" "http://data.statmt.org/wmt17/translation-task/dev.tgz" "http://statmt.org/wmt14/test-full.tgz" ) FILES=( "training-parallel-europarl-v7.tgz" "training-parallel-commoncrawl.tgz" "training-parallel-nc-v12.tgz" "dev.tgz" "test-full.tgz" ) CORPORA=( "training/europarl-v7.de-en" "commoncrawl.de-en" "training/news-commentary-v12.de-en" ) # This will make the dataset compatible to the one used in "Convolutional Sequence to Sequence Learning" # https://arxiv.org/abs/1705.03122 if [ "$1" == "--icml17" ]; then URLS[2]="http://statmt.org/wmt14/training-parallel-nc-v9.tgz" FILES[2]="training-parallel-nc-v9.tgz" CORPORA[2]="training/news-commentary-v9.de-en" OUTDIR=wmt14_en_de else OUTDIR=wmt17_en_de fi if [ ! -d "$SCRIPTS" ]; then echo "Please set SCRIPTS variable correctly to point to Moses scripts." exit fi src=en tgt=de lang=en-de prep=$OUTDIR tmp=$prep/tmp orig=orig dev=dev/newstest2013 mkdir -p $orig $tmp $prep cd $orig for ((i=0;i<${#URLS[@]};++i)); do file=${FILES[i]} if [ -f $file ]; then echo "$file already exists, skipping download" else url=${URLS[i]} wget "$url" if [ -f $file ]; then echo "$url successfully downloaded." else echo "$url not successfully downloaded." exit -1 fi if [ ${file: -4} == ".tgz" ]; then tar zxvf $file elif [ ${file: -4} == ".tar" ]; then tar xvf $file fi fi done cd .. echo "pre-processing train data..." for l in $src $tgt; do rm $tmp/train.tags.$lang.tok.$l for f in "${CORPORA[@]}"; do cat $orig/$f.$l | \ perl $NORM_PUNC $l | \ perl $REM_NON_PRINT_CHAR | \ perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l done done echo "pre-processing test data..." for l in $src $tgt; do if [ "$l" == "$src" ]; then t="src" else t="ref" fi grep '<seg id' $orig/test-full/newstest2014-deen-$t.$l.sgm | \ sed -e 's/<seg id="[0-9]*">\s*//g' | \ sed -e 's/\s*<\/seg>\s*//g' | \ sed -e "s/\’/\'/g" | \ perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l echo "" done echo "splitting train and valid..." for l in $src $tgt; do awk '{if (NR%100 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l awk '{if (NR%100 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l done TRAIN=$tmp/train.de-en BPE_CODE=$prep/code rm -f $TRAIN for l in $src $tgt; do cat $tmp/train.$l >> $TRAIN done echo "learn_bpe.py on ${TRAIN}..." python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE for L in $src $tgt; do for f in train.$L valid.$L test.$L; do echo "apply_bpe.py to ${f}..." python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f done done perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250 perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250 for L in $src $tgt; do cp $tmp/bpe.test.$L $prep/test.$L done ================================================ FILE: examples/translation/prepare-wmt14en2fr.sh ================================================ #!/bin/bash # Adapted from https://github.com/facebookresearch/MIXER/blob/master/prepareData.sh echo 'Cloning Moses github repository (for tokenization scripts)...' git clone https://github.com/moses-smt/mosesdecoder.git echo 'Cloning Subword NMT repository (for BPE pre-processing)...' git clone https://github.com/rsennrich/subword-nmt.git SCRIPTS=mosesdecoder/scripts TOKENIZER=$SCRIPTS/tokenizer/tokenizer.perl CLEAN=$SCRIPTS/training/clean-corpus-n.perl NORM_PUNC=$SCRIPTS/tokenizer/normalize-punctuation.perl REM_NON_PRINT_CHAR=$SCRIPTS/tokenizer/remove-non-printing-char.perl BPEROOT=subword-nmt/subword_nmt BPE_TOKENS=40000 URLS=( "http://statmt.org/wmt13/training-parallel-europarl-v7.tgz" "http://statmt.org/wmt13/training-parallel-commoncrawl.tgz" "http://statmt.org/wmt13/training-parallel-un.tgz" "http://statmt.org/wmt14/training-parallel-nc-v9.tgz" "http://statmt.org/wmt10/training-giga-fren.tar" "http://statmt.org/wmt14/test-full.tgz" ) FILES=( "training-parallel-europarl-v7.tgz" "training-parallel-commoncrawl.tgz" "training-parallel-un.tgz" "training-parallel-nc-v9.tgz" "training-giga-fren.tar" "test-full.tgz" ) CORPORA=( "training/europarl-v7.fr-en" "commoncrawl.fr-en" "un/undoc.2000.fr-en" "training/news-commentary-v9.fr-en" "giga-fren.release2.fixed" ) if [ ! -d "$SCRIPTS" ]; then echo "Please set SCRIPTS variable correctly to point to Moses scripts." exit fi src=en tgt=fr lang=en-fr prep=wmt14_en_fr tmp=$prep/tmp orig=orig mkdir -p $orig $tmp $prep cd $orig for ((i=0;i<${#URLS[@]};++i)); do file=${FILES[i]} if [ -f $file ]; then echo "$file already exists, skipping download" else url=${URLS[i]} wget "$url" if [ -f $file ]; then echo "$url successfully downloaded." else echo "$url not successfully downloaded." exit -1 fi if [ ${file: -4} == ".tgz" ]; then tar zxvf $file elif [ ${file: -4} == ".tar" ]; then tar xvf $file fi fi done gunzip giga-fren.release2.fixed.*.gz cd .. echo "pre-processing train data..." for l in $src $tgt; do rm $tmp/train.tags.$lang.tok.$l for f in "${CORPORA[@]}"; do cat $orig/$f.$l | \ perl $NORM_PUNC $l | \ perl $REM_NON_PRINT_CHAR | \ perl $TOKENIZER -threads 8 -a -l $l >> $tmp/train.tags.$lang.tok.$l done done echo "pre-processing test data..." for l in $src $tgt; do if [ "$l" == "$src" ]; then t="src" else t="ref" fi grep '<seg id' $orig/test-full/newstest2014-fren-$t.$l.sgm | \ sed -e 's/<seg id="[0-9]*">\s*//g' | \ sed -e 's/\s*<\/seg>\s*//g' | \ sed -e "s/\’/\'/g" | \ perl $TOKENIZER -threads 8 -a -l $l > $tmp/test.$l echo "" done echo "splitting train and valid..." for l in $src $tgt; do awk '{if (NR%1333 == 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/valid.$l awk '{if (NR%1333 != 0) print $0; }' $tmp/train.tags.$lang.tok.$l > $tmp/train.$l done TRAIN=$tmp/train.fr-en BPE_CODE=$prep/code rm -f $TRAIN for l in $src $tgt; do cat $tmp/train.$l >> $TRAIN done echo "learn_bpe.py on ${TRAIN}..." python $BPEROOT/learn_bpe.py -s $BPE_TOKENS < $TRAIN > $BPE_CODE for L in $src $tgt; do for f in train.$L valid.$L test.$L; do echo "apply_bpe.py to ${f}..." python $BPEROOT/apply_bpe.py -c $BPE_CODE < $tmp/$f > $tmp/bpe.$f done done perl $CLEAN -ratio 1.5 $tmp/bpe.train $src $tgt $prep/train 1 250 perl $CLEAN -ratio 1.5 $tmp/bpe.valid $src $tgt $prep/valid 1 250 for L in $src $tgt; do cp $tmp/bpe.test.$L $prep/test.$L done ================================================ FILE: examples/translation_moe/README.md ================================================ # Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019) This page includes instructions for reproducing results from the paper [Mixture Models for Diverse Machine Translation: Tricks of the Trade (Shen et al., 2019)](https://arxiv.org/abs/1902.07816). ## Download data First, follow the [instructions to download and preprocess the WMT'17 En-De dataset](../translation#prepare-wmt14en2desh). Make sure to learn a joint vocabulary by passing the `--joined-dictionary` option to `fairseq-preprocess`. ## Train a model Then we can train a mixture of experts model using the `translation_moe` task. Use the `--method` flag to choose the MoE variant; we support hard mixtures with a learned or uniform prior (`--method hMoElp` and `hMoEup`, respectively) and soft mixures (`--method sMoElp` and `sMoEup`). The model is trained with online responsibility assignment and shared parameterization. The following command will train a `hMoElp` model with `3` experts: ```bash fairseq-train --ddp-backend='legacy_ddp' \ data-bin/wmt17_en_de \ --max-update 100000 \ --task translation_moe --user-dir examples/translation_moe/translation_moe_src \ --method hMoElp --mean-pool-gating-network \ --num-experts 3 \ --arch transformer_wmt_en_de --share-all-embeddings \ --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 0.0 \ --lr-scheduler inverse_sqrt --warmup-init-lr 1e-07 --warmup-updates 4000 \ --lr 0.0007 \ --dropout 0.1 --weight-decay 0.0 --criterion cross_entropy \ --max-tokens 3584 ``` ## Translate Once a model is trained, we can generate translations from different experts using the `--gen-expert` option. For example, to generate from expert 0: ```bash fairseq-generate data-bin/wmt17_en_de \ --path checkpoints/checkpoint_best.pt \ --beam 1 --remove-bpe \ --task translation_moe --user-dir examples/translation_moe/translation_moe_src \ --method hMoElp --mean-pool-gating-network \ --num-experts 3 \ --gen-expert 0 ``` ## Evaluate First download a tokenized version of the WMT'14 En-De test set with multiple references: ```bash wget dl.fbaipublicfiles.com/fairseq/data/wmt14-en-de.extra_refs.tok ``` Next apply BPE on the fly and run generation for each expert: ```bash BPE_CODE=examples/translation/wmt17_en_de/code for EXPERT in $(seq 0 2); do \ cat wmt14-en-de.extra_refs.tok \ | grep ^S | cut -f 2 \ | fairseq-interactive data-bin/wmt17_en_de \ --path checkpoints/checkpoint_best.pt \ --beam 1 \ --bpe subword_nmt --bpe-codes $BPE_CODE \ --buffer-size 500 --max-tokens 6000 \ --task translation_moe --user-dir examples/translation_moe/translation_moe_src \ --method hMoElp --mean-pool-gating-network \ --num-experts 3 \ --gen-expert $EXPERT ; \ done > wmt14-en-de.extra_refs.tok.gen.3experts ``` Finally use `score_moe.py` to compute pairwise BLUE and average oracle BLEU: ```bash python examples/translation_moe/score.py --sys wmt14-en-de.extra_refs.tok.gen.3experts --ref wmt14-en-de.extra_refs.tok # pairwise BLEU: 48.26 # #refs covered: 2.11 # multi-reference BLEU (leave-one-out): 59.46 ``` This matches row 3 from Table 7 in the paper. ## Citation ```bibtex @article{shen2019mixture, title = {Mixture Models for Diverse Machine Translation: Tricks of the Trade}, author = {Tianxiao Shen and Myle Ott and Michael Auli and Marc'Aurelio Ranzato}, journal = {International Conference on Machine Learning}, year = 2019, } ``` ================================================ FILE: examples/translation_moe/score.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Scoring script for computing pairwise BLEU and multi-ref BLEU over a set of candidate hypotheses. See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade" (Shen et al., 2019) <https://arxiv.org/abs/1902.07816>`_. """ import argparse import random import sys from itertools import chain import numpy as np import sacrebleu from sacrebleu import corpus_bleu as _corpus_bleu def main(): parser = argparse.ArgumentParser(sys.argv[0]) parser.add_argument( "--sys", nargs="*", default="", metavar="FILE", help="path to system output" ) parser.add_argument("--ref", default="", metavar="FILE", help="path to references") parser.add_argument( "--output", default="", metavar="FILE", help="print outputs into a pretty format", ) args = parser.parse_args() if args.sys: src, tgt, hypos, log_probs = load_sys(args.sys) print("pairwise BLEU: %.2f" % pairwise(hypos)) if args.output: merge(src, tgt, hypos, log_probs, args.output) if args.ref: _, _, refs = load_ref(args.ref) if args.sys: multi_ref(refs, hypos) else: intra_ref(refs) def dictolist(d): a = sorted(d.items(), key=lambda i: i[0]) return [i[1] for i in a] def load_sys(paths): src, tgt, hypos, log_probs = {}, {}, {}, {} for path in paths: with open(path) as f: for line in f: line = line.rstrip() # S: source # T: target # D: detokenized system output if line.startswith(("S-", "T-", "D-")): i = int(line[line.find("-") + 1 : line.find("\t")]) if line.startswith("S-"): src[i] = line.split("\t")[1] if line.startswith("T-"): tgt[i] = line.split("\t")[1] if line.startswith("D-"): if i not in hypos: hypos[i] = [] log_probs[i] = [] hypos[i].append(line.split("\t")[2]) log_probs[i].append(float(line.split("\t")[1])) return dictolist(src), dictolist(tgt), dictolist(hypos), dictolist(log_probs) def load_ref(path): with open(path) as f: lines = f.readlines() src, tgt, refs = [], [], [] i = 0 while i < len(lines): if lines[i].startswith("S-"): src.append(lines[i].split("\t")[1].rstrip()) i += 1 elif lines[i].startswith("T-"): tgt.append(lines[i].split("\t")[1].rstrip()) i += 1 else: a = [] while i < len(lines) and lines[i].startswith("R"): a.append(lines[i].split("\t")[1].rstrip()) i += 1 refs.append(a) return src, tgt, refs def merge(src, tgt, hypos, log_probs, path): with open(path, "w") as f: for s, t, hs, lps in zip(src, tgt, hypos, log_probs): f.write(s + "\n") f.write(t + "\n") f.write("\n") for h, lp in zip(hs, lps): f.write("\t%f\t%s\n" % (lp, h.strip())) f.write("------------------------------------------------------\n") def corpus_bleu(sys_stream, ref_streams): bleu = _corpus_bleu(sys_stream, ref_streams, tokenize="none") return bleu.score def sentence_bleu(hypothesis, reference): bleu = _corpus_bleu(hypothesis, reference) for i in range(1, 4): bleu.counts[i] += 1 bleu.totals[i] += 1 bleu = sacrebleu.BLEU.compute_bleu( bleu.counts, bleu.totals, bleu.sys_len, bleu.ref_len, smooth_method="exp", ) return bleu.score def pairwise(sents): _ref, _hypo = [], [] for s in sents: for i in range(len(s)): for j in range(len(s)): if i != j: _ref.append(s[i]) _hypo.append(s[j]) return corpus_bleu(_hypo, [_ref]) def multi_ref(refs, hypos): _ref, _hypo = [], [] ref_cnt = 0 assert len(refs) == len(hypos) # count number of refs covered for rs, hs in zip(refs, hypos): a = set() for h in hs: s = [sentence_bleu(h, r) for r in rs] j = np.argmax(s) _ref.append(rs[j]) _hypo.append(h) best = [k for k in range(len(rs)) if s[k] == s[j]] a.add(random.choice(best)) ref_cnt += len(a) print("#refs covered: %.2f" % (ref_cnt / len(refs))) # transpose refs and hypos refs = list(zip(*refs)) hypos = list(zip(*hypos)) # compute multi-ref corpus BLEU (leave-one-out to be comparable to intra_ref) k = len(hypos) m = len(refs) flat_hypos = [hypos[j][i] for i in range(len(hypos[0])) for j in range(k)] duplicated_refs = [[ref for ref in refs_i for _ in range(k)] for refs_i in refs] loo_bleus = [] for held_out_ref in range(m): remaining_refs = ( duplicated_refs[:held_out_ref] + duplicated_refs[held_out_ref + 1 :] ) assert len(remaining_refs) == m - 1 loo_bleus.append(corpus_bleu(flat_hypos, remaining_refs)) print("average multi-reference BLEU (leave-one-out): %.2f" % np.mean(loo_bleus)) def intra_ref(refs): print("ref pairwise BLEU: %.2f" % pairwise(refs)) refs = list(zip(*refs)) m = len(refs) concat_h = [] concat_rest = [[] for j in range(m - 1)] for i, h in enumerate(refs): rest = refs[:i] + refs[i + 1 :] concat_h.append(h) for j in range(m - 1): concat_rest[j].extend(rest[j]) concat_h = list(chain.from_iterable(concat_h)) bleu = corpus_bleu(concat_h, concat_rest) print("multi-reference BLEU (leave-one-out): %.2f" % bleu) if __name__ == "__main__": main() ================================================ FILE: examples/translation_moe/translation_moe_src/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import translation_moe # noqa ================================================ FILE: examples/translation_moe/translation_moe_src/logsumexp_moe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch class LogSumExpMoE(torch.autograd.Function): """Standard LogSumExp forward pass, but use *posterior* for the backward. See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade" (Shen et al., 2019) <https://arxiv.org/abs/1902.07816>`_. """ @staticmethod def forward(ctx, logp, posterior, dim=-1): ctx.save_for_backward(posterior) ctx.dim = dim return torch.logsumexp(logp, dim=dim) @staticmethod def backward(ctx, grad_output): (posterior,) = ctx.saved_tensors grad_logp = grad_output.unsqueeze(ctx.dim) * posterior return grad_logp, None, None ================================================ FILE: examples/translation_moe/translation_moe_src/mean_pool_gating_network.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn.functional as F class MeanPoolGatingNetwork(torch.nn.Module): """A simple mean-pooling gating network for selecting experts. This module applies mean pooling over an encoder's output and returns reponsibilities for each expert. The encoder format is expected to match :class:`fairseq.models.transformer.TransformerEncoder`. """ def __init__(self, embed_dim, num_experts, dropout=None): super().__init__() self.embed_dim = embed_dim self.num_experts = num_experts self.fc1 = torch.nn.Linear(embed_dim, embed_dim) self.dropout = torch.nn.Dropout(dropout) if dropout is not None else None self.fc2 = torch.nn.Linear(embed_dim, num_experts) def forward(self, encoder_out): if not ( "encoder_out" in encoder_out and "encoder_padding_mask" in encoder_out and encoder_out["encoder_out"][0].size(2) == self.embed_dim ): raise ValueError("Unexpected format for encoder_out") # mean pooling over time encoder_padding_mask = encoder_out["encoder_padding_mask"][0] # B x T encoder_out = encoder_out["encoder_out"][0].transpose(0, 1) # B x T x C if encoder_padding_mask is not None: encoder_out = encoder_out.clone() # required because of transpose above encoder_out[encoder_padding_mask] = 0 ntokens = torch.sum(~encoder_padding_mask, dim=1, keepdim=True) x = torch.sum(encoder_out, dim=1) / ntokens.type_as(encoder_out) else: x = torch.mean(encoder_out, dim=1) x = torch.tanh(self.fc1(x)) if self.dropout is not None: x = self.dropout(x) x = self.fc2(x) return F.log_softmax(x, dim=-1, dtype=torch.float32).type_as(x) ================================================ FILE: examples/translation_moe/translation_moe_src/translation_moe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import torch from omegaconf import II from fairseq import utils from fairseq.logging import metrics from fairseq.dataclass import ChoiceEnum from fairseq.tasks import register_task from fairseq.tasks.translation import TranslationConfig, TranslationTask from .logsumexp_moe import LogSumExpMoE from .mean_pool_gating_network import MeanPoolGatingNetwork METHOD_CHOICES = ChoiceEnum(["sMoElp", "sMoEup", "hMoElp", "hMoEup"]) @dataclass class TranslationMoEConfig(TranslationConfig): method: METHOD_CHOICES = field( default="hMoEup", metadata={"help": "MoE method"}, ) num_experts: int = field( default=3, metadata={"help": "number of experts"}, ) mean_pool_gating_network: bool = field( default=False, metadata={"help": "use a simple mean-pooling gating network"}, ) mean_pool_gating_network_dropout: float = field( default=0, metadata={"help": "dropout for mean-pooling gating network"}, ) mean_pool_gating_network_encoder_dim: int = field( default=0, metadata={"help": "encoder output dim for mean-pooling gating network"}, ) gen_expert: int = field( default=0, metadata={"help": "which expert to use for generation"}, ) sentence_avg: bool = II("optimization.sentence_avg") @register_task("translation_moe", dataclass=TranslationMoEConfig) class TranslationMoETask(TranslationTask): """ Translation task for Mixture of Experts (MoE) models. See `"Mixture Models for Diverse Machine Translation: Tricks of the Trade" (Shen et al., 2019) <https://arxiv.org/abs/1902.07816>`_. Args: src_dict (~fairseq.data.Dictionary): dictionary for the source language tgt_dict (~fairseq.data.Dictionary): dictionary for the target language .. note:: The translation task is compatible with :mod:`fairseq-train`, :mod:`fairseq-generate` and :mod:`fairseq-interactive`. The translation task provides the following additional command-line arguments: .. argparse:: :ref: fairseq.tasks.translation_parser :prog: """ cfg: TranslationMoEConfig def __init__(self, cfg: TranslationMoEConfig, src_dict, tgt_dict): if cfg.method == "sMoElp": # soft MoE with learned prior self.uniform_prior = False self.hard_selection = False elif cfg.method == "sMoEup": # soft MoE with uniform prior self.uniform_prior = True self.hard_selection = False elif cfg.method == "hMoElp": # hard MoE with learned prior self.uniform_prior = False self.hard_selection = True elif cfg.method == "hMoEup": # hard MoE with uniform prior self.uniform_prior = True self.hard_selection = True # add indicator tokens for each expert for i in range(cfg.num_experts): # add to both dictionaries in case we're sharing embeddings src_dict.add_symbol("<expert_{}>".format(i)) tgt_dict.add_symbol("<expert_{}>".format(i)) super().__init__(cfg, src_dict, tgt_dict) def build_model(self, cfg, from_checkpoint=False): from fairseq import models model = models.build_model(cfg, self) if not self.uniform_prior and not hasattr(model, "gating_network"): if self.cfg.mean_pool_gating_network: if self.cfg.mean_pool_gating_network_encoder_dim > 0: encoder_dim = self.cfg.mean_pool_gating_network_encoder_dim elif getattr(cfg, "encoder_embed_dim", None): # assume that encoder_embed_dim is the encoder's output dimension encoder_dim = cfg.encoder_embed_dim else: raise ValueError( "Must specify --mean-pool-gating-network-encoder-dim" ) if self.cfg.mean_pool_gating_network_dropout > 0: dropout = self.cfg.mean_pool_gating_network_dropout elif getattr(cfg, "dropout", None): dropout = cfg.dropout else: raise ValueError("Must specify task.mean_pool_gating_network_dropout") model.gating_network = MeanPoolGatingNetwork( encoder_dim, self.cfg.num_experts, dropout, ) else: raise ValueError( "translation_moe task with learned prior requires the model to " "have a gating network; try using --mean-pool-gating-network" ) return model def expert_index(self, i): return i + self.tgt_dict.index("<expert_0>") def _get_loss(self, sample, model, criterion): assert hasattr( criterion, "compute_loss" ), "translation_moe task requires the criterion to implement the compute_loss() method" k = self.cfg.num_experts bsz = sample["target"].size(0) def get_lprob_y(encoder_out, prev_output_tokens_k): net_output = model.decoder( prev_output_tokens=prev_output_tokens_k, encoder_out=encoder_out, ) loss, _ = criterion.compute_loss(model, net_output, sample, reduce=False) loss = loss.view(bsz, -1) return -loss.sum(dim=1, keepdim=True) # -> B x 1 def get_lprob_yz(winners=None): encoder_out = model.encoder( src_tokens=sample["net_input"]["src_tokens"], src_lengths=sample["net_input"]["src_lengths"], ) if winners is None: lprob_y = [] for i in range(k): prev_output_tokens_k = sample["net_input"][ "prev_output_tokens" ].clone() assert not prev_output_tokens_k.requires_grad prev_output_tokens_k[:, 0] = self.expert_index(i) lprob_y.append(get_lprob_y(encoder_out, prev_output_tokens_k)) lprob_y = torch.cat(lprob_y, dim=1) # -> B x K else: prev_output_tokens_k = sample["net_input"]["prev_output_tokens"].clone() prev_output_tokens_k[:, 0] = self.expert_index(winners) lprob_y = get_lprob_y(encoder_out, prev_output_tokens_k) # -> B if self.uniform_prior: lprob_yz = lprob_y else: lprob_z = model.gating_network(encoder_out) # B x K if winners is not None: lprob_z = lprob_z.gather(dim=1, index=winners.unsqueeze(-1)) lprob_yz = lprob_y + lprob_z.type_as(lprob_y) # B x K return lprob_yz # compute responsibilities without dropout with utils.model_eval(model): # disable dropout with torch.no_grad(): # disable autograd lprob_yz = get_lprob_yz() # B x K prob_z_xy = torch.nn.functional.softmax(lprob_yz, dim=1) assert not prob_z_xy.requires_grad # compute loss with dropout if self.hard_selection: winners = prob_z_xy.max(dim=1)[1] loss = -get_lprob_yz(winners) else: lprob_yz = get_lprob_yz() # B x K loss = -LogSumExpMoE.apply(lprob_yz, prob_z_xy, 1) loss = loss.sum() sample_size = ( sample["target"].size(0) if self.cfg.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data), "ntokens": sample["ntokens"], "nsentences": bsz, "sample_size": sample_size, "posterior": prob_z_xy.float().sum(dim=0).cpu(), } return loss, sample_size, logging_output def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): model.train() loss, sample_size, logging_output = self._get_loss(sample, model, criterion) if ignore_grad: loss *= 0 optimizer.backward(loss) return loss, sample_size, logging_output def valid_step(self, sample, model, criterion): model.eval() with torch.no_grad(): loss, sample_size, logging_output = self._get_loss(sample, model, criterion) return loss, sample_size, logging_output def inference_step( self, generator, models, sample, prefix_tokens=None, expert=None, constraints=None, ): expert = expert or self.cfg.gen_expert with torch.no_grad(): return generator.generate( models, sample, prefix_tokens=prefix_tokens, constraints=constraints, bos_token=self.expert_index(expert), ) def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) metrics.log_scalar( "posterior", sum(log["posterior"] for log in logging_outputs if "posterior" in log), ) ================================================ FILE: examples/truncated_bptt/README.md ================================================ # Truncated Backpropagation Through Time (BPTT) Truncated BPTT is a useful technique for training language models on very long sequences. Typically a long sequences is split into chunks and a language model is trained over the chunks sequentially. The LM may condition on previous chunks, but gradients only flow through the current chunk. This technique was the basis for the paper: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860), which achieved state-of-the-art language modeling results at the time of publication. It is slightly tricky to implement Truncated BPTT efficiently in fairseq, since we need to iterate over the data sequentially and disable any batch shuffling logic. The code provided in this example illustrates how to implement Truncated BPTT in fairseq by overriding ``FairseqTask::get_batch_iterator`` to iterate over the data sequentially. Crucially, this example supports batching and multi-GPU (data parallel) training. ##### 0. Setup First, see the general [language modeling README](README.md) for instructions on preprocessing the WikiText-103 data. ##### 1. Train a Transformer-XL model on WikiText-103 We will train a 16-layer Transformer-XL model following the [hyperparameters used in the original paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_wt103_base.sh). The following command assumes 4 GPUs, so that the total batch size is 60 sequences (15 x 4). Training should take ~24 hours on 4 V100 GPUs: ```bash CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train \ --user-dir examples/truncated_bptt \ data-bin/wikitext-103/ \ --task truncated_bptt_lm --tokens-per-sample 150 \ --batch-size 15 --max-update 200000 \ --arch transformer_xl --n-layer 16 --d-model 410 --n-head 10 \ --d-head 41 --d-inner 2100 --dropout 0.1 --dropatt 0.0 --mem-len 150 \ --optimizer adam --clip-norm 0.25 \ --lr-scheduler cosine --warmup-updates 0 --min-lr 0.0 --lr 0.00025 \ --log-format json --log-interval 25 \ --fp16 ``` If training on a single GPU, set `--update-freq=4` to accumulate 4x gradients and simulate training on 4 GPUs. ##### 2. Evaluate ```bash fairseq-eval-lm data-bin/wikitext-103/ \ --path checkpoints/checkpoint_best.pt \ --user-dir examples/truncated_bptt/ \ --task truncated_bptt_lm \ --batch-size 1 --required-batch-size-multiple 1 \ --model-overrides '{"mem_len":640,"clamp_len":400,"same_length":True}' \ --tokens-per-sample 64 # ... | INFO | fairseq_cli.eval_lm | num. model params: 151123537 # ... | INFO | fairseq_cli.eval_lm | Evaluated 245569 tokens in 83.1s (2956.82 tokens/s) # ... | INFO | fairseq_cli.eval_lm | Loss (base 2): 4.5668, Perplexity: 23.70 # Compare to 24.0 test perplexity from the paper ``` *Note:* During training the model saw 150 tokens of context (``--tokens-per-sample=150``) and 150 extra memory tokens (``--mem-len=150``). During evaluation we measure perplexity on sequences of 64 tokens (``--tokens-per-sample=64``) and increase the memory length (``--model-overrides='{"mem_len":640}'``). These settings match the evaluation settings from [the original paper](https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/run_wt103_base.sh). ================================================ FILE: examples/truncated_bptt/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import transformer_xl_model, truncated_bptt_lm_task # noqa ================================================ FILE: examples/truncated_bptt/transformer_xl_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass, field from typing import Dict, List, Optional import torch from fairseq.dataclass import FairseqDataclass from fairseq.models import ( FairseqIncrementalDecoder, FairseqLanguageModel, register_model, ) from fairseq.modules.checkpoint_activations import checkpoint_wrapper from omegaconf import II logger = logging.getLogger(__name__) @dataclass class TransformerXLConfig(FairseqDataclass): # defaults come from the original Transformer-XL code cutoffs: List[int] = field(default_factory=lambda: [20000, 40000, 200000]) d_model: int = 500 n_head: int = 10 d_head: int = 50 d_inner: int = 1000 div_val: int = 1 n_layer: int = 12 mem_len: int = 0 clamp_len: int = -1 same_length: bool = False dropout: float = 0.0 dropatt: float = 0.0 checkpoint_activations: bool = False offload_activations: bool = False max_target_positions: int = II("task.max_target_positions") @register_model("transformer_xl", dataclass=TransformerXLConfig) class TransformerXLLanguageModel(FairseqLanguageModel): @classmethod def build_model(cls, cfg: TransformerXLConfig, task): return cls(TransformerXLDecoder(cfg, task)) class TransformerXLDecoder(FairseqIncrementalDecoder): def __init__(self, cfg, task): try: from transformers.models.transfo_xl import ( TransfoXLConfig, TransfoXLLMHeadModel, ) except ImportError: from transformers.configuration_transfo_xl import TransfoXLConfig from transformers.modeling_transfo_xl import TransfoXLLMHeadModel super().__init__(task.target_dictionary) self.cfg = cfg # remove any cutoffs larger than the vocab size cutoffs = [ cutoff for cutoff in cfg.cutoffs if cutoff < len(task.target_dictionary) ] config = TransfoXLConfig( vocab_size=len(task.target_dictionary), cutoffs=cutoffs, d_model=cfg.d_model, d_embed=cfg.d_model, n_head=cfg.n_head, d_head=cfg.d_head, d_inner=cfg.d_inner, div_val=cfg.div_val, n_layer=cfg.n_layer, mem_len=cfg.mem_len, clamp_len=cfg.clamp_len, same_length=cfg.same_length, dropout=cfg.dropout, dropatt=cfg.dropatt, ) logger.info(config) self.model = TransfoXLLMHeadModel(config) if cfg.checkpoint_activations or cfg.offload_activations: for i in range(len(self.model.transformer.layers)): self.model.transformer.layers[i] = checkpoint_wrapper( self.model.transformer.layers[i], offload_to_cpu=cfg.offload_activations, ) # TODO: may save mem to wrap(layer.pos_ff.CoreNet[3]) self._mems = None def forward( self, src_tokens, src_lengths=None, # unused incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, encoder_out=None, ): if incremental_state is not None: # used during inference mems = self.get_incremental_state(incremental_state, "mems") src_tokens = src_tokens[:, -1:] # only keep the most recent token else: mems = self._mems output = self.model( input_ids=src_tokens, mems=mems, return_dict=False, ) if len(output) >= 2: if incremental_state is not None: self.set_incremental_state(incremental_state, "mems", output[1]) else: self._mems = output[1] return (output[0],) def max_positions(self): return self.cfg.max_target_positions def reorder_incremental_state( self, incremental_state: Dict[str, Dict[str, Optional[torch.Tensor]]], new_order: torch.Tensor, ): """Reorder incremental state. This will be called when the order of the input has changed from the previous time step. A typical use case is beam search, where the input order changes between time steps based on the selection of beams. """ mems = self.get_incremental_state(incremental_state, "mems") if mems is not None: new_mems = [mems_i.index_select(1, new_order) for mems_i in mems] self.set_incremental_state(incremental_state, "mems", new_mems) ================================================ FILE: examples/truncated_bptt/truncated_bptt_lm_task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field from typing import List, Optional, Tuple import torch from fairseq import utils from fairseq.data import ( Dictionary, TokenBlockDataset, data_utils, iterators, ) from fairseq.dataclass import FairseqDataclass from fairseq.distributed import utils as dist_utils from fairseq.tasks import FairseqTask, register_task from omegaconf import II logger = logging.getLogger(__name__) @dataclass class TruncatedBPTTLMConfig(FairseqDataclass): data: str = field(default="???", metadata={"help": "path to data directory"}) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sequence"}, ) batch_size: int = II("dataset.batch_size") # Some models use *max_target_positions* to know how many positional # embeddings to learn. We use II(...) to make it default to # *tokens_per_sample*, but in principle there could be more positional # embeddings than tokens in a single batch. This may also be irrelevant for # custom model implementations. max_target_positions: int = II("task.tokens_per_sample") # these will be populated automatically if not provided data_parallel_rank: Optional[int] = None data_parallel_size: Optional[int] = None @register_task("truncated_bptt_lm", dataclass=TruncatedBPTTLMConfig) class TruncatedBPTTLMTask(FairseqTask): def __init__(self, cfg: TruncatedBPTTLMConfig): super().__init__(cfg) if cfg.data_parallel_rank is None or cfg.data_parallel_size is None: if torch.distributed.is_initialized(): cfg.data_parallel_rank = dist_utils.get_data_parallel_rank() cfg.data_parallel_size = dist_utils.get_data_parallel_world_size() else: cfg.data_parallel_rank = 0 cfg.data_parallel_size = 1 # load the dictionary paths = utils.split_paths(cfg.data) assert len(paths) > 0 self.dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(self.dictionary))) def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test)""" # support sharded datasets paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) # each element of *data* will be a tensorized line from the original # text dataset, similar to ``open(split_path).readlines()`` data = data_utils.load_indexed_dataset( split_path, self.dictionary, combine=combine ) if data is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) # this is similar to ``data.view(-1).split(tokens_per_sample)`` data = TokenBlockDataset( data, data.sizes, block_size=self.cfg.tokens_per_sample, pad=None, # unused eos=None, # unused break_mode="none", ) self.datasets[split] = TruncatedBPTTDataset( data=data, bsz_per_shard=self.cfg.batch_size, shard_id=self.cfg.data_parallel_rank, num_shards=self.cfg.data_parallel_size, ) def dataset(self, split): return self.datasets[split] def get_batch_iterator( self, dataset, num_workers=0, epoch=1, data_buffer_size=0, skip_remainder_batch=False, **kwargs ): return iterators.EpochBatchIterator( dataset=dataset, collate_fn=self._collate_fn, num_workers=num_workers, epoch=epoch, buffer_size=data_buffer_size, # we don't use the batching functionality from EpochBatchIterator; # instead every item in *dataset* is a whole batch batch_sampler=[[i] for i in range(len(dataset))], disable_shuffling=True, skip_remainder_batch=skip_remainder_batch, ) def _collate_fn(self, items: List[List[torch.Tensor]]): # we don't use fairseq's batching functionality, so we expect a single # Tensor of type List[torch.Tensor] assert len(items) == 1 # item will have shape B x T (the last batch may have length < T) id, item = items[0] item = data_utils.collate_tokens(item, pad_idx=self.source_dictionary.pad()) B, T = item.size() # shift item one position over and append a padding token for the target target = torch.nn.functional.pad( item[:, 1:], (0, 1, 0, 0), value=self.target_dictionary.pad() ) # fairseq expects batches to have the following structure return { "id": torch.tensor([id] * item.size(0)), "net_input": {"src_tokens": item,}, "target": target, "nsentences": item.size(0), "ntokens": item.numel(), } def build_dataset_for_inference( self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs ) -> torch.utils.data.Dataset: eos = self.source_dictionary.eos() dataset = TokenBlockDataset( src_tokens, src_lengths, block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=eos, break_mode="eos", ) class Dataset(torch.utils.data.Dataset): def __getitem__(self, i): item = dataset[i] if item[-1] == eos: # remove eos to support generating with a prefix item = item[:-1] return (i, [item]) def __len__(self): return len(dataset) return Dataset() def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): if constraints is not None: raise NotImplementedError # SequenceGenerator doesn't use *src_tokens* directly, we need to # pass the *prefix_tokens* argument instead. if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement(): prefix_tokens = sample["net_input"]["src_tokens"] # begin generation with the end-of-sentence token bos_token = self.source_dictionary.eos() return generator.generate( models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token ) def eval_lm_dataloader( self, dataset, max_tokens: Optional[int] = 36000, batch_size: Optional[int] = None, max_positions: Optional[int] = None, num_shards: int = 1, shard_id: int = 0, num_workers: int = 1, data_buffer_size: int = 10, context_window: int = 0, ): if context_window > 0: raise NotImplementedError( "Transformer-XL doesn't need --context-window, try " "--model-overrides '{\"mem_len\":42}' instead " ) return self.get_batch_iterator( dataset=dataset, max_tokens=max_tokens, max_sentences=batch_size, max_positions=max_positions, ignore_invalid_inputs=True, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, data_buffer_size=data_buffer_size, ).next_epoch_itr(shuffle=False) @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary class TruncatedBPTTDataset(torch.utils.data.Dataset): def __init__( self, data: List[torch.Tensor], # ordered list of items bsz_per_shard, # number of items processed per GPUs per forward shard_id, # current GPU ID num_shards, # number of GPUs ): super().__init__() self.data = data def batchify(data, bsz): # Work out how cleanly we can divide the dataset into bsz parts. nbatch = data.size(0) // bsz # Trim off any extra elements that wouldn't cleanly fit (remainders). data = data.narrow(0, 0, nbatch * bsz) # Evenly divide the data across the bsz batches. data = data.view(bsz, -1).contiguous() return data # total number of sequences processed by all GPUs in each forward pass global_batch_size = bsz_per_shard * num_shards """ With a 16 item dataset, bsz_per_shard=2 and num_shards=3, *indices* might look like: indices = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11]] The size of the TruncatedBPTTDataset instance will be 2, and shard 1 will see items: [(0, [data[4], data[6]]), (1, [data[5], data[7]])] """ indices = batchify(torch.arange(len(data)), global_batch_size) assert indices.size(0) == global_batch_size self.my_indices = indices[ shard_id * bsz_per_shard : (shard_id + 1) * bsz_per_shard ] assert self.my_indices.size(0) == bsz_per_shard def __len__(self): return self.my_indices.size(1) def __getitem__(self, i) -> Tuple[int, List[torch.Tensor]]: return (i, [self.data[idx] for idx in self.my_indices[:, i]]) ================================================ FILE: examples/unsupervised_quality_estimation/README.md ================================================ # Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020) This page includes instructions for reproducing results from the paper [Unsupervised Quality Estimation for Neural Machine Translation (Fomicheva et al., 2020)](https://arxiv.org/abs/2005.10608) ## Requirements: * mosesdecoder: https://github.com/moses-smt/mosesdecoder * subword-nmt: https://github.com/rsennrich/subword-nmt * flores: https://github.com/facebookresearch/flores ## Download Models and Test Data Download translation models and test data from [MLQE dataset repository](https://github.com/facebookresearch/mlqe). ## Set up: Given a testset consisting of source sentences and reference translations: * `SRC_LANG`: source language * `TGT_LANG`: target language * `INPUT`: input prefix, such that the file `$INPUT.$SRC_LANG` contains source sentences and `$INPUT.$TGT_LANG` contains the reference sentences * `OUTPUT_DIR`: output path to store results * `MOSES_DECODER`: path to mosesdecoder installation * `BPE_ROOT`: path to subword-nmt installation * `BPE`: path to BPE model * `MODEL_DIR`: directory containing the NMT model `.pt` file as well as the source and target vocabularies. * `TMP`: directory for intermediate temporary files * `GPU`: if translating with GPU, id of the GPU to use for inference * `DROPOUT_N`: number of stochastic forward passes `$DROPOUT_N` is set to 30 in the experiments reported in the paper. However, we observed that increasing it beyond 10 does not bring substantial improvements. ## Translate the data using standard decoding Preprocess the input data: ``` for LANG in $SRC_LANG $TGT_LANG; do perl $MOSES_DECODER/scripts/tokenizer/tokenizer.perl -threads 80 -a -l $LANG < $INPUT.$LANG > $TMP/preprocessed.tok.$LANG python $BPE_ROOT/apply_bpe.py -c ${BPE} < $TMP/preprocessed.tok.$LANG > $TMP/preprocessed.tok.bpe.$LANG done ``` Binarize the data for faster translation: ``` fairseq-preprocess --srcdict $MODEL_DIR/dict.$SRC_LANG.txt --tgtdict $MODEL_DIR/dict.$TGT_LANG.txt --source-lang ${SRC_LANG} --target-lang ${TGT_LANG} --testpref $TMP/preprocessed.tok.bpe --destdir $TMP/bin --workers 4 ``` Translate ``` CUDA_VISIBLE_DEVICES=$GPU fairseq-generate $TMP/bin --path ${MODEL_DIR}/${SRC_LANG}-${TGT_LANG}.pt --beam 5 --source-lang $SRC_LANG --target-lang $TGT_LANG --no-progress-bar --unkpen 5 > $TMP/fairseq.out grep ^H $TMP/fairseq.out | cut -d- -f2- | sort -n | cut -f3- > $TMP/mt.out ``` Post-process ``` sed -r 's/(@@ )| (@@ ?$)//g' < $TMP/mt.out | perl $MOSES_DECODER/scripts/tokenizer/detokenizer.perl -l $TGT_LANG > $OUTPUT_DIR/mt.out ``` ## Produce uncertainty estimates ### Scoring Make temporary files to store the translations repeated N times. ``` python ${SCRIPTS}/scripts/uncertainty/repeat_lines.py -i $TMP/preprocessed.tok.bpe.$SRC_LANG -n $DROPOUT_N -o $TMP/repeated.$SRC_LANG python ${SCRIPTS}/scripts/uncertainty/repeat_lines.py -i $TMP/mt.out -n $DROPOUT_N -o $TMP/repeated.$TGT_LANG fairseq-preprocess --srcdict ${MODEL_DIR}/dict.${SRC_LANG}.txt $TGT_DIC --source-lang ${SRC_LANG} --target-lang ${TGT_LANG} --testpref ${TMP}/repeated --destdir ${TMP}/bin-repeated ``` Produce model scores for the generated translations using `--retain-dropout` option to apply dropout at inference time: ``` CUDA_VISIBLE_DEVICES=${GPU} fairseq-generate ${TMP}/bin-repeated --path ${MODEL_DIR}/${LP}.pt --beam 5 --source-lang $SRC_LANG --target-lang $TGT_LANG --no-progress-bar --unkpen 5 --score-reference --retain-dropout --retain-dropout-modules '["TransformerModel","TransformerEncoder","TransformerDecoder","TransformerEncoderLayer"]' TransformerDecoderLayer --seed 46 > $TMP/dropout.scoring.out grep ^H $TMP/dropout.scoring.out | cut -d- -f2- | sort -n | cut -f2 > $TMP/dropout.scores ``` Use `--retain-dropout-modules` to specify the modules. By default, dropout is applied in the same places as for training. Compute the mean of the resulting output distribution: ``` python $SCRIPTS/scripts/uncertainty/aggregate_scores.py -i $TMP/dropout.scores -o $OUTPUT_DIR/dropout.scores.mean -n $DROPOUT_N ``` ### Generation Produce multiple translation hypotheses for the same source using `--retain-dropout` option: ``` CUDA_VISIBLE_DEVICES=${GPU} fairseq-generate ${TMP}/bin-repeated --path ${MODEL_DIR}/${LP}.pt --beam 5 --source-lang $SRC_LANG --target-lang $TGT_LANG --no-progress-bar --retain-dropout --unkpen 5 --retain-dropout-modules TransformerModel TransformerEncoder TransformerDecoder TransformerEncoderLayer TransformerDecoderLayer --seed 46 > $TMP/dropout.generation.out grep ^H $TMP/dropout.generation.out | cut -d- -f2- | sort -n | cut -f3- > $TMP/dropout.hypotheses_ sed -r 's/(@@ )| (@@ ?$)//g' < $TMP/dropout.hypotheses_ | perl $MOSES_DECODER/scripts/tokenizer/detokenizer.perl -l $TGT_LANG > $TMP/dropout.hypotheses ``` Compute similarity between multiple hypotheses corresponding to the same source sentence using Meteor evaluation metric: ``` python meteor.py -i $TMP/dropout.hypotheses -m <path_to_meteor_installation> -n $DROPOUT_N -o $OUTPUT_DIR/dropout.gen.sim.meteor ``` ================================================ FILE: examples/unsupervised_quality_estimation/aggregate_scores.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import sys import numpy as np aggregate_funcs = { "std": np.std, "var": np.var, "median": np.median, "mean": np.mean, "min": np.min, "max": np.max, } def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", required=True, type=str) parser.add_argument("-n", "--repeat_times", required=True, type=int) parser.add_argument("-o", "--output_file", required=False) parser.add_argument("-f", "--func", required=False, default="mean") args = parser.parse_args() stream = open(args.output_file, "w") if args.output_file else sys.stdout segment_scores = [] for line in open(args.input_file): segment_scores.append(float(line.strip())) if len(segment_scores) == args.repeat_times: stream.write("{}\n".format(aggregate_funcs[args.func](segment_scores))) segment_scores = [] if __name__ == "__main__": main() ================================================ FILE: examples/unsupervised_quality_estimation/meteor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import math import os import subprocess import sys import tempfile from collections import defaultdict from itertools import combinations def read_translations(path, n_repeats): segment_counter = 0 segment_translations = [] translations = defaultdict(list) for line in open(path): segment_translations.append(" ".join(line.split())) if len(segment_translations) == n_repeats: translations[segment_counter] = segment_translations segment_translations = [] segment_counter += 1 return translations def generate_input(translations, n_repeats): _, ref_path = tempfile.mkstemp() _, mt_path = tempfile.mkstemp() ref_fh = open(ref_path, "w") mt_fh = open(mt_path, "w") for segid in sorted(translations.keys()): assert len(translations[segid]) == n_repeats indexes = combinations(range(n_repeats), 2) for idx1, idx2 in indexes: mt_fh.write(translations[segid][idx1].strip() + "\n") ref_fh.write(translations[segid][idx2].strip() + "\n") sys.stderr.write("\nSaved translations to %s and %s" % (ref_path, mt_path)) return ref_path, mt_path def run_meteor(ref_path, mt_path, metric_path, lang="en"): _, out_path = tempfile.mkstemp() subprocess.call( [ "java", "-Xmx2G", "-jar", metric_path, mt_path, ref_path, "-p", "0.5 0.2 0.6 0.75", # default parameters, only changed alpha to give equal weight to P and R "-norm", "-l", lang, ], stdout=open(out_path, "w"), ) os.remove(ref_path) os.remove(mt_path) sys.stderr.write("\nSaved Meteor output to %s" % out_path) return out_path def read_output(meteor_output_path, n_repeats): n_combinations = math.factorial(n_repeats) / ( math.factorial(2) * math.factorial(n_repeats - 2) ) raw_scores = [] average_scores = [] for line in open(meteor_output_path): if not line.startswith("Segment "): continue score = float(line.strip().split("\t")[1]) raw_scores.append(score) if len(raw_scores) == n_combinations: average_scores.append(sum(raw_scores) / n_combinations) raw_scores = [] os.remove(meteor_output_path) return average_scores def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--infile") parser.add_argument("-n", "--repeat_times", type=int) parser.add_argument("-m", "--meteor") parser.add_argument("-o", "--output") args = parser.parse_args() translations = read_translations(args.infile, args.repeat_times) sys.stderr.write("\nGenerating input for Meteor...") ref_path, mt_path = generate_input(translations, args.repeat_times) sys.stderr.write("\nRunning Meteor...") out_path = run_meteor(ref_path, mt_path, args.meteor) sys.stderr.write("\nReading output...") scores = read_output(out_path, args.repeat_times) sys.stderr.write("\nWriting results...") with open(args.output, "w") as o: for scr in scores: o.write("{}\n".format(scr)) o.close() if __name__ == "__main__": main() ================================================ FILE: examples/unsupervised_quality_estimation/repeat_lines.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import sys def _normalize_spaces(line): return " ".join(line.split()) def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input_file", required=True, type=str) parser.add_argument("-n", "--repeat_times", required=True, type=int) parser.add_argument("-o", "--output_file", required=False, type=str) args = parser.parse_args() stream = open(args.output_file, "w") if args.output_file else sys.stdout for line in open(args.input_file): for _ in range(args.repeat_times): stream.write(_normalize_spaces(line) + "\n") if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/README.md ================================================ # wav2vec 2.0 wav2vec 2.0 learns speech representations on unlabeled data as described in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations (Baevski et al., 2020)](https://arxiv.org/abs/2006.11477). We learned speech representations in multiple languages as well in [Unsupervised Cross-lingual Representation Learning for Speech Recognition (Conneau et al., 2020)](https://arxiv.org/abs/2006.13979). We also combined wav2vec 2.0 with self-training in [Self-training and Pre-training are Complementary for Speech Recognition (Xu et al., 2020)](https://arxiv.org/abs/2010.11430). We combined speech data from multiple domains in [Robust wav2vec 2.0: Analyzing Domain Shift in Self-Supervised Pre-Training (Hsu, et al., 2021)](https://arxiv.org/abs/2104.01027). We finetuned XLSR-53 on multiple languages to transcribe unseen languages in [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition (Xu et al., 2021)](https://arxiv.org/abs/2109.11680). ## Pre-trained models Model | Finetuning split | Dataset | Model |---|---|---|--- Wav2Vec 2.0 Base | No finetuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small.pt) Wav2Vec 2.0 Base | 10 minutes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_10m.pt) Wav2Vec 2.0 Base | 100 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_100h.pt) Wav2Vec 2.0 Base | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_small_960h.pt) Wav2Vec 2.0 Large | No finetuning | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/libri960_big.pt) Wav2Vec 2.0 Large | 10 minutes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_10m.pt) Wav2Vec 2.0 Large | 100 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_100h.pt) Wav2Vec 2.0 Large | 960 hours | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_big_960h.pt) Wav2Vec 2.0 Large (LV-60)* | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_new.pt) Wav2Vec 2.0 Large conformer - rel_pos (LV-60)* | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_relpos_PT_no_FT) Wav2Vec 2.0 Large conformer - rope (LV-60)* | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_rope_PT_no_FT) Wav2Vec 2.0 Large (LV-60)* | 10 minutes | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_10m_new.pt) Wav2Vec 2.0 Large (LV-60)* | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_100h_new.pt) Wav2Vec 2.0 Large conformer - rel_pos (LV-60)* | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_relpos_PT_100h_FT.pt) Wav2Vec 2.0 Large conformer - rope (LV-60)* | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_rope_PT_100h_FT.pt) Wav2Vec 2.0 Large (LV-60)* | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec2_vox_960h_new.pt) Wav2Vec 2.0 Large conformer - rel_pos (LV-60)* | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_relpos_PT_960h_FT.pt) Wav2Vec 2.0 Large conformer - rope (LV-60)* | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) | [download](s3://dl.fbaipublicfiles.com/fairseq/conformer/wav2vec2/librilight/LL_rope_PT_960h_FT.pt) Wav2Vec 2.0 Large (LV-60) + Self Training * | 10 minutes | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_10m_pl.pt) Wav2Vec 2.0 Large (LV-60) + Self Training * | 100 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_100h_pl.pt) Wav2Vec 2.0 Large (LV-60) + Self Training * | 960 hours | [Libri-Light](https://github.com/facebookresearch/libri-light) + [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_vox_960h_pl.pt) Wav2Vec 2.0 Large (LV-60 + CV + SWBD + FSH) ** | No finetuning | [Libri-Light](https://github.com/facebookresearch/libri-light) + [CommonVoice](https://commonvoice.mozilla.org/en/languages) + [Switchboard](https://catalog.ldc.upenn.edu/LDC97S62) + [Fisher](https://catalog.ldc.upenn.edu/LDC2004T19) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv.pt) Wav2Vec 2.0 Large (LV-60 + CV + SWBD + FSH) ** | 960 hours Librispeech | [Libri-Light](https://github.com/facebookresearch/libri-light) + [CommonVoice](https://commonvoice.mozilla.org/en/languages) + [Switchboard](https://catalog.ldc.upenn.edu/LDC97S62) + [Fisher](https://catalog.ldc.upenn.edu/LDC2004T19) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv_ftls960_updated.pt) Wav2Vec 2.0 Large (LV-60 + CV + SWBD + FSH) ** | 300 hours Switchboard | [Libri-Light](https://github.com/facebookresearch/libri-light) + [CommonVoice](https://commonvoice.mozilla.org/en/languages) + [Switchboard](https://catalog.ldc.upenn.edu/LDC97S62) + [Fisher](https://catalog.ldc.upenn.edu/LDC2004T19) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/w2v_large_lv_fsh_swbd_cv_ftsb300_updated.pt) \* updated (Oct. 24, 2020)\ ** updated (Nov. 13, 2021) We also release multilingual pre-trained wav2vec 2.0 (XLSR) models: Model | Architecture | Hours | Languages | Datasets | Model |---|---|---|---|---|--- XLSR-53 | Large | 56k | 53 | MLS, CommonVoice, BABEL | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr_53_56k.pt) The XLSR model uses the following datasets for multilingual pretraining: * **[MLS: Multilingual LibriSpeech](https://indico2.conference4me.psnc.pl/event/35/contributions/3585/attachments/1060/1101/Wed-2-6-10.pdf)** (8 languages, 50.7k hours): *Dutch, English, French, German, Italian, Polish, Portuguese, Spanish* * **[CommonVoice](https://commonvoice.mozilla.org/en/languages)** (36 languages, 3.6k hours): *Arabic, Basque, Breton, Chinese (CN), Chinese (HK), Chinese (TW), Chuvash, Dhivehi, Dutch, English, Esperanto, Estonian, French, German, Hakh-Chin, Indonesian, Interlingua, Irish, Italian, Japanese, Kabyle, Kinyarwanda, Kyrgyz, Latvian, Mongolian, Persian, Portuguese, Russian, Sakha, Slovenian, Spanish, Swedish, Tamil, Tatar, Turkish, Welsh* (see also [finetuning splits]([https://dl.fbaipublicfiles.com/cpc_audio/common_voices_splits.tar.gz]) from [this paper](https://arxiv.org/abs/2002.02848)). * **[Babel](https://catalog.ldc.upenn.edu/byyear)** (17 languages, 1.7k hours): *Assamese, Bengali, Cantonese, Cebuano, Georgian, Haitian, Kazakh, Kurmanji, Lao, Pashto, Swahili, Tagalog, Tamil, Tok, Turkish, Vietnamese, Zulu* We also finetuned several models on languages from [CommonVoice](https://commonvoice.mozilla.org/en/languages) (version 6.1) and [Babel](https://catalog.ldc.upenn.edu/byyear). Please refer to [our paper](https://arxiv.org/abs/2109.11680) for details about which languages are used. Pretrained Model | Fintune Dataset | # Languages | Phonemizer | Model | Dictionary |---|---|---|---|---|--- LV-60 | CommonVoice | 26 | [Espeak](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_en_26lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_dict.txt) XLSR-53 | CommonVoice | 26 | [Espeak](https://github.com/espeak-ng/espeak-ng/blob/master/docs/languages.md) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_26lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/espeak_dict.txt) XLSR-53 | CommonVoice | 21 | [Phonetisaurus](https://github.com/AdolfVonKleist/Phonetisaurus) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_21lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_dict.txt) XLSR-53 | CommonVoice, BABEL | 21, 19 | [Phonetisaurus](https://github.com/AdolfVonKleist/Phonetisaurus) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_40lang_m10.pt) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/zero_shot/phonetisaurus_40lang.dict.txt) We release 2 models that are finetuned on data from 2 different phonemizers. Although the phonemes are all [IPA](https://en.wikipedia.org/wiki/International_Phonetic_Alphabet) symbols, there are still subtle differences between the phonemized transcriptions from the 2 phonemizers. Thus, it's better to use the corresponding model, if your data is phonemized by either phonemizer above. ## Training a new model with the CLI tools Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length) ### Prepare training data manifest First, install the `soundfile` library: ```shell script pip install soundfile ``` Next, run: ```shell script python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext $ext --valid-percent $valid ``` $ext should be set to flac, wav, or whatever format your dataset happens to use that soundfile can read. $valid should be set to some reasonable percentage (like 0.01) of training data to use for validation. To use a pre-defined validation set (like dev-other from librispeech), set to it 0 and then overwrite valid.tsv with a separately pre-processed manifest file. ### Train a wav2vec 2.0 base model This configuration was used for the base model trained on the Librispeech dataset in the wav2vec 2.0 paper Note that the input is expected to be single channel, sampled at 16 kHz ```shell script $ fairseq-hydra-train \ task.data=/path/to/data \ --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ --config-name wav2vec2_base_librispeech ``` Note: you can simulate 64 GPUs by using k GPUs and adding command line parameters (before `--config-dir`) `distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 64/k ### Train a wav2vec 2.0 large model This configuration was used for the large model trained on the Libri-light dataset in the wav2vec 2.0 paper ```shell script $ fairseq-hydra-train \ task.data=/path/to/data \ --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ --config-name wav2vec2_large_librivox ``` Note: you can simulate 128 GPUs by using k GPUs and adding command line parameters (before `--config-dir`) `distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 128/k ### Train a wav2vec 2.0 model with conformer backbone To replace the transformer layers in the encoder with the conformer layers, set `--layer-type conformer --attn-type espnet --pos-enc-type ${POS_ENC_TYPE}`. `POS_ENC_TYPE` refers to positional encoding to be used in the conformer encoder. Set it to `abs`, `rope` or `rel_pos` to use the absolute positional encoding, rotary positional encoding or relative positional encoding in the conformer layer respectively. To train a base model with conformer: ```shell script $ fairseq-hydra-train \ task.data=/path/to/data \ --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ --config-name wav2vec2_conformer_base_librispeech \ --attn-type espnet --pos-enc-type ${POS_ENC_TYPE} ``` To train a large model with conformer: ```shell script $ fairseq-hydra-train \ task.data=/path/to/data \ --config-dir /path/to/fairseq-py/examples/wav2vec/config/pretraining \ --config-name wav2vec2_conformer_large_librivox --attn-type espnet --pos-enc-type ${POS_ENC_TYPE} ``` ### Fine-tune a pre-trained model with CTC Fine-tuning a model requires parallel audio and labels file, as well as a vocabulary file in fairseq format. A letter vocabulary can be downloaded [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt). An example [script](libri_labels.py) that generates labels for the Librispeech dataset from the tsv file produced by wav2vec_manifest.py can be used as follows: ```shell script split=train $ python libri_labels.py /path/to/tsv --output-dir /output/dir --output-name $split ``` Fine-tuning on 100h of Librispeech with letter targets: ```shell script $ fairseq-hydra-train \ distributed_training.distributed_port=$PORT \ task.data=/path/to/data \ model.w2v_path=/path/to/model.pt \ --config-dir /path/to/fairseq-py/examples/wav2vec/config/finetuning \ --config-name base_100h ``` There are other config files in the config/finetuning directory that can be used to fine-tune on other splits. You can specify the right config via the `--config-name` parameter. Note: you can simulate 24 GPUs by using k GPUs and adding command line parameters (before `--config-dir`) `distributed_training.distributed_world_size=k` `+optimization.update_freq='[x]'` where x = 24/k Decoding with a language model during training requires flashlight [python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter). If you want to use a language model, add `+criterion.wer_args='[/path/to/kenlm, /path/to/lexicon, 2, -1]'` to the command line. ### Evaluating a CTC model Evaluating a CTC model with a language model requires [flashlight python bindings](https://github.com/facebookresearch/flashlight/tree/master/bindings/python) (previously called [wav2letter](https://github.com/facebookresearch/wav2letter) to be installed. Fairseq transformer language model used in the wav2vec 2.0 paper can be obtained from the [wav2letter model repository](https://github.com/facebookresearch/wav2letter/tree/master/recipes/sota/2019). Be sure to upper-case the language model vocab after downloading it. Letter dictionary for pre-trained models can be found [here](https://dl.fbaipublicfiles.com/fairseq/wav2vec/dict.ltr.txt). Next, run the evaluation command: ```shell script $subset=dev_other python examples/speech_recognition/infer.py /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw --task audio_finetuning \ --nbest 1 --path /path/to/model --gen-subset $subset --results-path /path/to/save/results/for/sclite --w2l-decoder kenlm \ --lm-model /path/to/kenlm.bin --lm-weight 2 --word-score -1 --sil-weight 0 --criterion ctc --labels ltr --max-tokens 4000000 \ --post-process letter ``` To get raw numbers, use --w2l-decoder viterbi and omit the lexicon. To use the transformer language model, use --w2l-decoder fairseqlm. ## Use wav2vec 2.0 with 🤗Transformers Wav2Vec2 is also available in the [🤗Transformers library](https://github.com/huggingface/transformers) since version 4.4. Pretrained Models can be found on the [hub](https://huggingface.co/models?filter=wav2vec2) and documentation can be found [here](https://huggingface.co/transformers/master/model_doc/wav2vec2.html). Usage example: ```python # !pip install transformers # !pip install datasets import soundfile as sf import torch from datasets import load_dataset from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor # load pretrained model processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") librispeech_samples_ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") # load audio audio_input, sample_rate = sf.read(librispeech_samples_ds[0]["file"]) # pad input values and return pt tensor input_values = processor(audio_input, sampling_rate=sample_rate, return_tensors="pt").input_values # INFERENCE # retrieve logits & take argmax logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) # transcribe transcription = processor.decode(predicted_ids[0]) # FINE-TUNE target_transcription = "A MAN SAID TO THE UNIVERSE I EXIST" # encode labels with processor.as_target_processor(): labels = processor(target_transcription, return_tensors="pt").input_ids # compute loss by passing labels loss = model(input_values, labels=labels).loss loss.backward() ``` # wav2vec Example to train a wav2vec model as described in [wav2vec: Unsupervised Pre-training for Speech Recognition (Schneider et al., 2019)](https://arxiv.org/abs/1904.05862). ## Pre-trained models Description | Dataset | Model ---|---|--- Wav2Vec large | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/wav2vec_large.pt) #### Example usage ```python import torch import fairseq cp_path = '/path/to/wav2vec.pt' model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp_path]) model = model[0] model.eval() wav_input_16khz = torch.randn(1,10000) z = model.feature_extractor(wav_input_16khz) c = model.feature_aggregator(z) ``` ## Training a new model with the CLI tools Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate files 10 to 30 seconds in length) ### Prepare training data manifest ``` python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav ``` ### Train a wav2vec model ``` $ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \ --arch wav2vec --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --lr 0.005 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ --skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \ --max-sample-size 150000 --max-tokens 1500000 --skip-invalid-size-inputs-valid-test ``` ### Run wav2vec2 pre-training on Google Cloud TPUs Wav2Vec2 is now supported on TPUs! It's currently pre-training only. #### Using hydra on a v3-8 ``` $ OMP_NUM_THREADS=1 fairseq-hydra-train \ task.data=/manifest/path \ --config-dir /PATH/TO/FAIRSEQ/examples/wav2vec/config/pretraining \ --config-name wav2vec2_large_librivox_tpu.yaml ``` #### Using command line arguments on a v3-8 Note: Commandline arguments way of execution has a [known-problem](https://github.com/pytorch/fairseq/issues/3741) currently. ``` $ OMP_NUM_THREADS=1 python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \ --arch wav2vec2 --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --lr 0.005 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ --skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \ --max-sample-size 150000 --max-tokens 1500000 --skip-invalid-size-inputs-valid-test \ --tpu --distributed-world-size 8 --num-batch-buckets 3 --enable-padding \ --encoder-layerdrop 0 --mask-channel-prob 0.1 ``` #### Using hydra on a pod slice (v3-N with N > 8) ``` $ OMP_NUM_THREADS=1 fairseq-hydra-train \ task.data=/manifest/path \ --config-dir /PATH/TO/FAIRSEQ/examples/wav2vec/config/pretraining \ --config-name wav2vec2_large_librivox_tpu-pod.yaml # edit distributed-world-size accordingly ``` #### Using command line arguments on a pod slice (v3-N with N > 8) Note: Commandline arguments way of execution has a [known-problem](https://github.com/pytorch/fairseq/issues/3741) currently. ``` $ python -m torch_xla.distributed.xla_dist \ --tpu ${TPUNAME} --conda-env=torch-xla-${TORCH_XLA_VERSION} --env OMP_NUM_THREADS=1 \ -- \ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 --save-interval 1 --no-epoch-checkpoints \ --arch wav2vec2 --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 --optimizer adam --lr 0.005 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ --skip-connections-agg --residual-scale 0.5 --log-compression --warmup-updates 500 --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 \ --max-sample-size 150000 --max-tokens 1500000 --skip-invalid-size-inputs-valid-test \ --tpu --distributed-world-size ${WORLD_SIZE} --num-batch-buckets 3 --enable-padding \ --encoder-layerdrop 0 --mask-channel-prob 0.1 ``` ### Extract embeddings from the downstream task data ``` $ PYTHONPATH=/path/to/fairseq python examples/wav2vec/wav2vec_featurize.py --input /path/to/task/waves --output /path/to/output \ --model /model/path/checkpoint_best.pt --split train valid test ``` # vq-wav2vec Example to train a vq-wav2vec model as described in [vq-wav2vec: Self-Supervised Learning of Discrete Speech Representations (Baevski et al., 2019)](https://arxiv.org/abs/1910.05453). These models are also used in [Effectiveness of self-supervised pre-training for speech recognition (Baevski et al., 2019)](https://arxiv.org/abs/1911.03912). ## Pre-trained models Description | Dataset | Model ---|---|--- vq-wav2vec Gumbel | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/vq-wav2vec.pt) vq-wav2vec K-means | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/vq-wav2vec_kmeans.pt) Roberta on K-means codes | [Librispeech](http://www.openslr.org/12) | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/bert_kmeans.tar) #### Example usage ```python import torch import fairseq cp = torch.load('/path/to/vq-wav2vec.pt') model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([cp]) model = model[0] model.eval() wav_input_16khz = torch.randn(1,10000) z = model.feature_extractor(wav_input_16khz) _, idxs = model.vector_quantizer.forward_idx(z) print(idxs.shape) # output: torch.Size([1, 60, 2]), 60 timesteps with 2 indexes corresponding to 2 groups in the model ``` ## Training a new model with the CLI tools Given a directory containing wav files to be used for pretraining (we recommend splitting each file into separate file 10 to 30 seconds in length) ### Prepare training data manifest ``` python examples/wav2vec/wav2vec_manifest.py /path/to/waves --dest /manifest/path --ext wav ``` ### Train a gumbel vq-wav2vec model ``` $ python train.py /manifest/path --save-dir /model/path --num-workers 6 --fp16 --max-update 400000 \ --save-interval 1 --no-epoch-checkpoints --arch wav2vec --task audio_pretraining --min-lr 1e-06 --stop-min-lr 1e-09 \ --optimizer adam --lr 1e-05 --lr-scheduler cosine \ --conv-feature-layers [(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)] \ --conv-aggregator-layers [(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)] \ --activation gelu --offset auto --skip-connections-agg --residual-scale 0.5 \ --log-keys ["prob_perplexity","code_perplexity","temp"] --vq-type gumbel --vq-groups 2 --vq-depth 2 \ --combine-groups --vq-vars 320 --vq-temp (2,0.5,0.999995) --prediction-steps 12 --warmup-updates 1000 \ --warmup-init-lr 1e-07 --criterion wav2vec --num-negatives 10 --max-sample-size 150000 \ --max-tokens 300000 --cross-sample-negatives 0 --update-freq 1 --seed 2 --skip-invalid-size-inputs-valid-test ``` for k-means training, set vq-type with "kmeans" and add --loss-weights [1] argument. Pre-trained models were trained on 16 GPUs. ### Tokenize audio data (e.g. for BERT training) ``` $ PYTHONPATH=/path/to/fairseq python examples/wav2vec/vq-wav2vec_featurize.py --data-dir /manifest/path --output-dir /path/to/output \ --checkpoint /model/path/checkpoint_best.pt --split train valid test --extension tsv ``` ================================================ FILE: examples/wav2vec/__init__.py ================================================ ================================================ FILE: examples/wav2vec/config/finetuning/base_100h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: false labels: ltr dataset: num_workers: 6 max_tokens: 3200000 skip_invalid_size_inputs_valid_test: true valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 2 criterion: _name: ctc zero_infinity: true optimization: max_update: 80000 lr: [0.00003] sentence_avg: true update_freq: [4] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_channel_prob: 0.5 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 0 ================================================ FILE: examples/wav2vec/config/finetuning/base_10h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval: 50 save_interval_updates: 10000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: false labels: ltr dataset: num_workers: 6 max_tokens: 3200000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 50 valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 2 criterion: _name: ctc zero_infinity: true optimization: max_update: 20000 lr: [0.00005] sentence_avg: true update_freq: [4] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_channel_prob: 0.5 mask_channel_length: 64 layerdrop: 0.05 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/base_10m.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval: 1000 save_interval_updates: 50 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: false labels: ltr dataset: num_workers: 6 max_tokens: 3200000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 1000 valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 2 criterion: _name: ctc zero_infinity: true optimization: max_update: 13000 lr: [0.00005] sentence_avg: true update_freq: [4] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_channel_prob: 0.25 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/base_1h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval: 50 save_interval_updates: 1000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: false labels: ltr dataset: num_workers: 6 max_tokens: 3200000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 1000 valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 2 criterion: _name: ctc zero_infinity: true optimization: max_update: 13000 lr: [0.00005] sentence_avg: true update_freq: [4] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_channel_prob: 0.25 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/base_960h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: false labels: ltr dataset: num_workers: 6 max_tokens: 3200000 skip_invalid_size_inputs_valid_test: true valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true optimization: max_update: 320000 lr: [0.0001] sentence_avg: true optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.5 mask_channel_prob: 0.1 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 0 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_1.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_16.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 16 name: ${env:PREFIX}_${hydra.job.config_name} partition: learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 exclude: learnfair1381,learnfair5192,learnfair2304 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_1_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.local_cache_path - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_1_old.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 450 nodes: 1 name: ${env:PREFIX}_wav2vec3_small_librispeech partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 exclude: learnfair1381 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_2.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 exclude: learnfair7491,learnfair7477,learnfair7487 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_2_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.local_cache_path - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 8 tasks_per_node: 1 mem_gb: 0 nodes: 2 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_2g.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 2 tasks_per_node: 2 mem_gb: 200 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_3.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 450 nodes: 3 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 exclude: learnfair7491,learnfair7477,learnfair7487 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_4g.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 200 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_4g_aws.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '/' exclude_keys: - run_config - distributed_training.distributed_port - distributed_training.distributed_world_size - model.pretrained_model_path - model.target_network_path - next_script - task.cache_in_scratch - task.local_cache_path - task.data - checkpoint.save_interval_updates - checkpoint.keep_interval_updates - checkpoint.save_on_overflow - common.log_interval - common.user_dir sweep: dir: /fsx-wav2vec/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: '' launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 80 gpus_per_node: 4 tasks_per_node: 1 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab,learnfair max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/run_config/slurm_8.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 4320 cpus_per_task: 10 gpus_per_node: 8 tasks_per_node: 8 mem_gb: 400 nodes: 8 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_100h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: true labels: ltr dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true optimization: max_update: 80000 lr: [0.00003] sentence_avg: true update_freq: [5] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.5 mask_channel_prob: 0.5 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/vox_100h_2.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 1 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: 0 wer_sil_weight: -2 optimization: max_update: 100000 lr: [1e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: null warmup_steps: 8000 hold_steps: 0 decay_steps: 72000 final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_100h_2_aws.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /data/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx-wav2vec/abaevski/data/libri/100h/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 1 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: 0 wer_sil_weight: -2 optimization: max_update: 100000 lr: [1e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: null warmup_steps: 8000 hold_steps: 0 decay_steps: 82000 final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 7 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 ================================================ FILE: examples/wav2vec/config/finetuning/vox_100h_3.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 1 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: -1.0 optimization: max_update: 100000 lr: [1e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: cosine warmup_updates: 8000 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval: 50 save_interval_updates: 10000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: true labels: ltr dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 50 valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true optimization: max_update: 20000 lr: [0.0001] sentence_avg: true update_freq: [5] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.75 mask_channel_prob: 0.25 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10h_2.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 10 no_epoch_checkpoints: true best_checkpoint_metric: wer keep_interval_updates: 1 task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 10 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: -1.0 optimization: max_update: 60000 lr: [2e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: cosine warmup_updates: 8000 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.5 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10h_2_aws.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /data/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 10 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 10 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: 4 wer_sil_weight: -5 optimization: max_update: 60000 lr: [1e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: null warmup_steps: 8000 hold_steps: 0 decay_steps: 72000 final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.75 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10h_aws.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /data/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 10 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 10 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter # wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin # wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst # wer_lm_weight: 2.0 # wer_word_score: -1.0 optimization: max_update: 60000 lr: [2e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: null warmup_steps: 8000 hold_steps: 0 decay_steps: 72000 final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: wav2vec,learnlab max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10h_aws_v100.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 # tensorboard_logdir: tb checkpoint: save_interval: 10 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx/abaevski/data/libri/10h/wav2vec/raw labels: ltr cache_in_scratch: true dataset: num_workers: 10 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 10 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter wer_lexicon: /fsx/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: -1.0 optimization: max_update: 60000 lr: [2e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: null warmup_steps: 8000 hold_steps: 0 decay_steps: 72000 final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.6 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /fsx/${env:USER}/w2v_ft/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 0 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: learnfair max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10m.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval: 1000 save_interval_updates: 50 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: true labels: ltr dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 1000 valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true optimization: max_update: 13000 lr: [0.0001] sentence_avg: true update_freq: [5] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_channel_prob: 0.25 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10m_2.yaml ================================================ # @package _group_ common: fp16: true fp16_no_flatten_grads: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 500 save_interval_updates: 500 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/10m/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 500 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 5 wer_word_score: 2 wer_sil_weight: -2 optimization: max_update: 10000 lr: [2e-6] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [4] # base 10h we -> 2/4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 2e-6 optimizer: _name: adam adam_betas: [0.9,0.95] lr_scheduler: _name: cosine warmup_updates: 1000 lr_scheduler: pass_through model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 3 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer freeze_finetune_updates: 100 zero_mask: true feature_grad_mult: 0.0 activation_dropout: 0.1 dropout: 0 final_dropout: 0 attention_dropout: 0 update_alibi: false #hydra: # job: # config: # override_dirname: # kv_sep: ':' # item_sep: '__' # exclude_keys: # - run_config # - distributed_training.distributed_port # sweep: # dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} # subdir: ${hydra.job.num} # launcher: # submitit_folder: ${hydra.sweep.dir} # timeout_min: 3000 # cpus_per_task: 10 # gpus_per_node: 4 # tasks_per_node: 4 # mem_gb: 250 # nodes: 1 # name: ${env:PREFIX}_${hydra.job.config_name} # partition: devlab,learnlab,learnfair,scavenge # constraint: volta32gb # max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10m_2_aws.yaml ================================================ # @package _group_ common: fp16: true fp16_no_flatten_grads: true log_format: json log_interval: 200 user_dir: /data/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 500 save_interval_updates: 500 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx-wav2vec/abaevski/data/libri/10m/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 500 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 5 wer_word_score: 2 wer_sil_weight: -2 optimization: max_update: 10000 lr: [2e-6] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [4] # base 10h we -> 2/4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 2e-6 optimizer: _name: adam adam_betas: [0.9,0.95] lr_scheduler: _name: cosine warmup_updates: 1000 lr_scheduler: pass_through model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 3 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer freeze_finetune_updates: 100 zero_mask: true feature_grad_mult: 0.0 activation_dropout: 0.1 dropout: 0 final_dropout: 0 attention_dropout: 0 update_alibi: false #hydra: # job: # config: # override_dirname: # kv_sep: ':' # item_sep: '__' # exclude_keys: # - run_config # - distributed_training.distributed_port # sweep: # dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} # subdir: ${hydra.job.num} # launcher: # submitit_folder: ${hydra.sweep.dir} # timeout_min: 3000 # cpus_per_task: 10 # gpus_per_node: 4 # tasks_per_node: 4 # mem_gb: 250 # nodes: 1 # name: ${env:PREFIX}_${hydra.job.config_name} # partition: devlab,learnlab,learnfair,scavenge # constraint: volta32gb # max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_10m_3.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 1000 save_interval_updates: 100 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/10m/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 500 valid_subset: dev_other required_batch_size_multiple: 8 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 8 wer_word_score: 5.8 wer_sil_weight: -8 optimization: max_update: 13000 lr: [2e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [5] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_length: 10 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_1h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval: 1000 save_interval_updates: 50 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: true labels: ltr dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 1000 valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true optimization: max_update: 13000 lr: [0.0003] sentence_avg: true update_freq: [5] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.75 mask_channel_prob: 0.25 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/vox_1h_2.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 100 save_interval_updates: 500 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 100 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 6 wer_word_score: -0.1 wer_sil_weight: -4.7 optimization: max_update: 60000 lr: [1e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: cosine warmup_updates: 4000 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_1h_2_aws.yaml ================================================ # @package _group_ common: fp16: true fp16_no_flatten_grads: true log_format: json log_interval: 200 user_dir: /data/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 100 save_interval_updates: 500 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx-wav2vec/abaevski/data/libri/1h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 500 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 5 wer_word_score: 0 wer_sil_weight: -4 optimization: max_update: 10000 lr: [2e-6] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [4] # base 10h we -> 2/4 optimizer: _name: composite dynamic_groups: true groups: default: lr_float: 2e-6 optimizer: _name: adam adam_betas: [0.9,0.95] lr_scheduler: _name: cosine warmup_updates: 1000 lr_scheduler: pass_through model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 3 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer freeze_finetune_updates: 100 zero_mask: true feature_grad_mult: 0.0 activation_dropout: 0.1 dropout: 0 final_dropout: 0 attention_dropout: 0 update_alibi: false #hydra: # job: # config: # override_dirname: # kv_sep: ':' # item_sep: '__' # exclude_keys: # - run_config # - distributed_training.distributed_port # sweep: # dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} # subdir: ${hydra.job.num} # launcher: # submitit_folder: ${hydra.sweep.dir} # timeout_min: 3000 # cpus_per_task: 10 # gpus_per_node: 4 # tasks_per_node: 4 # mem_gb: 250 # nodes: 1 # name: ${env:PREFIX}_${hydra.job.config_name} # partition: devlab,learnlab,learnfair,scavenge # constraint: volta32gb # max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_1h_3.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 100 save_interval_updates: 500 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 640000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 100 valid_subset: dev_other required_batch_size_multiple: 8 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 6 wer_word_score: -0.1 wer_sil_weight: -4.7 optimization: max_update: 13000 lr: [6e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [5] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: cosine warmup_updates: 4000 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.3 mask_length: 3 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_1h_4.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 100 save_interval_updates: 1000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 640000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 100 valid_subset: dev_other required_batch_size_multiple: 8 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: -1.0 optimization: max_update: 13000 lr: [6e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [5] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.65 mask_length: 10 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_1h_aws.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /data/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 100 save_interval_updates: 500 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx-wav2vec/abaevski/data/libri/10m/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval: 100 valid_subset: dev_other required_batch_size_multiple: 8 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 5 wer_word_score: -0.1 wer_sil_weight: -4.7 optimization: max_update: 13000 lr: [6e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [5] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: cosine warmup_updates: 4000 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.3 mask_length: 3 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.25 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 dropout: 0 final_dropout: 0 attention_dropout: 0 update_alibi: false ================================================ FILE: examples/wav2vec/config/finetuning/vox_960h.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: true labels: ltr dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true valid_subset: dev_other distributed_training: ddp_backend: legacy_ddp distributed_world_size: 24 criterion: _name: ctc zero_infinity: true optimization: max_update: 320000 lr: [0.00003] sentence_avg: true optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.5 mask_channel_prob: 0.25 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 ================================================ FILE: examples/wav2vec/config/finetuning/vox_960h_2.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/960h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 1 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 16 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: -1.0 optimization: max_update: 200000 lr: [1e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: null warmup_steps: 8000 hold_steps: 0 decay_steps: 200000 final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/finetuning/vox_960h_2_aws.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /data/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /fsx-wav2vec/abaevski/data/librispeech labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 1 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 16 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /fsx-wav2vec/abaevski/data/libri/4-gram.bin wer_lexicon: /fsx-wav2vec/abaevski/data/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 1.5 wer_word_score: 0 wer_sil_weight: -1 optimization: max_update: 200000 lr: [2e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: null warmup_steps: 8000 hold_steps: 0 decay_steps: 192000 final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.3 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 ================================================ FILE: examples/wav2vec/config/finetuning/vox_960h_3.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 user_dir: /private/home/abaevski/fairseq-py/examples/data2vec # tensorboard_logdir: tb checkpoint: save_interval: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: /checkpoint/abaevski/data/speech/libri/1h/wav2vec/raw labels: ltr normalize: true dataset: num_workers: 6 max_tokens: 1000000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 100 validate_interval: 1 valid_subset: dev_other required_batch_size_multiple: 1 distributed_training: ddp_backend: legacy_ddp distributed_world_size: 16 criterion: _name: ctc zero_infinity: true post_process: letter wer_kenlm_model: /checkpoint/abaevski/data/speech/libri/4-gram.bin wer_lexicon: /checkpoint/abaevski/data/speech/libri/10h/wav2vec/raw/lexicon_ltr2.lst wer_lm_weight: 2.0 wer_word_score: -1.0 optimization: max_update: 200000 lr: [1e-5] # lr: [1e-5] # base 10h wer sentence_avg: true update_freq: [1] # base 10h we -> 2/4 optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: cosine warmup_updates: 8000 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.4 mask_length: 5 # mask_prob: 0.65 # base 10h wer mask_channel_prob: 0.1 # mask_channel_prob: 0.6 # base 10h wer mask_channel_length: 64 layerdrop: 0.1 # layerdrop: 0.05 # base 10h wer activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 100 dropout: 0 final_dropout: 0 attention_dropout: 0 hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 4 tasks_per_node: 4 mem_gb: 250 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/config/pretraining/wav2vec2_base_librispeech.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: ??? max_sample_size: 250000 min_sample_size: 32000 normalize: false dataset: num_workers: 6 max_tokens: 1400000 skip_invalid_size_inputs_valid_test: true distributed_training: distributed_world_size: 64 ddp_backend: legacy_ddp criterion: _name: wav2vec infonce: true log_keys: ["prob_perplexity","code_perplexity","temp"] loss_weights: [0.1, 10] optimization: max_update: 400000 lr: [0.0005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: wav2vec2 quantize_targets: true final_dim: 256 encoder_layerdrop: 0.05 dropout_input: 0.1 dropout_features: 0.1 feature_grad_mult: 0.1 encoder_embed_dim: 768 ================================================ FILE: examples/wav2vec/config/pretraining/wav2vec2_conformer_base_librispeech.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: ??? max_sample_size: 250000 min_sample_size: 32000 normalize: false dataset: num_workers: 6 max_tokens: 1400000 skip_invalid_size_inputs_valid_test: true distributed_training: distributed_world_size: 64 ddp_backend: legacy_ddp criterion: _name: wav2vec infonce: true log_keys: ["prob_perplexity","code_perplexity","temp"] loss_weights: [0.1, 10] optimization: max_update: 400000 lr: [0.0005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: wav2vec2 quantize_targets: true final_dim: 256 encoder_layerdrop: 0.05 dropout_input: 0.1 dropout_features: 0.1 feature_grad_mult: 0.1 encoder_embed_dim: 768 layer_type: conformer attn_type: espnet pos_enc_type: rel_pos ================================================ FILE: examples/wav2vec/config/pretraining/wav2vec2_conformer_large_librivox.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: ??? max_sample_size: 320000 min_sample_size: 32000 normalize: true dataset: num_workers: 6 max_tokens: 1200000 skip_invalid_size_inputs_valid_test: true distributed_training: distributed_world_size: 128 ddp_backend: legacy_ddp criterion: _name: wav2vec infonce: true log_keys: ["prob_perplexity","code_perplexity","temp"] loss_weights: [0.1, 0] optimization: max_update: 1000000 lr: [0.005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: wav2vec2 quantize_targets: true extractor_mode: layer_norm layer_norm_first: true final_dim: 768 latent_temp: [2.0,0.1,0.999995] encoder_layerdrop: 0.00 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 conv_bias: true encoder_layers: 24 encoder_embed_dim: 1024 encoder_ffn_embed_dim: 4096 encoder_attention_heads: 16 feature_grad_mult: 1.0 layer_type: conformer attn_type: espnet pos_enc_type: rel_pos ================================================ FILE: examples/wav2vec/config/pretraining/wav2vec2_large_librivox.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: ??? max_sample_size: 320000 min_sample_size: 32000 normalize: true dataset: batch_size: 4 num_workers: 6 max_tokens: 1200000 skip_invalid_size_inputs_valid_test: true distributed_training: distributed_world_size: 128 ddp_backend: legacy_ddp criterion: _name: wav2vec infonce: true log_keys: ["prob_perplexity","code_perplexity","temp"] loss_weights: [0.1, 0] optimization: max_update: 1000000 lr: [0.005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: wav2vec2 quantize_targets: true extractor_mode: layer_norm layer_norm_first: true final_dim: 768 latent_temp: [2.0,0.1,0.999995] encoder_layerdrop: 0.00 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 conv_bias: true encoder_layers: 24 encoder_embed_dim: 1024 encoder_ffn_embed_dim: 4096 encoder_attention_heads: 16 feature_grad_mult: 1.0 ================================================ FILE: examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu-pod.yaml ================================================ # @package _group_ common: tpu: true fp16: false log_format: json log_interval: 10 checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: ??? max_sample_size: 250000 min_sample_size: 32000 normalize: true num_batch_buckets: 3 precompute_mask_indices: true enable_padding: true dataset: num_workers: 6 max_tokens: 1200000 skip_invalid_size_inputs_valid_test: true distributed_training: distributed_world_size: 128 ddp_backend: legacy_ddp criterion: _name: wav2vec infonce: true log_keys: ["prob_perplexity","code_perplexity","temp"] loss_weights: [0.1, 0] optimization: max_update: 1000000 lr: [0.005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: wav2vec2 quantize_targets: true extractor_mode: layer_norm layer_norm_first: true final_dim: 768 latent_temp: [2.0,0.1,0.999995] encoder_layerdrop: 0.00 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 conv_bias: true encoder_layers: 24 encoder_embed_dim: 1024 encoder_ffn_embed_dim: 4096 encoder_attention_heads: 16 feature_grad_mult: 1.0 ================================================ FILE: examples/wav2vec/config/pretraining/wav2vec2_large_librivox_tpu.yaml ================================================ # @package _group_ common: tpu: true fp16: false log_format: json log_interval: 10 checkpoint: save_interval_updates: 25000 keep_interval_updates: 1 no_epoch_checkpoints: true task: _name: audio_pretraining data: ??? max_sample_size: 250000 min_sample_size: 32000 normalize: true num_batch_buckets: 3 precompute_mask_indices: true enable_padding: true inferred_w2v_config: mask_prob: 0.65 mask_selection: 'static' mask_other: 0 mask_channel_prob: 0.1 dataset: num_workers: 6 max_tokens: 1200000 skip_invalid_size_inputs_valid_test: true distributed_training: distributed_world_size: 8 ddp_backend: legacy_ddp criterion: _name: wav2vec infonce: true log_keys: ["prob_perplexity","code_perplexity","temp"] loss_weights: [0.1, 0] optimization: max_update: 1000000 lr: [0.005] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-06 weight_decay: 0.01 lr_scheduler: _name: polynomial_decay warmup_updates: 32000 model: _name: wav2vec2 quantize_targets: true extractor_mode: layer_norm layer_norm_first: true final_dim: 768 latent_temp: [2.0,0.1,0.999995] encoder_layerdrop: 0.00 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 conv_bias: true encoder_layers: 24 encoder_embed_dim: 1024 encoder_ffn_embed_dim: 4096 encoder_attention_heads: 16 feature_grad_mult: 1.0 ================================================ FILE: examples/wav2vec/libri_labels.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset """ import argparse import os def main(): parser = argparse.ArgumentParser() parser.add_argument("tsv") parser.add_argument("--output-dir", required=True) parser.add_argument("--output-name", required=True) args = parser.parse_args() os.makedirs(args.output_dir, exist_ok=True) transcriptions = {} with open(args.tsv, "r") as tsv, open( os.path.join(args.output_dir, args.output_name + ".ltr"), "w" ) as ltr_out, open( os.path.join(args.output_dir, args.output_name + ".wrd"), "w" ) as wrd_out: root = next(tsv).strip() for line in tsv: line = line.strip() dir = os.path.dirname(line) if dir not in transcriptions: parts = dir.split(os.path.sep) trans_path = f"{parts[-2]}-{parts[-1]}.trans.txt" path = os.path.join(root, dir, trans_path) assert os.path.exists(path) texts = {} with open(path, "r") as trans_f: for tline in trans_f: items = tline.strip().split() texts[items[0]] = " ".join(items[1:]) transcriptions[dir] = texts part = os.path.basename(line).split(".")[0] assert part in transcriptions[dir] print(transcriptions[dir][part], file=wrd_out) print( " ".join(list(transcriptions[dir][part].replace(" ", "|"))) + " |", file=ltr_out, ) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/scripts/binarize_manifest.sh ================================================ #!/usr/bin/env bash # usage: bash binarize_manifest <dest_dir> <train_split> <valid_split> DEST_DIR=$1 TRAIN_SPLIT=$2 VALID_SPLIT=$3 FAIRSEQ_ROOT=$4 mkdir -p $DEST_DIR # split file path and lengths into separate files cut -f1 $TRAIN_SPLIT.tsv > $DEST_DIR/train_fnames.txt cut -f1 $VALID_SPLIT.tsv > $DEST_DIR/valid_fnames.txt cut -f2 $TRAIN_SPLIT.tsv > $DEST_DIR/train.lengths cut -f2 $VALID_SPLIT.tsv > $DEST_DIR/valid.lengths # copy root directory head -1 $TRAIN_SPLIT.tsv > $DEST_DIR/train.root head -1 $VALID_SPLIT.tsv > $DEST_DIR/valid.root # remove root directory sed -i '1d' $DEST_DIR/train_fnames.txt sed -i '1d' $DEST_DIR/valid_fnames.txt sed -i '1d' $DEST_DIR/train.lengths sed -i '1d' $DEST_DIR/valid.lengths # insert spaces between characters sed -i -e 's/\(.\)/\1 /g' $DEST_DIR/train_fnames.txt sed -i -e 's/\(.\)/\1 /g' $DEST_DIR/valid_fnames.txt # run preprocessor PYTHONPATH=$FAIRSEQ_ROOT python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $DEST_DIR/train_fnames.txt --validpref $DEST_DIR/valid_fnames.txt --workers 60 --only-source --destdir $DEST_DIR ================================================ FILE: examples/wav2vec/unsupervised/README.md ================================================ # wav2vec Unsupervised (wav2vec-U) Wav2vec Unsupervised (wav2vec-U) and the 2.0 version are frameworks for building speech recognition systems without any labeled training data as described in [Unsupervised Speech Recognition (Baevski et al., 2021)](https://ai.facebook.com/research/publications/unsupervised-speech-recognition) and [Towards End-to-end Unsupervised Speech Recognition (Liu, et al., 2022)](https://arxiv.org/abs/2204.02492). The model takes as input wav2vec 2.0 or XLSR representations (see [pretrained models](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec)) as well as unlabeled speech and text data. The training procedure consists of three consecutive main steps: * Preparation of speech representations and text data * Generative adversarial training (GAN) * Iterative self-training + Kaldi LM-decoding ## Preparation of speech and text data Similar to [wav2vec 2.0](https://github.com/pytorch/fairseq/blob/main/examples/wav2vec/README.md), data folders contain {train,valid,test}.{tsv,wrd,phn} files, where audio paths are stored in tsv files, and word, letter or phoneme transcriptions are stored in .{wrd,ltr,phn}. In **/path/to/data/with_silence** you need a *train.tsv* file as well as (optionally) *{valid,test}.{tsv,wrd,phn}*. It is nice to have *10h.{tsv,phn}* files there too for reproducing the ablation study on layer selection. In **/path/to/data/without_silence** you have the same files, except *.tsv* files contain audios with silences removed using rVAD. Pre-requisites: * set FAIRSEQ_ROOT environmental variable to your fairseq installation * set RVAD_ROOT environmental variable to a checkout of [rVADfast](https://github.com/zhenghuatan/rVADfast) * set KENLM_ROOT environmental variable to the location of [KenLM](https://github.com/kpu/kenlm) binaries * install [PyKaldi](https://github.com/pykaldi/pykaldi) and set KALDI_ROOT environmental variable to the location of your kaldi installation. To use the version bundled with PyKaldi, you can use /path/to/pykaldi/tools/kaldi Create new audio files without silences: ```shell # create a manifest file for the set original of audio files python $FAIRSEQ_ROOT/examples/wav2vec/wav2vec_manifest.py /dir/to/save/audio/files --ext wav --dest /path/to/new/train.tsv --valid-percent 0 python scripts/vads.py -r $RVAD_ROOT < /path/to/train.tsv > train.vads python scripts/remove_silence.py --tsv /path/to/train.tsv --vads train.vads --out /dir/to/save/audio/files python $FAIRSEQ_ROOT/examples/wav2vec/wav2vec_manifest.py /dir/to/save/audio/files --ext wav --dest /path/to/new/train.tsv --valid-percent 0.01 ``` Next, we need to preprocess the audio data to better match phonemized text data: ```shell # wav2vec-U zsh scripts/prepare_audio.sh /dir/with/{train,test,valid}.tsv /output/dir /path/to/wav2vec2/model.pt 512 14 # wav2vec-U 2.0 zsh scripts/prepare_audio_v2.sh /dir/with/{train,test,valid}.tsv /output/dir /path/to/wav2vec2/model.pt 64 14 ``` Note that if you have splits different than train/valid/test, you will need to modify this script. The thrid argument is the PCA dimensionality for wav2vec-U and the number of MFCC clusters for wav2vec-U 2.0. The last argument is the 0-based index of the layer from which to extract representations. Now we need to prepare text data: ```shell zsh scripts/prepare_text.sh language /path/to/text/file /output/dir 1000 espeak /path/to/fasttext/lid/model sil_prob ``` The fourth argument is minimum number observations of phones to keep. If your text corpus is small, you might want to reduce this number. The fifth argument is which phonemizer to use. Supported values are [espeak](http://espeak.sourceforge.net/), [espeak-ng](https://github.com/espeak-ng/espeak-ng), and [G2P](https://github.com/Kyubyong/g2p) (english only). Pre-trained fasttext LID models can be downloaded [here](https://fasttext.cc/docs/en/language-identification.html). The last argument is the probability to introduce silence (`<SIL>`) between the word boundaries. We found the value `0.25`/`0.5` works in general for wav2vec-U and the 2.0 version respectively, but you might want to vary for languages that are never tested. ### Prepare TIMIT data TIMIT transcripts include silence. Therefore VAD is not used for audio preprocessing, and we do not wrap transcripts with silences or insert random silence in between words. To prepare TIMIT data for both the matched an unmatched setup: ```shell bash scripts/prepare_timit.sh /dir/to/timit/raw/data /output/dir /path/to/wav2vec2/model.pt ``` Note that we assume the TIMIT distribution with capitalized directories and filenames are used (e.g., `TRAIN/DR1/FCJF0/SA1.PHN`). ## Generative adversarial training (GAN) We then use a GAN model to build a first unsupervised ASR model. The data preparation above of both speech features and text data is a necessary procedure that enables the generator to match speech to text in an unsupervised way. Launching GAN training on top of preprocessed features, with default hyperparameters can be done with: ``` PREFIX=w2v_unsup_gan_xp # For wav2vec-U, audio features are pre-segmented CONFIG_NAME=w2vu TASK_DATA=/path/to/features/precompute_unfiltered_pca512_cls128_mean_pooled # For wav2vec-U 2.0, use raw audio features CONFIG_NAME=w2vu2 TASK_DATA=/path/to/features/ # Unpaired text input TEXT_DATA=/path/to/data/phones # path to fairseq-preprocessed GAN data (phones dir) KENLM_PATH=/path/to/data/phones/kenlm.phn.o4.bin # KenLM 4-gram phoneme language model (LM data = GAN data here) PYTHONPATH=$FAIRSEQ_ROOT PREFIX=$PREFIX fairseq-hydra-train \ -m --config-dir config/gan \ --config-name $CONFIG_NAME \ task.data=${TASK_DATA} \ task.text_data=${TEXT_DATA} \ task.kenlm_path=${KENLM_PATH} \ common.user_dir=${FAIRSEQ_ROOT}/examples/wav2vec/unsupervised \ model.code_penalty=2,4 model.gradient_penalty=1.5,2.0 \ model.smoothness_weight=0.5,0.75,1.0 'common.seed=range(0,5)' ``` Once we find the best checkpoint (chosen using unsupervised metric that combined language model perplexity and vocabulary usage), we can use it to generate phone labels (or word labels with an appropriate kaldi WFST): ```shell python w2vu_generate.py --config-dir config/generate --config-name viterbi \ fairseq.common.user_dir=${FAIRSEQ_ROOT}/examples/wav2vec/unsupervised \ fairseq.task.data=/path/to/dir/with/features \ fairseq.common_eval.path=/path/to/gan/checkpoint \ fairseq.dataset.gen_subset=valid results_path=/where/to/save/transcriptions ``` The decoding without LM works best on the same adjacent-mean-pooled features that the gan was trained on, while decoding with LM works better on features before the adjacent timestep mean-pooling step (without the "_pooled" suffix). While the generator of wav2vec-U 2.0 is trained with an output frequency of 16hz, we found decoding at a higher frequency produces better results. This can be done by adding `decode_stride=1` or `2` to the argument. ## Iterative self-training + Kaldi LM-decoding After the GAN training provides a first unsupervised model, we can then progressively refine the quality of transcriptions using several iterations of semi-supervised learning. We perform two iterations: first, pseudo-label the training data with the unsupervised GAN model and train an HMM on the pseudo-labels. Second, we relabel the training data with the HMM and then fine-tune the original wav2vec 2.0 model using the HMM pseudo-labels with a CTC loss. Note that HMM models use phonemes as output, while wav2vec 2.0 use letter. Both are decoded using WFST decoders into words. Please see [this README](kaldi_self_train/README.md) for more instructions on how to do iterative self-training + Kaldi LM-decoding. *** Note: these instructions are a work in progress and will be updated over the next few days ================================================ FILE: examples/wav2vec/unsupervised/__init__.py ================================================ ================================================ FILE: examples/wav2vec/unsupervised/config/finetuning/w2v_finetune.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: no_epoch_checkpoints: true save_interval_updates: 20000 task: _name: audio_finetuning data: ??? normalize: true labels: ltr dataset: num_workers: 6 max_tokens: 800000 skip_invalid_size_inputs_valid_test: true train_subset: train valid_subset: valid distributed_training: ddp_backend: legacy_ddp distributed_world_size: 8 find_unused_parameters: True criterion: _name: ctc zero_infinity: true post_process: letter optimization: max_update: 80000 lr: [0.00003] sentence_avg: true update_freq: [1] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.25 mask_channel_prob: 0.1 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 0 ================================================ FILE: examples/wav2vec/unsupervised/config/gan/w2vu.yaml ================================================ # @package _group_ common: fp16: false fp16_no_flatten_grads: true log_format: json log_interval: 100 tensorboard_logdir: tb reset_logging: false suppress_crashes: false checkpoint: save_interval: 1000 save_interval_updates: 1000 no_epoch_checkpoints: true best_checkpoint_metric: weighted_lm_ppl save_dir: . distributed_training: distributed_world_size: 1 task: _name: unpaired_audio_text data: ??? text_data: ??? labels: phn sort_by_length: false unfiltered: false max_length: null append_eos: false kenlm_path: ??? dataset: num_workers: 6 batch_size: 160 skip_invalid_size_inputs_valid_test: true valid_subset: valid validate_interval: 1000 validate_interval_updates: 1000 criterion: _name: model log_keys: - accuracy_dense - accuracy_token - temp - code_ppl optimization: max_update: 150000 clip_norm: 5.0 lr: [0] optimizer: _name: composite groups: generator: lr: [0.0004] lr_float: null optimizer: _name: adam adam_betas: [0.5,0.98] adam_eps: 1e-06 weight_decay: 0 amsgrad: false lr_scheduler: _name: fixed warmup_updates: 0 discriminator: lr: [ 0.0005 ] lr_float: null optimizer: _name: adam adam_betas: [0.5,0.98] adam_eps: 1e-06 weight_decay: 0.0001 amsgrad: false lr_scheduler: _name: fixed warmup_updates: 0 lr_scheduler: pass_through model: _name: wav2vec_u discriminator_dim: 384 discriminator_depth: 2 discriminator_kernel: 6 discriminator_linear_emb: false discriminator_causal: true discriminator_max_pool: false discriminator_act_after_linear: false discriminator_dropout: 0.0 discriminator_weight_norm: false generator_stride: 1 generator_kernel: 4 generator_bias: false generator_dropout: 0.1 smoothness_weight: 0.5 smoothing: 0 smoothing_one_sided: false gumbel: false hard_gumbel: false gradient_penalty: 1.5 code_penalty: 4.0 temp: [ 2,0.1,0.99995 ] input_dim: 512 segmentation: type: JOIN mean_pool_join: false remove_zeros: false ================================================ FILE: examples/wav2vec/unsupervised/config/gan/w2vu2.yaml ================================================ # @package _group_ common: fp16: false fp16_no_flatten_grads: true log_format: json log_interval: 100 tensorboard_logdir: tb reset_logging: false suppress_crashes: false checkpoint: save_interval: 1000 save_interval_updates: 1000 no_epoch_checkpoints: true best_checkpoint_metric: weighted_lm_ppl save_dir: . distributed_training: distributed_world_size: 1 task: _name: unpaired_audio_text data: ??? text_data: ??? labels: phn sort_by_length: false unfiltered: false max_length: null append_eos: false kenlm_path: ??? aux_target_postfix: km dataset: num_workers: 6 batch_size: 160 skip_invalid_size_inputs_valid_test: true valid_subset: valid validate_interval: 1000 validate_interval_updates: 1000 criterion: _name: model log_keys: - accuracy_dense - accuracy_token - temp - code_ppl optimization: max_update: 150000 clip_norm: 5.0 lr: [0] optimizer: _name: composite groups: generator: lr: [0.00005] lr_float: null optimizer: _name: adam adam_betas: [0.5,0.98] adam_eps: 1e-06 weight_decay: 0 amsgrad: false lr_scheduler: _name: fixed warmup_updates: 0 discriminator: lr: [ 0.0003 ] lr_float: null optimizer: _name: adam adam_betas: [0.5,0.98] adam_eps: 1e-06 weight_decay: 0.0001 amsgrad: false lr_scheduler: _name: fixed warmup_updates: 0 lr_scheduler: pass_through model: _name: wav2vec_u discriminator_dim: 384 discriminator_depth: 2 discriminator_kernel: 8 discriminator_linear_emb: false discriminator_causal: true discriminator_max_pool: false discriminator_act_after_linear: false discriminator_dropout: 0.0 discriminator_weight_norm: false generator_stride: 3 generator_kernel: 9 generator_bias: false generator_dropout: 0.1 generator_batch_norm: 30 generator_residual: true smoothness_weight: 1.5 smoothing: 0 smoothing_one_sided: false gumbel: false hard_gumbel: false gradient_penalty: 1.0 code_penalty: 3.0 temp: [ 2,0.1,0.99995 ] input_dim: 1024 mmi_weight: 0.5 target_dim: 64 segmentation: type: JOIN mean_pool_join: false remove_zeros: false hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - run_config - distributed_training.distributed_port - common.user_dir - task.data - task.kenlm_path - task.text_data - model.generator_layers - task.labels - task.force_model_seed sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}/${hydra.job.override_dirname} subdir: ${hydra.job.num} launcher: submitit_folder: ${hydra.sweep.dir} timeout_min: 3000 cpus_per_task: 10 gpus_per_node: 1 tasks_per_node: 1 mem_gb: 120 nodes: 1 name: ${env:PREFIX}_${hydra.job.config_name} partition: devlab,learnlab,learnfair,scavenge comment: intern_endding_soon constraint: volta32gb max_num_timeout: 30 ================================================ FILE: examples/wav2vec/unsupervised/config/generate/viterbi.yaml ================================================ # @package _group_ fairseq: task: _name: unpaired_audio_text labels: phn data: ??? sort_by_length: false shuffle: false text_data: '' common_eval: path: ??? quiet: true dataset: gen_subset: valid batch_size: 1 w2l_decoder: VITERBI post_process: silence ================================================ FILE: examples/wav2vec/unsupervised/config/timit_matched/test.uid ================================================ FDHC0_SI1559 FDHC0_SI2189 FDHC0_SI929 FDHC0_SX119 FDHC0_SX209 FDHC0_SX29 FDHC0_SX299 FDHC0_SX389 FELC0_SI1386 FELC0_SI2016 FELC0_SI756 FELC0_SX126 FELC0_SX216 FELC0_SX306 FELC0_SX36 FELC0_SX396 FJLM0_SI1043 FJLM0_SI1673 FJLM0_SI2303 FJLM0_SX143 FJLM0_SX233 FJLM0_SX323 FJLM0_SX413 FJLM0_SX53 FMGD0_SI1564 FMGD0_SI2194 FMGD0_SI934 FMGD0_SX124 FMGD0_SX214 FMGD0_SX304 FMGD0_SX34 FMGD0_SX394 FMLD0_SI2185 FMLD0_SI822 FMLD0_SI925 FMLD0_SX115 FMLD0_SX205 FMLD0_SX25 FMLD0_SX295 FMLD0_SX385 FNLP0_SI1308 FNLP0_SI1938 FNLP0_SI678 FNLP0_SX138 FNLP0_SX228 FNLP0_SX318 FNLP0_SX408 FNLP0_SX48 FPAS0_SI1272 FPAS0_SI2204 FPAS0_SI944 FPAS0_SX134 FPAS0_SX224 FPAS0_SX314 FPAS0_SX404 FPAS0_SX44 FPKT0_SI1538 FPKT0_SI2168 FPKT0_SI908 FPKT0_SX188 FPKT0_SX278 FPKT0_SX368 FPKT0_SX8 FPKT0_SX98 MBPM0_SI1577 MBPM0_SI1584 MBPM0_SI947 MBPM0_SX137 MBPM0_SX227 MBPM0_SX317 MBPM0_SX407 MBPM0_SX47 MCMJ0_SI1094 MCMJ0_SI464 MCMJ0_SI602 MCMJ0_SX104 MCMJ0_SX14 MCMJ0_SX194 MCMJ0_SX284 MCMJ0_SX374 MDAB0_SI1039 MDAB0_SI1669 MDAB0_SI2299 MDAB0_SX139 MDAB0_SX229 MDAB0_SX319 MDAB0_SX409 MDAB0_SX49 MGRT0_SI1450 MGRT0_SI2080 MGRT0_SI820 MGRT0_SX10 MGRT0_SX100 MGRT0_SX190 MGRT0_SX280 MGRT0_SX370 MJDH0_SI1354 MJDH0_SI1984 MJDH0_SI724 MJDH0_SX184 MJDH0_SX274 MJDH0_SX364 MJDH0_SX4 MJDH0_SX94 MJLN0_SI1449 MJLN0_SI2079 MJLN0_SI819 MJLN0_SX189 MJLN0_SX279 MJLN0_SX369 MJLN0_SX9 MJLN0_SX99 MJMP0_SI1535 MJMP0_SI1791 MJMP0_SI905 MJMP0_SX185 MJMP0_SX275 MJMP0_SX365 MJMP0_SX5 MJMP0_SX95 MKLT0_SI1213 MKLT0_SI1843 MKLT0_SI583 MKLT0_SX133 MKLT0_SX223 MKLT0_SX313 MKLT0_SX403 MKLT0_SX43 MLLL0_SI1363 MLLL0_SI1993 MLLL0_SI733 MLLL0_SX103 MLLL0_SX13 MLLL0_SX193 MLLL0_SX283 MLLL0_SX373 MLNT0_SI1574 MLNT0_SI1902 MLNT0_SI642 MLNT0_SX102 MLNT0_SX12 MLNT0_SX192 MLNT0_SX282 MLNT0_SX372 MNJM0_SI1580 MNJM0_SI2210 MNJM0_SI950 MNJM0_SX140 MNJM0_SX230 MNJM0_SX320 MNJM0_SX410 MNJM0_SX50 MPAM0_SI1189 MPAM0_SI1819 MPAM0_SI1961 MPAM0_SX109 MPAM0_SX19 MPAM0_SX199 MPAM0_SX289 MPAM0_SX379 MTAS1_SI1473 MTAS1_SI2098 MTAS1_SI838 MTAS1_SX118 MTAS1_SX208 MTAS1_SX28 MTAS1_SX298 MTAS1_SX388 MTLS0_SI1370 MTLS0_SI2000 MTLS0_SI740 MTLS0_SX110 MTLS0_SX20 MTLS0_SX200 MTLS0_SX290 MTLS0_SX380 MWBT0_SI1553 MWBT0_SI2183 MWBT0_SI923 MWBT0_SX113 MWBT0_SX203 MWBT0_SX23 MWBT0_SX293 MWBT0_SX383 MWEW0_SI1361 MWEW0_SI1991 MWEW0_SI731 MWEW0_SX101 MWEW0_SX11 MWEW0_SX191 MWEW0_SX281 MWEW0_SX371 ================================================ FILE: examples/wav2vec/unsupervised/config/timit_matched/train.uid ================================================ FAEM0_SI1392 FAEM0_SI2022 FAEM0_SI762 FAEM0_SX132 FAEM0_SX222 FAEM0_SX312 FAEM0_SX402 FAEM0_SX42 FAJW0_SI1263 FAJW0_SI1893 FAJW0_SI633 FAJW0_SX183 FAJW0_SX273 FAJW0_SX3 FAJW0_SX363 FAJW0_SX93 FALK0_SI1086 FALK0_SI456 FALK0_SI658 FALK0_SX186 FALK0_SX276 FALK0_SX366 FALK0_SX6 FALK0_SX96 FALR0_SI1325 FALR0_SI1955 FALR0_SI695 FALR0_SX155 FALR0_SX245 FALR0_SX335 FALR0_SX425 FALR0_SX65 FAPB0_SI1063 FAPB0_SI1693 FAPB0_SI2323 FAPB0_SX163 FAPB0_SX253 FAPB0_SX343 FAPB0_SX433 FAPB0_SX73 FBAS0_SI1387 FBAS0_SI1472 FBAS0_SI2066 FBAS0_SX127 FBAS0_SX217 FBAS0_SX307 FBAS0_SX37 FBAS0_SX397 FBCG1_SI1612 FBCG1_SI2242 FBCG1_SI982 FBCG1_SX172 FBCG1_SX262 FBCG1_SX352 FBCG1_SX442 FBCG1_SX82 FBCH0_SI1586 FBCH0_SI956 FBCH0_SI959 FBCH0_SX146 FBCH0_SX236 FBCH0_SX326 FBCH0_SX416 FBCH0_SX56 FBJL0_SI1552 FBJL0_SI2182 FBJL0_SI922 FBJL0_SX112 FBJL0_SX202 FBJL0_SX22 FBJL0_SX292 FBJL0_SX382 FBLV0_SI1058 FBLV0_SI1688 FBLV0_SI2318 FBLV0_SX158 FBLV0_SX248 FBLV0_SX338 FBLV0_SX428 FBLV0_SX68 FBMH0_SI1136 FBMH0_SI1766 FBMH0_SI970 FBMH0_SX146 FBMH0_SX236 FBMH0_SX326 FBMH0_SX416 FBMH0_SX56 FBMJ0_SI1776 FBMJ0_SI516 FBMJ0_SI815 FBMJ0_SX156 FBMJ0_SX246 FBMJ0_SX336 FBMJ0_SX426 FBMJ0_SX66 FCAG0_SI1503 FCAG0_SI1641 FCAG0_SI2133 FCAG0_SX153 FCAG0_SX243 FCAG0_SX333 FCAG0_SX423 FCAG0_SX63 FCAJ0_SI1479 FCAJ0_SI1804 FCAJ0_SI849 FCAJ0_SX129 FCAJ0_SX219 FCAJ0_SX309 FCAJ0_SX39 FCAJ0_SX399 FCDR1_SI1186 FCDR1_SI1816 FCDR1_SI556 FCDR1_SX106 FCDR1_SX16 FCDR1_SX196 FCDR1_SX286 FCDR1_SX376 FCEG0_SI1248 FCEG0_SI1878 FCEG0_SI618 FCEG0_SX168 FCEG0_SX258 FCEG0_SX348 FCEG0_SX438 FCEG0_SX78 FCJF0_SI1027 FCJF0_SI1657 FCJF0_SI648 FCJF0_SX127 FCJF0_SX217 FCJF0_SX307 FCJF0_SX37 FCJF0_SX397 FCJS0_SI1607 FCJS0_SI2237 FCJS0_SI977 FCJS0_SX167 FCJS0_SX257 FCJS0_SX347 FCJS0_SX437 FCJS0_SX77 FCKE0_SI1111 FCKE0_SI1741 FCKE0_SI481 FCKE0_SX121 FCKE0_SX211 FCKE0_SX301 FCKE0_SX31 FCKE0_SX391 FCLT0_SI1438 FCLT0_SI2068 FCLT0_SI808 FCLT0_SX178 FCLT0_SX268 FCLT0_SX358 FCLT0_SX448 FCLT0_SX88 FCMG0_SI1142 FCMG0_SI1242 FCMG0_SI1872 FCMG0_SX162 FCMG0_SX252 FCMG0_SX342 FCMG0_SX432 FCMG0_SX72 FCMM0_SI1083 FCMM0_SI1957 FCMM0_SI453 FCMM0_SX183 FCMM0_SX273 FCMM0_SX363 FCMM0_SX420 FCMM0_SX93 FCRZ0_SI1913 FCRZ0_SI2053 FCRZ0_SI793 FCRZ0_SX163 FCRZ0_SX253 FCRZ0_SX343 FCRZ0_SX433 FCRZ0_SX73 FCYL0_SI1297 FCYL0_SI1927 FCYL0_SI667 FCYL0_SX127 FCYL0_SX217 FCYL0_SX349 FCYL0_SX37 FCYL0_SX397 FDAS1_SI1461 FDAS1_SI2091 FDAS1_SI831 FDAS1_SX111 FDAS1_SX201 FDAS1_SX21 FDAS1_SX291 FDAS1_SX381 FDAW0_SI1271 FDAW0_SI1406 FDAW0_SI2036 FDAW0_SX146 FDAW0_SX236 FDAW0_SX326 FDAW0_SX416 FDAW0_SX56 FDFB0_SI1318 FDFB0_SI1948 FDFB0_SI2010 FDFB0_SX148 FDFB0_SX238 FDFB0_SX328 FDFB0_SX418 FDFB0_SX58 FDJH0_SI1565 FDJH0_SI2195 FDJH0_SI935 FDJH0_SX125 FDJH0_SX215 FDJH0_SX305 FDJH0_SX35 FDJH0_SX395 FDKN0_SI1081 FDKN0_SI1202 FDKN0_SI1711 FDKN0_SX181 FDKN0_SX271 FDKN0_SX361 FDKN0_SX451 FDKN0_SX91 FDML0_SI1149 FDML0_SI1779 FDML0_SI2075 FDML0_SX159 FDML0_SX249 FDML0_SX339 FDML0_SX429 FDML0_SX69 FDMY0_SI1197 FDMY0_SI567 FDMY0_SI714 FDMY0_SX117 FDMY0_SX207 FDMY0_SX27 FDMY0_SX297 FDMY0_SX387 FDNC0_SI1278 FDNC0_SI1908 FDNC0_SI2287 FDNC0_SX108 FDNC0_SX18 FDNC0_SX198 FDNC0_SX288 FDNC0_SX378 FDTD0_SI1561 FDTD0_SI2191 FDTD0_SI931 FDTD0_SX121 FDTD0_SX211 FDTD0_SX301 FDTD0_SX321 FDTD0_SX391 FDXW0_SI1511 FDXW0_SI2141 FDXW0_SI881 FDXW0_SX161 FDXW0_SX251 FDXW0_SX341 FDXW0_SX431 FDXW0_SX71 FEAC0_SI1245 FEAC0_SI1875 FEAC0_SI615 FEAC0_SX165 FEAC0_SX255 FEAC0_SX345 FEAC0_SX435 FEAC0_SX75 FEAR0_SI1252 FEAR0_SI1882 FEAR0_SI622 FEAR0_SX172 FEAR0_SX262 FEAR0_SX352 FEAR0_SX442 FEAR0_SX82 FECD0_SI1418 FECD0_SI2048 FECD0_SI788 FECD0_SX158 FECD0_SX248 FECD0_SX338 FECD0_SX428 FECD0_SX68 FEEH0_SI1112 FEEH0_SI1742 FEEH0_SI471 FEEH0_SX122 FEEH0_SX212 FEEH0_SX302 FEEH0_SX32 FEEH0_SX392 FEME0_SI1505 FEME0_SI2135 FEME0_SI875 FEME0_SX155 FEME0_SX245 FEME0_SX335 FEME0_SX425 FEME0_SX65 FETB0_SI1148 FETB0_SI1778 FETB0_SI518 FETB0_SX158 FETB0_SX248 FETB0_SX338 FETB0_SX428 FETB0_SX68 FEXM0_SI1101 FEXM0_SI1731 FEXM0_SI482 FEXM0_SX111 FEXM0_SX201 FEXM0_SX291 FEXM0_SX366 FEXM0_SX381 FGCS0_SI1486 FGCS0_SI2116 FGCS0_SI856 FGCS0_SX136 FGCS0_SX226 FGCS0_SX316 FGCS0_SX406 FGCS0_SX46 FGDP0_SI1618 FGDP0_SI2248 FGDP0_SI988 FGDP0_SX178 FGDP0_SX268 FGDP0_SX358 FGDP0_SX448 FGDP0_SX88 FGMB0_SI1145 FGMB0_SI1775 FGMB0_SI515 FGMB0_SX155 FGMB0_SX245 FGMB0_SX335 FGMB0_SX425 FGMB0_SX65 FGRW0_SI1152 FGRW0_SI1782 FGRW0_SI1990 FGRW0_SX162 FGRW0_SX252 FGRW0_SX342 FGRW0_SX432 FGRW0_SX72 FHLM0_SI1560 FHLM0_SI2190 FHLM0_SI930 FHLM0_SX120 FHLM0_SX210 FHLM0_SX300 FHLM0_SX349 FHLM0_SX390 FHXS0_SI1075 FHXS0_SI2302 FHXS0_SI2335 FHXS0_SX175 FHXS0_SX265 FHXS0_SX355 FHXS0_SX445 FHXS0_SX85 FJDM2_SI1582 FJDM2_SI1964 FJDM2_SI2212 FJDM2_SX142 FJDM2_SX232 FJDM2_SX322 FJDM2_SX412 FJDM2_SX52 FJEN0_SI1047 FJEN0_SI1677 FJEN0_SI2307 FJEN0_SX147 FJEN0_SX237 FJEN0_SX327 FJEN0_SX417 FJEN0_SX57 FJHK0_SI1022 FJHK0_SI1652 FJHK0_SI2282 FJHK0_SX122 FJHK0_SX212 FJHK0_SX302 FJHK0_SX32 FJHK0_SX392 FJKL0_SI1562 FJKL0_SI2192 FJKL0_SI932 FJKL0_SX122 FJKL0_SX212 FJKL0_SX302 FJKL0_SX32 FJKL0_SX392 FJLG0_SI1506 FJLG0_SI1889 FJLG0_SI2306 FJLG0_SX179 FJLG0_SX269 FJLG0_SX359 FJLG0_SX449 FJLG0_SX89 FJLR0_SI1231 FJLR0_SI1861 FJLR0_SI601 FJLR0_SX151 FJLR0_SX241 FJLR0_SX331 FJLR0_SX421 FJLR0_SX61 FJRB0_SI1302 FJRB0_SI1932 FJRB0_SI672 FJRB0_SX132 FJRB0_SX222 FJRB0_SX312 FJRB0_SX402 FJRB0_SX42 FJRP1_SI1432 FJRP1_SI2062 FJRP1_SI802 FJRP1_SX172 FJRP1_SX262 FJRP1_SX352 FJRP1_SX442 FJRP1_SX82 FJSK0_SI1052 FJSK0_SI1682 FJSK0_SI2312 FJSK0_SX152 FJSK0_SX242 FJSK0_SX332 FJSK0_SX422 FJSK0_SX62 FJSP0_SI1434 FJSP0_SI1763 FJSP0_SI804 FJSP0_SX174 FJSP0_SX264 FJSP0_SX354 FJSP0_SX444 FJSP0_SX84 FJWB1_SI2055 FJWB1_SI748 FJWB1_SI795 FJWB1_SX165 FJWB1_SX255 FJWB1_SX345 FJWB1_SX435 FJWB1_SX75 FJXM0_SI1211 FJXM0_SI1971 FJXM0_SI581 FJXM0_SX131 FJXM0_SX221 FJXM0_SX311 FJXM0_SX401 FJXM0_SX41 FJXP0_SI1122 FJXP0_SI1752 FJXP0_SI492 FJXP0_SX132 FJXP0_SX222 FJXP0_SX312 FJXP0_SX402 FJXP0_SX42 FKAA0_SI1208 FKAA0_SI1838 FKAA0_SI578 FKAA0_SX128 FKAA0_SX218 FKAA0_SX308 FKAA0_SX38 FKAA0_SX398 FKDE0_SI1141 FKDE0_SI1771 FKDE0_SI2221 FKDE0_SX151 FKDE0_SX241 FKDE0_SX331 FKDE0_SX421 FKDE0_SX61 FKDW0_SI1207 FKDW0_SI1891 FKDW0_SI577 FKDW0_SX127 FKDW0_SX217 FKDW0_SX307 FKDW0_SX37 FKDW0_SX397 FKFB0_SI1608 FKFB0_SI2238 FKFB0_SI978 FKFB0_SX168 FKFB0_SX258 FKFB0_SX348 FKFB0_SX438 FKFB0_SX78 FKKH0_SI1290 FKKH0_SI1920 FKKH0_SI660 FKKH0_SX120 FKKH0_SX210 FKKH0_SX30 FKKH0_SX300 FKKH0_SX390 FKLC0_SI1615 FKLC0_SI2245 FKLC0_SI985 FKLC0_SX175 FKLC0_SX265 FKLC0_SX355 FKLC0_SX445 FKLC0_SX85 FKLC1_SI1048 FKLC1_SI1678 FKLC1_SI2308 FKLC1_SX148 FKLC1_SX238 FKLC1_SX328 FKLC1_SX418 FKLC1_SX58 FKLH0_SI1257 FKLH0_SI1887 FKLH0_SI627 FKLH0_SX177 FKLH0_SX267 FKLH0_SX357 FKLH0_SX447 FKLH0_SX87 FKSR0_SI1117 FKSR0_SI1747 FKSR0_SI487 FKSR0_SX161 FKSR0_SX217 FKSR0_SX366 FKSR0_SX37 FKSR0_SX397 FLAC0_SI1339 FLAC0_SI2161 FLAC0_SI901 FLAC0_SX181 FLAC0_SX271 FLAC0_SX361 FLAC0_SX451 FLAC0_SX91 FLAG0_SI1464 FLAG0_SI2094 FLAG0_SI834 FLAG0_SX114 FLAG0_SX204 FLAG0_SX24 FLAG0_SX294 FLAG0_SX384 FLEH0_SI1051 FLEH0_SI1681 FLEH0_SI2311 FLEH0_SX151 FLEH0_SX241 FLEH0_SX331 FLEH0_SX421 FLEH0_SX61 FLET0_SI1137 FLET0_SI1767 FLET0_SI507 FLET0_SX147 FLET0_SX237 FLET0_SX277 FLET0_SX417 FLET0_SX57 FLHD0_SI1344 FLHD0_SI1827 FLHD0_SI1974 FLHD0_SX174 FLHD0_SX264 FLHD0_SX354 FLHD0_SX444 FLHD0_SX84 FLJA0_SI1078 FLJA0_SI1708 FLJA0_SI2338 FLJA0_SX178 FLJA0_SX268 FLJA0_SX358 FLJA0_SX448 FLJA0_SX88 FLJD0_SI1516 FLJD0_SI2146 FLJD0_SI886 FLJD0_SX166 FLJD0_SX256 FLJD0_SX346 FLJD0_SX436 FLJD0_SX76 FLJG0_SI1611 FLJG0_SI2241 FLJG0_SI981 FLJG0_SX171 FLJG0_SX261 FLJG0_SX351 FLJG0_SX441 FLJG0_SX81 FLKM0_SI1880 FLKM0_SI620 FLKM0_SI686 FLKM0_SX116 FLKM0_SX260 FLKM0_SX350 FLKM0_SX440 FLKM0_SX80 FLMA0_SI1243 FLMA0_SI1873 FLMA0_SI613 FLMA0_SX163 FLMA0_SX253 FLMA0_SX343 FLMA0_SX433 FLMA0_SX73 FLMC0_SI1372 FLMC0_SI2002 FLMC0_SI742 FLMC0_SX112 FLMC0_SX22 FLMC0_SX292 FLMC0_SX336 FLMC0_SX382 FLMK0_SI1035 FLMK0_SI1229 FLMK0_SI2295 FLMK0_SX135 FLMK0_SX225 FLMK0_SX315 FLMK0_SX405 FLMK0_SX45 FLOD0_SI1287 FLOD0_SI1917 FLOD0_SI657 FLOD0_SX117 FLOD0_SX171 FLOD0_SX207 FLOD0_SX297 FLOD0_SX387 FLTM0_SI1070 FLTM0_SI1700 FLTM0_SI2330 FLTM0_SX170 FLTM0_SX260 FLTM0_SX350 FLTM0_SX440 FLTM0_SX80 FMAH1_SI1509 FMAH1_SI2139 FMAH1_SI879 FMAH1_SX159 FMAH1_SX249 FMAH1_SX339 FMAH1_SX429 FMAH1_SX69 FMBG0_SI1160 FMBG0_SI1790 FMBG0_SI2264 FMBG0_SX260 FMBG0_SX3 FMBG0_SX350 FMBG0_SX440 FMBG0_SX80 FMEM0_SI1377 FMEM0_SI2007 FMEM0_SI747 FMEM0_SX117 FMEM0_SX207 FMEM0_SX297 FMEM0_SX333 FMEM0_SX387 FMJB0_SI1177 FMJB0_SI1807 FMJB0_SI547 FMJB0_SX187 FMJB0_SX277 FMJB0_SX367 FMJB0_SX7 FMJB0_SX97 FMJF0_SI1254 FMJF0_SI1884 FMJF0_SI624 FMJF0_SX174 FMJF0_SX264 FMJF0_SX354 FMJF0_SX444 FMJF0_SX84 FMJU0_SI1389 FMJU0_SI2019 FMJU0_SI759 FMJU0_SX129 FMJU0_SX219 FMJU0_SX309 FMJU0_SX39 FMJU0_SX399 FMKC0_SI1041 FMKC0_SI1072 FMKC0_SI1702 FMKC0_SX172 FMKC0_SX262 FMKC0_SX352 FMKC0_SX442 FMKC0_SX82 FMKF0_SI1018 FMKF0_SI1536 FMKF0_SI906 FMKF0_SX186 FMKF0_SX276 FMKF0_SX366 FMKF0_SX6 FMKF0_SX96 FMMH0_SI1537 FMMH0_SI2167 FMMH0_SI907 FMMH0_SX187 FMMH0_SX367 FMMH0_SX420 FMMH0_SX7 FMMH0_SX97 FMPG0_SI1602 FMPG0_SI2232 FMPG0_SI972 FMPG0_SX162 FMPG0_SX252 FMPG0_SX342 FMPG0_SX432 FMPG0_SX72 FNKL0_SI1522 FNKL0_SI2152 FNKL0_SI892 FNKL0_SX172 FNKL0_SX196 FNKL0_SX262 FNKL0_SX442 FNKL0_SX82 FNTB0_SI1203 FNTB0_SI573 FNTB0_SI679 FNTB0_SX123 FNTB0_SX213 FNTB0_SX303 FNTB0_SX33 FNTB0_SX393 FPAB1_SI1471 FPAB1_SI2101 FPAB1_SI841 FPAB1_SX121 FPAB1_SX211 FPAB1_SX301 FPAB1_SX31 FPAB1_SX391 FPAC0_SI1921 FPAC0_SI2011 FPAC0_SI661 FPAC0_SX121 FPAC0_SX211 FPAC0_SX301 FPAC0_SX31 FPAC0_SX391 FPAD0_SI1346 FPAD0_SI1976 FPAD0_SI716 FPAD0_SX176 FPAD0_SX266 FPAD0_SX356 FPAD0_SX446 FPAD0_SX86 FPAF0_SI1054 FPAF0_SI1684 FPAF0_SI2314 FPAF0_SX154 FPAF0_SX244 FPAF0_SX334 FPAF0_SX424 FPAF0_SX64 FPAZ0_SI1593 FPAZ0_SI2223 FPAZ0_SI963 FPAZ0_SX153 FPAZ0_SX243 FPAZ0_SX27 FPAZ0_SX423 FPAZ0_SX63 FPJF0_SI1046 FPJF0_SI1259 FPJF0_SI1676 FPJF0_SX146 FPJF0_SX236 FPJF0_SX326 FPJF0_SX352 FPJF0_SX56 FPLS0_SI1590 FPLS0_SI2220 FPLS0_SI960 FPLS0_SX150 FPLS0_SX240 FPLS0_SX3 FPLS0_SX330 FPLS0_SX60 FPMY0_SI1153 FPMY0_SI1783 FPMY0_SI523 FPMY0_SX163 FPMY0_SX196 FPMY0_SX253 FPMY0_SX343 FPMY0_SX73 FREH0_SI1315 FREH0_SI1945 FREH0_SI685 FREH0_SX145 FREH0_SX235 FREH0_SX325 FREH0_SX415 FREH0_SX55 FRJB0_SI1427 FRJB0_SI1470 FRJB0_SI1794 FRJB0_SX167 FRJB0_SX257 FRJB0_SX347 FRJB0_SX437 FRJB0_SX77 FRLL0_SI1514 FRLL0_SI805 FRLL0_SI884 FRLL0_SX164 FRLL0_SX254 FRLL0_SX344 FRLL0_SX434 FRLL0_SX74 FSAG0_SI1323 FSAG0_SI1953 FSAG0_SI693 FSAG0_SX153 FSAG0_SX243 FSAG0_SX333 FSAG0_SX423 FSAG0_SX63 FSAH0_SI1244 FSAH0_SI1874 FSAH0_SI614 FSAH0_SX164 FSAH0_SX327 FSAH0_SX344 FSAH0_SX434 FSAH0_SX74 FSAK0_SI1300 FSAK0_SI1930 FSAK0_SI670 FSAK0_SX130 FSAK0_SX220 FSAK0_SX310 FSAK0_SX40 FSAK0_SX400 FSBK0_SI1069 FSBK0_SI1699 FSBK0_SI2329 FSBK0_SX169 FSBK0_SX259 FSBK0_SX349 FSBK0_SX439 FSBK0_SX79 FSCN0_SI1886 FSCN0_SI626 FSCN0_SI705 FSCN0_SX176 FSCN0_SX266 FSCN0_SX356 FSCN0_SX446 FSCN0_SX86 FSDC0_SI1312 FSDC0_SI1942 FSDC0_SI2234 FSDC0_SX142 FSDC0_SX232 FSDC0_SX322 FSDC0_SX412 FSDC0_SX52 FSDJ0_SI1115 FSDJ0_SI1745 FSDJ0_SI485 FSDJ0_SX125 FSDJ0_SX215 FSDJ0_SX305 FSDJ0_SX35 FSDJ0_SX395 FSGF0_SI1557 FSGF0_SI2187 FSGF0_SI927 FSGF0_SX117 FSGF0_SX207 FSGF0_SX27 FSGF0_SX297 FSGF0_SX387 FSJG0_SI1570 FSJG0_SI2200 FSJG0_SI940 FSJG0_SX130 FSJG0_SX220 FSJG0_SX310 FSJG0_SX40 FSJG0_SX400 FSJK1_SI1025 FSJK1_SI2285 FSJK1_SI696 FSJK1_SX125 FSJK1_SX215 FSJK1_SX305 FSJK1_SX35 FSJK1_SX395 FSJS0_SI1171 FSJS0_SI1801 FSJS0_SI541 FSJS0_SX181 FSJS0_SX271 FSJS0_SX361 FSJS0_SX451 FSJS0_SX91 FSJW0_SI1333 FSJW0_SI1963 FSJW0_SI703 FSJW0_SX163 FSJW0_SX253 FSJW0_SX343 FSJW0_SX433 FSJW0_SX73 FSKC0_SI1416 FSKC0_SI2046 FSKC0_SI786 FSKC0_SX156 FSKC0_SX246 FSKC0_SX336 FSKC0_SX426 FSKC0_SX66 FSKL0_SI1529 FSKL0_SI2159 FSKL0_SI899 FSKL0_SX179 FSKL0_SX269 FSKL0_SX359 FSKL0_SX449 FSKL0_SX89 FSKP0_SI1098 FSKP0_SI1728 FSKP0_SI468 FSKP0_SX108 FSKP0_SX18 FSKP0_SX198 FSKP0_SX288 FSKP0_SX378 FSLS0_SI1056 FSLS0_SI1686 FSLS0_SI2316 FSLS0_SX156 FSLS0_SX202 FSLS0_SX246 FSLS0_SX426 FSLS0_SX66 FSMA0_SI1621 FSMA0_SI2251 FSMA0_SI991 FSMA0_SX181 FSMA0_SX271 FSMA0_SX361 FSMA0_SX451 FSMA0_SX91 FSMM0_SI1314 FSMM0_SI1944 FSMM0_SI684 FSMM0_SX144 FSMM0_SX234 FSMM0_SX324 FSMM0_SX414 FSMM0_SX54 FSMS1_SI1504 FSMS1_SI2134 FSMS1_SI874 FSMS1_SX154 FSMS1_SX244 FSMS1_SX334 FSMS1_SX347 FSMS1_SX64 FSPM0_SI1241 FSPM0_SI1871 FSPM0_SI611 FSPM0_SX161 FSPM0_SX251 FSPM0_SX341 FSPM0_SX431 FSPM0_SX71 FSRH0_SI1719 FSRH0_SI1931 FSRH0_SI671 FSRH0_SX131 FSRH0_SX221 FSRH0_SX311 FSRH0_SX401 FSRH0_SX41 FSSB0_SI1082 FSSB0_SI1712 FSSB0_SI2342 FSSB0_SX182 FSSB0_SX272 FSSB0_SX362 FSSB0_SX452 FSSB0_SX92 FTAJ0_SI1329 FTAJ0_SI474 FTAJ0_SI699 FTAJ0_SX159 FTAJ0_SX249 FTAJ0_SX339 FTAJ0_SX429 FTAJ0_SX69 FTBR0_SI1402 FTBR0_SI2181 FTBR0_SI921 FTBR0_SX111 FTBR0_SX201 FTBR0_SX21 FTBR0_SX291 FTBR0_SX381 FTBW0_SI1345 FTBW0_SI1975 FTBW0_SI715 FTBW0_SX175 FTBW0_SX265 FTBW0_SX355 FTBW0_SX445 FTBW0_SX85 FTLG0_SI1743 FTLG0_SI483 FTLG0_SI840 FTLG0_SX123 FTLG0_SX213 FTLG0_SX303 FTLG0_SX33 FTLG0_SX393 FTMG0_SI1532 FTMG0_SI2162 FTMG0_SI902 FTMG0_SX182 FTMG0_SX272 FTMG0_SX362 FTMG0_SX452 FTMG0_SX92 FVFB0_SI1032 FVFB0_SI1510 FVFB0_SI2292 FVFB0_SX132 FVFB0_SX222 FVFB0_SX312 FVFB0_SX402 FVFB0_SX42 FVKB0_SI1159 FVKB0_SI1789 FVKB0_SI529 FVKB0_SX169 FVKB0_SX259 FVKB0_SX349 FVKB0_SX439 FVKB0_SX79 FVMH0_SI1466 FVMH0_SI2096 FVMH0_SI836 FVMH0_SX116 FVMH0_SX206 FVMH0_SX26 FVMH0_SX296 FVMH0_SX386 MABC0_SI1620 MABC0_SI2041 MABC0_SI781 MABC0_SX151 MABC0_SX241 MABC0_SX331 MABC0_SX421 MABC0_SX61 MADC0_SI1367 MADC0_SI1997 MADC0_SI737 MADC0_SX107 MADC0_SX17 MADC0_SX197 MADC0_SX287 MADC0_SX377 MADD0_SI1295 MADD0_SI1798 MADD0_SI538 MADD0_SX178 MADD0_SX268 MADD0_SX358 MADD0_SX448 MADD0_SX88 MAEB0_SI1411 MAEB0_SI2250 MAEB0_SI990 MAEB0_SX180 MAEB0_SX270 MAEB0_SX360 MAEB0_SX450 MAEB0_SX90 MAEO0_SI1326 MAEO0_SI1655 MAEO0_SI1956 MAEO0_SX156 MAEO0_SX246 MAEO0_SX336 MAEO0_SX426 MAEO0_SX66 MAFM0_SI1569 MAFM0_SI2199 MAFM0_SI939 MAFM0_SX129 MAFM0_SX219 MAFM0_SX309 MAFM0_SX39 MAFM0_SX399 MAJP0_SI1074 MAJP0_SI1704 MAJP0_SI2334 MAJP0_SX174 MAJP0_SX264 MAJP0_SX354 MAJP0_SX444 MAJP0_SX84 MAKB0_SI1016 MAKB0_SI1646 MAKB0_SI2276 MAKB0_SX116 MAKB0_SX206 MAKB0_SX26 MAKB0_SX296 MAKB0_SX386 MAKR0_SI1352 MAKR0_SI1982 MAKR0_SI722 MAKR0_SX182 MAKR0_SX272 MAKR0_SX362 MAKR0_SX452 MAKR0_SX92 MAPV0_SI1293 MAPV0_SI1923 MAPV0_SI663 MAPV0_SX123 MAPV0_SX213 MAPV0_SX303 MAPV0_SX33 MAPV0_SX393 MARC0_SI1188 MARC0_SI1818 MARC0_SI558 MARC0_SX108 MARC0_SX18 MARC0_SX198 MARC0_SX288 MARC0_SX378 MARW0_SI1276 MARW0_SI1906 MARW0_SI646 MARW0_SX106 MARW0_SX16 MARW0_SX286 MARW0_SX349 MARW0_SX376 MBAR0_SI1319 MBAR0_SI1949 MBAR0_SI689 MBAR0_SX149 MBAR0_SX239 MBAR0_SX329 MBAR0_SX419 MBAR0_SX59 MBBR0_SI1055 MBBR0_SI1685 MBBR0_SI2315 MBBR0_SX155 MBBR0_SX245 MBBR0_SX335 MBBR0_SX425 MBBR0_SX65 MBCG0_SI2217 MBCG0_SI486 MBCG0_SI957 MBCG0_SX147 MBCG0_SX237 MBCG0_SX327 MBCG0_SX417 MBCG0_SX57 MBEF0_SI1281 MBEF0_SI1911 MBEF0_SI651 MBEF0_SX111 MBEF0_SX201 MBEF0_SX21 MBEF0_SX291 MBEF0_SX381 MBGT0_SI1341 MBGT0_SI1841 MBGT0_SI711 MBGT0_SX171 MBGT0_SX261 MBGT0_SX351 MBGT0_SX441 MBGT0_SX81 MBJV0_SI1247 MBJV0_SI1877 MBJV0_SI617 MBJV0_SX167 MBJV0_SX257 MBJV0_SX347 MBJV0_SX437 MBJV0_SX77 MBMA0_SI1222 MBMA0_SI1852 MBMA0_SI592 MBMA0_SX142 MBMA0_SX232 MBMA0_SX322 MBMA0_SX412 MBMA0_SX52 MBMA1_SI2207 MBMA1_SI2214 MBMA1_SI954 MBMA1_SX144 MBMA1_SX234 MBMA1_SX324 MBMA1_SX414 MBMA1_SX54 MBML0_SI1169 MBML0_SI1799 MBML0_SI539 MBML0_SX179 MBML0_SX269 MBML0_SX359 MBML0_SX449 MBML0_SX89 MBOM0_SI1014 MBOM0_SI1644 MBOM0_SI2274 MBOM0_SX114 MBOM0_SX204 MBOM0_SX294 MBOM0_SX311 MBOM0_SX384 MBSB0_SI1353 MBSB0_SI1983 MBSB0_SI723 MBSB0_SX183 MBSB0_SX273 MBSB0_SX3 MBSB0_SX363 MBSB0_SX93 MBTH0_SI2102 MBTH0_SI505 MBTH0_SI757 MBTH0_SX122 MBTH0_SX212 MBTH0_SX302 MBTH0_SX32 MBTH0_SX392 MBWP0_SI1531 MBWP0_SI1969 MBWP0_SI709 MBWP0_SX169 MBWP0_SX259 MBWP0_SX349 MBWP0_SX439 MBWP0_SX79 MCAE0_SI1447 MCAE0_SI2077 MCAE0_SI817 MCAE0_SX187 MCAE0_SX277 MCAE0_SX367 MCAE0_SX7 MCAE0_SX97 MCAL0_SI1138 MCAL0_SI1768 MCAL0_SI508 MCAL0_SX148 MCAL0_SX238 MCAL0_SX328 MCAL0_SX418 MCAL0_SX58 MCDC0_SI1292 MCDC0_SI1922 MCDC0_SI662 MCDC0_SX122 MCDC0_SX212 MCDC0_SX302 MCDC0_SX32 MCDC0_SX392 MCDD0_SI1513 MCDD0_SI2143 MCDD0_SI883 MCDD0_SX163 MCDD0_SX253 MCDD0_SX343 MCDD0_SX433 MCDD0_SX73 MCDR0_SI1154 MCDR0_SI1784 MCDR0_SI524 MCDR0_SX164 MCDR0_SX254 MCDR0_SX344 MCDR0_SX434 MCDR0_SX74 MCEF0_SI1135 MCEF0_SI1765 MCEF0_SI842 MCEF0_SX145 MCEF0_SX235 MCEF0_SX325 MCEF0_SX415 MCEF0_SX55 MCEW0_SI1442 MCEW0_SI2072 MCEW0_SI812 MCEW0_SX182 MCEW0_SX272 MCEW0_SX362 MCEW0_SX452 MCEW0_SX92 MCHL0_SI1347 MCHL0_SI1404 MCHL0_SI1977 MCHL0_SX177 MCHL0_SX267 MCHL0_SX357 MCHL0_SX447 MCHL0_SX87 MCLK0_SI1660 MCLK0_SI2290 MCLK0_SI650 MCLK0_SX130 MCLK0_SX220 MCLK0_SX310 MCLK0_SX40 MCLK0_SX400 MCLM0_SI1456 MCLM0_SI2086 MCLM0_SI826 MCLM0_SX106 MCLM0_SX16 MCLM0_SX196 MCLM0_SX286 MCLM0_SX376 MCPM0_SI1194 MCPM0_SI1824 MCPM0_SI564 MCPM0_SX114 MCPM0_SX204 MCPM0_SX24 MCPM0_SX294 MCPM0_SX384 MCRE0_SI1121 MCRE0_SI1725 MCRE0_SI1751 MCRE0_SX131 MCRE0_SX221 MCRE0_SX24 MCRE0_SX401 MCRE0_SX41 MCSS0_SI1380 MCSS0_SI688 MCSS0_SI750 MCSS0_SX120 MCSS0_SX210 MCSS0_SX30 MCSS0_SX300 MCSS0_SX390 MCTH0_SI1209 MCTH0_SI1839 MCTH0_SI579 MCTH0_SX129 MCTH0_SX219 MCTH0_SX309 MCTH0_SX39 MCTH0_SX399 MCTM0_SI1350 MCTM0_SI1980 MCTM0_SI720 MCTM0_SX180 MCTM0_SX270 MCTM0_SX360 MCTM0_SX450 MCTM0_SX90 MCXM0_SI1351 MCXM0_SI1981 MCXM0_SI721 MCXM0_SX181 MCXM0_SX271 MCXM0_SX361 MCXM0_SX451 MCXM0_SX91 MDAC0_SI1261 MDAC0_SI1837 MDAC0_SI631 MDAC0_SX181 MDAC0_SX271 MDAC0_SX361 MDAC0_SX451 MDAC0_SX91 MDAS0_SI1266 MDAS0_SI1896 MDAS0_SI636 MDAS0_SX186 MDAS0_SX21 MDAS0_SX276 MDAS0_SX6 MDAS0_SX96 MDBB1_SI1006 MDBB1_SI1636 MDBB1_SI2056 MDBB1_SX106 MDBB1_SX16 MDBB1_SX196 MDBB1_SX286 MDBB1_SX376 MDBP0_SI1158 MDBP0_SI1788 MDBP0_SI528 MDBP0_SX168 MDBP0_SX258 MDBP0_SX348 MDBP0_SX438 MDBP0_SX78 MDCD0_SI1415 MDCD0_SI2045 MDCD0_SI785 MDCD0_SX155 MDCD0_SX245 MDCD0_SX335 MDCD0_SX425 MDCD0_SX65 MDCM0_SI1480 MDCM0_SI2110 MDCM0_SI850 MDCM0_SX130 MDCM0_SX220 MDCM0_SX310 MDCM0_SX40 MDCM0_SX400 MDDC0_SI1419 MDDC0_SI2049 MDDC0_SI789 MDDC0_SX159 MDDC0_SX249 MDDC0_SX339 MDDC0_SX429 MDDC0_SX69 MDED0_SI1170 MDED0_SI1800 MDED0_SI540 MDED0_SX180 MDED0_SX270 MDED0_SX360 MDED0_SX450 MDED0_SX90 MDEF0_SI1123 MDEF0_SI1563 MDEF0_SI2193 MDEF0_SX123 MDEF0_SX213 MDEF0_SX303 MDEF0_SX33 MDEF0_SX393 MDEM0_SI1868 MDEM0_SI608 MDEM0_SI800 MDEM0_SX158 MDEM0_SX248 MDEM0_SX338 MDEM0_SX428 MDEM0_SX68 MDHL0_SI1439 MDHL0_SI2069 MDHL0_SI809 MDHL0_SX179 MDHL0_SX269 MDHL0_SX359 MDHL0_SX449 MDHL0_SX89 MDHS0_SI1530 MDHS0_SI2160 MDHS0_SI900 MDHS0_SX180 MDHS0_SX270 MDHS0_SX360 MDHS0_SX450 MDHS0_SX90 MDJM0_SI1455 MDJM0_SI2085 MDJM0_SI825 MDJM0_SX105 MDJM0_SX15 MDJM0_SX195 MDJM0_SX285 MDJM0_SX375 MDKS0_SI1066 MDKS0_SI1696 MDKS0_SI2326 MDKS0_SX166 MDKS0_SX256 MDKS0_SX346 MDKS0_SX436 MDKS0_SX76 MDLB0_SI1306 MDLB0_SI1936 MDLB0_SI676 MDLB0_SX136 MDLB0_SX226 MDLB0_SX316 MDLB0_SX406 MDLB0_SX46 MDLC0_SI1395 MDLC0_SI2025 MDLC0_SI765 MDLC0_SX135 MDLC0_SX225 MDLC0_SX315 MDLC0_SX405 MDLC0_SX45 MDLC1_SI1435 MDLC1_SI2065 MDLC1_SI2144 MDLC1_SX175 MDLC1_SX265 MDLC1_SX355 MDLC1_SX445 MDLC1_SX85 MDLC2_SI1614 MDLC2_SI2244 MDLC2_SI984 MDLC2_SX174 MDLC2_SX264 MDLC2_SX354 MDLC2_SX444 MDLC2_SX84 MDLH0_SI1960 MDLH0_SI574 MDLH0_SI700 MDLH0_SX160 MDLH0_SX250 MDLH0_SX340 MDLH0_SX430 MDLH0_SX70 MDLM0_SI1234 MDLM0_SI1864 MDLM0_SI604 MDLM0_SX154 MDLM0_SX244 MDLM0_SX334 MDLM0_SX424 MDLM0_SX64 MDLR0_SI1233 MDLR0_SI1863 MDLR0_SI603 MDLR0_SX153 MDLR0_SX243 MDLR0_SX333 MDLR0_SX423 MDLR0_SX63 MDLR1_SI1299 MDLR1_SI1929 MDLR1_SI669 MDLR1_SX129 MDLR1_SX219 MDLR1_SX309 MDLR1_SX39 MDLR1_SX399 MDMA0_SI1238 MDMA0_SI1430 MDMA0_SI2060 MDMA0_SX170 MDMA0_SX260 MDMA0_SX350 MDMA0_SX440 MDMA0_SX80 MDMT0_SI1832 MDMT0_SI2341 MDMT0_SI572 MDMT0_SX122 MDMT0_SX212 MDMT0_SX302 MDMT0_SX32 MDMT0_SX392 MDNS0_SI1011 MDNS0_SI2271 MDNS0_SI873 MDNS0_SX111 MDNS0_SX201 MDNS0_SX21 MDNS0_SX291 MDNS0_SX381 MDPB0_SI1760 MDPB0_SI2126 MDPB0_SI866 MDPB0_SX146 MDPB0_SX236 MDPB0_SX326 MDPB0_SX416 MDPB0_SX56 MDPK0_SI1053 MDPK0_SI1683 MDPK0_SI552 MDPK0_SX153 MDPK0_SX243 MDPK0_SX333 MDPK0_SX423 MDPK0_SX63 MDPS0_SI1651 MDPS0_SI1979 MDPS0_SI719 MDPS0_SX179 MDPS0_SX269 MDPS0_SX359 MDPS0_SX449 MDPS0_SX89 MDRD0_SI1382 MDRD0_SI2012 MDRD0_SI752 MDRD0_SX122 MDRD0_SX212 MDRD0_SX302 MDRD0_SX32 MDRD0_SX392 MDSJ0_SI1462 MDSJ0_SI2092 MDSJ0_SI832 MDSJ0_SX112 MDSJ0_SX22 MDSJ0_SX292 MDSJ0_SX382 MDSJ0_SX438 MDSS0_SI1881 MDSS0_SI2087 MDSS0_SI621 MDSS0_SX171 MDSS0_SX261 MDSS0_SX351 MDSS0_SX441 MDSS0_SX81 MDSS1_SI1327 MDSS1_SI1713 MDSS1_SI697 MDSS1_SX157 MDSS1_SX247 MDSS1_SX337 MDSS1_SX427 MDSS1_SX67 MDTB0_SI1200 MDTB0_SI1830 MDTB0_SI570 MDTB0_SX120 MDTB0_SX210 MDTB0_SX300 MDTB0_SX321 MDTB0_SX390 MDWD0_SI1260 MDWD0_SI1890 MDWD0_SI557 MDWD0_SX180 MDWD0_SX270 MDWD0_SX360 MDWD0_SX450 MDWD0_SX90 MDWH0_SI1168 MDWH0_SI1925 MDWH0_SI665 MDWH0_SX125 MDWH0_SX215 MDWH0_SX305 MDWH0_SX35 MDWH0_SX395 MDWM0_SI1546 MDWM0_SI2176 MDWM0_SI916 MDWM0_SX106 MDWM0_SX16 MDWM0_SX286 MDWM0_SX376 MDWM0_SX433 MEAL0_SI1547 MEAL0_SI2177 MEAL0_SI917 MEAL0_SX107 MEAL0_SX197 MEAL0_SX287 MEAL0_SX347 MEAL0_SX377 MEDR0_SI1374 MEDR0_SI2004 MEDR0_SI744 MEDR0_SX114 MEDR0_SX204 MEDR0_SX24 MEDR0_SX294 MEDR0_SX384 MEFG0_SI465 MEFG0_SI491 MEFG0_SI598 MEFG0_SX105 MEFG0_SX15 MEFG0_SX195 MEFG0_SX285 MEFG0_SX375 MEGJ0_SI1337 MEGJ0_SI1967 MEGJ0_SI707 MEGJ0_SX167 MEGJ0_SX257 MEGJ0_SX3 MEGJ0_SX437 MEGJ0_SX77 MEJL0_SI1592 MEJL0_SI1654 MEJL0_SI962 MEJL0_SX152 MEJL0_SX242 MEJL0_SX332 MEJL0_SX422 MEJL0_SX62 MEJS0_SI1240 MEJS0_SI1870 MEJS0_SI610 MEJS0_SX160 MEJS0_SX250 MEJS0_SX340 MEJS0_SX430 MEJS0_SX70 MESG0_SI1332 MESG0_SI1962 MESG0_SI702 MESG0_SX162 MESG0_SX252 MESG0_SX342 MESG0_SX432 MESG0_SX72 MESJ0_SI2039 MESJ0_SI2257 MESJ0_SI997 MESJ0_SX187 MESJ0_SX277 MESJ0_SX367 MESJ0_SX7 MESJ0_SX97 MEWM0_SI1348 MEWM0_SI1978 MEWM0_SI718 MEWM0_SX178 MEWM0_SX268 MEWM0_SX358 MEWM0_SX448 MEWM0_SX88 MFER0_SI1492 MFER0_SI2122 MFER0_SI862 MFER0_SX142 MFER0_SX232 MFER0_SX322 MFER0_SX412 MFER0_SX52 MFMC0_SI1132 MFMC0_SI1762 MFMC0_SI502 MFMC0_SX142 MFMC0_SX232 MFMC0_SX322 MFMC0_SX412 MFMC0_SX52 MFRM0_SI1155 MFRM0_SI1717 MFRM0_SI1785 MFRM0_SX165 MFRM0_SX255 MFRM0_SX345 MFRM0_SX435 MFRM0_SX75 MFWK0_SI1249 MFWK0_SI1879 MFWK0_SI619 MFWK0_SX169 MFWK0_SX259 MFWK0_SX349 MFWK0_SX439 MFWK0_SX79 MFXS0_SI1674 MFXS0_SI2225 MFXS0_SI2304 MFXS0_SX144 MFXS0_SX234 MFXS0_SX324 MFXS0_SX414 MFXS0_SX54 MFXV0_SI1005 MFXV0_SI1342 MFXV0_SI1635 MFXV0_SX105 MFXV0_SX15 MFXV0_SX195 MFXV0_SX285 MFXV0_SX375 MGAF0_SI1282 MGAF0_SI1912 MGAF0_SI652 MGAF0_SX112 MGAF0_SX202 MGAF0_SX22 MGAF0_SX292 MGAF0_SX382 MGAG0_SI1321 MGAG0_SI645 MGAG0_SI691 MGAG0_SX151 MGAG0_SX241 MGAG0_SX331 MGAG0_SX421 MGAG0_SX61 MGAK0_SI1036 MGAK0_SI1666 MGAK0_SI2296 MGAK0_SX136 MGAK0_SX226 MGAK0_SX316 MGAK0_SX406 MGAK0_SX46 MGAR0_SI1212 MGAR0_SI1694 MGAR0_SI1842 MGAR0_SX132 MGAR0_SX222 MGAR0_SX312 MGAR0_SX402 MGAR0_SX42 MGAW0_SI1165 MGAW0_SI1802 MGAW0_SI535 MGAW0_SX175 MGAW0_SX265 MGAW0_SX355 MGAW0_SX445 MGAW0_SX85 MGES0_SI1481 MGES0_SI2111 MGES0_SI851 MGES0_SX131 MGES0_SX221 MGES0_SX311 MGES0_SX401 MGES0_SX41 MGJC0_SI1256 MGJC0_SI1335 MGJC0_SI1965 MGJC0_SX165 MGJC0_SX255 MGJC0_SX345 MGJC0_SX435 MGJC0_SX75 MGRL0_SI1497 MGRL0_SI2127 MGRL0_SI867 MGRL0_SX147 MGRL0_SX237 MGRL0_SX327 MGRL0_SX417 MGRL0_SX57 MGRP0_SI1317 MGRP0_SI1947 MGRP0_SI687 MGRP0_SX147 MGRP0_SX237 MGRP0_SX327 MGRP0_SX417 MGRP0_SX57 MGSH0_SI1176 MGSH0_SI1806 MGSH0_SI546 MGSH0_SX127 MGSH0_SX186 MGSH0_SX276 MGSH0_SX6 MGSH0_SX96 MGSL0_SI1164 MGSL0_SI534 MGSL0_SI797 MGSL0_SX174 MGSL0_SX264 MGSL0_SX354 MGSL0_SX444 MGSL0_SX84 MGXP0_SI1087 MGXP0_SI457 MGXP0_SI525 MGXP0_SX187 MGXP0_SX277 MGXP0_SX367 MGXP0_SX7 MGXP0_SX97 MHBS0_SI1575 MHBS0_SI2205 MHBS0_SI945 MHBS0_SX135 MHBS0_SX225 MHBS0_SX315 MHBS0_SX405 MHBS0_SX45 MHIT0_SI1613 MHIT0_SI2243 MHIT0_SI983 MHIT0_SX173 MHIT0_SX263 MHIT0_SX353 MHIT0_SX443 MHIT0_SX83 MHJB0_SI1017 MHJB0_SI1647 MHJB0_SI2277 MHJB0_SX117 MHJB0_SX207 MHJB0_SX27 MHJB0_SX297 MHJB0_SX387 MHMG0_SI1365 MHMG0_SI1995 MHMG0_SI735 MHMG0_SX105 MHMG0_SX15 MHMG0_SX195 MHMG0_SX285 MHMG0_SX375 MHMR0_SI1119 MHMR0_SI1692 MHMR0_SI489 MHMR0_SX129 MHMR0_SX219 MHMR0_SX309 MHMR0_SX39 MHMR0_SX399 MHRM0_SI1475 MHRM0_SI2218 MHRM0_SI958 MHRM0_SX148 MHRM0_SX238 MHRM0_SX328 MHRM0_SX418 MHRM0_SX58 MHXL0_SI1772 MHXL0_SI512 MHXL0_SI612 MHXL0_SX152 MHXL0_SX242 MHXL0_SX332 MHXL0_SX422 MHXL0_SX62 MILB0_SI2163 MILB0_SI807 MILB0_SI903 MILB0_SX183 MILB0_SX273 MILB0_SX3 MILB0_SX363 MILB0_SX93 MJAC0_SI1331 MJAC0_SI2148 MJAC0_SI701 MJAC0_SX251 MJAC0_SX307 MJAC0_SX341 MJAC0_SX431 MJAC0_SX71 MJAE0_SI1524 MJAE0_SI1999 MJAE0_SI2154 MJAE0_SX174 MJAE0_SX264 MJAE0_SX354 MJAE0_SX444 MJAE0_SX84 MJAI0_SI1604 MJAI0_SI682 MJAI0_SI710 MJAI0_SX164 MJAI0_SX254 MJAI0_SX344 MJAI0_SX434 MJAI0_SX74 MJBG0_SI1232 MJBG0_SI1724 MJBG0_SI1862 MJBG0_SX152 MJBG0_SX242 MJBG0_SX332 MJBG0_SX422 MJBG0_SX62 MJDA0_SI1031 MJDA0_SI1661 MJDA0_SI2291 MJDA0_SX131 MJDA0_SX221 MJDA0_SX311 MJDA0_SX401 MJDA0_SX41 MJDC0_SI1161 MJDC0_SI2165 MJDC0_SI531 MJDC0_SX171 MJDC0_SX261 MJDC0_SX351 MJDC0_SX441 MJDC0_SX81 MJDE0_SI1120 MJDE0_SI463 MJDE0_SI490 MJDE0_SX130 MJDE0_SX220 MJDE0_SX310 MJDE0_SX40 MJDE0_SX400 MJDG0_SI1042 MJDG0_SI1672 MJDG0_SI1705 MJDG0_SX142 MJDG0_SX232 MJDG0_SX322 MJDG0_SX412 MJDG0_SX52 MJDM0_SI1340 MJDM0_SI1937 MJDM0_SI974 MJDM0_SX170 MJDM0_SX260 MJDM0_SX350 MJDM0_SX440 MJDM0_SX80 MJEB0_SI1286 MJEB0_SI1916 MJEB0_SI656 MJEB0_SX170 MJEB0_SX206 MJEB0_SX26 MJEB0_SX296 MJEB0_SX386 MJEB1_SI1467 MJEB1_SI2097 MJEB1_SI837 MJEB1_SX117 MJEB1_SX207 MJEB1_SX27 MJEB1_SX297 MJEB1_SX387 MJEE0_SI1237 MJEE0_SI1867 MJEE0_SI607 MJEE0_SX157 MJEE0_SX247 MJEE0_SX337 MJEE0_SX427 MJEE0_SX67 MJFH0_SI1107 MJFH0_SI1737 MJFH0_SI477 MJFH0_SX117 MJFH0_SX207 MJFH0_SX27 MJFH0_SX297 MJFH0_SX387 MJFR0_SI1605 MJFR0_SI2235 MJFR0_SI975 MJFR0_SX165 MJFR0_SX255 MJFR0_SX345 MJFR0_SX435 MJFR0_SX75 MJHI0_SI1328 MJHI0_SI555 MJHI0_SI698 MJHI0_SX158 MJHI0_SX248 MJHI0_SX338 MJHI0_SX428 MJHI0_SX68 MJJB0_SI1139 MJJB0_SI1277 MJJB0_SI1769 MJJB0_SX149 MJJB0_SX239 MJJB0_SX329 MJJB0_SX419 MJJB0_SX59 MJJJ0_SI1163 MJJJ0_SI1793 MJJJ0_SI533 MJJJ0_SX173 MJJJ0_SX263 MJJJ0_SX353 MJJJ0_SX443 MJJJ0_SX83 MJJM0_SI1251 MJJM0_SI1457 MJJM0_SI827 MJJM0_SX107 MJJM0_SX17 MJJM0_SX197 MJJM0_SX287 MJJM0_SX377 MJKR0_SI1201 MJKR0_SI1831 MJKR0_SI571 MJKR0_SX121 MJKR0_SX211 MJKR0_SX301 MJKR0_SX31 MJKR0_SX391 MJLB0_SI1616 MJLB0_SI2246 MJLB0_SI986 MJLB0_SX176 MJLB0_SX266 MJLB0_SX356 MJLB0_SX446 MJLB0_SX86 MJLG1_SI1012 MJLG1_SI1642 MJLG1_SI2272 MJLG1_SX112 MJLG1_SX202 MJLG1_SX22 MJLG1_SX292 MJLG1_SX382 MJLS0_SI1096 MJLS0_SI1726 MJLS0_SI466 MJLS0_SX106 MJLS0_SX16 MJLS0_SX196 MJLS0_SX286 MJLS0_SX376 MJMA0_SI1495 MJMA0_SI2125 MJMA0_SI865 MJMA0_SX145 MJMA0_SX235 MJMA0_SX325 MJMA0_SX415 MJMA0_SX55 MJMD0_SI1028 MJMD0_SI1658 MJMD0_SI2288 MJMD0_SX128 MJMD0_SX218 MJMD0_SX308 MJMD0_SX38 MJMD0_SX398 MJMM0_SI1255 MJMM0_SI1885 MJMM0_SI625 MJMM0_SX175 MJMM0_SX265 MJMM0_SX355 MJMM0_SX445 MJMM0_SX85 MJPG0_SI1191 MJPG0_SI1821 MJPG0_SI561 MJPG0_SX111 MJPG0_SX201 MJPG0_SX21 MJPG0_SX291 MJPG0_SX381 MJPM0_SI1368 MJPM0_SI1998 MJPM0_SI738 MJPM0_SX108 MJPM0_SX18 MJPM0_SX198 MJPM0_SX288 MJPM0_SX378 MJPM1_SI1897 MJPM1_SI2280 MJPM1_SI761 MJPM1_SX131 MJPM1_SX221 MJPM1_SX311 MJPM1_SX401 MJPM1_SX41 MJRA0_SI1236 MJRA0_SI1866 MJRA0_SI606 MJRA0_SX156 MJRA0_SX246 MJRA0_SX336 MJRA0_SX426 MJRA0_SX66 MJRG0_SI1366 MJRG0_SI1996 MJRG0_SI736 MJRG0_SX106 MJRG0_SX16 MJRG0_SX286 MJRG0_SX352 MJRG0_SX376 MJRH0_SI1125 MJRH0_SI1755 MJRH0_SI1840 MJRH0_SX135 MJRH0_SX225 MJRH0_SX315 MJRH0_SX405 MJRH0_SX45 MJRH1_SI1558 MJRH1_SI1774 MJRH1_SI514 MJRH1_SX154 MJRH1_SX244 MJRH1_SX334 MJRH1_SX424 MJRH1_SX64 MJRK0_SI1662 MJRK0_SI2103 MJRK0_SI880 MJRK0_SX160 MJRK0_SX250 MJRK0_SX340 MJRK0_SX430 MJRK0_SX70 MJRP0_SI1835 MJRP0_SI1845 MJRP0_SI585 MJRP0_SX135 MJRP0_SX225 MJRP0_SX315 MJRP0_SX405 MJRP0_SX45 MJSR0_SI1424 MJSR0_SI2054 MJSR0_SI794 MJSR0_SX164 MJSR0_SX254 MJSR0_SX344 MJSR0_SX434 MJSR0_SX74 MJWG0_SI2155 MJWG0_SI813 MJWG0_SI895 MJWG0_SX175 MJWG0_SX265 MJWG0_SX355 MJWG0_SX445 MJWG0_SX85 MJWS0_SI1143 MJWS0_SI1773 MJWS0_SI513 MJWS0_SX153 MJWS0_SX243 MJWS0_SX333 MJWS0_SX423 MJWS0_SX63 MJWT0_SI1291 MJWT0_SI1381 MJWT0_SI751 MJWT0_SX121 MJWT0_SX211 MJWT0_SX301 MJWT0_SX31 MJWT0_SX391 MJXA0_SI1507 MJXA0_SI2137 MJXA0_SI877 MJXA0_SX157 MJXA0_SX247 MJXA0_SX337 MJXA0_SX427 MJXA0_SX67 MJXL0_SI1172 MJXL0_SI1795 MJXL0_SI542 MJXL0_SX182 MJXL0_SX272 MJXL0_SX362 MJXL0_SX452 MJXL0_SX92 MKAG0_SI1609 MKAG0_SI2239 MKAG0_SI979 MKAG0_SX169 MKAG0_SX259 MKAG0_SX30 MKAG0_SX439 MKAG0_SX79 MKAH0_SI1528 MKAH0_SI2158 MKAH0_SI898 MKAH0_SX178 MKAH0_SX268 MKAH0_SX358 MKAH0_SX448 MKAH0_SX88 MKAJ0_SI1414 MKAJ0_SI2044 MKAJ0_SI784 MKAJ0_SX154 MKAJ0_SX244 MKAJ0_SX334 MKAJ0_SX424 MKAJ0_SX64 MKAM0_SI1250 MKAM0_SI1316 MKAM0_SI1465 MKAM0_SX146 MKAM0_SX236 MKAM0_SX326 MKAM0_SX416 MKAM0_SX56 MKDB0_SI2132 MKDB0_SI588 MKDB0_SI872 MKDB0_SX152 MKDB0_SX242 MKDB0_SX332 MKDB0_SX422 MKDB0_SX62 MKDD0_SI1567 MKDD0_SI2197 MKDD0_SI937 MKDD0_SX127 MKDD0_SX217 MKDD0_SX307 MKDD0_SX37 MKDD0_SX397 MKDT0_SI2153 MKDT0_SI814 MKDT0_SI893 MKDT0_SX173 MKDT0_SX263 MKDT0_SX353 MKDT0_SX443 MKDT0_SX83 MKES0_SI1253 MKES0_SI1883 MKES0_SI623 MKES0_SX173 MKES0_SX263 MKES0_SX353 MKES0_SX443 MKES0_SX83 MKJO0_SI1517 MKJO0_SI2147 MKJO0_SI887 MKJO0_SX167 MKJO0_SX257 MKJO0_SX424 MKJO0_SX437 MKJO0_SX77 MKLN0_SI1598 MKLN0_SI2228 MKLN0_SI968 MKLN0_SX158 MKLN0_SX248 MKLN0_SX338 MKLN0_SX428 MKLN0_SX68 MKLR0_SI1059 MKLR0_SI1689 MKLR0_SI2319 MKLR0_SX159 MKLR0_SX249 MKLR0_SX339 MKLR0_SX429 MKLR0_SX69 MKLS0_SI1437 MKLS0_SI1533 MKLS0_SI2067 MKLS0_SX177 MKLS0_SX267 MKLS0_SX357 MKLS0_SX447 MKLS0_SX87 MKLS1_SI1545 MKLS1_SI2175 MKLS1_SI915 MKLS1_SX105 MKLS1_SX15 MKLS1_SX195 MKLS1_SX285 MKLS1_SX375 MKLW0_SI1571 MKLW0_SI1844 MKLW0_SI2201 MKLW0_SX131 MKLW0_SX221 MKLW0_SX311 MKLW0_SX401 MKLW0_SX41 MKRG0_SI1491 MKRG0_SI2121 MKRG0_SI861 MKRG0_SX141 MKRG0_SX231 MKRG0_SX31 MKRG0_SX411 MKRG0_SX51 MKXL0_SI1185 MKXL0_SI1815 MKXL0_SI1958 MKXL0_SX105 MKXL0_SX15 MKXL0_SX195 MKXL0_SX285 MKXL0_SX375 MLBC0_SI1239 MLBC0_SI1869 MLBC0_SI609 MLBC0_SX159 MLBC0_SX249 MLBC0_SX339 MLBC0_SX429 MLBC0_SX69 MLEL0_SI1246 MLEL0_SI1876 MLEL0_SI616 MLEL0_SX166 MLEL0_SX256 MLEL0_SX346 MLEL0_SX436 MLEL0_SX76 MLJC0_SI1225 MLJC0_SI1855 MLJC0_SI595 MLJC0_SX145 MLJC0_SX235 MLJC0_SX325 MLJC0_SX415 MLJC0_SX55 MLJH0_SI1324 MLJH0_SI1422 MLJH0_SI694 MLJH0_SX154 MLJH0_SX244 MLJH0_SX334 MLJH0_SX424 MLJH0_SX64 MLNS0_SI1407 MLNS0_SI2037 MLNS0_SI777 MLNS0_SX147 MLNS0_SX237 MLNS0_SX327 MLNS0_SX417 MLNS0_SX57 MLSH0_SI1417 MLSH0_SI2047 MLSH0_SI787 MLSH0_SX157 MLSH0_SX247 MLSH0_SX337 MLSH0_SX427 MLSH0_SX67 MMAA0_SI1588 MMAA0_SI2105 MMAA0_SI845 MMAA0_SX125 MMAA0_SX215 MMAA0_SX305 MMAA0_SX35 MMAA0_SX395 MMAB1_SI1494 MMAB1_SI2124 MMAB1_SI864 MMAB1_SX144 MMAB1_SX234 MMAB1_SX324 MMAB1_SX414 MMAB1_SX54 MMAG0_SI1126 MMAG0_SI1756 MMAG0_SI496 MMAG0_SX136 MMAG0_SX226 MMAG0_SX316 MMAG0_SX406 MMAG0_SX46 MMAM0_SI1597 MMAM0_SI1668 MMAM0_SI2227 MMAM0_SX157 MMAM0_SX247 MMAM0_SX337 MMAM0_SX427 MMAM0_SX67 MMAR0_SI1336 MMAR0_SI1966 MMAR0_SI706 MMAR0_SX166 MMAR0_SX256 MMAR0_SX346 MMAR0_SX436 MMAR0_SX76 MMBS0_SI1151 MMBS0_SI1781 MMBS0_SI521 MMBS0_SX161 MMBS0_SX251 MMBS0_SX341 MMBS0_SX431 MMBS0_SX71 MMCC0_SI1338 MMCC0_SI1968 MMCC0_SI708 MMCC0_SX168 MMCC0_SX258 MMCC0_SX348 MMCC0_SX438 MMCC0_SX78 MMDB0_SI1358 MMDB0_SI1617 MMDB0_SI987 MMDB0_SX177 MMDB0_SX267 MMDB0_SX357 MMDB0_SX447 MMDB0_SX87 MMDG0_SI1780 MMDG0_SI2035 MMDG0_SI520 MMDG0_SX160 MMDG0_SX250 MMDG0_SX340 MMDG0_SX430 MMDG0_SX70 MMDM0_SI1311 MMDM0_SI1941 MMDM0_SI681 MMDM0_SX141 MMDM0_SX231 MMDM0_SX321 MMDM0_SX411 MMDM0_SX51 MMDM1_SI1650 MMDM1_SI2043 MMDM1_SI783 MMDM1_SX153 MMDM1_SX243 MMDM1_SX333 MMDM1_SX423 MMDM1_SX63 MMDS0_SI1343 MMDS0_SI1973 MMDS0_SI713 MMDS0_SX173 MMDS0_SX263 MMDS0_SX353 MMDS0_SX443 MMDS0_SX83 MMEA0_SI1388 MMEA0_SI2018 MMEA0_SI758 MMEA0_SX128 MMEA0_SX218 MMEA0_SX308 MMEA0_SX38 MMEA0_SX398 MMEB0_SI1357 MMEB0_SI1987 MMEB0_SI727 MMEB0_SX187 MMEB0_SX327 MMEB0_SX367 MMEB0_SX7 MMEB0_SX97 MMGC0_SI1305 MMGC0_SI1935 MMGC0_SI2184 MMGC0_SX135 MMGC0_SX225 MMGC0_SX315 MMGC0_SX405 MMGC0_SX45 MMGG0_SI1079 MMGG0_SI1709 MMGG0_SI2339 MMGG0_SX179 MMGG0_SX269 MMGG0_SX359 MMGG0_SX449 MMGG0_SX89 MMGK0_SI1322 MMGK0_SI1952 MMGK0_SI692 MMGK0_SX152 MMGK0_SX242 MMGK0_SX332 MMGK0_SX422 MMGK0_SX62 MMJB1_SI1408 MMJB1_SI2038 MMJB1_SI778 MMJB1_SX148 MMJB1_SX238 MMJB1_SX328 MMJB1_SX418 MMJB1_SX58 MMLM0_SI1527 MMLM0_SI2150 MMLM0_SI897 MMLM0_SX177 MMLM0_SX267 MMLM0_SX357 MMLM0_SX447 MMLM0_SX87 MMPM0_SI1061 MMPM0_SI1691 MMPM0_SI2321 MMPM0_SX161 MMPM0_SX251 MMPM0_SX341 MMPM0_SX431 MMPM0_SX71 MMRP0_SI2034 MMRP0_SI717 MMRP0_SI774 MMRP0_SX144 MMRP0_SX234 MMRP0_SX324 MMRP0_SX414 MMRP0_SX54 MMSM0_SI1106 MMSM0_SI1736 MMSM0_SI476 MMSM0_SX116 MMSM0_SX206 MMSM0_SX26 MMSM0_SX296 MMSM0_SX386 MMVP0_SI1284 MMVP0_SI1914 MMVP0_SI654 MMVP0_SX114 MMVP0_SX204 MMVP0_SX294 MMVP0_SX347 MMVP0_SX384 MMWB0_SI1619 MMWB0_SI2249 MMWB0_SI989 MMWB0_SX179 MMWB0_SX269 MMWB0_SX359 MMWB0_SX449 MMWB0_SX89 MMWS0_SI1518 MMWS0_SI559 MMWS0_SI888 MMWS0_SX168 MMWS0_SX258 MMWS0_SX348 MMWS0_SX438 MMWS0_SX78 MMWS1_SI1071 MMWS1_SI1701 MMWS1_SI2331 MMWS1_SX261 MMWS1_SX27 MMWS1_SX351 MMWS1_SX441 MMWS1_SX81 MMXS0_SI2136 MMXS0_SI629 MMXS0_SI876 MMXS0_SX156 MMXS0_SX246 MMXS0_SX336 MMXS0_SX426 MMXS0_SX66 MNET0_SI1446 MNET0_SI2076 MNET0_SI816 MNET0_SX186 MNET0_SX276 MNET0_SX366 MNET0_SX6 MNET0_SX96 MNTW0_SI1068 MNTW0_SI1698 MNTW0_SI2328 MNTW0_SX168 MNTW0_SX202 MNTW0_SX258 MNTW0_SX348 MNTW0_SX78 MPAR0_SI1576 MPAR0_SI2206 MPAR0_SI946 MPAR0_SX136 MPAR0_SX226 MPAR0_SX316 MPAR0_SX406 MPAR0_SX46 MPEB0_SI1034 MPEB0_SI1860 MPEB0_SI600 MPEB0_SX150 MPEB0_SX240 MPEB0_SX330 MPEB0_SX420 MPEB0_SX60 MPFU0_SI1258 MPFU0_SI1888 MPFU0_SI628 MPFU0_SX178 MPFU0_SX268 MPFU0_SX358 MPFU0_SX448 MPFU0_SX88 MPGH0_SI1554 MPGH0_SI675 MPGH0_SI924 MPGH0_SX114 MPGH0_SX204 MPGH0_SX24 MPGH0_SX294 MPGH0_SX384 MPGR0_SI1410 MPGR0_SI2040 MPGR0_SI780 MPGR0_SX150 MPGR0_SX240 MPGR0_SX330 MPGR0_SX420 MPGR0_SX60 MPGR1_SI1269 MPGR1_SI1499 MPGR1_SI2129 MPGR1_SX149 MPGR1_SX239 MPGR1_SX329 MPGR1_SX419 MPGR1_SX59 MPMB0_SI1501 MPMB0_SI2131 MPMB0_SI871 MPMB0_SX151 MPMB0_SX241 MPMB0_SX331 MPMB0_SX421 MPMB0_SX61 MPPC0_SI1412 MPPC0_SI2042 MPPC0_SI782 MPPC0_SX152 MPPC0_SX242 MPPC0_SX332 MPPC0_SX422 MPPC0_SX62 MPRB0_SI1205 MPRB0_SI1215 MPRB0_SI575 MPRB0_SX125 MPRB0_SX215 MPRB0_SX305 MPRB0_SX35 MPRB0_SX395 MPRD0_SI1431 MPRD0_SI2061 MPRD0_SI801 MPRD0_SX171 MPRD0_SX261 MPRD0_SX351 MPRD0_SX441 MPRD0_SX81 MPRK0_SI1097 MPRK0_SI1727 MPRK0_SI467 MPRK0_SX107 MPRK0_SX17 MPRK0_SX197 MPRK0_SX287 MPRK0_SX377 MPRT0_SI1210 MPRT0_SI495 MPRT0_SI580 MPRT0_SX130 MPRT0_SX220 MPRT0_SX310 MPRT0_SX40 MPRT0_SX400 MPSW0_SI1067 MPSW0_SI1697 MPSW0_SI2327 MPSW0_SX167 MPSW0_SX24 MPSW0_SX257 MPSW0_SX437 MPSW0_SX77 MRAB0_SI1224 MRAB0_SI1854 MRAB0_SI594 MRAB0_SX144 MRAB0_SX234 MRAB0_SX324 MRAB0_SX414 MRAB0_SX54 MRAB1_SI1478 MRAB1_SI2108 MRAB1_SI848 MRAB1_SX128 MRAB1_SX218 MRAB1_SX308 MRAB1_SX38 MRAB1_SX398 MRAI0_SI1954 MRAI0_SI2052 MRAI0_SI792 MRAI0_SX162 MRAI0_SX252 MRAI0_SX342 MRAI0_SX432 MRAI0_SX72 MRAM0_SI1275 MRAM0_SI1905 MRAM0_SI1951 MRAM0_SX105 MRAM0_SX15 MRAM0_SX195 MRAM0_SX285 MRAM0_SX375 MRAV0_SI1008 MRAV0_SI1638 MRAV0_SI2268 MRAV0_SX108 MRAV0_SX18 MRAV0_SX198 MRAV0_SX288 MRAV0_SX378 MRBC0_SI1665 MRBC0_SI1859 MRBC0_SI599 MRBC0_SX149 MRBC0_SX239 MRBC0_SX329 MRBC0_SX419 MRBC0_SX59 MRCG0_SI1428 MRCG0_SI2058 MRCG0_SI798 MRCG0_SX168 MRCG0_SX258 MRCG0_SX348 MRCG0_SX438 MRCG0_SX78 MRCW0_SI1371 MRCW0_SI2001 MRCW0_SI741 MRCW0_SX111 MRCW0_SX201 MRCW0_SX21 MRCW0_SX291 MRCW0_SX381 MRDD0_SI1050 MRDD0_SI1680 MRDD0_SI2310 MRDD0_SX150 MRDD0_SX240 MRDD0_SX277 MRDD0_SX330 MRDD0_SX60 MRDM0_SI1044 MRDM0_SI1595 MRDM0_SI965 MRDM0_SX155 MRDM0_SX245 MRDM0_SX335 MRDM0_SX425 MRDM0_SX65 MRDS0_SI1167 MRDS0_SI1797 MRDS0_SI537 MRDS0_SX177 MRDS0_SX267 MRDS0_SX357 MRDS0_SX447 MRDS0_SX87 MREE0_SI1104 MREE0_SI1734 MREE0_SI1959 MREE0_SX114 MREE0_SX204 MREE0_SX24 MREE0_SX294 MREE0_SX384 MREH1_SI1599 MREH1_SI2229 MREH1_SI969 MREH1_SX159 MREH1_SX249 MREH1_SX339 MREH1_SX429 MREH1_SX69 MREM0_SI1591 MREM0_SI511 MREM0_SI961 MREM0_SX151 MREM0_SX241 MREM0_SX331 MREM0_SX421 MREM0_SX61 MREW1_SI1500 MREW1_SI2130 MREW1_SI870 MREW1_SX150 MREW1_SX240 MREW1_SX330 MREW1_SX420 MREW1_SX60 MRFK0_SI1076 MRFK0_SI1706 MRFK0_SI2336 MRFK0_SX176 MRFK0_SX266 MRFK0_SX356 MRFK0_SX446 MRFK0_SX86 MRFL0_SI1156 MRFL0_SI1786 MRFL0_SI526 MRFL0_SX166 MRFL0_SX256 MRFL0_SX346 MRFL0_SX436 MRFL0_SX76 MRGM0_SI1162 MRGM0_SI1792 MRGM0_SI532 MRGM0_SX172 MRGM0_SX262 MRGM0_SX416 MRGM0_SX442 MRGM0_SX82 MRGS0_SI1356 MRGS0_SI1986 MRGS0_SI726 MRGS0_SX186 MRGS0_SX276 MRGS0_SX366 MRGS0_SX6 MRGS0_SX96 MRHL0_SI1515 MRHL0_SI2145 MRHL0_SI885 MRHL0_SX165 MRHL0_SX255 MRHL0_SX345 MRHL0_SX435 MRHL0_SX75 MRJB1_SI1020 MRJB1_SI1413 MRJB1_SI2021 MRJB1_SX120 MRJB1_SX210 MRJB1_SX30 MRJB1_SX300 MRJB1_SX390 MRJH0_SI1519 MRJH0_SI889 MRJH0_SI914 MRJH0_SX169 MRJH0_SX259 MRJH0_SX307 MRJH0_SX439 MRJH0_SX79 MRJM0_SI1095 MRJM0_SI1228 MRJM0_SI1858 MRJM0_SX148 MRJM0_SX238 MRJM0_SX328 MRJM0_SX418 MRJM0_SX58 MRJM1_SI1298 MRJM1_SI1928 MRJM1_SI668 MRJM1_SX128 MRJM1_SX218 MRJM1_SX308 MRJM1_SX38 MRJM1_SX398 MRJT0_SI1498 MRJT0_SI1805 MRJT0_SI868 MRJT0_SX148 MRJT0_SX238 MRJT0_SX328 MRJT0_SX418 MRJT0_SX58 MRKM0_SI1267 MRKM0_SI1391 MRKM0_SI637 MRKM0_SX187 MRKM0_SX277 MRKM0_SX367 MRKM0_SX7 MRKM0_SX97 MRLD0_SI1594 MRLD0_SI2224 MRLD0_SI964 MRLD0_SX154 MRLD0_SX244 MRLD0_SX334 MRLD0_SX424 MRLD0_SX64 MRLJ0_SI1420 MRLJ0_SI2050 MRLJ0_SI790 MRLJ0_SX160 MRLJ0_SX250 MRLJ0_SX340 MRLJ0_SX430 MRLJ0_SX70 MRLJ1_SI1671 MRLJ1_SI2301 MRLJ1_SI2332 MRLJ1_SX141 MRLJ1_SX231 MRLJ1_SX321 MRLJ1_SX411 MRLJ1_SX51 MRLK0_SI1468 MRLK0_SI2140 MRLK0_SI843 MRLK0_SX123 MRLK0_SX213 MRLK0_SX303 MRLK0_SX33 MRLK0_SX393 MRLR0_SI1196 MRLR0_SI1826 MRLR0_SI566 MRLR0_SX116 MRLR0_SX206 MRLR0_SX26 MRLR0_SX296 MRLR0_SX386 MRMB0_SI1581 MRMB0_SI2211 MRMB0_SI951 MRMB0_SX141 MRMB0_SX231 MRMB0_SX321 MRMB0_SX411 MRMB0_SX51 MRMG0_SI1080 MRMG0_SI1710 MRMG0_SI2340 MRMG0_SX180 MRMG0_SX270 MRMG0_SX360 MRMG0_SX450 MRMG0_SX90 MRMH0_SI1021 MRMH0_SI1349 MRMH0_SI2281 MRMH0_SX121 MRMH0_SX211 MRMH0_SX301 MRMH0_SX31 MRMH0_SX391 MRML0_SI1421 MRML0_SI2051 MRML0_SI791 MRML0_SX161 MRML0_SX251 MRML0_SX341 MRML0_SX431 MRML0_SX71 MRMS0_SI1113 MRMS0_SI2057 MRMS0_SI2100 MRMS0_SX120 MRMS0_SX210 MRMS0_SX30 MRMS0_SX300 MRMS0_SX390 MRPC1_SI1482 MRPC1_SI2026 MRPC1_SI2112 MRPC1_SX132 MRPC1_SX222 MRPC1_SX312 MRPC1_SX402 MRPC1_SX42 MRRE0_SI1334 MRRE0_SI704 MRRE0_SI952 MRRE0_SX164 MRRE0_SX254 MRRE0_SX344 MRRE0_SX434 MRRE0_SX74 MRSO0_SI1206 MRSO0_SI1659 MRSO0_SI2289 MRSO0_SX129 MRSO0_SX219 MRSO0_SX309 MRSO0_SX39 MRSO0_SX399 MRSP0_SI1429 MRSP0_SI2059 MRSP0_SI799 MRSP0_SX169 MRSP0_SX196 MRSP0_SX259 MRSP0_SX439 MRSP0_SX79 MRTC0_SI1458 MRTC0_SI2088 MRTC0_SI828 MRTC0_SX108 MRTC0_SX18 MRTC0_SX198 MRTC0_SX288 MRTC0_SX378 MRTJ0_SI1551 MRTJ0_SI2032 MRTJ0_SI772 MRTJ0_SX142 MRTJ0_SX232 MRTJ0_SX322 MRTJ0_SX412 MRTJ0_SX52 MRVG0_SI1140 MRVG0_SI1770 MRVG0_SI510 MRVG0_SX150 MRVG0_SX240 MRVG0_SX330 MRVG0_SX420 MRVG0_SX60 MRWA0_SI1603 MRWA0_SI2233 MRWA0_SI973 MRWA0_SX163 MRWA0_SX253 MRWA0_SX343 MRWA0_SX433 MRWA0_SX73 MRWS0_SI1102 MRWS0_SI1732 MRWS0_SI472 MRWS0_SX112 MRWS0_SX202 MRWS0_SX22 MRWS0_SX292 MRWS0_SX382 MRXB0_SI1585 MRXB0_SI2215 MRXB0_SI955 MRXB0_SX145 MRXB0_SX235 MRXB0_SX325 MRXB0_SX415 MRXB0_SX55 MSAH1_SI1049 MSAH1_SI1679 MSAH1_SI2309 MSAH1_SX149 MSAH1_SX239 MSAH1_SX329 MSAH1_SX419 MSAH1_SX59 MSAS0_SI1376 MSAS0_SI2006 MSAS0_SI746 MSAS0_SX116 MSAS0_SX206 MSAS0_SX26 MSAS0_SX296 MSAS0_SX386 MSAT0_SI1526 MSAT0_SI2156 MSAT0_SI896 MSAT0_SX176 MSAT0_SX266 MSAT0_SX356 MSAT0_SX446 MSAT0_SX86 MSAT1_SI1073 MSAT1_SI1703 MSAT1_SI2333 MSAT1_SX173 MSAT1_SX263 MSAT1_SX353 MSAT1_SX443 MSAT1_SX83 MSDB0_SI1007 MSDB0_SI1637 MSDB0_SI2267 MSDB0_SX107 MSDB0_SX17 MSDB0_SX197 MSDB0_SX287 MSDB0_SX377 MSDH0_SI2113 MSDH0_SI2240 MSDH0_SI980 MSDH0_SX170 MSDH0_SX260 MSDH0_SX350 MSDH0_SX440 MSDH0_SX80 MSDS0_SI1077 MSDS0_SI1707 MSDS0_SI2337 MSDS0_SX177 MSDS0_SX267 MSDS0_SX357 MSDS0_SX447 MSDS0_SX87 MSEM1_SI1440 MSEM1_SI2070 MSEM1_SI810 MSEM1_SX180 MSEM1_SX270 MSEM1_SX360 MSEM1_SX450 MSEM1_SX90 MSES0_SI1589 MSES0_SI2216 MSES0_SI2219 MSES0_SX149 MSES0_SX239 MSES0_SX329 MSES0_SX419 MSES0_SX59 MSFH0_SI1216 MSFH0_SI1738 MSFH0_SI586 MSFH0_SX136 MSFH0_SX226 MSFH0_SX316 MSFH0_SX406 MSFH0_SX46 MSFV0_SI1262 MSFV0_SI1892 MSFV0_SI632 MSFV0_SX182 MSFV0_SX272 MSFV0_SX362 MSFV0_SX452 MSFV0_SX92 MSJK0_SI1596 MSJK0_SI2226 MSJK0_SI966 MSJK0_SX156 MSJK0_SX246 MSJK0_SX336 MSJK0_SX426 MSJK0_SX66 MSMC0_SI1907 MSMC0_SI509 MSMC0_SI647 MSMC0_SX107 MSMC0_SX17 MSMC0_SX197 MSMC0_SX287 MSMC0_SX377 MSMR0_SI1150 MSMR0_SI1405 MSMR0_SI775 MSMR0_SX145 MSMR0_SX235 MSMR0_SX325 MSMR0_SX415 MSMR0_SX55 MSMS0_SI1433 MSMS0_SI2063 MSMS0_SI803 MSMS0_SX173 MSMS0_SX263 MSMS0_SX353 MSMS0_SX443 MSMS0_SX83 MSRG0_SI1221 MSRG0_SI1851 MSRG0_SI591 MSRG0_SX141 MSRG0_SX231 MSRG0_SX321 MSRG0_SX411 MSRG0_SX51 MSRR0_SI1131 MSRR0_SI1761 MSRR0_SI501 MSRR0_SX141 MSRR0_SX231 MSRR0_SX30 MSRR0_SX411 MSRR0_SX51 MSTF0_SI1396 MSTF0_SI766 MSTF0_SI852 MSTF0_SX136 MSTF0_SX226 MSTF0_SX316 MSTF0_SX406 MSTF0_SX46 MSVS0_SI1568 MSVS0_SI2198 MSVS0_SI938 MSVS0_SX128 MSVS0_SX218 MSVS0_SX308 MSVS0_SX38 MSVS0_SX398 MTAB0_SI1572 MTAB0_SI2202 MTAB0_SI942 MTAB0_SX132 MTAB0_SX222 MTAB0_SX312 MTAB0_SX402 MTAB0_SX42 MTAS0_SI1385 MTAS0_SI2015 MTAS0_SI755 MTAS0_SX125 MTAS0_SX215 MTAS0_SX305 MTAS0_SX35 MTAS0_SX395 MTAT0_SI1110 MTAT0_SI1740 MTAT0_SI811 MTAT0_SX120 MTAT0_SX210 MTAT0_SX30 MTAT0_SX300 MTAT0_SX390 MTAT1_SI1409 MTAT1_SI1627 MTAT1_SI779 MTAT1_SX149 MTAT1_SX239 MTAT1_SX329 MTAT1_SX419 MTAT1_SX59 MTBC0_SI1173 MTBC0_SI1803 MTBC0_SI543 MTBC0_SX183 MTBC0_SX273 MTBC0_SX347 MTBC0_SX363 MTBC0_SX93 MTCS0_SI1972 MTCS0_SI2265 MTCS0_SI712 MTCS0_SX172 MTCS0_SX262 MTCS0_SX352 MTCS0_SX442 MTCS0_SX82 MTDB0_SI1401 MTDB0_SI2031 MTDB0_SI771 MTDB0_SX141 MTDB0_SX231 MTDB0_SX321 MTDB0_SX411 MTDB0_SX51 MTDP0_SI1274 MTDP0_SI1521 MTDP0_SI2151 MTDP0_SX171 MTDP0_SX261 MTDP0_SX351 MTDP0_SX441 MTDP0_SX81 MTER0_SI1157 MTER0_SI1787 MTER0_SI527 MTER0_SX167 MTER0_SX17 MTER0_SX257 MTER0_SX437 MTER0_SX77 MTJG0_SI1520 MTJG0_SI2157 MTJG0_SI890 MTJG0_SX170 MTJG0_SX260 MTJG0_SX350 MTJG0_SX440 MTJG0_SX80 MTJM0_SI1226 MTJM0_SI1856 MTJM0_SI655 MTJM0_SX146 MTJM0_SX236 MTJM0_SX326 MTJM0_SX416 MTJM0_SX56 MTJS0_SI1192 MTJS0_SI1822 MTJS0_SI562 MTJS0_SX112 MTJS0_SX202 MTJS0_SX22 MTJS0_SX292 MTJS0_SX382 MTJU0_SI2020 MTJU0_SI2269 MTJU0_SI760 MTJU0_SX130 MTJU0_SX220 MTJU0_SX310 MTJU0_SX40 MTJU0_SX400 MTKD0_SI1187 MTKD0_SI1817 MTKD0_SI630 MTKD0_SX107 MTKD0_SX17 MTKD0_SX197 MTKD0_SX287 MTKD0_SX377 MTKP0_SI1023 MTKP0_SI2283 MTKP0_SI454 MTKP0_SX123 MTKP0_SX213 MTKP0_SX303 MTKP0_SX33 MTKP0_SX393 MTLB0_SI1134 MTLB0_SI1764 MTLB0_SI504 MTLB0_SX144 MTLB0_SX234 MTLB0_SX324 MTLB0_SX414 MTLB0_SX54 MTLC0_SI1313 MTLC0_SI1477 MTLC0_SI847 MTLC0_SX127 MTLC0_SX217 MTLC0_SX307 MTLC0_SX37 MTLC0_SX397 MTML0_SI1065 MTML0_SI1695 MTML0_SI2325 MTML0_SX165 MTML0_SX255 MTML0_SX345 MTML0_SX435 MTML0_SX75 MTMN0_SI1064 MTMN0_SI2324 MTMN0_SI582 MTMN0_SX164 MTMN0_SX254 MTMN0_SX344 MTMN0_SX434 MTMN0_SX74 MTMT0_SI1118 MTMT0_SI1748 MTMT0_SI488 MTMT0_SX128 MTMT0_SX218 MTMT0_SX308 MTMT0_SX38 MTMT0_SX398 MTPF0_SI1235 MTPF0_SI1865 MTPF0_SI605 MTPF0_SX155 MTPF0_SX245 MTPF0_SX335 MTPF0_SX425 MTPF0_SX65 MTPG0_SI1383 MTPG0_SI2013 MTPG0_SI753 MTPG0_SX123 MTPG0_SX213 MTPG0_SX303 MTPG0_SX33 MTPG0_SX393 MTPP0_SI1508 MTPP0_SI2138 MTPP0_SI878 MTPP0_SX158 MTPP0_SX248 MTPP0_SX338 MTPP0_SX428 MTPP0_SX68 MTPR0_SI1600 MTPR0_SI2230 MTPR0_SI506 MTPR0_SX160 MTPR0_SX250 MTPR0_SX340 MTPR0_SX430 MTPR0_SX70 MTQC0_SI1441 MTQC0_SI2071 MTQC0_SI480 MTQC0_SX181 MTQC0_SX271 MTQC0_SX361 MTQC0_SX451 MTQC0_SX91 MTRC0_SI1623 MTRC0_SI589 MTRC0_SI993 MTRC0_SX170 MTRC0_SX183 MTRC0_SX273 MTRC0_SX363 MTRC0_SX93 MTRR0_SI1548 MTRR0_SI2178 MTRR0_SI918 MTRR0_SX108 MTRR0_SX18 MTRR0_SX198 MTRR0_SX288 MTRR0_SX378 MTRT0_SI1227 MTRT0_SI1857 MTRT0_SI597 MTRT0_SX147 MTRT0_SX237 MTRT0_SX254 MTRT0_SX417 MTRT0_SX57 MTWH1_SI1512 MTWH1_SI2142 MTWH1_SI882 MTWH1_SX162 MTWH1_SX252 MTWH1_SX342 MTWH1_SX432 MTWH1_SX72 MTXS0_SI1060 MTXS0_SI1690 MTXS0_SI2320 MTXS0_SX160 MTXS0_SX250 MTXS0_SX340 MTXS0_SX430 MTXS0_SX70 MVJH0_SI1556 MVJH0_SI2186 MVJH0_SI926 MVJH0_SX116 MVJH0_SX206 MVJH0_SX26 MVJH0_SX296 MVJH0_SX386 MVLO0_SI1147 MVLO0_SI1777 MVLO0_SI517 MVLO0_SX157 MVLO0_SX247 MVLO0_SX337 MVLO0_SX427 MVLO0_SX67 MVRW0_SI1485 MVRW0_SI2115 MVRW0_SI855 MVRW0_SX135 MVRW0_SX225 MVRW0_SX315 MVRW0_SX405 MVRW0_SX45 MWAC0_SI1601 MWAC0_SI2231 MWAC0_SI971 MWAC0_SX161 MWAC0_SX251 MWAC0_SX341 MWAC0_SX431 MWAC0_SX71 MWAD0_SI1062 MWAD0_SI1749 MWAD0_SI2322 MWAD0_SX162 MWAD0_SX252 MWAD0_SX342 MWAD0_SX432 MWAD0_SX72 MWAR0_SI1045 MWAR0_SI1675 MWAR0_SI2305 MWAR0_SX145 MWAR0_SX235 MWAR0_SX325 MWAR0_SX415 MWAR0_SX55 MWCH0_SI1622 MWCH0_SI1895 MWCH0_SI2252 MWCH0_SX182 MWCH0_SX272 MWCH0_SX362 MWCH0_SX452 MWCH0_SX92 MWDK0_SI1436 MWDK0_SI2017 MWDK0_SI806 MWDK0_SX176 MWDK0_SX266 MWDK0_SX356 MWDK0_SX446 MWDK0_SX86 MWEM0_SI1320 MWEM0_SI1393 MWEM0_SI1950 MWEM0_SX150 MWEM0_SX240 MWEM0_SX330 MWEM0_SX420 MWEM0_SX60 MWGR0_SI1606 MWGR0_SI2236 MWGR0_SI976 MWGR0_SX166 MWGR0_SX256 MWGR0_SX346 MWGR0_SX436 MWGR0_SX76 MWRE0_SI1057 MWRE0_SI1687 MWRE0_SI2317 MWRE0_SX157 MWRE0_SX247 MWRE0_SX337 MWRE0_SX427 MWRE0_SX67 MWRP0_SI1443 MWRP0_SI1525 MWRP0_SI2073 MWRP0_SX183 MWRP0_SX273 MWRP0_SX3 MWRP0_SX363 MWRP0_SX93 MWSB0_SI1626 MWSB0_SI2256 MWSB0_SI996 MWSB0_SX186 MWSB0_SX276 MWSB0_SX366 MWSB0_SX6 MWSB0_SX96 MWSH0_SI1426 MWSH0_SI2266 MWSH0_SI796 MWSH0_SX166 MWSH0_SX256 MWSH0_SX346 MWSH0_SX436 MWSH0_SX76 MZMB0_SI1166 MZMB0_SI1796 MZMB0_SI536 MZMB0_SX176 MZMB0_SX266 MZMB0_SX356 MZMB0_SX446 MZMB0_SX86 ================================================ FILE: examples/wav2vec/unsupervised/config/timit_matched/train_text.uid ================================================ FAEM0_SI1392 FAEM0_SI2022 FAEM0_SI762 FAEM0_SX132 FAEM0_SX222 FAEM0_SX312 FAEM0_SX402 FAEM0_SX42 FAJW0_SI1263 FAJW0_SI1893 FAJW0_SI633 FAJW0_SX183 FAJW0_SX273 FAJW0_SX3 FAJW0_SX363 FAJW0_SX93 FALK0_SI1086 FALK0_SI456 FALK0_SI658 FALK0_SX186 FALK0_SX276 FALK0_SX366 FALK0_SX6 FALK0_SX96 FALR0_SI1325 FALR0_SI1955 FALR0_SI695 FALR0_SX155 FALR0_SX245 FALR0_SX335 FALR0_SX425 FALR0_SX65 FAPB0_SI1063 FAPB0_SI1693 FAPB0_SI2323 FAPB0_SX163 FAPB0_SX253 FAPB0_SX343 FAPB0_SX433 FAPB0_SX73 FBAS0_SI1387 FBAS0_SI1472 FBAS0_SI2066 FBAS0_SX127 FBAS0_SX217 FBAS0_SX307 FBAS0_SX37 FBAS0_SX397 FBCG1_SI1612 FBCG1_SI2242 FBCG1_SI982 FBCG1_SX172 FBCG1_SX262 FBCG1_SX352 FBCG1_SX442 FBCG1_SX82 FBCH0_SI1586 FBCH0_SI956 FBCH0_SI959 FBCH0_SX146 FBCH0_SX236 FBCH0_SX326 FBCH0_SX416 FBCH0_SX56 FBJL0_SI1552 FBJL0_SI2182 FBJL0_SI922 FBJL0_SX112 FBJL0_SX202 FBJL0_SX22 FBJL0_SX292 FBJL0_SX382 FBLV0_SI1058 FBLV0_SI1688 FBLV0_SI2318 FBLV0_SX158 FBLV0_SX248 FBLV0_SX338 FBLV0_SX428 FBLV0_SX68 FBMH0_SI1136 FBMH0_SI1766 FBMH0_SI970 FBMH0_SX146 FBMH0_SX236 FBMH0_SX326 FBMH0_SX416 FBMH0_SX56 FBMJ0_SI1776 FBMJ0_SI516 FBMJ0_SI815 FBMJ0_SX156 FBMJ0_SX246 FBMJ0_SX336 FBMJ0_SX426 FBMJ0_SX66 FCAG0_SI1503 FCAG0_SI1641 FCAG0_SI2133 FCAG0_SX153 FCAG0_SX243 FCAG0_SX333 FCAG0_SX423 FCAG0_SX63 FCAJ0_SI1479 FCAJ0_SI1804 FCAJ0_SI849 FCAJ0_SX129 FCAJ0_SX219 FCAJ0_SX309 FCAJ0_SX39 FCAJ0_SX399 FCDR1_SI1186 FCDR1_SI1816 FCDR1_SI556 FCDR1_SX106 FCDR1_SX16 FCDR1_SX196 FCDR1_SX286 FCDR1_SX376 FCEG0_SI1248 FCEG0_SI1878 FCEG0_SI618 FCEG0_SX168 FCEG0_SX258 FCEG0_SX348 FCEG0_SX438 FCEG0_SX78 FCJF0_SI1027 FCJF0_SI1657 FCJF0_SI648 FCJF0_SX127 FCJF0_SX217 FCJF0_SX307 FCJF0_SX37 FCJF0_SX397 FCJS0_SI1607 FCJS0_SI2237 FCJS0_SI977 FCJS0_SX167 FCJS0_SX257 FCJS0_SX347 FCJS0_SX437 FCJS0_SX77 FCKE0_SI1111 FCKE0_SI1741 FCKE0_SI481 FCKE0_SX121 FCKE0_SX211 FCKE0_SX301 FCKE0_SX31 FCKE0_SX391 FCLT0_SI1438 FCLT0_SI2068 FCLT0_SI808 FCLT0_SX178 FCLT0_SX268 FCLT0_SX358 FCLT0_SX448 FCLT0_SX88 FCMG0_SI1142 FCMG0_SI1242 FCMG0_SI1872 FCMG0_SX162 FCMG0_SX252 FCMG0_SX342 FCMG0_SX432 FCMG0_SX72 FCMM0_SI1083 FCMM0_SI1957 FCMM0_SI453 FCMM0_SX183 FCMM0_SX273 FCMM0_SX363 FCMM0_SX420 FCMM0_SX93 FCRZ0_SI1913 FCRZ0_SI2053 FCRZ0_SI793 FCRZ0_SX163 FCRZ0_SX253 FCRZ0_SX343 FCRZ0_SX433 FCRZ0_SX73 FCYL0_SI1297 FCYL0_SI1927 FCYL0_SI667 FCYL0_SX127 FCYL0_SX217 FCYL0_SX349 FCYL0_SX37 FCYL0_SX397 FDAS1_SI1461 FDAS1_SI2091 FDAS1_SI831 FDAS1_SX111 FDAS1_SX201 FDAS1_SX21 FDAS1_SX291 FDAS1_SX381 FDAW0_SI1271 FDAW0_SI1406 FDAW0_SI2036 FDAW0_SX146 FDAW0_SX236 FDAW0_SX326 FDAW0_SX416 FDAW0_SX56 FDFB0_SI1318 FDFB0_SI1948 FDFB0_SI2010 FDFB0_SX148 FDFB0_SX238 FDFB0_SX328 FDFB0_SX418 FDFB0_SX58 FDJH0_SI1565 FDJH0_SI2195 FDJH0_SI935 FDJH0_SX125 FDJH0_SX215 FDJH0_SX305 FDJH0_SX35 FDJH0_SX395 FDKN0_SI1081 FDKN0_SI1202 FDKN0_SI1711 FDKN0_SX181 FDKN0_SX271 FDKN0_SX361 FDKN0_SX451 FDKN0_SX91 FDML0_SI1149 FDML0_SI1779 FDML0_SI2075 FDML0_SX159 FDML0_SX249 FDML0_SX339 FDML0_SX429 FDML0_SX69 FDMY0_SI1197 FDMY0_SI567 FDMY0_SI714 FDMY0_SX117 FDMY0_SX207 FDMY0_SX27 FDMY0_SX297 FDMY0_SX387 FDNC0_SI1278 FDNC0_SI1908 FDNC0_SI2287 FDNC0_SX108 FDNC0_SX18 FDNC0_SX198 FDNC0_SX288 FDNC0_SX378 FDTD0_SI1561 FDTD0_SI2191 FDTD0_SI931 FDTD0_SX121 FDTD0_SX211 FDTD0_SX301 FDTD0_SX321 FDTD0_SX391 FDXW0_SI1511 FDXW0_SI2141 FDXW0_SI881 FDXW0_SX161 FDXW0_SX251 FDXW0_SX341 FDXW0_SX431 FDXW0_SX71 FEAC0_SI1245 FEAC0_SI1875 FEAC0_SI615 FEAC0_SX165 FEAC0_SX255 FEAC0_SX345 FEAC0_SX435 FEAC0_SX75 FEAR0_SI1252 FEAR0_SI1882 FEAR0_SI622 FEAR0_SX172 FEAR0_SX262 FEAR0_SX352 FEAR0_SX442 FEAR0_SX82 FECD0_SI1418 FECD0_SI2048 FECD0_SI788 FECD0_SX158 FECD0_SX248 FECD0_SX338 FECD0_SX428 FECD0_SX68 FEEH0_SI1112 FEEH0_SI1742 FEEH0_SI471 FEEH0_SX122 FEEH0_SX212 FEEH0_SX302 FEEH0_SX32 FEEH0_SX392 FEME0_SI1505 FEME0_SI2135 FEME0_SI875 FEME0_SX155 FEME0_SX245 FEME0_SX335 FEME0_SX425 FEME0_SX65 FETB0_SI1148 FETB0_SI1778 FETB0_SI518 FETB0_SX158 FETB0_SX248 FETB0_SX338 FETB0_SX428 FETB0_SX68 FEXM0_SI1101 FEXM0_SI1731 FEXM0_SI482 FEXM0_SX111 FEXM0_SX201 FEXM0_SX291 FEXM0_SX366 FEXM0_SX381 FGCS0_SI1486 FGCS0_SI2116 FGCS0_SI856 FGCS0_SX136 FGCS0_SX226 FGCS0_SX316 FGCS0_SX406 FGCS0_SX46 FGDP0_SI1618 FGDP0_SI2248 FGDP0_SI988 FGDP0_SX178 FGDP0_SX268 FGDP0_SX358 FGDP0_SX448 FGDP0_SX88 FGMB0_SI1145 FGMB0_SI1775 FGMB0_SI515 FGMB0_SX155 FGMB0_SX245 FGMB0_SX335 FGMB0_SX425 FGMB0_SX65 FGRW0_SI1152 FGRW0_SI1782 FGRW0_SI1990 FGRW0_SX162 FGRW0_SX252 FGRW0_SX342 FGRW0_SX432 FGRW0_SX72 FHLM0_SI1560 FHLM0_SI2190 FHLM0_SI930 FHLM0_SX120 FHLM0_SX210 FHLM0_SX300 FHLM0_SX349 FHLM0_SX390 FHXS0_SI1075 FHXS0_SI2302 FHXS0_SI2335 FHXS0_SX175 FHXS0_SX265 FHXS0_SX355 FHXS0_SX445 FHXS0_SX85 FJDM2_SI1582 FJDM2_SI1964 FJDM2_SI2212 FJDM2_SX142 FJDM2_SX232 FJDM2_SX322 FJDM2_SX412 FJDM2_SX52 FJEN0_SI1047 FJEN0_SI1677 FJEN0_SI2307 FJEN0_SX147 FJEN0_SX237 FJEN0_SX327 FJEN0_SX417 FJEN0_SX57 FJHK0_SI1022 FJHK0_SI1652 FJHK0_SI2282 FJHK0_SX122 FJHK0_SX212 FJHK0_SX302 FJHK0_SX32 FJHK0_SX392 FJKL0_SI1562 FJKL0_SI2192 FJKL0_SI932 FJKL0_SX122 FJKL0_SX212 FJKL0_SX302 FJKL0_SX32 FJKL0_SX392 FJLG0_SI1506 FJLG0_SI1889 FJLG0_SI2306 FJLG0_SX179 FJLG0_SX269 FJLG0_SX359 FJLG0_SX449 FJLG0_SX89 FJLR0_SI1231 FJLR0_SI1861 FJLR0_SI601 FJLR0_SX151 FJLR0_SX241 FJLR0_SX331 FJLR0_SX421 FJLR0_SX61 FJRB0_SI1302 FJRB0_SI1932 FJRB0_SI672 FJRB0_SX132 FJRB0_SX222 FJRB0_SX312 FJRB0_SX402 FJRB0_SX42 FJRP1_SI1432 FJRP1_SI2062 FJRP1_SI802 FJRP1_SX172 FJRP1_SX262 FJRP1_SX352 FJRP1_SX442 FJRP1_SX82 FJSK0_SI1052 FJSK0_SI1682 FJSK0_SI2312 FJSK0_SX152 FJSK0_SX242 FJSK0_SX332 FJSK0_SX422 FJSK0_SX62 FJSP0_SI1434 FJSP0_SI1763 FJSP0_SI804 FJSP0_SX174 FJSP0_SX264 FJSP0_SX354 FJSP0_SX444 FJSP0_SX84 FJWB1_SI2055 FJWB1_SI748 FJWB1_SI795 FJWB1_SX165 FJWB1_SX255 FJWB1_SX345 FJWB1_SX435 FJWB1_SX75 FJXM0_SI1211 FJXM0_SI1971 FJXM0_SI581 FJXM0_SX131 FJXM0_SX221 FJXM0_SX311 FJXM0_SX401 FJXM0_SX41 FJXP0_SI1122 FJXP0_SI1752 FJXP0_SI492 FJXP0_SX132 FJXP0_SX222 FJXP0_SX312 FJXP0_SX402 FJXP0_SX42 FKAA0_SI1208 FKAA0_SI1838 FKAA0_SI578 FKAA0_SX128 FKAA0_SX218 FKAA0_SX308 FKAA0_SX38 FKAA0_SX398 FKDE0_SI1141 FKDE0_SI1771 FKDE0_SI2221 FKDE0_SX151 FKDE0_SX241 FKDE0_SX331 FKDE0_SX421 FKDE0_SX61 FKDW0_SI1207 FKDW0_SI1891 FKDW0_SI577 FKDW0_SX127 FKDW0_SX217 FKDW0_SX307 FKDW0_SX37 FKDW0_SX397 FKFB0_SI1608 FKFB0_SI2238 FKFB0_SI978 FKFB0_SX168 FKFB0_SX258 FKFB0_SX348 FKFB0_SX438 FKFB0_SX78 FKKH0_SI1290 FKKH0_SI1920 FKKH0_SI660 FKKH0_SX120 FKKH0_SX210 FKKH0_SX30 FKKH0_SX300 FKKH0_SX390 FKLC0_SI1615 FKLC0_SI2245 FKLC0_SI985 FKLC0_SX175 FKLC0_SX265 FKLC0_SX355 FKLC0_SX445 FKLC0_SX85 FKLC1_SI1048 FKLC1_SI1678 FKLC1_SI2308 FKLC1_SX148 FKLC1_SX238 FKLC1_SX328 FKLC1_SX418 FKLC1_SX58 FKLH0_SI1257 FKLH0_SI1887 FKLH0_SI627 FKLH0_SX177 FKLH0_SX267 FKLH0_SX357 FKLH0_SX447 FKLH0_SX87 FKSR0_SI1117 FKSR0_SI1747 FKSR0_SI487 FKSR0_SX161 FKSR0_SX217 FKSR0_SX366 FKSR0_SX37 FKSR0_SX397 FLAC0_SI1339 FLAC0_SI2161 FLAC0_SI901 FLAC0_SX181 FLAC0_SX271 FLAC0_SX361 FLAC0_SX451 FLAC0_SX91 FLAG0_SI1464 FLAG0_SI2094 FLAG0_SI834 FLAG0_SX114 FLAG0_SX204 FLAG0_SX24 FLAG0_SX294 FLAG0_SX384 FLEH0_SI1051 FLEH0_SI1681 FLEH0_SI2311 FLEH0_SX151 FLEH0_SX241 FLEH0_SX331 FLEH0_SX421 FLEH0_SX61 FLET0_SI1137 FLET0_SI1767 FLET0_SI507 FLET0_SX147 FLET0_SX237 FLET0_SX277 FLET0_SX417 FLET0_SX57 FLHD0_SI1344 FLHD0_SI1827 FLHD0_SI1974 FLHD0_SX174 FLHD0_SX264 FLHD0_SX354 FLHD0_SX444 FLHD0_SX84 FLJA0_SI1078 FLJA0_SI1708 FLJA0_SI2338 FLJA0_SX178 FLJA0_SX268 FLJA0_SX358 FLJA0_SX448 FLJA0_SX88 FLJD0_SI1516 FLJD0_SI2146 FLJD0_SI886 FLJD0_SX166 FLJD0_SX256 FLJD0_SX346 FLJD0_SX436 FLJD0_SX76 FLJG0_SI1611 FLJG0_SI2241 FLJG0_SI981 FLJG0_SX171 FLJG0_SX261 FLJG0_SX351 FLJG0_SX441 FLJG0_SX81 FLKM0_SI1880 FLKM0_SI620 FLKM0_SI686 FLKM0_SX116 FLKM0_SX260 FLKM0_SX350 FLKM0_SX440 FLKM0_SX80 FLMA0_SI1243 FLMA0_SI1873 FLMA0_SI613 FLMA0_SX163 FLMA0_SX253 FLMA0_SX343 FLMA0_SX433 FLMA0_SX73 FLMC0_SI1372 FLMC0_SI2002 FLMC0_SI742 FLMC0_SX112 FLMC0_SX22 FLMC0_SX292 FLMC0_SX336 FLMC0_SX382 FLMK0_SI1035 FLMK0_SI1229 FLMK0_SI2295 FLMK0_SX135 FLMK0_SX225 FLMK0_SX315 FLMK0_SX405 FLMK0_SX45 FLOD0_SI1287 FLOD0_SI1917 FLOD0_SI657 FLOD0_SX117 FLOD0_SX171 FLOD0_SX207 FLOD0_SX297 FLOD0_SX387 FLTM0_SI1070 FLTM0_SI1700 FLTM0_SI2330 FLTM0_SX170 FLTM0_SX260 FLTM0_SX350 FLTM0_SX440 FLTM0_SX80 FMAH1_SI1509 FMAH1_SI2139 FMAH1_SI879 FMAH1_SX159 FMAH1_SX249 FMAH1_SX339 FMAH1_SX429 FMAH1_SX69 FMBG0_SI1160 FMBG0_SI1790 FMBG0_SI2264 FMBG0_SX260 FMBG0_SX3 FMBG0_SX350 FMBG0_SX440 FMBG0_SX80 FMEM0_SI1377 FMEM0_SI2007 FMEM0_SI747 FMEM0_SX117 FMEM0_SX207 FMEM0_SX297 FMEM0_SX333 FMEM0_SX387 FMJB0_SI1177 FMJB0_SI1807 FMJB0_SI547 FMJB0_SX187 FMJB0_SX277 FMJB0_SX367 FMJB0_SX7 FMJB0_SX97 FMJF0_SI1254 FMJF0_SI1884 FMJF0_SI624 FMJF0_SX174 FMJF0_SX264 FMJF0_SX354 FMJF0_SX444 FMJF0_SX84 FMJU0_SI1389 FMJU0_SI2019 FMJU0_SI759 FMJU0_SX129 FMJU0_SX219 FMJU0_SX309 FMJU0_SX39 FMJU0_SX399 FMKC0_SI1041 FMKC0_SI1072 FMKC0_SI1702 FMKC0_SX172 FMKC0_SX262 FMKC0_SX352 FMKC0_SX442 FMKC0_SX82 FMKF0_SI1018 FMKF0_SI1536 FMKF0_SI906 FMKF0_SX186 FMKF0_SX276 FMKF0_SX366 FMKF0_SX6 FMKF0_SX96 FMMH0_SI1537 FMMH0_SI2167 FMMH0_SI907 FMMH0_SX187 FMMH0_SX367 FMMH0_SX420 FMMH0_SX7 FMMH0_SX97 FMPG0_SI1602 FMPG0_SI2232 FMPG0_SI972 FMPG0_SX162 FMPG0_SX252 FMPG0_SX342 FMPG0_SX432 FMPG0_SX72 FNKL0_SI1522 FNKL0_SI2152 FNKL0_SI892 FNKL0_SX172 FNKL0_SX196 FNKL0_SX262 FNKL0_SX442 FNKL0_SX82 FNTB0_SI1203 FNTB0_SI573 FNTB0_SI679 FNTB0_SX123 FNTB0_SX213 FNTB0_SX303 FNTB0_SX33 FNTB0_SX393 FPAB1_SI1471 FPAB1_SI2101 FPAB1_SI841 FPAB1_SX121 FPAB1_SX211 FPAB1_SX301 FPAB1_SX31 FPAB1_SX391 FPAC0_SI1921 FPAC0_SI2011 FPAC0_SI661 FPAC0_SX121 FPAC0_SX211 FPAC0_SX301 FPAC0_SX31 FPAC0_SX391 FPAD0_SI1346 FPAD0_SI1976 FPAD0_SI716 FPAD0_SX176 FPAD0_SX266 FPAD0_SX356 FPAD0_SX446 FPAD0_SX86 FPAF0_SI1054 FPAF0_SI1684 FPAF0_SI2314 FPAF0_SX154 FPAF0_SX244 FPAF0_SX334 FPAF0_SX424 FPAF0_SX64 FPAZ0_SI1593 FPAZ0_SI2223 FPAZ0_SI963 FPAZ0_SX153 FPAZ0_SX243 FPAZ0_SX27 FPAZ0_SX423 FPAZ0_SX63 FPJF0_SI1046 FPJF0_SI1259 FPJF0_SI1676 FPJF0_SX146 FPJF0_SX236 FPJF0_SX326 FPJF0_SX352 FPJF0_SX56 FPLS0_SI1590 FPLS0_SI2220 FPLS0_SI960 FPLS0_SX150 FPLS0_SX240 FPLS0_SX3 FPLS0_SX330 FPLS0_SX60 FPMY0_SI1153 FPMY0_SI1783 FPMY0_SI523 FPMY0_SX163 FPMY0_SX196 FPMY0_SX253 FPMY0_SX343 FPMY0_SX73 FREH0_SI1315 FREH0_SI1945 FREH0_SI685 FREH0_SX145 FREH0_SX235 FREH0_SX325 FREH0_SX415 FREH0_SX55 FRJB0_SI1427 FRJB0_SI1470 FRJB0_SI1794 FRJB0_SX167 FRJB0_SX257 FRJB0_SX347 FRJB0_SX437 FRJB0_SX77 FRLL0_SI1514 FRLL0_SI805 FRLL0_SI884 FRLL0_SX164 FRLL0_SX254 FRLL0_SX344 FRLL0_SX434 FRLL0_SX74 FSAG0_SI1323 FSAG0_SI1953 FSAG0_SI693 FSAG0_SX153 FSAG0_SX243 FSAG0_SX333 FSAG0_SX423 FSAG0_SX63 FSAH0_SI1244 FSAH0_SI1874 FSAH0_SI614 FSAH0_SX164 FSAH0_SX327 FSAH0_SX344 FSAH0_SX434 FSAH0_SX74 FSAK0_SI1300 FSAK0_SI1930 FSAK0_SI670 FSAK0_SX130 FSAK0_SX220 FSAK0_SX310 FSAK0_SX40 FSAK0_SX400 FSBK0_SI1069 FSBK0_SI1699 FSBK0_SI2329 FSBK0_SX169 FSBK0_SX259 FSBK0_SX349 FSBK0_SX439 FSBK0_SX79 FSCN0_SI1886 FSCN0_SI626 FSCN0_SI705 FSCN0_SX176 FSCN0_SX266 FSCN0_SX356 FSCN0_SX446 FSCN0_SX86 FSDC0_SI1312 FSDC0_SI1942 FSDC0_SI2234 FSDC0_SX142 FSDC0_SX232 FSDC0_SX322 FSDC0_SX412 FSDC0_SX52 FSDJ0_SI1115 FSDJ0_SI1745 FSDJ0_SI485 FSDJ0_SX125 FSDJ0_SX215 FSDJ0_SX305 FSDJ0_SX35 FSDJ0_SX395 FSGF0_SI1557 FSGF0_SI2187 FSGF0_SI927 FSGF0_SX117 FSGF0_SX207 FSGF0_SX27 FSGF0_SX297 FSGF0_SX387 FSJG0_SI1570 FSJG0_SI2200 FSJG0_SI940 FSJG0_SX130 FSJG0_SX220 FSJG0_SX310 FSJG0_SX40 FSJG0_SX400 FSJK1_SI1025 FSJK1_SI2285 FSJK1_SI696 FSJK1_SX125 FSJK1_SX215 FSJK1_SX305 FSJK1_SX35 FSJK1_SX395 FSJS0_SI1171 FSJS0_SI1801 FSJS0_SI541 FSJS0_SX181 FSJS0_SX271 FSJS0_SX361 FSJS0_SX451 FSJS0_SX91 FSJW0_SI1333 FSJW0_SI1963 FSJW0_SI703 FSJW0_SX163 FSJW0_SX253 FSJW0_SX343 FSJW0_SX433 FSJW0_SX73 FSKC0_SI1416 FSKC0_SI2046 FSKC0_SI786 FSKC0_SX156 FSKC0_SX246 FSKC0_SX336 FSKC0_SX426 FSKC0_SX66 FSKL0_SI1529 FSKL0_SI2159 FSKL0_SI899 FSKL0_SX179 FSKL0_SX269 FSKL0_SX359 FSKL0_SX449 FSKL0_SX89 FSKP0_SI1098 FSKP0_SI1728 FSKP0_SI468 FSKP0_SX108 FSKP0_SX18 FSKP0_SX198 FSKP0_SX288 FSKP0_SX378 FSLS0_SI1056 FSLS0_SI1686 FSLS0_SI2316 FSLS0_SX156 FSLS0_SX202 FSLS0_SX246 FSLS0_SX426 FSLS0_SX66 FSMA0_SI1621 FSMA0_SI2251 FSMA0_SI991 FSMA0_SX181 FSMA0_SX271 FSMA0_SX361 FSMA0_SX451 FSMA0_SX91 FSMM0_SI1314 FSMM0_SI1944 FSMM0_SI684 FSMM0_SX144 FSMM0_SX234 FSMM0_SX324 FSMM0_SX414 FSMM0_SX54 FSMS1_SI1504 FSMS1_SI2134 FSMS1_SI874 FSMS1_SX154 FSMS1_SX244 FSMS1_SX334 FSMS1_SX347 FSMS1_SX64 FSPM0_SI1241 FSPM0_SI1871 FSPM0_SI611 FSPM0_SX161 FSPM0_SX251 FSPM0_SX341 FSPM0_SX431 FSPM0_SX71 FSRH0_SI1719 FSRH0_SI1931 FSRH0_SI671 FSRH0_SX131 FSRH0_SX221 FSRH0_SX311 FSRH0_SX401 FSRH0_SX41 FSSB0_SI1082 FSSB0_SI1712 FSSB0_SI2342 FSSB0_SX182 FSSB0_SX272 FSSB0_SX362 FSSB0_SX452 FSSB0_SX92 FTAJ0_SI1329 FTAJ0_SI474 FTAJ0_SI699 FTAJ0_SX159 FTAJ0_SX249 FTAJ0_SX339 FTAJ0_SX429 FTAJ0_SX69 FTBR0_SI1402 FTBR0_SI2181 FTBR0_SI921 FTBR0_SX111 FTBR0_SX201 FTBR0_SX21 FTBR0_SX291 FTBR0_SX381 FTBW0_SI1345 FTBW0_SI1975 FTBW0_SI715 FTBW0_SX175 FTBW0_SX265 FTBW0_SX355 FTBW0_SX445 FTBW0_SX85 FTLG0_SI1743 FTLG0_SI483 FTLG0_SI840 FTLG0_SX123 FTLG0_SX213 FTLG0_SX303 FTLG0_SX33 FTLG0_SX393 FTMG0_SI1532 FTMG0_SI2162 FTMG0_SI902 FTMG0_SX182 FTMG0_SX272 FTMG0_SX362 FTMG0_SX452 FTMG0_SX92 FVFB0_SI1032 FVFB0_SI1510 FVFB0_SI2292 FVFB0_SX132 FVFB0_SX222 FVFB0_SX312 FVFB0_SX402 FVFB0_SX42 FVKB0_SI1159 FVKB0_SI1789 FVKB0_SI529 FVKB0_SX169 FVKB0_SX259 FVKB0_SX349 FVKB0_SX439 FVKB0_SX79 FVMH0_SI1466 FVMH0_SI2096 FVMH0_SI836 FVMH0_SX116 FVMH0_SX206 FVMH0_SX26 FVMH0_SX296 FVMH0_SX386 MABC0_SI1620 MABC0_SI2041 MABC0_SI781 MABC0_SX151 MABC0_SX241 MABC0_SX331 MABC0_SX421 MABC0_SX61 MADC0_SI1367 MADC0_SI1997 MADC0_SI737 MADC0_SX107 MADC0_SX17 MADC0_SX197 MADC0_SX287 MADC0_SX377 MADD0_SI1295 MADD0_SI1798 MADD0_SI538 MADD0_SX178 MADD0_SX268 MADD0_SX358 MADD0_SX448 MADD0_SX88 MAEB0_SI1411 MAEB0_SI2250 MAEB0_SI990 MAEB0_SX180 MAEB0_SX270 MAEB0_SX360 MAEB0_SX450 MAEB0_SX90 MAEO0_SI1326 MAEO0_SI1655 MAEO0_SI1956 MAEO0_SX156 MAEO0_SX246 MAEO0_SX336 MAEO0_SX426 MAEO0_SX66 MAFM0_SI1569 MAFM0_SI2199 MAFM0_SI939 MAFM0_SX129 MAFM0_SX219 MAFM0_SX309 MAFM0_SX39 MAFM0_SX399 MAJP0_SI1074 MAJP0_SI1704 MAJP0_SI2334 MAJP0_SX174 MAJP0_SX264 MAJP0_SX354 MAJP0_SX444 MAJP0_SX84 MAKB0_SI1016 MAKB0_SI1646 MAKB0_SI2276 MAKB0_SX116 MAKB0_SX206 MAKB0_SX26 MAKB0_SX296 MAKB0_SX386 MAKR0_SI1352 MAKR0_SI1982 MAKR0_SI722 MAKR0_SX182 MAKR0_SX272 MAKR0_SX362 MAKR0_SX452 MAKR0_SX92 MAPV0_SI1293 MAPV0_SI1923 MAPV0_SI663 MAPV0_SX123 MAPV0_SX213 MAPV0_SX303 MAPV0_SX33 MAPV0_SX393 MARC0_SI1188 MARC0_SI1818 MARC0_SI558 MARC0_SX108 MARC0_SX18 MARC0_SX198 MARC0_SX288 MARC0_SX378 MARW0_SI1276 MARW0_SI1906 MARW0_SI646 MARW0_SX106 MARW0_SX16 MARW0_SX286 MARW0_SX349 MARW0_SX376 MBAR0_SI1319 MBAR0_SI1949 MBAR0_SI689 MBAR0_SX149 MBAR0_SX239 MBAR0_SX329 MBAR0_SX419 MBAR0_SX59 MBBR0_SI1055 MBBR0_SI1685 MBBR0_SI2315 MBBR0_SX155 MBBR0_SX245 MBBR0_SX335 MBBR0_SX425 MBBR0_SX65 MBCG0_SI2217 MBCG0_SI486 MBCG0_SI957 MBCG0_SX147 MBCG0_SX237 MBCG0_SX327 MBCG0_SX417 MBCG0_SX57 MBEF0_SI1281 MBEF0_SI1911 MBEF0_SI651 MBEF0_SX111 MBEF0_SX201 MBEF0_SX21 MBEF0_SX291 MBEF0_SX381 MBGT0_SI1341 MBGT0_SI1841 MBGT0_SI711 MBGT0_SX171 MBGT0_SX261 MBGT0_SX351 MBGT0_SX441 MBGT0_SX81 MBJV0_SI1247 MBJV0_SI1877 MBJV0_SI617 MBJV0_SX167 MBJV0_SX257 MBJV0_SX347 MBJV0_SX437 MBJV0_SX77 MBMA0_SI1222 MBMA0_SI1852 MBMA0_SI592 MBMA0_SX142 MBMA0_SX232 MBMA0_SX322 MBMA0_SX412 MBMA0_SX52 MBMA1_SI2207 MBMA1_SI2214 MBMA1_SI954 MBMA1_SX144 MBMA1_SX234 MBMA1_SX324 MBMA1_SX414 MBMA1_SX54 MBML0_SI1169 MBML0_SI1799 MBML0_SI539 MBML0_SX179 MBML0_SX269 MBML0_SX359 MBML0_SX449 MBML0_SX89 MBOM0_SI1014 MBOM0_SI1644 MBOM0_SI2274 MBOM0_SX114 MBOM0_SX204 MBOM0_SX294 MBOM0_SX311 MBOM0_SX384 MBSB0_SI1353 MBSB0_SI1983 MBSB0_SI723 MBSB0_SX183 MBSB0_SX273 MBSB0_SX3 MBSB0_SX363 MBSB0_SX93 MBTH0_SI2102 MBTH0_SI505 MBTH0_SI757 MBTH0_SX122 MBTH0_SX212 MBTH0_SX302 MBTH0_SX32 MBTH0_SX392 MBWP0_SI1531 MBWP0_SI1969 MBWP0_SI709 MBWP0_SX169 MBWP0_SX259 MBWP0_SX349 MBWP0_SX439 MBWP0_SX79 MCAE0_SI1447 MCAE0_SI2077 MCAE0_SI817 MCAE0_SX187 MCAE0_SX277 MCAE0_SX367 MCAE0_SX7 MCAE0_SX97 MCAL0_SI1138 MCAL0_SI1768 MCAL0_SI508 MCAL0_SX148 MCAL0_SX238 MCAL0_SX328 MCAL0_SX418 MCAL0_SX58 MCDC0_SI1292 MCDC0_SI1922 MCDC0_SI662 MCDC0_SX122 MCDC0_SX212 MCDC0_SX302 MCDC0_SX32 MCDC0_SX392 MCDD0_SI1513 MCDD0_SI2143 MCDD0_SI883 MCDD0_SX163 MCDD0_SX253 MCDD0_SX343 MCDD0_SX433 MCDD0_SX73 MCDR0_SI1154 MCDR0_SI1784 MCDR0_SI524 MCDR0_SX164 MCDR0_SX254 MCDR0_SX344 MCDR0_SX434 MCDR0_SX74 MCEF0_SI1135 MCEF0_SI1765 MCEF0_SI842 MCEF0_SX145 MCEF0_SX235 MCEF0_SX325 MCEF0_SX415 MCEF0_SX55 MCEW0_SI1442 MCEW0_SI2072 MCEW0_SI812 MCEW0_SX182 MCEW0_SX272 MCEW0_SX362 MCEW0_SX452 MCEW0_SX92 MCHL0_SI1347 MCHL0_SI1404 MCHL0_SI1977 MCHL0_SX177 MCHL0_SX267 MCHL0_SX357 MCHL0_SX447 MCHL0_SX87 MCLK0_SI1660 MCLK0_SI2290 MCLK0_SI650 MCLK0_SX130 MCLK0_SX220 MCLK0_SX310 MCLK0_SX40 MCLK0_SX400 MCLM0_SI1456 MCLM0_SI2086 MCLM0_SI826 MCLM0_SX106 MCLM0_SX16 MCLM0_SX196 MCLM0_SX286 MCLM0_SX376 MCPM0_SI1194 MCPM0_SI1824 MCPM0_SI564 MCPM0_SX114 MCPM0_SX204 MCPM0_SX24 MCPM0_SX294 MCPM0_SX384 MCRE0_SI1121 MCRE0_SI1725 MCRE0_SI1751 MCRE0_SX131 MCRE0_SX221 MCRE0_SX24 MCRE0_SX401 MCRE0_SX41 MCSS0_SI1380 MCSS0_SI688 MCSS0_SI750 MCSS0_SX120 MCSS0_SX210 MCSS0_SX30 MCSS0_SX300 MCSS0_SX390 MCTH0_SI1209 MCTH0_SI1839 MCTH0_SI579 MCTH0_SX129 MCTH0_SX219 MCTH0_SX309 MCTH0_SX39 MCTH0_SX399 MCTM0_SI1350 MCTM0_SI1980 MCTM0_SI720 MCTM0_SX180 MCTM0_SX270 MCTM0_SX360 MCTM0_SX450 MCTM0_SX90 MCXM0_SI1351 MCXM0_SI1981 MCXM0_SI721 MCXM0_SX181 MCXM0_SX271 MCXM0_SX361 MCXM0_SX451 MCXM0_SX91 MDAC0_SI1261 MDAC0_SI1837 MDAC0_SI631 MDAC0_SX181 MDAC0_SX271 MDAC0_SX361 MDAC0_SX451 MDAC0_SX91 MDAS0_SI1266 MDAS0_SI1896 MDAS0_SI636 MDAS0_SX186 MDAS0_SX21 MDAS0_SX276 MDAS0_SX6 MDAS0_SX96 MDBB1_SI1006 MDBB1_SI1636 MDBB1_SI2056 MDBB1_SX106 MDBB1_SX16 MDBB1_SX196 MDBB1_SX286 MDBB1_SX376 MDBP0_SI1158 MDBP0_SI1788 MDBP0_SI528 MDBP0_SX168 MDBP0_SX258 MDBP0_SX348 MDBP0_SX438 MDBP0_SX78 MDCD0_SI1415 MDCD0_SI2045 MDCD0_SI785 MDCD0_SX155 MDCD0_SX245 MDCD0_SX335 MDCD0_SX425 MDCD0_SX65 MDCM0_SI1480 MDCM0_SI2110 MDCM0_SI850 MDCM0_SX130 MDCM0_SX220 MDCM0_SX310 MDCM0_SX40 MDCM0_SX400 MDDC0_SI1419 MDDC0_SI2049 MDDC0_SI789 MDDC0_SX159 MDDC0_SX249 MDDC0_SX339 MDDC0_SX429 MDDC0_SX69 MDED0_SI1170 MDED0_SI1800 MDED0_SI540 MDED0_SX180 MDED0_SX270 MDED0_SX360 MDED0_SX450 MDED0_SX90 MDEF0_SI1123 MDEF0_SI1563 MDEF0_SI2193 MDEF0_SX123 MDEF0_SX213 MDEF0_SX303 MDEF0_SX33 MDEF0_SX393 MDEM0_SI1868 MDEM0_SI608 MDEM0_SI800 MDEM0_SX158 MDEM0_SX248 MDEM0_SX338 MDEM0_SX428 MDEM0_SX68 MDHL0_SI1439 MDHL0_SI2069 MDHL0_SI809 MDHL0_SX179 MDHL0_SX269 MDHL0_SX359 MDHL0_SX449 MDHL0_SX89 MDHS0_SI1530 MDHS0_SI2160 MDHS0_SI900 MDHS0_SX180 MDHS0_SX270 MDHS0_SX360 MDHS0_SX450 MDHS0_SX90 MDJM0_SI1455 MDJM0_SI2085 MDJM0_SI825 MDJM0_SX105 MDJM0_SX15 MDJM0_SX195 MDJM0_SX285 MDJM0_SX375 MDKS0_SI1066 MDKS0_SI1696 MDKS0_SI2326 MDKS0_SX166 MDKS0_SX256 MDKS0_SX346 MDKS0_SX436 MDKS0_SX76 MDLB0_SI1306 MDLB0_SI1936 MDLB0_SI676 MDLB0_SX136 MDLB0_SX226 MDLB0_SX316 MDLB0_SX406 MDLB0_SX46 MDLC0_SI1395 MDLC0_SI2025 MDLC0_SI765 MDLC0_SX135 MDLC0_SX225 MDLC0_SX315 MDLC0_SX405 MDLC0_SX45 MDLC1_SI1435 MDLC1_SI2065 MDLC1_SI2144 MDLC1_SX175 MDLC1_SX265 MDLC1_SX355 MDLC1_SX445 MDLC1_SX85 MDLC2_SI1614 MDLC2_SI2244 MDLC2_SI984 MDLC2_SX174 MDLC2_SX264 MDLC2_SX354 MDLC2_SX444 MDLC2_SX84 MDLH0_SI1960 MDLH0_SI574 MDLH0_SI700 MDLH0_SX160 MDLH0_SX250 MDLH0_SX340 MDLH0_SX430 MDLH0_SX70 MDLM0_SI1234 MDLM0_SI1864 MDLM0_SI604 MDLM0_SX154 MDLM0_SX244 MDLM0_SX334 MDLM0_SX424 MDLM0_SX64 MDLR0_SI1233 MDLR0_SI1863 MDLR0_SI603 MDLR0_SX153 MDLR0_SX243 MDLR0_SX333 MDLR0_SX423 MDLR0_SX63 MDLR1_SI1299 MDLR1_SI1929 MDLR1_SI669 MDLR1_SX129 MDLR1_SX219 MDLR1_SX309 MDLR1_SX39 MDLR1_SX399 MDMA0_SI1238 MDMA0_SI1430 MDMA0_SI2060 MDMA0_SX170 MDMA0_SX260 MDMA0_SX350 MDMA0_SX440 MDMA0_SX80 MDMT0_SI1832 MDMT0_SI2341 MDMT0_SI572 MDMT0_SX122 MDMT0_SX212 MDMT0_SX302 MDMT0_SX32 MDMT0_SX392 MDNS0_SI1011 MDNS0_SI2271 MDNS0_SI873 MDNS0_SX111 MDNS0_SX201 MDNS0_SX21 MDNS0_SX291 MDNS0_SX381 MDPB0_SI1760 MDPB0_SI2126 MDPB0_SI866 MDPB0_SX146 MDPB0_SX236 MDPB0_SX326 MDPB0_SX416 MDPB0_SX56 MDPK0_SI1053 MDPK0_SI1683 MDPK0_SI552 MDPK0_SX153 MDPK0_SX243 MDPK0_SX333 MDPK0_SX423 MDPK0_SX63 MDPS0_SI1651 MDPS0_SI1979 MDPS0_SI719 MDPS0_SX179 MDPS0_SX269 MDPS0_SX359 MDPS0_SX449 MDPS0_SX89 MDRD0_SI1382 MDRD0_SI2012 MDRD0_SI752 MDRD0_SX122 MDRD0_SX212 MDRD0_SX302 MDRD0_SX32 MDRD0_SX392 MDSJ0_SI1462 MDSJ0_SI2092 MDSJ0_SI832 MDSJ0_SX112 MDSJ0_SX22 MDSJ0_SX292 MDSJ0_SX382 MDSJ0_SX438 MDSS0_SI1881 MDSS0_SI2087 MDSS0_SI621 MDSS0_SX171 MDSS0_SX261 MDSS0_SX351 MDSS0_SX441 MDSS0_SX81 MDSS1_SI1327 MDSS1_SI1713 MDSS1_SI697 MDSS1_SX157 MDSS1_SX247 MDSS1_SX337 MDSS1_SX427 MDSS1_SX67 MDTB0_SI1200 MDTB0_SI1830 MDTB0_SI570 MDTB0_SX120 MDTB0_SX210 MDTB0_SX300 MDTB0_SX321 MDTB0_SX390 MDWD0_SI1260 MDWD0_SI1890 MDWD0_SI557 MDWD0_SX180 MDWD0_SX270 MDWD0_SX360 MDWD0_SX450 MDWD0_SX90 MDWH0_SI1168 MDWH0_SI1925 MDWH0_SI665 MDWH0_SX125 MDWH0_SX215 MDWH0_SX305 MDWH0_SX35 MDWH0_SX395 MDWM0_SI1546 MDWM0_SI2176 MDWM0_SI916 MDWM0_SX106 MDWM0_SX16 MDWM0_SX286 MDWM0_SX376 MDWM0_SX433 MEAL0_SI1547 MEAL0_SI2177 MEAL0_SI917 MEAL0_SX107 MEAL0_SX197 MEAL0_SX287 MEAL0_SX347 MEAL0_SX377 MEDR0_SI1374 MEDR0_SI2004 MEDR0_SI744 MEDR0_SX114 MEDR0_SX204 MEDR0_SX24 MEDR0_SX294 MEDR0_SX384 MEFG0_SI465 MEFG0_SI491 MEFG0_SI598 MEFG0_SX105 MEFG0_SX15 MEFG0_SX195 MEFG0_SX285 MEFG0_SX375 MEGJ0_SI1337 MEGJ0_SI1967 MEGJ0_SI707 MEGJ0_SX167 MEGJ0_SX257 MEGJ0_SX3 MEGJ0_SX437 MEGJ0_SX77 MEJL0_SI1592 MEJL0_SI1654 MEJL0_SI962 MEJL0_SX152 MEJL0_SX242 MEJL0_SX332 MEJL0_SX422 MEJL0_SX62 MEJS0_SI1240 MEJS0_SI1870 MEJS0_SI610 MEJS0_SX160 MEJS0_SX250 MEJS0_SX340 MEJS0_SX430 MEJS0_SX70 MESG0_SI1332 MESG0_SI1962 MESG0_SI702 MESG0_SX162 MESG0_SX252 MESG0_SX342 MESG0_SX432 MESG0_SX72 MESJ0_SI2039 MESJ0_SI2257 MESJ0_SI997 MESJ0_SX187 MESJ0_SX277 MESJ0_SX367 MESJ0_SX7 MESJ0_SX97 MEWM0_SI1348 MEWM0_SI1978 MEWM0_SI718 MEWM0_SX178 MEWM0_SX268 MEWM0_SX358 MEWM0_SX448 MEWM0_SX88 MFER0_SI1492 MFER0_SI2122 MFER0_SI862 MFER0_SX142 MFER0_SX232 MFER0_SX322 MFER0_SX412 MFER0_SX52 MFMC0_SI1132 MFMC0_SI1762 MFMC0_SI502 MFMC0_SX142 MFMC0_SX232 MFMC0_SX322 MFMC0_SX412 MFMC0_SX52 MFRM0_SI1155 MFRM0_SI1717 MFRM0_SI1785 MFRM0_SX165 MFRM0_SX255 MFRM0_SX345 MFRM0_SX435 MFRM0_SX75 MFWK0_SI1249 MFWK0_SI1879 MFWK0_SI619 MFWK0_SX169 MFWK0_SX259 MFWK0_SX349 MFWK0_SX439 MFWK0_SX79 MFXS0_SI1674 MFXS0_SI2225 MFXS0_SI2304 MFXS0_SX144 MFXS0_SX234 MFXS0_SX324 MFXS0_SX414 MFXS0_SX54 MFXV0_SI1005 MFXV0_SI1342 MFXV0_SI1635 MFXV0_SX105 MFXV0_SX15 MFXV0_SX195 MFXV0_SX285 MFXV0_SX375 MGAF0_SI1282 MGAF0_SI1912 MGAF0_SI652 MGAF0_SX112 MGAF0_SX202 MGAF0_SX22 MGAF0_SX292 MGAF0_SX382 MGAG0_SI1321 MGAG0_SI645 MGAG0_SI691 MGAG0_SX151 MGAG0_SX241 MGAG0_SX331 MGAG0_SX421 MGAG0_SX61 MGAK0_SI1036 MGAK0_SI1666 MGAK0_SI2296 MGAK0_SX136 MGAK0_SX226 MGAK0_SX316 MGAK0_SX406 MGAK0_SX46 MGAR0_SI1212 MGAR0_SI1694 MGAR0_SI1842 MGAR0_SX132 MGAR0_SX222 MGAR0_SX312 MGAR0_SX402 MGAR0_SX42 MGAW0_SI1165 MGAW0_SI1802 MGAW0_SI535 MGAW0_SX175 MGAW0_SX265 MGAW0_SX355 MGAW0_SX445 MGAW0_SX85 MGES0_SI1481 MGES0_SI2111 MGES0_SI851 MGES0_SX131 MGES0_SX221 MGES0_SX311 MGES0_SX401 MGES0_SX41 MGJC0_SI1256 MGJC0_SI1335 MGJC0_SI1965 MGJC0_SX165 MGJC0_SX255 MGJC0_SX345 MGJC0_SX435 MGJC0_SX75 MGRL0_SI1497 MGRL0_SI2127 MGRL0_SI867 MGRL0_SX147 MGRL0_SX237 MGRL0_SX327 MGRL0_SX417 MGRL0_SX57 MGRP0_SI1317 MGRP0_SI1947 MGRP0_SI687 MGRP0_SX147 MGRP0_SX237 MGRP0_SX327 MGRP0_SX417 MGRP0_SX57 MGSH0_SI1176 MGSH0_SI1806 MGSH0_SI546 MGSH0_SX127 MGSH0_SX186 MGSH0_SX276 MGSH0_SX6 MGSH0_SX96 MGSL0_SI1164 MGSL0_SI534 MGSL0_SI797 MGSL0_SX174 MGSL0_SX264 MGSL0_SX354 MGSL0_SX444 MGSL0_SX84 MGXP0_SI1087 MGXP0_SI457 MGXP0_SI525 MGXP0_SX187 MGXP0_SX277 MGXP0_SX367 MGXP0_SX7 MGXP0_SX97 MHBS0_SI1575 MHBS0_SI2205 MHBS0_SI945 MHBS0_SX135 MHBS0_SX225 MHBS0_SX315 MHBS0_SX405 MHBS0_SX45 MHIT0_SI1613 MHIT0_SI2243 MHIT0_SI983 MHIT0_SX173 MHIT0_SX263 MHIT0_SX353 MHIT0_SX443 MHIT0_SX83 MHJB0_SI1017 MHJB0_SI1647 MHJB0_SI2277 MHJB0_SX117 MHJB0_SX207 MHJB0_SX27 MHJB0_SX297 MHJB0_SX387 MHMG0_SI1365 MHMG0_SI1995 MHMG0_SI735 MHMG0_SX105 MHMG0_SX15 MHMG0_SX195 MHMG0_SX285 MHMG0_SX375 MHMR0_SI1119 MHMR0_SI1692 MHMR0_SI489 MHMR0_SX129 MHMR0_SX219 MHMR0_SX309 MHMR0_SX39 MHMR0_SX399 MHRM0_SI1475 MHRM0_SI2218 MHRM0_SI958 MHRM0_SX148 MHRM0_SX238 MHRM0_SX328 MHRM0_SX418 MHRM0_SX58 MHXL0_SI1772 MHXL0_SI512 MHXL0_SI612 MHXL0_SX152 MHXL0_SX242 MHXL0_SX332 MHXL0_SX422 MHXL0_SX62 MILB0_SI2163 MILB0_SI807 MILB0_SI903 MILB0_SX183 MILB0_SX273 MILB0_SX3 MILB0_SX363 MILB0_SX93 MJAC0_SI1331 MJAC0_SI2148 MJAC0_SI701 MJAC0_SX251 MJAC0_SX307 MJAC0_SX341 MJAC0_SX431 MJAC0_SX71 MJAE0_SI1524 MJAE0_SI1999 MJAE0_SI2154 MJAE0_SX174 MJAE0_SX264 MJAE0_SX354 MJAE0_SX444 MJAE0_SX84 MJAI0_SI1604 MJAI0_SI682 MJAI0_SI710 MJAI0_SX164 MJAI0_SX254 MJAI0_SX344 MJAI0_SX434 MJAI0_SX74 MJBG0_SI1232 MJBG0_SI1724 MJBG0_SI1862 MJBG0_SX152 MJBG0_SX242 MJBG0_SX332 MJBG0_SX422 MJBG0_SX62 MJDA0_SI1031 MJDA0_SI1661 MJDA0_SI2291 MJDA0_SX131 MJDA0_SX221 MJDA0_SX311 MJDA0_SX401 MJDA0_SX41 MJDC0_SI1161 MJDC0_SI2165 MJDC0_SI531 MJDC0_SX171 MJDC0_SX261 MJDC0_SX351 MJDC0_SX441 MJDC0_SX81 MJDE0_SI1120 MJDE0_SI463 MJDE0_SI490 MJDE0_SX130 MJDE0_SX220 MJDE0_SX310 MJDE0_SX40 MJDE0_SX400 MJDG0_SI1042 MJDG0_SI1672 MJDG0_SI1705 MJDG0_SX142 MJDG0_SX232 MJDG0_SX322 MJDG0_SX412 MJDG0_SX52 MJDM0_SI1340 MJDM0_SI1937 MJDM0_SI974 MJDM0_SX170 MJDM0_SX260 MJDM0_SX350 MJDM0_SX440 MJDM0_SX80 MJEB0_SI1286 MJEB0_SI1916 MJEB0_SI656 MJEB0_SX170 MJEB0_SX206 MJEB0_SX26 MJEB0_SX296 MJEB0_SX386 MJEB1_SI1467 MJEB1_SI2097 MJEB1_SI837 MJEB1_SX117 MJEB1_SX207 MJEB1_SX27 MJEB1_SX297 MJEB1_SX387 MJEE0_SI1237 MJEE0_SI1867 MJEE0_SI607 MJEE0_SX157 MJEE0_SX247 MJEE0_SX337 MJEE0_SX427 MJEE0_SX67 MJFH0_SI1107 MJFH0_SI1737 MJFH0_SI477 MJFH0_SX117 MJFH0_SX207 MJFH0_SX27 MJFH0_SX297 MJFH0_SX387 MJFR0_SI1605 MJFR0_SI2235 MJFR0_SI975 MJFR0_SX165 MJFR0_SX255 MJFR0_SX345 MJFR0_SX435 MJFR0_SX75 MJHI0_SI1328 MJHI0_SI555 MJHI0_SI698 MJHI0_SX158 MJHI0_SX248 MJHI0_SX338 MJHI0_SX428 MJHI0_SX68 MJJB0_SI1139 MJJB0_SI1277 MJJB0_SI1769 MJJB0_SX149 MJJB0_SX239 MJJB0_SX329 MJJB0_SX419 MJJB0_SX59 MJJJ0_SI1163 MJJJ0_SI1793 MJJJ0_SI533 MJJJ0_SX173 MJJJ0_SX263 MJJJ0_SX353 MJJJ0_SX443 MJJJ0_SX83 MJJM0_SI1251 MJJM0_SI1457 MJJM0_SI827 MJJM0_SX107 MJJM0_SX17 MJJM0_SX197 MJJM0_SX287 MJJM0_SX377 MJKR0_SI1201 MJKR0_SI1831 MJKR0_SI571 MJKR0_SX121 MJKR0_SX211 MJKR0_SX301 MJKR0_SX31 MJKR0_SX391 MJLB0_SI1616 MJLB0_SI2246 MJLB0_SI986 MJLB0_SX176 MJLB0_SX266 MJLB0_SX356 MJLB0_SX446 MJLB0_SX86 MJLG1_SI1012 MJLG1_SI1642 MJLG1_SI2272 MJLG1_SX112 MJLG1_SX202 MJLG1_SX22 MJLG1_SX292 MJLG1_SX382 MJLS0_SI1096 MJLS0_SI1726 MJLS0_SI466 MJLS0_SX106 MJLS0_SX16 MJLS0_SX196 MJLS0_SX286 MJLS0_SX376 MJMA0_SI1495 MJMA0_SI2125 MJMA0_SI865 MJMA0_SX145 MJMA0_SX235 MJMA0_SX325 MJMA0_SX415 MJMA0_SX55 MJMD0_SI1028 MJMD0_SI1658 MJMD0_SI2288 MJMD0_SX128 MJMD0_SX218 MJMD0_SX308 MJMD0_SX38 MJMD0_SX398 MJMM0_SI1255 MJMM0_SI1885 MJMM0_SI625 MJMM0_SX175 MJMM0_SX265 MJMM0_SX355 MJMM0_SX445 MJMM0_SX85 MJPG0_SI1191 MJPG0_SI1821 MJPG0_SI561 MJPG0_SX111 MJPG0_SX201 MJPG0_SX21 MJPG0_SX291 MJPG0_SX381 MJPM0_SI1368 MJPM0_SI1998 MJPM0_SI738 MJPM0_SX108 MJPM0_SX18 MJPM0_SX198 MJPM0_SX288 MJPM0_SX378 MJPM1_SI1897 MJPM1_SI2280 MJPM1_SI761 MJPM1_SX131 MJPM1_SX221 MJPM1_SX311 MJPM1_SX401 MJPM1_SX41 MJRA0_SI1236 MJRA0_SI1866 MJRA0_SI606 MJRA0_SX156 MJRA0_SX246 MJRA0_SX336 MJRA0_SX426 MJRA0_SX66 MJRG0_SI1366 MJRG0_SI1996 MJRG0_SI736 MJRG0_SX106 MJRG0_SX16 MJRG0_SX286 MJRG0_SX352 MJRG0_SX376 MJRH0_SI1125 MJRH0_SI1755 MJRH0_SI1840 MJRH0_SX135 MJRH0_SX225 MJRH0_SX315 MJRH0_SX405 MJRH0_SX45 MJRH1_SI1558 MJRH1_SI1774 MJRH1_SI514 MJRH1_SX154 MJRH1_SX244 MJRH1_SX334 MJRH1_SX424 MJRH1_SX64 MJRK0_SI1662 MJRK0_SI2103 MJRK0_SI880 MJRK0_SX160 MJRK0_SX250 MJRK0_SX340 MJRK0_SX430 MJRK0_SX70 MJRP0_SI1835 MJRP0_SI1845 MJRP0_SI585 MJRP0_SX135 MJRP0_SX225 MJRP0_SX315 MJRP0_SX405 MJRP0_SX45 MJSR0_SI1424 MJSR0_SI2054 MJSR0_SI794 MJSR0_SX164 MJSR0_SX254 MJSR0_SX344 MJSR0_SX434 MJSR0_SX74 MJWG0_SI2155 MJWG0_SI813 MJWG0_SI895 MJWG0_SX175 MJWG0_SX265 MJWG0_SX355 MJWG0_SX445 MJWG0_SX85 MJWS0_SI1143 MJWS0_SI1773 MJWS0_SI513 MJWS0_SX153 MJWS0_SX243 MJWS0_SX333 MJWS0_SX423 MJWS0_SX63 MJWT0_SI1291 MJWT0_SI1381 MJWT0_SI751 MJWT0_SX121 MJWT0_SX211 MJWT0_SX301 MJWT0_SX31 MJWT0_SX391 MJXA0_SI1507 MJXA0_SI2137 MJXA0_SI877 MJXA0_SX157 MJXA0_SX247 MJXA0_SX337 MJXA0_SX427 MJXA0_SX67 MJXL0_SI1172 MJXL0_SI1795 MJXL0_SI542 MJXL0_SX182 MJXL0_SX272 MJXL0_SX362 MJXL0_SX452 MJXL0_SX92 MKAG0_SI1609 MKAG0_SI2239 MKAG0_SI979 MKAG0_SX169 MKAG0_SX259 MKAG0_SX30 MKAG0_SX439 MKAG0_SX79 MKAH0_SI1528 MKAH0_SI2158 MKAH0_SI898 MKAH0_SX178 MKAH0_SX268 MKAH0_SX358 MKAH0_SX448 MKAH0_SX88 MKAJ0_SI1414 MKAJ0_SI2044 MKAJ0_SI784 MKAJ0_SX154 MKAJ0_SX244 MKAJ0_SX334 MKAJ0_SX424 MKAJ0_SX64 MKAM0_SI1250 MKAM0_SI1316 MKAM0_SI1465 MKAM0_SX146 MKAM0_SX236 MKAM0_SX326 MKAM0_SX416 MKAM0_SX56 MKDB0_SI2132 MKDB0_SI588 MKDB0_SI872 MKDB0_SX152 MKDB0_SX242 MKDB0_SX332 MKDB0_SX422 MKDB0_SX62 MKDD0_SI1567 MKDD0_SI2197 MKDD0_SI937 MKDD0_SX127 MKDD0_SX217 MKDD0_SX307 MKDD0_SX37 MKDD0_SX397 MKDT0_SI2153 MKDT0_SI814 MKDT0_SI893 MKDT0_SX173 MKDT0_SX263 MKDT0_SX353 MKDT0_SX443 MKDT0_SX83 MKES0_SI1253 MKES0_SI1883 MKES0_SI623 MKES0_SX173 MKES0_SX263 MKES0_SX353 MKES0_SX443 MKES0_SX83 MKJO0_SI1517 MKJO0_SI2147 MKJO0_SI887 MKJO0_SX167 MKJO0_SX257 MKJO0_SX424 MKJO0_SX437 MKJO0_SX77 MKLN0_SI1598 MKLN0_SI2228 MKLN0_SI968 MKLN0_SX158 MKLN0_SX248 MKLN0_SX338 MKLN0_SX428 MKLN0_SX68 MKLR0_SI1059 MKLR0_SI1689 MKLR0_SI2319 MKLR0_SX159 MKLR0_SX249 MKLR0_SX339 MKLR0_SX429 MKLR0_SX69 MKLS0_SI1437 MKLS0_SI1533 MKLS0_SI2067 MKLS0_SX177 MKLS0_SX267 MKLS0_SX357 MKLS0_SX447 MKLS0_SX87 MKLS1_SI1545 MKLS1_SI2175 MKLS1_SI915 MKLS1_SX105 MKLS1_SX15 MKLS1_SX195 MKLS1_SX285 MKLS1_SX375 MKLW0_SI1571 MKLW0_SI1844 MKLW0_SI2201 MKLW0_SX131 MKLW0_SX221 MKLW0_SX311 MKLW0_SX401 MKLW0_SX41 MKRG0_SI1491 MKRG0_SI2121 MKRG0_SI861 MKRG0_SX141 MKRG0_SX231 MKRG0_SX31 MKRG0_SX411 MKRG0_SX51 MKXL0_SI1185 MKXL0_SI1815 MKXL0_SI1958 MKXL0_SX105 MKXL0_SX15 MKXL0_SX195 MKXL0_SX285 MKXL0_SX375 MLBC0_SI1239 MLBC0_SI1869 MLBC0_SI609 MLBC0_SX159 MLBC0_SX249 MLBC0_SX339 MLBC0_SX429 MLBC0_SX69 MLEL0_SI1246 MLEL0_SI1876 MLEL0_SI616 MLEL0_SX166 MLEL0_SX256 MLEL0_SX346 MLEL0_SX436 MLEL0_SX76 MLJC0_SI1225 MLJC0_SI1855 MLJC0_SI595 MLJC0_SX145 MLJC0_SX235 MLJC0_SX325 MLJC0_SX415 MLJC0_SX55 MLJH0_SI1324 MLJH0_SI1422 MLJH0_SI694 MLJH0_SX154 MLJH0_SX244 MLJH0_SX334 MLJH0_SX424 MLJH0_SX64 MLNS0_SI1407 MLNS0_SI2037 MLNS0_SI777 MLNS0_SX147 MLNS0_SX237 MLNS0_SX327 MLNS0_SX417 MLNS0_SX57 MLSH0_SI1417 MLSH0_SI2047 MLSH0_SI787 MLSH0_SX157 MLSH0_SX247 MLSH0_SX337 MLSH0_SX427 MLSH0_SX67 MMAA0_SI1588 MMAA0_SI2105 MMAA0_SI845 MMAA0_SX125 MMAA0_SX215 MMAA0_SX305 MMAA0_SX35 MMAA0_SX395 MMAB1_SI1494 MMAB1_SI2124 MMAB1_SI864 MMAB1_SX144 MMAB1_SX234 MMAB1_SX324 MMAB1_SX414 MMAB1_SX54 MMAG0_SI1126 MMAG0_SI1756 MMAG0_SI496 MMAG0_SX136 MMAG0_SX226 MMAG0_SX316 MMAG0_SX406 MMAG0_SX46 MMAM0_SI1597 MMAM0_SI1668 MMAM0_SI2227 MMAM0_SX157 MMAM0_SX247 MMAM0_SX337 MMAM0_SX427 MMAM0_SX67 MMAR0_SI1336 MMAR0_SI1966 MMAR0_SI706 MMAR0_SX166 MMAR0_SX256 MMAR0_SX346 MMAR0_SX436 MMAR0_SX76 MMBS0_SI1151 MMBS0_SI1781 MMBS0_SI521 MMBS0_SX161 MMBS0_SX251 MMBS0_SX341 MMBS0_SX431 MMBS0_SX71 MMCC0_SI1338 MMCC0_SI1968 MMCC0_SI708 MMCC0_SX168 MMCC0_SX258 MMCC0_SX348 MMCC0_SX438 MMCC0_SX78 MMDB0_SI1358 MMDB0_SI1617 MMDB0_SI987 MMDB0_SX177 MMDB0_SX267 MMDB0_SX357 MMDB0_SX447 MMDB0_SX87 MMDG0_SI1780 MMDG0_SI2035 MMDG0_SI520 MMDG0_SX160 MMDG0_SX250 MMDG0_SX340 MMDG0_SX430 MMDG0_SX70 MMDM0_SI1311 MMDM0_SI1941 MMDM0_SI681 MMDM0_SX141 MMDM0_SX231 MMDM0_SX321 MMDM0_SX411 MMDM0_SX51 MMDM1_SI1650 MMDM1_SI2043 MMDM1_SI783 MMDM1_SX153 MMDM1_SX243 MMDM1_SX333 MMDM1_SX423 MMDM1_SX63 MMDS0_SI1343 MMDS0_SI1973 MMDS0_SI713 MMDS0_SX173 MMDS0_SX263 MMDS0_SX353 MMDS0_SX443 MMDS0_SX83 MMEA0_SI1388 MMEA0_SI2018 MMEA0_SI758 MMEA0_SX128 MMEA0_SX218 MMEA0_SX308 MMEA0_SX38 MMEA0_SX398 MMEB0_SI1357 MMEB0_SI1987 MMEB0_SI727 MMEB0_SX187 MMEB0_SX327 MMEB0_SX367 MMEB0_SX7 MMEB0_SX97 MMGC0_SI1305 MMGC0_SI1935 MMGC0_SI2184 MMGC0_SX135 MMGC0_SX225 MMGC0_SX315 MMGC0_SX405 MMGC0_SX45 MMGG0_SI1079 MMGG0_SI1709 MMGG0_SI2339 MMGG0_SX179 MMGG0_SX269 MMGG0_SX359 MMGG0_SX449 MMGG0_SX89 MMGK0_SI1322 MMGK0_SI1952 MMGK0_SI692 MMGK0_SX152 MMGK0_SX242 MMGK0_SX332 MMGK0_SX422 MMGK0_SX62 MMJB1_SI1408 MMJB1_SI2038 MMJB1_SI778 MMJB1_SX148 MMJB1_SX238 MMJB1_SX328 MMJB1_SX418 MMJB1_SX58 MMLM0_SI1527 MMLM0_SI2150 MMLM0_SI897 MMLM0_SX177 MMLM0_SX267 MMLM0_SX357 MMLM0_SX447 MMLM0_SX87 MMPM0_SI1061 MMPM0_SI1691 MMPM0_SI2321 MMPM0_SX161 MMPM0_SX251 MMPM0_SX341 MMPM0_SX431 MMPM0_SX71 MMRP0_SI2034 MMRP0_SI717 MMRP0_SI774 MMRP0_SX144 MMRP0_SX234 MMRP0_SX324 MMRP0_SX414 MMRP0_SX54 MMSM0_SI1106 MMSM0_SI1736 MMSM0_SI476 MMSM0_SX116 MMSM0_SX206 MMSM0_SX26 MMSM0_SX296 MMSM0_SX386 MMVP0_SI1284 MMVP0_SI1914 MMVP0_SI654 MMVP0_SX114 MMVP0_SX204 MMVP0_SX294 MMVP0_SX347 MMVP0_SX384 MMWB0_SI1619 MMWB0_SI2249 MMWB0_SI989 MMWB0_SX179 MMWB0_SX269 MMWB0_SX359 MMWB0_SX449 MMWB0_SX89 MMWS0_SI1518 MMWS0_SI559 MMWS0_SI888 MMWS0_SX168 MMWS0_SX258 MMWS0_SX348 MMWS0_SX438 MMWS0_SX78 MMWS1_SI1071 MMWS1_SI1701 MMWS1_SI2331 MMWS1_SX261 MMWS1_SX27 MMWS1_SX351 MMWS1_SX441 MMWS1_SX81 MMXS0_SI2136 MMXS0_SI629 MMXS0_SI876 MMXS0_SX156 MMXS0_SX246 MMXS0_SX336 MMXS0_SX426 MMXS0_SX66 MNET0_SI1446 MNET0_SI2076 MNET0_SI816 MNET0_SX186 MNET0_SX276 MNET0_SX366 MNET0_SX6 MNET0_SX96 MNTW0_SI1068 MNTW0_SI1698 MNTW0_SI2328 MNTW0_SX168 MNTW0_SX202 MNTW0_SX258 MNTW0_SX348 MNTW0_SX78 MPAR0_SI1576 MPAR0_SI2206 MPAR0_SI946 MPAR0_SX136 MPAR0_SX226 MPAR0_SX316 MPAR0_SX406 MPAR0_SX46 MPEB0_SI1034 MPEB0_SI1860 MPEB0_SI600 MPEB0_SX150 MPEB0_SX240 MPEB0_SX330 MPEB0_SX420 MPEB0_SX60 MPFU0_SI1258 MPFU0_SI1888 MPFU0_SI628 MPFU0_SX178 MPFU0_SX268 MPFU0_SX358 MPFU0_SX448 MPFU0_SX88 MPGH0_SI1554 MPGH0_SI675 MPGH0_SI924 MPGH0_SX114 MPGH0_SX204 MPGH0_SX24 MPGH0_SX294 MPGH0_SX384 MPGR0_SI1410 MPGR0_SI2040 MPGR0_SI780 MPGR0_SX150 MPGR0_SX240 MPGR0_SX330 MPGR0_SX420 MPGR0_SX60 MPGR1_SI1269 MPGR1_SI1499 MPGR1_SI2129 MPGR1_SX149 MPGR1_SX239 MPGR1_SX329 MPGR1_SX419 MPGR1_SX59 MPMB0_SI1501 MPMB0_SI2131 MPMB0_SI871 MPMB0_SX151 MPMB0_SX241 MPMB0_SX331 MPMB0_SX421 MPMB0_SX61 MPPC0_SI1412 MPPC0_SI2042 MPPC0_SI782 MPPC0_SX152 MPPC0_SX242 MPPC0_SX332 MPPC0_SX422 MPPC0_SX62 MPRB0_SI1205 MPRB0_SI1215 MPRB0_SI575 MPRB0_SX125 MPRB0_SX215 MPRB0_SX305 MPRB0_SX35 MPRB0_SX395 MPRD0_SI1431 MPRD0_SI2061 MPRD0_SI801 MPRD0_SX171 MPRD0_SX261 MPRD0_SX351 MPRD0_SX441 MPRD0_SX81 MPRK0_SI1097 MPRK0_SI1727 MPRK0_SI467 MPRK0_SX107 MPRK0_SX17 MPRK0_SX197 MPRK0_SX287 MPRK0_SX377 MPRT0_SI1210 MPRT0_SI495 MPRT0_SI580 MPRT0_SX130 MPRT0_SX220 MPRT0_SX310 MPRT0_SX40 MPRT0_SX400 MPSW0_SI1067 MPSW0_SI1697 MPSW0_SI2327 MPSW0_SX167 MPSW0_SX24 MPSW0_SX257 MPSW0_SX437 MPSW0_SX77 MRAB0_SI1224 MRAB0_SI1854 MRAB0_SI594 MRAB0_SX144 MRAB0_SX234 MRAB0_SX324 MRAB0_SX414 MRAB0_SX54 MRAB1_SI1478 MRAB1_SI2108 MRAB1_SI848 MRAB1_SX128 MRAB1_SX218 MRAB1_SX308 MRAB1_SX38 MRAB1_SX398 MRAI0_SI1954 MRAI0_SI2052 MRAI0_SI792 MRAI0_SX162 MRAI0_SX252 MRAI0_SX342 MRAI0_SX432 MRAI0_SX72 MRAM0_SI1275 MRAM0_SI1905 MRAM0_SI1951 MRAM0_SX105 MRAM0_SX15 MRAM0_SX195 MRAM0_SX285 MRAM0_SX375 MRAV0_SI1008 MRAV0_SI1638 MRAV0_SI2268 MRAV0_SX108 MRAV0_SX18 MRAV0_SX198 MRAV0_SX288 MRAV0_SX378 MRBC0_SI1665 MRBC0_SI1859 MRBC0_SI599 MRBC0_SX149 MRBC0_SX239 MRBC0_SX329 MRBC0_SX419 MRBC0_SX59 MRCG0_SI1428 MRCG0_SI2058 MRCG0_SI798 MRCG0_SX168 MRCG0_SX258 MRCG0_SX348 MRCG0_SX438 MRCG0_SX78 MRCW0_SI1371 MRCW0_SI2001 MRCW0_SI741 MRCW0_SX111 MRCW0_SX201 MRCW0_SX21 MRCW0_SX291 MRCW0_SX381 MRDD0_SI1050 MRDD0_SI1680 MRDD0_SI2310 MRDD0_SX150 MRDD0_SX240 MRDD0_SX277 MRDD0_SX330 MRDD0_SX60 MRDM0_SI1044 MRDM0_SI1595 MRDM0_SI965 MRDM0_SX155 MRDM0_SX245 MRDM0_SX335 MRDM0_SX425 MRDM0_SX65 MRDS0_SI1167 MRDS0_SI1797 MRDS0_SI537 MRDS0_SX177 MRDS0_SX267 MRDS0_SX357 MRDS0_SX447 MRDS0_SX87 MREE0_SI1104 MREE0_SI1734 MREE0_SI1959 MREE0_SX114 MREE0_SX204 MREE0_SX24 MREE0_SX294 MREE0_SX384 MREH1_SI1599 MREH1_SI2229 MREH1_SI969 MREH1_SX159 MREH1_SX249 MREH1_SX339 MREH1_SX429 MREH1_SX69 MREM0_SI1591 MREM0_SI511 MREM0_SI961 MREM0_SX151 MREM0_SX241 MREM0_SX331 MREM0_SX421 MREM0_SX61 MREW1_SI1500 MREW1_SI2130 MREW1_SI870 MREW1_SX150 MREW1_SX240 MREW1_SX330 MREW1_SX420 MREW1_SX60 MRFK0_SI1076 MRFK0_SI1706 MRFK0_SI2336 MRFK0_SX176 MRFK0_SX266 MRFK0_SX356 MRFK0_SX446 MRFK0_SX86 MRFL0_SI1156 MRFL0_SI1786 MRFL0_SI526 MRFL0_SX166 MRFL0_SX256 MRFL0_SX346 MRFL0_SX436 MRFL0_SX76 MRGM0_SI1162 MRGM0_SI1792 MRGM0_SI532 MRGM0_SX172 MRGM0_SX262 MRGM0_SX416 MRGM0_SX442 MRGM0_SX82 MRGS0_SI1356 MRGS0_SI1986 MRGS0_SI726 MRGS0_SX186 MRGS0_SX276 MRGS0_SX366 MRGS0_SX6 MRGS0_SX96 MRHL0_SI1515 MRHL0_SI2145 MRHL0_SI885 MRHL0_SX165 MRHL0_SX255 MRHL0_SX345 MRHL0_SX435 MRHL0_SX75 MRJB1_SI1020 MRJB1_SI1413 MRJB1_SI2021 MRJB1_SX120 MRJB1_SX210 MRJB1_SX30 MRJB1_SX300 MRJB1_SX390 MRJH0_SI1519 MRJH0_SI889 MRJH0_SI914 MRJH0_SX169 MRJH0_SX259 MRJH0_SX307 MRJH0_SX439 MRJH0_SX79 MRJM0_SI1095 MRJM0_SI1228 MRJM0_SI1858 MRJM0_SX148 MRJM0_SX238 MRJM0_SX328 MRJM0_SX418 MRJM0_SX58 MRJM1_SI1298 MRJM1_SI1928 MRJM1_SI668 MRJM1_SX128 MRJM1_SX218 MRJM1_SX308 MRJM1_SX38 MRJM1_SX398 MRJT0_SI1498 MRJT0_SI1805 MRJT0_SI868 MRJT0_SX148 MRJT0_SX238 MRJT0_SX328 MRJT0_SX418 MRJT0_SX58 MRKM0_SI1267 MRKM0_SI1391 MRKM0_SI637 MRKM0_SX187 MRKM0_SX277 MRKM0_SX367 MRKM0_SX7 MRKM0_SX97 MRLD0_SI1594 MRLD0_SI2224 MRLD0_SI964 MRLD0_SX154 MRLD0_SX244 MRLD0_SX334 MRLD0_SX424 MRLD0_SX64 MRLJ0_SI1420 MRLJ0_SI2050 MRLJ0_SI790 MRLJ0_SX160 MRLJ0_SX250 MRLJ0_SX340 MRLJ0_SX430 MRLJ0_SX70 MRLJ1_SI1671 MRLJ1_SI2301 MRLJ1_SI2332 MRLJ1_SX141 MRLJ1_SX231 MRLJ1_SX321 MRLJ1_SX411 MRLJ1_SX51 MRLK0_SI1468 MRLK0_SI2140 MRLK0_SI843 MRLK0_SX123 MRLK0_SX213 MRLK0_SX303 MRLK0_SX33 MRLK0_SX393 MRLR0_SI1196 MRLR0_SI1826 MRLR0_SI566 MRLR0_SX116 MRLR0_SX206 MRLR0_SX26 MRLR0_SX296 MRLR0_SX386 MRMB0_SI1581 MRMB0_SI2211 MRMB0_SI951 MRMB0_SX141 MRMB0_SX231 MRMB0_SX321 MRMB0_SX411 MRMB0_SX51 MRMG0_SI1080 MRMG0_SI1710 MRMG0_SI2340 MRMG0_SX180 MRMG0_SX270 MRMG0_SX360 MRMG0_SX450 MRMG0_SX90 MRMH0_SI1021 MRMH0_SI1349 MRMH0_SI2281 MRMH0_SX121 MRMH0_SX211 MRMH0_SX301 MRMH0_SX31 MRMH0_SX391 MRML0_SI1421 MRML0_SI2051 MRML0_SI791 MRML0_SX161 MRML0_SX251 MRML0_SX341 MRML0_SX431 MRML0_SX71 MRMS0_SI1113 MRMS0_SI2057 MRMS0_SI2100 MRMS0_SX120 MRMS0_SX210 MRMS0_SX30 MRMS0_SX300 MRMS0_SX390 MRPC1_SI1482 MRPC1_SI2026 MRPC1_SI2112 MRPC1_SX132 MRPC1_SX222 MRPC1_SX312 MRPC1_SX402 MRPC1_SX42 MRRE0_SI1334 MRRE0_SI704 MRRE0_SI952 MRRE0_SX164 MRRE0_SX254 MRRE0_SX344 MRRE0_SX434 MRRE0_SX74 MRSO0_SI1206 MRSO0_SI1659 MRSO0_SI2289 MRSO0_SX129 MRSO0_SX219 MRSO0_SX309 MRSO0_SX39 MRSO0_SX399 MRSP0_SI1429 MRSP0_SI2059 MRSP0_SI799 MRSP0_SX169 MRSP0_SX196 MRSP0_SX259 MRSP0_SX439 MRSP0_SX79 MRTC0_SI1458 MRTC0_SI2088 MRTC0_SI828 MRTC0_SX108 MRTC0_SX18 MRTC0_SX198 MRTC0_SX288 MRTC0_SX378 MRTJ0_SI1551 MRTJ0_SI2032 MRTJ0_SI772 MRTJ0_SX142 MRTJ0_SX232 MRTJ0_SX322 MRTJ0_SX412 MRTJ0_SX52 MRVG0_SI1140 MRVG0_SI1770 MRVG0_SI510 MRVG0_SX150 MRVG0_SX240 MRVG0_SX330 MRVG0_SX420 MRVG0_SX60 MRWA0_SI1603 MRWA0_SI2233 MRWA0_SI973 MRWA0_SX163 MRWA0_SX253 MRWA0_SX343 MRWA0_SX433 MRWA0_SX73 MRWS0_SI1102 MRWS0_SI1732 MRWS0_SI472 MRWS0_SX112 MRWS0_SX202 MRWS0_SX22 MRWS0_SX292 MRWS0_SX382 MRXB0_SI1585 MRXB0_SI2215 MRXB0_SI955 MRXB0_SX145 MRXB0_SX235 MRXB0_SX325 MRXB0_SX415 MRXB0_SX55 MSAH1_SI1049 MSAH1_SI1679 MSAH1_SI2309 MSAH1_SX149 MSAH1_SX239 MSAH1_SX329 MSAH1_SX419 MSAH1_SX59 MSAS0_SI1376 MSAS0_SI2006 MSAS0_SI746 MSAS0_SX116 MSAS0_SX206 MSAS0_SX26 MSAS0_SX296 MSAS0_SX386 MSAT0_SI1526 MSAT0_SI2156 MSAT0_SI896 MSAT0_SX176 MSAT0_SX266 MSAT0_SX356 MSAT0_SX446 MSAT0_SX86 MSAT1_SI1073 MSAT1_SI1703 MSAT1_SI2333 MSAT1_SX173 MSAT1_SX263 MSAT1_SX353 MSAT1_SX443 MSAT1_SX83 MSDB0_SI1007 MSDB0_SI1637 MSDB0_SI2267 MSDB0_SX107 MSDB0_SX17 MSDB0_SX197 MSDB0_SX287 MSDB0_SX377 MSDH0_SI2113 MSDH0_SI2240 MSDH0_SI980 MSDH0_SX170 MSDH0_SX260 MSDH0_SX350 MSDH0_SX440 MSDH0_SX80 MSDS0_SI1077 MSDS0_SI1707 MSDS0_SI2337 MSDS0_SX177 MSDS0_SX267 MSDS0_SX357 MSDS0_SX447 MSDS0_SX87 MSEM1_SI1440 MSEM1_SI2070 MSEM1_SI810 MSEM1_SX180 MSEM1_SX270 MSEM1_SX360 MSEM1_SX450 MSEM1_SX90 MSES0_SI1589 MSES0_SI2216 MSES0_SI2219 MSES0_SX149 MSES0_SX239 MSES0_SX329 MSES0_SX419 MSES0_SX59 MSFH0_SI1216 MSFH0_SI1738 MSFH0_SI586 MSFH0_SX136 MSFH0_SX226 MSFH0_SX316 MSFH0_SX406 MSFH0_SX46 MSFV0_SI1262 MSFV0_SI1892 MSFV0_SI632 MSFV0_SX182 MSFV0_SX272 MSFV0_SX362 MSFV0_SX452 MSFV0_SX92 MSJK0_SI1596 MSJK0_SI2226 MSJK0_SI966 MSJK0_SX156 MSJK0_SX246 MSJK0_SX336 MSJK0_SX426 MSJK0_SX66 MSMC0_SI1907 MSMC0_SI509 MSMC0_SI647 MSMC0_SX107 MSMC0_SX17 MSMC0_SX197 MSMC0_SX287 MSMC0_SX377 MSMR0_SI1150 MSMR0_SI1405 MSMR0_SI775 MSMR0_SX145 MSMR0_SX235 MSMR0_SX325 MSMR0_SX415 MSMR0_SX55 MSMS0_SI1433 MSMS0_SI2063 MSMS0_SI803 MSMS0_SX173 MSMS0_SX263 MSMS0_SX353 MSMS0_SX443 MSMS0_SX83 MSRG0_SI1221 MSRG0_SI1851 MSRG0_SI591 MSRG0_SX141 MSRG0_SX231 MSRG0_SX321 MSRG0_SX411 MSRG0_SX51 MSRR0_SI1131 MSRR0_SI1761 MSRR0_SI501 MSRR0_SX141 MSRR0_SX231 MSRR0_SX30 MSRR0_SX411 MSRR0_SX51 MSTF0_SI1396 MSTF0_SI766 MSTF0_SI852 MSTF0_SX136 MSTF0_SX226 MSTF0_SX316 MSTF0_SX406 MSTF0_SX46 MSVS0_SI1568 MSVS0_SI2198 MSVS0_SI938 MSVS0_SX128 MSVS0_SX218 MSVS0_SX308 MSVS0_SX38 MSVS0_SX398 MTAB0_SI1572 MTAB0_SI2202 MTAB0_SI942 MTAB0_SX132 MTAB0_SX222 MTAB0_SX312 MTAB0_SX402 MTAB0_SX42 MTAS0_SI1385 MTAS0_SI2015 MTAS0_SI755 MTAS0_SX125 MTAS0_SX215 MTAS0_SX305 MTAS0_SX35 MTAS0_SX395 MTAT0_SI1110 MTAT0_SI1740 MTAT0_SI811 MTAT0_SX120 MTAT0_SX210 MTAT0_SX30 MTAT0_SX300 MTAT0_SX390 MTAT1_SI1409 MTAT1_SI1627 MTAT1_SI779 MTAT1_SX149 MTAT1_SX239 MTAT1_SX329 MTAT1_SX419 MTAT1_SX59 MTBC0_SI1173 MTBC0_SI1803 MTBC0_SI543 MTBC0_SX183 MTBC0_SX273 MTBC0_SX347 MTBC0_SX363 MTBC0_SX93 MTCS0_SI1972 MTCS0_SI2265 MTCS0_SI712 MTCS0_SX172 MTCS0_SX262 MTCS0_SX352 MTCS0_SX442 MTCS0_SX82 MTDB0_SI1401 MTDB0_SI2031 MTDB0_SI771 MTDB0_SX141 MTDB0_SX231 MTDB0_SX321 MTDB0_SX411 MTDB0_SX51 MTDP0_SI1274 MTDP0_SI1521 MTDP0_SI2151 MTDP0_SX171 MTDP0_SX261 MTDP0_SX351 MTDP0_SX441 MTDP0_SX81 MTER0_SI1157 MTER0_SI1787 MTER0_SI527 MTER0_SX167 MTER0_SX17 MTER0_SX257 MTER0_SX437 MTER0_SX77 MTJG0_SI1520 MTJG0_SI2157 MTJG0_SI890 MTJG0_SX170 MTJG0_SX260 MTJG0_SX350 MTJG0_SX440 MTJG0_SX80 MTJM0_SI1226 MTJM0_SI1856 MTJM0_SI655 MTJM0_SX146 MTJM0_SX236 MTJM0_SX326 MTJM0_SX416 MTJM0_SX56 MTJS0_SI1192 MTJS0_SI1822 MTJS0_SI562 MTJS0_SX112 MTJS0_SX202 MTJS0_SX22 MTJS0_SX292 MTJS0_SX382 MTJU0_SI2020 MTJU0_SI2269 MTJU0_SI760 MTJU0_SX130 MTJU0_SX220 MTJU0_SX310 MTJU0_SX40 MTJU0_SX400 MTKD0_SI1187 MTKD0_SI1817 MTKD0_SI630 MTKD0_SX107 MTKD0_SX17 MTKD0_SX197 MTKD0_SX287 MTKD0_SX377 MTKP0_SI1023 MTKP0_SI2283 MTKP0_SI454 MTKP0_SX123 MTKP0_SX213 MTKP0_SX303 MTKP0_SX33 MTKP0_SX393 MTLB0_SI1134 MTLB0_SI1764 MTLB0_SI504 MTLB0_SX144 MTLB0_SX234 MTLB0_SX324 MTLB0_SX414 MTLB0_SX54 MTLC0_SI1313 MTLC0_SI1477 MTLC0_SI847 MTLC0_SX127 MTLC0_SX217 MTLC0_SX307 MTLC0_SX37 MTLC0_SX397 MTML0_SI1065 MTML0_SI1695 MTML0_SI2325 MTML0_SX165 MTML0_SX255 MTML0_SX345 MTML0_SX435 MTML0_SX75 MTMN0_SI1064 MTMN0_SI2324 MTMN0_SI582 MTMN0_SX164 MTMN0_SX254 MTMN0_SX344 MTMN0_SX434 MTMN0_SX74 MTMT0_SI1118 MTMT0_SI1748 MTMT0_SI488 MTMT0_SX128 MTMT0_SX218 MTMT0_SX308 MTMT0_SX38 MTMT0_SX398 MTPF0_SI1235 MTPF0_SI1865 MTPF0_SI605 MTPF0_SX155 MTPF0_SX245 MTPF0_SX335 MTPF0_SX425 MTPF0_SX65 MTPG0_SI1383 MTPG0_SI2013 MTPG0_SI753 MTPG0_SX123 MTPG0_SX213 MTPG0_SX303 MTPG0_SX33 MTPG0_SX393 MTPP0_SI1508 MTPP0_SI2138 MTPP0_SI878 MTPP0_SX158 MTPP0_SX248 MTPP0_SX338 MTPP0_SX428 MTPP0_SX68 MTPR0_SI1600 MTPR0_SI2230 MTPR0_SI506 MTPR0_SX160 MTPR0_SX250 MTPR0_SX340 MTPR0_SX430 MTPR0_SX70 MTQC0_SI1441 MTQC0_SI2071 MTQC0_SI480 MTQC0_SX181 MTQC0_SX271 MTQC0_SX361 MTQC0_SX451 MTQC0_SX91 MTRC0_SI1623 MTRC0_SI589 MTRC0_SI993 MTRC0_SX170 MTRC0_SX183 MTRC0_SX273 MTRC0_SX363 MTRC0_SX93 MTRR0_SI1548 MTRR0_SI2178 MTRR0_SI918 MTRR0_SX108 MTRR0_SX18 MTRR0_SX198 MTRR0_SX288 MTRR0_SX378 MTRT0_SI1227 MTRT0_SI1857 MTRT0_SI597 MTRT0_SX147 MTRT0_SX237 MTRT0_SX254 MTRT0_SX417 MTRT0_SX57 MTWH1_SI1512 MTWH1_SI2142 MTWH1_SI882 MTWH1_SX162 MTWH1_SX252 MTWH1_SX342 MTWH1_SX432 MTWH1_SX72 MTXS0_SI1060 MTXS0_SI1690 MTXS0_SI2320 MTXS0_SX160 MTXS0_SX250 MTXS0_SX340 MTXS0_SX430 MTXS0_SX70 MVJH0_SI1556 MVJH0_SI2186 MVJH0_SI926 MVJH0_SX116 MVJH0_SX206 MVJH0_SX26 MVJH0_SX296 MVJH0_SX386 MVLO0_SI1147 MVLO0_SI1777 MVLO0_SI517 MVLO0_SX157 MVLO0_SX247 MVLO0_SX337 MVLO0_SX427 MVLO0_SX67 MVRW0_SI1485 MVRW0_SI2115 MVRW0_SI855 MVRW0_SX135 MVRW0_SX225 MVRW0_SX315 MVRW0_SX405 MVRW0_SX45 MWAC0_SI1601 MWAC0_SI2231 MWAC0_SI971 MWAC0_SX161 MWAC0_SX251 MWAC0_SX341 MWAC0_SX431 MWAC0_SX71 MWAD0_SI1062 MWAD0_SI1749 MWAD0_SI2322 MWAD0_SX162 MWAD0_SX252 MWAD0_SX342 MWAD0_SX432 MWAD0_SX72 MWAR0_SI1045 MWAR0_SI1675 MWAR0_SI2305 MWAR0_SX145 MWAR0_SX235 MWAR0_SX325 MWAR0_SX415 MWAR0_SX55 MWCH0_SI1622 MWCH0_SI1895 MWCH0_SI2252 MWCH0_SX182 MWCH0_SX272 MWCH0_SX362 MWCH0_SX452 MWCH0_SX92 MWDK0_SI1436 MWDK0_SI2017 MWDK0_SI806 MWDK0_SX176 MWDK0_SX266 MWDK0_SX356 MWDK0_SX446 MWDK0_SX86 MWEM0_SI1320 MWEM0_SI1393 MWEM0_SI1950 MWEM0_SX150 MWEM0_SX240 MWEM0_SX330 MWEM0_SX420 MWEM0_SX60 MWGR0_SI1606 MWGR0_SI2236 MWGR0_SI976 MWGR0_SX166 MWGR0_SX256 MWGR0_SX346 MWGR0_SX436 MWGR0_SX76 MWRE0_SI1057 MWRE0_SI1687 MWRE0_SI2317 MWRE0_SX157 MWRE0_SX247 MWRE0_SX337 MWRE0_SX427 MWRE0_SX67 MWRP0_SI1443 MWRP0_SI1525 MWRP0_SI2073 MWRP0_SX183 MWRP0_SX273 MWRP0_SX3 MWRP0_SX363 MWRP0_SX93 MWSB0_SI1626 MWSB0_SI2256 MWSB0_SI996 MWSB0_SX186 MWSB0_SX276 MWSB0_SX366 MWSB0_SX6 MWSB0_SX96 MWSH0_SI1426 MWSH0_SI2266 MWSH0_SI796 MWSH0_SX166 MWSH0_SX256 MWSH0_SX346 MWSH0_SX436 MWSH0_SX76 MZMB0_SI1166 MZMB0_SI1796 MZMB0_SI536 MZMB0_SX176 MZMB0_SX266 MZMB0_SX356 MZMB0_SX446 MZMB0_SX86 ================================================ FILE: examples/wav2vec/unsupervised/config/timit_matched/valid.uid ================================================ FADG0_SI1279 FADG0_SI1909 FADG0_SI649 FADG0_SX109 FADG0_SX19 FADG0_SX199 FADG0_SX289 FADG0_SX379 FAKS0_SI1573 FAKS0_SI2203 FAKS0_SI943 FAKS0_SX133 FAKS0_SX223 FAKS0_SX313 FAKS0_SX403 FAKS0_SX43 FCAL1_SI1403 FCAL1_SI2033 FCAL1_SI773 FCAL1_SX143 FCAL1_SX233 FCAL1_SX323 FCAL1_SX413 FCAL1_SX53 FCMH0_SI1454 FCMH0_SI2084 FCMH0_SI824 FCMH0_SX104 FCMH0_SX14 FCMH0_SX194 FCMH0_SX284 FCMH0_SX374 FDAC1_SI1474 FDAC1_SI2104 FDAC1_SI844 FDAC1_SX124 FDAC1_SX214 FDAC1_SX304 FDAC1_SX34 FDAC1_SX394 FDMS0_SI1218 FDMS0_SI1502 FDMS0_SI1848 FDMS0_SX138 FDMS0_SX228 FDMS0_SX318 FDMS0_SX408 FDMS0_SX48 FDRW0_SI1283 FDRW0_SI1423 FDRW0_SI653 FDRW0_SX113 FDRW0_SX203 FDRW0_SX23 FDRW0_SX293 FDRW0_SX383 FEDW0_SI1084 FEDW0_SI1653 FEDW0_SI1714 FEDW0_SX184 FEDW0_SX274 FEDW0_SX364 FEDW0_SX4 FEDW0_SX94 FGJD0_SI1179 FGJD0_SI549 FGJD0_SI818 FGJD0_SX189 FGJD0_SX279 FGJD0_SX369 FGJD0_SX9 FGJD0_SX99 FJEM0_SI1264 FJEM0_SI1894 FJEM0_SI634 FJEM0_SX184 FJEM0_SX274 FJEM0_SX364 FJEM0_SX4 FJEM0_SX94 FJMG0_SI1181 FJMG0_SI1811 FJMG0_SI551 FJMG0_SX101 FJMG0_SX11 FJMG0_SX191 FJMG0_SX281 FJMG0_SX371 FJSJ0_SI1484 FJSJ0_SI2114 FJSJ0_SI854 FJSJ0_SX134 FJSJ0_SX224 FJSJ0_SX314 FJSJ0_SX404 FJSJ0_SX44 FKMS0_SI1490 FKMS0_SI2120 FKMS0_SI860 FKMS0_SX140 FKMS0_SX230 FKMS0_SX320 FKMS0_SX410 FKMS0_SX50 FMAH0_SI1289 FMAH0_SI1919 FMAH0_SI659 FMAH0_SX119 FMAH0_SX209 FMAH0_SX29 FMAH0_SX299 FMAH0_SX389 FMML0_SI1040 FMML0_SI1670 FMML0_SI2300 FMML0_SX140 FMML0_SX230 FMML0_SX320 FMML0_SX410 FMML0_SX50 FNMR0_SI1399 FNMR0_SI2029 FNMR0_SI769 FNMR0_SX139 FNMR0_SX229 FNMR0_SX319 FNMR0_SX409 FNMR0_SX49 FREW0_SI1030 FREW0_SI1280 FREW0_SI1910 FREW0_SX110 FREW0_SX20 FREW0_SX200 FREW0_SX290 FREW0_SX380 FSEM0_SI1198 FSEM0_SI1828 FSEM0_SI568 FSEM0_SX118 FSEM0_SX208 FSEM0_SX28 FSEM0_SX298 FSEM0_SX388 MAJC0_SI1946 MAJC0_SI2095 MAJC0_SI835 MAJC0_SX115 MAJC0_SX205 MAJC0_SX25 MAJC0_SX295 MAJC0_SX385 MBDG0_SI1463 MBDG0_SI2093 MBDG0_SI833 MBDG0_SX113 MBDG0_SX203 MBDG0_SX23 MBDG0_SX293 MBDG0_SX383 MBNS0_SI1220 MBNS0_SI1850 MBNS0_SI590 MBNS0_SX140 MBNS0_SX230 MBNS0_SX320 MBNS0_SX410 MBNS0_SX50 MBWM0_SI1304 MBWM0_SI1934 MBWM0_SI674 MBWM0_SX134 MBWM0_SX224 MBWM0_SX314 MBWM0_SX404 MBWM0_SX44 MCSH0_SI1549 MCSH0_SI2179 MCSH0_SI919 MCSH0_SX109 MCSH0_SX19 MCSH0_SX199 MCSH0_SX289 MCSH0_SX379 MDLF0_SI1583 MDLF0_SI2213 MDLF0_SI953 MDLF0_SX143 MDLF0_SX233 MDLF0_SX323 MDLF0_SX413 MDLF0_SX53 MDLS0_SI1628 MDLS0_SI2258 MDLS0_SI998 MDLS0_SX188 MDLS0_SX278 MDLS0_SX368 MDLS0_SX8 MDLS0_SX98 MDVC0_SI2174 MDVC0_SI2196 MDVC0_SI936 MDVC0_SX126 MDVC0_SX216 MDVC0_SX306 MDVC0_SX36 MDVC0_SX396 MERS0_SI1019 MERS0_SI1649 MERS0_SI497 MERS0_SX119 MERS0_SX209 MERS0_SX29 MERS0_SX299 MERS0_SX389 MGJF0_SI1901 MGJF0_SI641 MGJF0_SI776 MGJF0_SX101 MGJF0_SX11 MGJF0_SX191 MGJF0_SX281 MGJF0_SX371 MGLB0_SI1534 MGLB0_SI2164 MGLB0_SI904 MGLB0_SX184 MGLB0_SX274 MGLB0_SX364 MGLB0_SX4 MGLB0_SX94 MGWT0_SI1539 MGWT0_SI2169 MGWT0_SI909 MGWT0_SX189 MGWT0_SX279 MGWT0_SX369 MGWT0_SX9 MGWT0_SX99 MJAR0_SI1988 MJAR0_SI2247 MJAR0_SI728 MJAR0_SX188 MJAR0_SX278 MJAR0_SX368 MJAR0_SX8 MJAR0_SX98 MJFC0_SI1033 MJFC0_SI1663 MJFC0_SI2293 MJFC0_SX133 MJFC0_SX223 MJFC0_SX313 MJFC0_SX403 MJFC0_SX43 MJSW0_SI1010 MJSW0_SI1640 MJSW0_SI2270 MJSW0_SX110 MJSW0_SX20 MJSW0_SX200 MJSW0_SX290 MJSW0_SX380 MMDB1_SI1625 MMDB1_SI2255 MMDB1_SI995 MMDB1_SX185 MMDB1_SX275 MMDB1_SX365 MMDB1_SX5 MMDB1_SX95 MMDM2_SI1452 MMDM2_SI1555 MMDM2_SI2082 MMDM2_SX102 MMDM2_SX12 MMDM2_SX192 MMDM2_SX282 MMDM2_SX372 MMJR0_SI1648 MMJR0_SI2166 MMJR0_SI2278 MMJR0_SX118 MMJR0_SX208 MMJR0_SX28 MMJR0_SX298 MMJR0_SX388 MMWH0_SI1089 MMWH0_SI1301 MMWH0_SI459 MMWH0_SX189 MMWH0_SX279 MMWH0_SX369 MMWH0_SX9 MMWH0_SX99 MPDF0_SI1542 MPDF0_SI2172 MPDF0_SI912 MPDF0_SX102 MPDF0_SX12 MPDF0_SX192 MPDF0_SX282 MPDF0_SX372 MRCS0_SI1223 MRCS0_SI1853 MRCS0_SI593 MRCS0_SX143 MRCS0_SX233 MRCS0_SX323 MRCS0_SX413 MRCS0_SX53 MREB0_SI1375 MREB0_SI2005 MREB0_SI745 MREB0_SX115 MREB0_SX205 MREB0_SX25 MREB0_SX295 MREB0_SX385 MRJM4_SI1489 MRJM4_SI2119 MRJM4_SI859 MRJM4_SX139 MRJM4_SX229 MRJM4_SX319 MRJM4_SX409 MRJM4_SX49 MRJR0_SI1182 MRJR0_SI1812 MRJR0_SI2313 MRJR0_SX102 MRJR0_SX12 MRJR0_SX192 MRJR0_SX282 MRJR0_SX372 MROA0_SI1307 MROA0_SI1970 MROA0_SI677 MROA0_SX137 MROA0_SX227 MROA0_SX317 MROA0_SX407 MROA0_SX47 MRTK0_SI1093 MRTK0_SI1723 MRTK0_SI1750 MRTK0_SX103 MRTK0_SX13 MRTK0_SX193 MRTK0_SX283 MRTK0_SX373 MRWS1_SI1130 MRWS1_SI1496 MRWS1_SI500 MRWS1_SX140 MRWS1_SX230 MRWS1_SX320 MRWS1_SX410 MRWS1_SX50 MTAA0_SI1285 MTAA0_SI1915 MTAA0_SI596 MTAA0_SX115 MTAA0_SX205 MTAA0_SX25 MTAA0_SX295 MTAA0_SX385 MTDT0_SI1994 MTDT0_SI2254 MTDT0_SI994 MTDT0_SX184 MTDT0_SX274 MTDT0_SX364 MTDT0_SX4 MTDT0_SX94 MTEB0_SI1133 MTEB0_SI2064 MTEB0_SI503 MTEB0_SX143 MTEB0_SX233 MTEB0_SX323 MTEB0_SX413 MTEB0_SX53 MTHC0_SI1015 MTHC0_SI1645 MTHC0_SI2275 MTHC0_SX115 MTHC0_SX205 MTHC0_SX25 MTHC0_SX295 MTHC0_SX385 MWJG0_SI1124 MWJG0_SI1754 MWJG0_SI494 MWJG0_SX134 MWJG0_SX224 MWJG0_SX314 MWJG0_SX404 MWJG0_SX44 ================================================ FILE: examples/wav2vec/unsupervised/config/timit_unmatched/test.uid ================================================ FADG0_SA1 FADG0_SA2 FADG0_SI1279 FADG0_SI1909 FADG0_SI649 FADG0_SX109 FADG0_SX19 FADG0_SX199 FADG0_SX289 FADG0_SX379 FAKS0_SA1 FAKS0_SA2 FAKS0_SI1573 FAKS0_SI2203 FAKS0_SI943 FAKS0_SX133 FAKS0_SX223 FAKS0_SX313 FAKS0_SX403 FAKS0_SX43 FASW0_SA1 FASW0_SA2 FASW0_SI1550 FASW0_SI2180 FASW0_SI920 FASW0_SX110 FASW0_SX20 FASW0_SX200 FASW0_SX290 FASW0_SX380 FAWF0_SA1 FAWF0_SA2 FAWF0_SI1000 FAWF0_SI1630 FAWF0_SI2260 FAWF0_SX10 FAWF0_SX100 FAWF0_SX190 FAWF0_SX280 FAWF0_SX370 FCAL1_SA1 FCAL1_SA2 FCAL1_SI1403 FCAL1_SI2033 FCAL1_SI773 FCAL1_SX143 FCAL1_SX233 FCAL1_SX323 FCAL1_SX413 FCAL1_SX53 FCAU0_SA1 FCAU0_SA2 FCAU0_SI1037 FCAU0_SI1667 FCAU0_SI2297 FCAU0_SX137 FCAU0_SX227 FCAU0_SX317 FCAU0_SX407 FCAU0_SX47 FCFT0_SA1 FCFT0_SA2 FCFT0_SI1178 FCFT0_SI1808 FCFT0_SI548 FCFT0_SX188 FCFT0_SX278 FCFT0_SX368 FCFT0_SX8 FCFT0_SX98 FCMH0_SA1 FCMH0_SA2 FCMH0_SI1454 FCMH0_SI2084 FCMH0_SI824 FCMH0_SX104 FCMH0_SX14 FCMH0_SX194 FCMH0_SX284 FCMH0_SX374 FCMH1_SA1 FCMH1_SA2 FCMH1_SI1493 FCMH1_SI2123 FCMH1_SI863 FCMH1_SX143 FCMH1_SX233 FCMH1_SX323 FCMH1_SX413 FCMH1_SX53 FCMR0_SA1 FCMR0_SA2 FCMR0_SI1105 FCMR0_SI1735 FCMR0_SI475 FCMR0_SX115 FCMR0_SX205 FCMR0_SX25 FCMR0_SX295 FCMR0_SX385 FCRH0_SA1 FCRH0_SA2 FCRH0_SI1088 FCRH0_SI1718 FCRH0_SI458 FCRH0_SX188 FCRH0_SX278 FCRH0_SX368 FCRH0_SX8 FCRH0_SX98 FDAC1_SA1 FDAC1_SA2 FDAC1_SI1474 FDAC1_SI2104 FDAC1_SI844 FDAC1_SX124 FDAC1_SX214 FDAC1_SX304 FDAC1_SX34 FDAC1_SX394 FDHC0_SA1 FDHC0_SA2 FDHC0_SI1559 FDHC0_SI2189 FDHC0_SI929 FDHC0_SX119 FDHC0_SX209 FDHC0_SX29 FDHC0_SX299 FDHC0_SX389 FDMS0_SA1 FDMS0_SA2 FDMS0_SI1218 FDMS0_SI1502 FDMS0_SI1848 FDMS0_SX138 FDMS0_SX228 FDMS0_SX318 FDMS0_SX408 FDMS0_SX48 FDRD1_SA1 FDRD1_SA2 FDRD1_SI1544 FDRD1_SI1566 FDRD1_SI2149 FDRD1_SX104 FDRD1_SX14 FDRD1_SX194 FDRD1_SX284 FDRD1_SX374 FDRW0_SA1 FDRW0_SA2 FDRW0_SI1283 FDRW0_SI1423 FDRW0_SI653 FDRW0_SX113 FDRW0_SX203 FDRW0_SX23 FDRW0_SX293 FDRW0_SX383 FEDW0_SA1 FEDW0_SA2 FEDW0_SI1084 FEDW0_SI1653 FEDW0_SI1714 FEDW0_SX184 FEDW0_SX274 FEDW0_SX364 FEDW0_SX4 FEDW0_SX94 FELC0_SA1 FELC0_SA2 FELC0_SI1386 FELC0_SI2016 FELC0_SI756 FELC0_SX126 FELC0_SX216 FELC0_SX306 FELC0_SX36 FELC0_SX396 FGJD0_SA1 FGJD0_SA2 FGJD0_SI1179 FGJD0_SI549 FGJD0_SI818 FGJD0_SX189 FGJD0_SX279 FGJD0_SX369 FGJD0_SX9 FGJD0_SX99 FGMD0_SA1 FGMD0_SA2 FGMD0_SI1943 FGMD0_SI2107 FGMD0_SI683 FGMD0_SX143 FGMD0_SX233 FGMD0_SX323 FGMD0_SX413 FGMD0_SX53 FGWR0_SA1 FGWR0_SA2 FGWR0_SI1578 FGWR0_SI2208 FGWR0_SI948 FGWR0_SX138 FGWR0_SX228 FGWR0_SX318 FGWR0_SX408 FGWR0_SX48 FHES0_SA1 FHES0_SA2 FHES0_SI1109 FHES0_SI1739 FHES0_SI479 FHES0_SX119 FHES0_SX209 FHES0_SX29 FHES0_SX299 FHES0_SX389 FHEW0_SA1 FHEW0_SA2 FHEW0_SI2023 FHEW0_SI690 FHEW0_SI763 FHEW0_SX133 FHEW0_SX223 FHEW0_SX313 FHEW0_SX403 FHEW0_SX43 FISB0_SA1 FISB0_SA2 FISB0_SI1579 FISB0_SI2209 FISB0_SI949 FISB0_SX139 FISB0_SX229 FISB0_SX319 FISB0_SX409 FISB0_SX49 FJAS0_SA1 FJAS0_SA2 FJAS0_SI1400 FJAS0_SI2030 FJAS0_SI770 FJAS0_SX140 FJAS0_SX230 FJAS0_SX320 FJAS0_SX410 FJAS0_SX50 FJCS0_SA1 FJCS0_SA2 FJCS0_SI1309 FJCS0_SI1833 FJCS0_SI1939 FJCS0_SX139 FJCS0_SX229 FJCS0_SX319 FJCS0_SX409 FJCS0_SX49 FJEM0_SA1 FJEM0_SA2 FJEM0_SI1264 FJEM0_SI1894 FJEM0_SI634 FJEM0_SX184 FJEM0_SX274 FJEM0_SX364 FJEM0_SX4 FJEM0_SX94 FJLM0_SA1 FJLM0_SA2 FJLM0_SI1043 FJLM0_SI1673 FJLM0_SI2303 FJLM0_SX143 FJLM0_SX233 FJLM0_SX323 FJLM0_SX413 FJLM0_SX53 FJMG0_SA1 FJMG0_SA2 FJMG0_SI1181 FJMG0_SI1811 FJMG0_SI551 FJMG0_SX101 FJMG0_SX11 FJMG0_SX191 FJMG0_SX281 FJMG0_SX371 FJRE0_SA1 FJRE0_SA2 FJRE0_SI1116 FJRE0_SI1587 FJRE0_SI1746 FJRE0_SX126 FJRE0_SX216 FJRE0_SX306 FJRE0_SX36 FJRE0_SX396 FJSA0_SA1 FJSA0_SA2 FJSA0_SI1379 FJSA0_SI2009 FJSA0_SI749 FJSA0_SX119 FJSA0_SX209 FJSA0_SX29 FJSA0_SX299 FJSA0_SX389 FJSJ0_SA1 FJSJ0_SA2 FJSJ0_SI1484 FJSJ0_SI2114 FJSJ0_SI854 FJSJ0_SX134 FJSJ0_SX224 FJSJ0_SX314 FJSJ0_SX404 FJSJ0_SX44 FJWB0_SA1 FJWB0_SA2 FJWB0_SI1265 FJWB0_SI635 FJWB0_SI992 FJWB0_SX185 FJWB0_SX275 FJWB0_SX365 FJWB0_SX5 FJWB0_SX95 FKMS0_SA1 FKMS0_SA2 FKMS0_SI1490 FKMS0_SI2120 FKMS0_SI860 FKMS0_SX140 FKMS0_SX230 FKMS0_SX320 FKMS0_SX410 FKMS0_SX50 FLAS0_SA1 FLAS0_SA2 FLAS0_SI1026 FLAS0_SI1488 FLAS0_SI858 FLAS0_SX138 FLAS0_SX228 FLAS0_SX318 FLAS0_SX408 FLAS0_SX48 FLBW0_SA1 FLBW0_SA2 FLBW0_SI1219 FLBW0_SI1849 FLBW0_SI2253 FLBW0_SX139 FLBW0_SX229 FLBW0_SX319 FLBW0_SX409 FLBW0_SX49 FLKD0_SA1 FLKD0_SA2 FLKD0_SI1369 FLKD0_SI739 FLKD0_SI894 FLKD0_SX109 FLKD0_SX19 FLKD0_SX199 FLKD0_SX289 FLKD0_SX379 FLNH0_SA1 FLNH0_SA2 FLNH0_SI1214 FLNH0_SI584 FLNH0_SI941 FLNH0_SX134 FLNH0_SX224 FLNH0_SX314 FLNH0_SX404 FLNH0_SX44 FMAF0_SA1 FMAF0_SA2 FMAF0_SI1459 FMAF0_SI2089 FMAF0_SI829 FMAF0_SX109 FMAF0_SX19 FMAF0_SX199 FMAF0_SX289 FMAF0_SX379 FMAH0_SA1 FMAH0_SA2 FMAH0_SI1289 FMAH0_SI1919 FMAH0_SI659 FMAH0_SX119 FMAH0_SX209 FMAH0_SX29 FMAH0_SX299 FMAH0_SX389 FMCM0_SA1 FMCM0_SA2 FMCM0_SI1180 FMCM0_SI1810 FMCM0_SI550 FMCM0_SX10 FMCM0_SX100 FMCM0_SX190 FMCM0_SX280 FMCM0_SX370 FMGD0_SA1 FMGD0_SA2 FMGD0_SI1564 FMGD0_SI2194 FMGD0_SI934 FMGD0_SX124 FMGD0_SX214 FMGD0_SX304 FMGD0_SX34 FMGD0_SX394 FMLD0_SA1 FMLD0_SA2 FMLD0_SI2185 FMLD0_SI822 FMLD0_SI925 FMLD0_SX115 FMLD0_SX205 FMLD0_SX25 FMLD0_SX295 FMLD0_SX385 FMML0_SA1 FMML0_SA2 FMML0_SI1040 FMML0_SI1670 FMML0_SI2300 FMML0_SX140 FMML0_SX230 FMML0_SX320 FMML0_SX410 FMML0_SX50 FNLP0_SA1 FNLP0_SA2 FNLP0_SI1308 FNLP0_SI1938 FNLP0_SI678 FNLP0_SX138 FNLP0_SX228 FNLP0_SX318 FNLP0_SX408 FNLP0_SX48 FNMR0_SA1 FNMR0_SA2 FNMR0_SI1399 FNMR0_SI2029 FNMR0_SI769 FNMR0_SX139 FNMR0_SX229 FNMR0_SX319 FNMR0_SX409 FNMR0_SX49 FPAS0_SA1 FPAS0_SA2 FPAS0_SI1272 FPAS0_SI2204 FPAS0_SI944 FPAS0_SX134 FPAS0_SX224 FPAS0_SX314 FPAS0_SX404 FPAS0_SX44 FPKT0_SA1 FPKT0_SA2 FPKT0_SI1538 FPKT0_SI2168 FPKT0_SI908 FPKT0_SX188 FPKT0_SX278 FPKT0_SX368 FPKT0_SX8 FPKT0_SX98 FRAM1_SA1 FRAM1_SA2 FRAM1_SI1360 FRAM1_SI522 FRAM1_SI730 FRAM1_SX10 FRAM1_SX100 FRAM1_SX190 FRAM1_SX280 FRAM1_SX370 FREW0_SA1 FREW0_SA2 FREW0_SI1030 FREW0_SI1280 FREW0_SI1910 FREW0_SX110 FREW0_SX20 FREW0_SX200 FREW0_SX290 FREW0_SX380 FRNG0_SA1 FRNG0_SA2 FRNG0_SI1355 FRNG0_SI1985 FRNG0_SI725 FRNG0_SX185 FRNG0_SX275 FRNG0_SX365 FRNG0_SX5 FRNG0_SX95 FSEM0_SA1 FSEM0_SA2 FSEM0_SI1198 FSEM0_SI1828 FSEM0_SI568 FSEM0_SX118 FSEM0_SX208 FSEM0_SX28 FSEM0_SX298 FSEM0_SX388 FSLB1_SA1 FSLB1_SA2 FSLB1_SI1904 FSLB1_SI644 FSLB1_SI891 FSLB1_SX104 FSLB1_SX14 FSLB1_SX194 FSLB1_SX284 FSLB1_SX374 FSXA0_SA1 FSXA0_SA2 FSXA0_SI1108 FSXA0_SI1846 FSXA0_SI478 FSXA0_SX118 FSXA0_SX208 FSXA0_SX28 FSXA0_SX298 FSXA0_SX388 FTLH0_SA1 FTLH0_SA2 FTLH0_SI1009 FTLH0_SI1390 FTLH0_SI1639 FTLH0_SX109 FTLH0_SX19 FTLH0_SX199 FTLH0_SX289 FTLH0_SX379 FUTB0_SA1 FUTB0_SA2 FUTB0_SI1204 FUTB0_SI1330 FUTB0_SI1834 FUTB0_SX124 FUTB0_SX214 FUTB0_SX304 FUTB0_SX34 FUTB0_SX394 MABW0_SA1 MABW0_SA2 MABW0_SI1230 MABW0_SI1664 MABW0_SI2294 MABW0_SX134 MABW0_SX224 MABW0_SX314 MABW0_SX404 MABW0_SX44 MAHH0_SA1 MAHH0_SA2 MAHH0_SI1294 MAHH0_SI1924 MAHH0_SI664 MAHH0_SX124 MAHH0_SX214 MAHH0_SX304 MAHH0_SX34 MAHH0_SX394 MAJC0_SA1 MAJC0_SA2 MAJC0_SI1946 MAJC0_SI2095 MAJC0_SI835 MAJC0_SX115 MAJC0_SX205 MAJC0_SX25 MAJC0_SX295 MAJC0_SX385 MBDG0_SA1 MBDG0_SA2 MBDG0_SI1463 MBDG0_SI2093 MBDG0_SI833 MBDG0_SX113 MBDG0_SX203 MBDG0_SX23 MBDG0_SX293 MBDG0_SX383 MBJK0_SA1 MBJK0_SA2 MBJK0_SI1175 MBJK0_SI2128 MBJK0_SI545 MBJK0_SX185 MBJK0_SX275 MBJK0_SX365 MBJK0_SX5 MBJK0_SX95 MBNS0_SA1 MBNS0_SA2 MBNS0_SI1220 MBNS0_SI1850 MBNS0_SI590 MBNS0_SX140 MBNS0_SX230 MBNS0_SX320 MBNS0_SX410 MBNS0_SX50 MBPM0_SA1 MBPM0_SA2 MBPM0_SI1577 MBPM0_SI1584 MBPM0_SI947 MBPM0_SX137 MBPM0_SX227 MBPM0_SX317 MBPM0_SX407 MBPM0_SX47 MBWM0_SA1 MBWM0_SA2 MBWM0_SI1304 MBWM0_SI1934 MBWM0_SI674 MBWM0_SX134 MBWM0_SX224 MBWM0_SX314 MBWM0_SX404 MBWM0_SX44 MCCS0_SA1 MCCS0_SA2 MCCS0_SI1469 MCCS0_SI2099 MCCS0_SI839 MCCS0_SX119 MCCS0_SX209 MCCS0_SX29 MCCS0_SX299 MCCS0_SX389 MCEM0_SA1 MCEM0_SA2 MCEM0_SI1398 MCEM0_SI2028 MCEM0_SI768 MCEM0_SX138 MCEM0_SX228 MCEM0_SX318 MCEM0_SX408 MCEM0_SX48 MCHH0_SA1 MCHH0_SA2 MCHH0_SI1004 MCHH0_SI1634 MCHH0_SI530 MCHH0_SX104 MCHH0_SX14 MCHH0_SX194 MCHH0_SX284 MCHH0_SX374 MCMB0_SA1 MCMB0_SA2 MCMB0_SI1268 MCMB0_SI1898 MCMB0_SI638 MCMB0_SX188 MCMB0_SX278 MCMB0_SX368 MCMB0_SX8 MCMB0_SX98 MCMJ0_SA1 MCMJ0_SA2 MCMJ0_SI1094 MCMJ0_SI464 MCMJ0_SI602 MCMJ0_SX104 MCMJ0_SX14 MCMJ0_SX194 MCMJ0_SX284 MCMJ0_SX374 MCRC0_SA1 MCRC0_SA2 MCRC0_SI1092 MCRC0_SI1722 MCRC0_SI462 MCRC0_SX102 MCRC0_SX12 MCRC0_SX192 MCRC0_SX282 MCRC0_SX372 MCSH0_SA1 MCSH0_SA2 MCSH0_SI1549 MCSH0_SI2179 MCSH0_SI919 MCSH0_SX109 MCSH0_SX19 MCSH0_SX199 MCSH0_SX289 MCSH0_SX379 MCTT0_SA1 MCTT0_SA2 MCTT0_SI1144 MCTT0_SI2188 MCTT0_SI928 MCTT0_SX118 MCTT0_SX208 MCTT0_SX28 MCTT0_SX298 MCTT0_SX388 MCTW0_SA1 MCTW0_SA2 MCTW0_SI1373 MCTW0_SI2003 MCTW0_SI743 MCTW0_SX113 MCTW0_SX203 MCTW0_SX23 MCTW0_SX293 MCTW0_SX383 MDAB0_SA1 MDAB0_SA2 MDAB0_SI1039 MDAB0_SI1669 MDAB0_SI2299 MDAB0_SX139 MDAB0_SX229 MDAB0_SX319 MDAB0_SX409 MDAB0_SX49 MDAC2_SA1 MDAC2_SA2 MDAC2_SI2259 MDAC2_SI560 MDAC2_SI999 MDAC2_SX189 MDAC2_SX279 MDAC2_SX369 MDAC2_SX9 MDAC2_SX99 MDAW1_SA1 MDAW1_SA2 MDAW1_SI1453 MDAW1_SI2083 MDAW1_SI823 MDAW1_SX103 MDAW1_SX13 MDAW1_SX193 MDAW1_SX283 MDAW1_SX373 MDBB0_SA1 MDBB0_SA2 MDBB0_SI1195 MDBB0_SI1825 MDBB0_SI565 MDBB0_SX115 MDBB0_SX205 MDBB0_SX25 MDBB0_SX295 MDBB0_SX385 MDLD0_SA1 MDLD0_SA2 MDLD0_SI1543 MDLD0_SI2173 MDLD0_SI913 MDLD0_SX103 MDLD0_SX13 MDLD0_SX193 MDLD0_SX283 MDLD0_SX373 MDLF0_SA1 MDLF0_SA2 MDLF0_SI1583 MDLF0_SI2213 MDLF0_SI953 MDLF0_SX143 MDLF0_SX233 MDLF0_SX323 MDLF0_SX413 MDLF0_SX53 MDLS0_SA1 MDLS0_SA2 MDLS0_SI1628 MDLS0_SI2258 MDLS0_SI998 MDLS0_SX188 MDLS0_SX278 MDLS0_SX368 MDLS0_SX8 MDLS0_SX98 MDRB0_SA1 MDRB0_SA2 MDRB0_SI1174 MDRB0_SI2109 MDRB0_SI544 MDRB0_SX184 MDRB0_SX274 MDRB0_SX364 MDRB0_SX4 MDRB0_SX94 MDRM0_SA1 MDRM0_SA2 MDRM0_SI1013 MDRM0_SI1643 MDRM0_SI2273 MDRM0_SX113 MDRM0_SX203 MDRM0_SX23 MDRM0_SX293 MDRM0_SX383 MDSC0_SA1 MDSC0_SA2 MDSC0_SI1038 MDSC0_SI2298 MDSC0_SI967 MDSC0_SX138 MDSC0_SX228 MDSC0_SX318 MDSC0_SX408 MDSC0_SX48 MDVC0_SA1 MDVC0_SA2 MDVC0_SI2174 MDVC0_SI2196 MDVC0_SI936 MDVC0_SX126 MDVC0_SX216 MDVC0_SX306 MDVC0_SX36 MDVC0_SX396 MDWA0_SA1 MDWA0_SA2 MDWA0_SI1146 MDWA0_SI1445 MDWA0_SI519 MDWA0_SX185 MDWA0_SX275 MDWA0_SX365 MDWA0_SX5 MDWA0_SX95 MDWK0_SA1 MDWK0_SA2 MDWK0_SI1540 MDWK0_SI2170 MDWK0_SI910 MDWK0_SX10 MDWK0_SX100 MDWK0_SX190 MDWK0_SX280 MDWK0_SX370 MERS0_SA1 MERS0_SA2 MERS0_SI1019 MERS0_SI1649 MERS0_SI497 MERS0_SX119 MERS0_SX209 MERS0_SX29 MERS0_SX299 MERS0_SX389 MESD0_SA1 MESD0_SA2 MESD0_SI1002 MESD0_SI1632 MESD0_SI2262 MESD0_SX102 MESD0_SX12 MESD0_SX192 MESD0_SX282 MESD0_SX372 MFGK0_SA1 MFGK0_SA2 MFGK0_SI1451 MFGK0_SI1744 MFGK0_SI484 MFGK0_SX124 MFGK0_SX214 MFGK0_SX304 MFGK0_SX34 MFGK0_SX394 MGJF0_SA1 MGJF0_SA2 MGJF0_SI1901 MGJF0_SI641 MGJF0_SI776 MGJF0_SX101 MGJF0_SX11 MGJF0_SX191 MGJF0_SX281 MGJF0_SX371 MGLB0_SA1 MGLB0_SA2 MGLB0_SI1534 MGLB0_SI2164 MGLB0_SI904 MGLB0_SX184 MGLB0_SX274 MGLB0_SX364 MGLB0_SX4 MGLB0_SX94 MGMM0_SA1 MGMM0_SA2 MGMM0_SI1129 MGMM0_SI1759 MGMM0_SI499 MGMM0_SX139 MGMM0_SX229 MGMM0_SX319 MGMM0_SX409 MGMM0_SX49 MGRT0_SA1 MGRT0_SA2 MGRT0_SI1450 MGRT0_SI2080 MGRT0_SI820 MGRT0_SX10 MGRT0_SX100 MGRT0_SX190 MGRT0_SX280 MGRT0_SX370 MGWT0_SA1 MGWT0_SA2 MGWT0_SI1539 MGWT0_SI2169 MGWT0_SI909 MGWT0_SX189 MGWT0_SX279 MGWT0_SX369 MGWT0_SX9 MGWT0_SX99 MHPG0_SA1 MHPG0_SA2 MHPG0_SI1090 MHPG0_SI1720 MHPG0_SI460 MHPG0_SX10 MHPG0_SX100 MHPG0_SX190 MHPG0_SX280 MHPG0_SX370 MJAR0_SA1 MJAR0_SA2 MJAR0_SI1988 MJAR0_SI2247 MJAR0_SI728 MJAR0_SX188 MJAR0_SX278 MJAR0_SX368 MJAR0_SX8 MJAR0_SX98 MJBR0_SA1 MJBR0_SA2 MJBR0_SI1001 MJBR0_SI1631 MJBR0_SI2261 MJBR0_SX101 MJBR0_SX11 MJBR0_SX191 MJBR0_SX281 MJBR0_SX371 MJDH0_SA1 MJDH0_SA2 MJDH0_SI1354 MJDH0_SI1984 MJDH0_SI724 MJDH0_SX184 MJDH0_SX274 MJDH0_SX364 MJDH0_SX4 MJDH0_SX94 MJDM1_SA1 MJDM1_SA2 MJDM1_SI1085 MJDM1_SI1715 MJDM1_SI455 MJDM1_SX185 MJDM1_SX275 MJDM1_SX365 MJDM1_SX5 MJDM1_SX95 MJES0_SA1 MJES0_SA2 MJES0_SI1384 MJES0_SI2014 MJES0_SI754 MJES0_SX124 MJES0_SX214 MJES0_SX304 MJES0_SX34 MJES0_SX394 MJFC0_SA1 MJFC0_SA2 MJFC0_SI1033 MJFC0_SI1663 MJFC0_SI2293 MJFC0_SX133 MJFC0_SX223 MJFC0_SX313 MJFC0_SX403 MJFC0_SX43 MJJG0_SA1 MJJG0_SA2 MJJG0_SI1003 MJJG0_SI1633 MJJG0_SI2263 MJJG0_SX103 MJJG0_SX13 MJJG0_SX193 MJJG0_SX283 MJJG0_SX373 MJLN0_SA1 MJLN0_SA2 MJLN0_SI1449 MJLN0_SI2079 MJLN0_SI819 MJLN0_SX189 MJLN0_SX279 MJLN0_SX369 MJLN0_SX9 MJLN0_SX99 MJMP0_SA1 MJMP0_SA2 MJMP0_SI1535 MJMP0_SI1791 MJMP0_SI905 MJMP0_SX185 MJMP0_SX275 MJMP0_SX365 MJMP0_SX5 MJMP0_SX95 MJRF0_SA1 MJRF0_SA2 MJRF0_SI1114 MJRF0_SI2081 MJRF0_SI821 MJRF0_SX101 MJRF0_SX11 MJRF0_SX191 MJRF0_SX281 MJRF0_SX371 MJSW0_SA1 MJSW0_SA2 MJSW0_SI1010 MJSW0_SI1640 MJSW0_SI2270 MJSW0_SX110 MJSW0_SX20 MJSW0_SX200 MJSW0_SX290 MJSW0_SX380 MJTC0_SA1 MJTC0_SA2 MJTC0_SI1460 MJTC0_SI2090 MJTC0_SI830 MJTC0_SX110 MJTC0_SX20 MJTC0_SX200 MJTC0_SX290 MJTC0_SX380 MJTH0_SA1 MJTH0_SA2 MJTH0_SI1296 MJTH0_SI1926 MJTH0_SI666 MJTH0_SX126 MJTH0_SX216 MJTH0_SX306 MJTH0_SX36 MJTH0_SX396 MJVW0_SA1 MJVW0_SA2 MJVW0_SI1733 MJVW0_SI1758 MJVW0_SI473 MJVW0_SX113 MJVW0_SX203 MJVW0_SX23 MJVW0_SX293 MJVW0_SX383 MKCH0_SA1 MKCH0_SA2 MKCH0_SI1378 MKCH0_SI1425 MKCH0_SI2008 MKCH0_SX118 MKCH0_SX208 MKCH0_SX28 MKCH0_SX298 MKCH0_SX388 MKCL0_SA1 MKCL0_SA2 MKCL0_SI1091 MKCL0_SI1721 MKCL0_SI461 MKCL0_SX101 MKCL0_SX11 MKCL0_SX191 MKCL0_SX281 MKCL0_SX371 MKDR0_SA1 MKDR0_SA2 MKDR0_SI1273 MKDR0_SI1903 MKDR0_SI643 MKDR0_SX103 MKDR0_SX13 MKDR0_SX193 MKDR0_SX283 MKDR0_SX373 MKJL0_SA1 MKJL0_SA2 MKJL0_SI1100 MKJL0_SI1730 MKJL0_SI470 MKJL0_SX110 MKJL0_SX20 MKJL0_SX200 MKJL0_SX290 MKJL0_SX380 MKLT0_SA1 MKLT0_SA2 MKLT0_SI1213 MKLT0_SI1843 MKLT0_SI583 MKLT0_SX133 MKLT0_SX223 MKLT0_SX313 MKLT0_SX403 MKLT0_SX43 MLIH0_SA1 MLIH0_SA2 MLIH0_SI1183 MLIH0_SI1813 MLIH0_SI553 MLIH0_SX103 MLIH0_SX13 MLIH0_SX193 MLIH0_SX283 MLIH0_SX373 MLJB0_SA1 MLJB0_SA2 MLJB0_SI1310 MLJB0_SI1940 MLJB0_SI680 MLJB0_SX140 MLJB0_SX230 MLJB0_SX320 MLJB0_SX410 MLJB0_SX50 MLLL0_SA1 MLLL0_SA2 MLLL0_SI1363 MLLL0_SI1993 MLLL0_SI733 MLLL0_SX103 MLLL0_SX13 MLLL0_SX193 MLLL0_SX283 MLLL0_SX373 MLNT0_SA1 MLNT0_SA2 MLNT0_SI1574 MLNT0_SI1902 MLNT0_SI642 MLNT0_SX102 MLNT0_SX12 MLNT0_SX192 MLNT0_SX282 MLNT0_SX372 MMAB0_SA1 MMAB0_SA2 MMAB0_SI1362 MMAB0_SI1992 MMAB0_SI732 MMAB0_SX102 MMAB0_SX12 MMAB0_SX192 MMAB0_SX282 MMAB0_SX372 MMDB1_SA1 MMDB1_SA2 MMDB1_SI1625 MMDB1_SI2255 MMDB1_SI995 MMDB1_SX185 MMDB1_SX275 MMDB1_SX365 MMDB1_SX5 MMDB1_SX95 MMDH0_SA1 MMDH0_SA2 MMDH0_SI1656 MMDH0_SI2118 MMDH0_SI2286 MMDH0_SX126 MMDH0_SX216 MMDH0_SX306 MMDH0_SX36 MMDH0_SX396 MMDM2_SA1 MMDM2_SA2 MMDM2_SI1452 MMDM2_SI1555 MMDM2_SI2082 MMDM2_SX102 MMDM2_SX12 MMDM2_SX192 MMDM2_SX282 MMDM2_SX372 MMJR0_SA1 MMJR0_SA2 MMJR0_SI1648 MMJR0_SI2166 MMJR0_SI2278 MMJR0_SX118 MMJR0_SX208 MMJR0_SX28 MMJR0_SX298 MMJR0_SX388 MMWH0_SA1 MMWH0_SA2 MMWH0_SI1089 MMWH0_SI1301 MMWH0_SI459 MMWH0_SX189 MMWH0_SX279 MMWH0_SX369 MMWH0_SX9 MMWH0_SX99 MNJM0_SA1 MNJM0_SA2 MNJM0_SI1580 MNJM0_SI2210 MNJM0_SI950 MNJM0_SX140 MNJM0_SX230 MNJM0_SX320 MNJM0_SX410 MNJM0_SX50 MNLS0_SA1 MNLS0_SA2 MNLS0_SI1483 MNLS0_SI1610 MNLS0_SI853 MNLS0_SX133 MNLS0_SX223 MNLS0_SX313 MNLS0_SX403 MNLS0_SX43 MPAB0_SA1 MPAB0_SA2 MPAB0_SI1103 MPAB0_SI1128 MPAB0_SI498 MPAB0_SX138 MPAB0_SX228 MPAB0_SX318 MPAB0_SX408 MPAB0_SX48 MPAM0_SA1 MPAM0_SA2 MPAM0_SI1189 MPAM0_SI1819 MPAM0_SI1961 MPAM0_SX109 MPAM0_SX19 MPAM0_SX199 MPAM0_SX289 MPAM0_SX379 MPAM1_SA1 MPAM1_SA2 MPAM1_SI1029 MPAM1_SI1836 MPAM1_SI576 MPAM1_SX126 MPAM1_SX216 MPAM1_SX306 MPAM1_SX36 MPAM1_SX396 MPCS0_SA1 MPCS0_SA2 MPCS0_SI1359 MPCS0_SI1989 MPCS0_SI729 MPCS0_SX189 MPCS0_SX279 MPCS0_SX369 MPCS0_SX9 MPCS0_SX99 MPDF0_SA1 MPDF0_SA2 MPDF0_SI1542 MPDF0_SI2172 MPDF0_SI912 MPDF0_SX102 MPDF0_SX12 MPDF0_SX192 MPDF0_SX282 MPDF0_SX372 MPGL0_SA1 MPGL0_SA2 MPGL0_SI1099 MPGL0_SI1729 MPGL0_SI469 MPGL0_SX109 MPGL0_SX19 MPGL0_SX199 MPGL0_SX289 MPGL0_SX379 MPLB0_SA1 MPLB0_SA2 MPLB0_SI1394 MPLB0_SI2024 MPLB0_SI764 MPLB0_SX134 MPLB0_SX224 MPLB0_SX314 MPLB0_SX404 MPLB0_SX44 MPWM0_SA1 MPWM0_SA2 MPWM0_SI1127 MPWM0_SI1757 MPWM0_SI2279 MPWM0_SX137 MPWM0_SX227 MPWM0_SX317 MPWM0_SX407 MPWM0_SX47 MRCS0_SA1 MRCS0_SA2 MRCS0_SI1223 MRCS0_SI1853 MRCS0_SI593 MRCS0_SX143 MRCS0_SX233 MRCS0_SX323 MRCS0_SX413 MRCS0_SX53 MRCZ0_SA1 MRCZ0_SA2 MRCZ0_SI1541 MRCZ0_SI2171 MRCZ0_SI911 MRCZ0_SX101 MRCZ0_SX11 MRCZ0_SX191 MRCZ0_SX281 MRCZ0_SX371 MREB0_SA1 MREB0_SA2 MREB0_SI1375 MREB0_SI2005 MREB0_SI745 MREB0_SX115 MREB0_SX205 MREB0_SX25 MREB0_SX295 MREB0_SX385 MRES0_SA1 MRES0_SA2 MRES0_SI1217 MRES0_SI1847 MRES0_SI587 MRES0_SX137 MRES0_SX227 MRES0_SX317 MRES0_SX407 MRES0_SX47 MRGG0_SA1 MRGG0_SA2 MRGG0_SI1199 MRGG0_SI1829 MRGG0_SI569 MRGG0_SX119 MRGG0_SX209 MRGG0_SX29 MRGG0_SX299 MRGG0_SX389 MRJM3_SA1 MRJM3_SA2 MRJM3_SI1448 MRJM3_SI1809 MRJM3_SI2078 MRJM3_SX188 MRJM3_SX278 MRJM3_SX368 MRJM3_SX8 MRJM3_SX98 MRJM4_SA1 MRJM4_SA2 MRJM4_SI1489 MRJM4_SI2119 MRJM4_SI859 MRJM4_SX139 MRJM4_SX229 MRJM4_SX319 MRJM4_SX409 MRJM4_SX49 MRJO0_SA1 MRJO0_SA2 MRJO0_SI1364 MRJO0_SI1624 MRJO0_SI734 MRJO0_SX104 MRJO0_SX14 MRJO0_SX194 MRJO0_SX284 MRJO0_SX374 MRJR0_SA1 MRJR0_SA2 MRJR0_SI1182 MRJR0_SI1812 MRJR0_SI2313 MRJR0_SX102 MRJR0_SX12 MRJR0_SX192 MRJR0_SX282 MRJR0_SX372 MRJS0_SA1 MRJS0_SA2 MRJS0_SI1444 MRJS0_SI1523 MRJS0_SI2074 MRJS0_SX184 MRJS0_SX274 MRJS0_SX364 MRJS0_SX4 MRJS0_SX94 MRKO0_SA1 MRKO0_SA2 MRKO0_SI1397 MRKO0_SI2027 MRKO0_SI767 MRKO0_SX137 MRKO0_SX227 MRKO0_SX317 MRKO0_SX407 MRKO0_SX47 MRMS1_SA1 MRMS1_SA2 MRMS1_SI1487 MRMS1_SI2117 MRMS1_SI857 MRMS1_SX137 MRMS1_SX227 MRMS1_SX317 MRMS1_SX407 MRMS1_SX47 MROA0_SA1 MROA0_SA2 MROA0_SI1307 MROA0_SI1970 MROA0_SI677 MROA0_SX137 MROA0_SX227 MROA0_SX317 MROA0_SX407 MROA0_SX47 MRPC0_SA1 MRPC0_SA2 MRPC0_SI1753 MRPC0_SI493 MRPC0_SI933 MRPC0_SX133 MRPC0_SX223 MRPC0_SX313 MRPC0_SX403 MRPC0_SX43 MRPP0_SA1 MRPP0_SA2 MRPP0_SI1184 MRPP0_SI1814 MRPP0_SI554 MRPP0_SX104 MRPP0_SX14 MRPP0_SX194 MRPP0_SX284 MRPP0_SX374 MRRK0_SA1 MRRK0_SA2 MRRK0_SI1288 MRRK0_SI1716 MRRK0_SI1918 MRRK0_SX118 MRRK0_SX208 MRRK0_SX28 MRRK0_SX298 MRRK0_SX388 MRTK0_SA1 MRTK0_SA2 MRTK0_SI1093 MRTK0_SI1723 MRTK0_SI1750 MRTK0_SX103 MRTK0_SX13 MRTK0_SX193 MRTK0_SX283 MRTK0_SX373 MRWS1_SA1 MRWS1_SA2 MRWS1_SI1130 MRWS1_SI1496 MRWS1_SI500 MRWS1_SX140 MRWS1_SX230 MRWS1_SX320 MRWS1_SX410 MRWS1_SX50 MSFH1_SA1 MSFH1_SA2 MSFH1_SI1270 MSFH1_SI1900 MSFH1_SI640 MSFH1_SX10 MSFH1_SX100 MSFH1_SX190 MSFH1_SX280 MSFH1_SX370 MSJS1_SA1 MSJS1_SA2 MSJS1_SI1899 MSJS1_SI639 MSJS1_SI869 MSJS1_SX189 MSJS1_SX279 MSJS1_SX369 MSJS1_SX9 MSJS1_SX99 MSLB0_SA1 MSLB0_SA2 MSLB0_SI1193 MSLB0_SI1823 MSLB0_SI563 MSLB0_SX113 MSLB0_SX203 MSLB0_SX23 MSLB0_SX293 MSLB0_SX383 MSTK0_SA1 MSTK0_SA2 MSTK0_SI1024 MSTK0_SI2222 MSTK0_SI2284 MSTK0_SX124 MSTK0_SX214 MSTK0_SX304 MSTK0_SX34 MSTK0_SX394 MTAA0_SA1 MTAA0_SA2 MTAA0_SI1285 MTAA0_SI1915 MTAA0_SI596 MTAA0_SX115 MTAA0_SX205 MTAA0_SX25 MTAA0_SX295 MTAA0_SX385 MTAS1_SA1 MTAS1_SA2 MTAS1_SI1473 MTAS1_SI2098 MTAS1_SI838 MTAS1_SX118 MTAS1_SX208 MTAS1_SX28 MTAS1_SX298 MTAS1_SX388 MTDT0_SA1 MTDT0_SA2 MTDT0_SI1994 MTDT0_SI2254 MTDT0_SI994 MTDT0_SX184 MTDT0_SX274 MTDT0_SX364 MTDT0_SX4 MTDT0_SX94 MTEB0_SA1 MTEB0_SA2 MTEB0_SI1133 MTEB0_SI2064 MTEB0_SI503 MTEB0_SX143 MTEB0_SX233 MTEB0_SX323 MTEB0_SX413 MTEB0_SX53 MTHC0_SA1 MTHC0_SA2 MTHC0_SI1015 MTHC0_SI1645 MTHC0_SI2275 MTHC0_SX115 MTHC0_SX205 MTHC0_SX25 MTHC0_SX295 MTHC0_SX385 MTLS0_SA1 MTLS0_SA2 MTLS0_SI1370 MTLS0_SI2000 MTLS0_SI740 MTLS0_SX110 MTLS0_SX20 MTLS0_SX200 MTLS0_SX290 MTLS0_SX380 MTMR0_SA1 MTMR0_SA2 MTMR0_SI1303 MTMR0_SI1933 MTMR0_SI673 MTMR0_SX133 MTMR0_SX223 MTMR0_SX313 MTMR0_SX403 MTMR0_SX43 MTWH0_SA1 MTWH0_SA2 MTWH0_SI1190 MTWH0_SI1629 MTWH0_SI1820 MTWH0_SX110 MTWH0_SX20 MTWH0_SX200 MTWH0_SX290 MTWH0_SX380 MWBT0_SA1 MWBT0_SA2 MWBT0_SI1553 MWBT0_SI2183 MWBT0_SI923 MWBT0_SX113 MWBT0_SX203 MWBT0_SX23 MWBT0_SX293 MWBT0_SX383 MWEW0_SA1 MWEW0_SA2 MWEW0_SI1361 MWEW0_SI1991 MWEW0_SI731 MWEW0_SX101 MWEW0_SX11 MWEW0_SX191 MWEW0_SX281 MWEW0_SX371 MWJG0_SA1 MWJG0_SA2 MWJG0_SI1124 MWJG0_SI1754 MWJG0_SI494 MWJG0_SX134 MWJG0_SX224 MWJG0_SX314 MWJG0_SX404 MWJG0_SX44 MWVW0_SA1 MWVW0_SA2 MWVW0_SI1476 MWVW0_SI2106 MWVW0_SI846 MWVW0_SX126 MWVW0_SX216 MWVW0_SX306 MWVW0_SX36 MWVW0_SX396 ================================================ FILE: examples/wav2vec/unsupervised/config/timit_unmatched/train.uid ================================================ FAEM0_SA1 FAEM0_SA2 FAEM0_SI2022 FAEM0_SX132 FAEM0_SX222 FAEM0_SX312 FAEM0_SX402 FAJW0_SA2 FAJW0_SI1893 FAJW0_SX183 FAJW0_SX273 FAJW0_SX363 FALK0_SA1 FALK0_SA2 FALK0_SI1086 FALK0_SI456 FALK0_SX276 FALK0_SX366 FALK0_SX96 FALR0_SA1 FALR0_SA2 FALR0_SI1955 FALR0_SI695 FALR0_SX155 FALR0_SX245 FALR0_SX425 FALR0_SX65 FAPB0_SA1 FAPB0_SA2 FAPB0_SI1693 FAPB0_SX163 FAPB0_SX253 FAPB0_SX343 FAPB0_SX73 FBAS0_SA2 FBAS0_SI1387 FBAS0_SX127 FBAS0_SX307 FBAS0_SX37 FBAS0_SX397 FBCG1_SA2 FBCG1_SI1612 FBCG1_SI2242 FBCG1_SI982 FBCG1_SX262 FBCG1_SX82 FBCH0_SA1 FBCH0_SA2 FBCH0_SI1586 FBCH0_SI956 FBCH0_SX146 FBCH0_SX326 FBCH0_SX56 FBJL0_SA1 FBJL0_SA2 FBJL0_SI1552 FBJL0_SI2182 FBJL0_SX112 FBJL0_SX202 FBJL0_SX22 FBJL0_SX292 FBJL0_SX382 FBLV0_SA2 FBLV0_SI2318 FBLV0_SX158 FBLV0_SX248 FBLV0_SX428 FBMH0_SA2 FBMH0_SI1766 FBMH0_SX146 FBMH0_SX236 FBMH0_SX326 FBMH0_SX416 FBMH0_SX56 FBMJ0_SA2 FBMJ0_SX156 FBMJ0_SX246 FBMJ0_SX426 FBMJ0_SX66 FCAG0_SA2 FCAG0_SI1503 FCAG0_SI1641 FCAG0_SI2133 FCAG0_SX333 FCAG0_SX423 FCAG0_SX63 FCAJ0_SA1 FCAJ0_SA2 FCAJ0_SI1804 FCAJ0_SI849 FCAJ0_SX129 FCAJ0_SX219 FCAJ0_SX39 FCAJ0_SX399 FCDR1_SA1 FCDR1_SA2 FCDR1_SX16 FCDR1_SX376 FCEG0_SA1 FCEG0_SI1248 FCEG0_SI1878 FCEG0_SI618 FCEG0_SX168 FCEG0_SX258 FCEG0_SX348 FCEG0_SX438 FCEG0_SX78 FCJF0_SA2 FCJF0_SI1027 FCJF0_SI1657 FCJF0_SI648 FCJF0_SX217 FCJF0_SX307 FCJF0_SX37 FCJF0_SX397 FCJS0_SA1 FCJS0_SA2 FCJS0_SI977 FCJS0_SX167 FCJS0_SX347 FCJS0_SX437 FCJS0_SX77 FCKE0_SA1 FCKE0_SI1111 FCKE0_SX211 FCKE0_SX301 FCKE0_SX31 FCKE0_SX391 FCLT0_SA1 FCLT0_SA2 FCLT0_SI1438 FCLT0_SX178 FCLT0_SX268 FCLT0_SX358 FCMG0_SA1 FCMG0_SI1242 FCMG0_SX162 FCMG0_SX252 FCMG0_SX342 FCMM0_SI1083 FCMM0_SI453 FCMM0_SX273 FCMM0_SX363 FCMM0_SX93 FCRZ0_SA1 FCRZ0_SA2 FCRZ0_SI1913 FCRZ0_SI793 FCRZ0_SX163 FCRZ0_SX253 FCRZ0_SX343 FCRZ0_SX73 FCYL0_SA2 FCYL0_SI1297 FCYL0_SI1927 FCYL0_SX127 FCYL0_SX217 FCYL0_SX397 FDAS1_SA1 FDAS1_SA2 FDAS1_SX111 FDAS1_SX21 FDAS1_SX291 FDAW0_SA1 FDAW0_SA2 FDAW0_SX146 FDAW0_SX236 FDAW0_SX326 FDAW0_SX416 FDAW0_SX56 FDFB0_SI1318 FDFB0_SI1948 FDFB0_SX148 FDFB0_SX238 FDFB0_SX328 FDFB0_SX418 FDJH0_SA1 FDJH0_SA2 FDJH0_SI1565 FDJH0_SI2195 FDJH0_SX125 FDJH0_SX215 FDJH0_SX35 FDJH0_SX395 FDKN0_SA1 FDKN0_SA2 FDKN0_SI1081 FDKN0_SI1711 FDKN0_SX271 FDKN0_SX361 FDKN0_SX91 FDML0_SA1 FDML0_SI1149 FDML0_SI1779 FDML0_SI2075 FDML0_SX339 FDML0_SX69 FDMY0_SI1197 FDMY0_SX117 FDMY0_SX207 FDMY0_SX297 FDNC0_SA1 FDNC0_SA2 FDNC0_SI2287 FDNC0_SX108 FDNC0_SX18 FDNC0_SX378 FDTD0_SA2 FDTD0_SI1561 FDTD0_SI2191 FDTD0_SI931 FDTD0_SX121 FDTD0_SX301 FDTD0_SX391 FDXW0_SA2 FDXW0_SI1511 FDXW0_SI2141 FDXW0_SI881 FDXW0_SX161 FDXW0_SX431 FEAC0_SA1 FEAC0_SA2 FEAC0_SI1245 FEAC0_SI1875 FEAC0_SX255 FEAC0_SX345 FEAC0_SX435 FEAR0_SA1 FEAR0_SA2 FEAR0_SI1252 FEAR0_SI1882 FEAR0_SX172 FEAR0_SX262 FEAR0_SX442 FEAR0_SX82 FECD0_SA2 FECD0_SI2048 FECD0_SX158 FECD0_SX248 FECD0_SX338 FECD0_SX428 FEEH0_SA2 FEEH0_SI1112 FEEH0_SX212 FEEH0_SX302 FEEH0_SX32 FEEH0_SX392 FEME0_SA2 FEME0_SI1505 FEME0_SI2135 FEME0_SX245 FEME0_SX425 FETB0_SA2 FETB0_SI1778 FETB0_SI518 FETB0_SX248 FETB0_SX338 FETB0_SX428 FETB0_SX68 FEXM0_SA2 FEXM0_SI1731 FEXM0_SX111 FEXM0_SX201 FEXM0_SX291 FEXM0_SX381 FGCS0_SA1 FGCS0_SA2 FGCS0_SI1486 FGCS0_SI2116 FGCS0_SI856 FGCS0_SX46 FGDP0_SA2 FGDP0_SI1618 FGDP0_SI2248 FGDP0_SX178 FGDP0_SX268 FGDP0_SX358 FGDP0_SX448 FGMB0_SA1 FGMB0_SA2 FGMB0_SI515 FGMB0_SX155 FGMB0_SX425 FGMB0_SX65 FGRW0_SA2 FGRW0_SI1782 FGRW0_SI1990 FGRW0_SX252 FGRW0_SX342 FGRW0_SX72 FHLM0_SA1 FHLM0_SA2 FHLM0_SI1560 FHLM0_SI2190 FHLM0_SI930 FHLM0_SX210 FHLM0_SX300 FHXS0_SI2335 FHXS0_SX265 FHXS0_SX355 FHXS0_SX85 FJDM2_SI1582 FJDM2_SI1964 FJDM2_SI2212 FJDM2_SX322 FJDM2_SX412 FJEN0_SA2 FJEN0_SI1047 FJEN0_SI1677 FJEN0_SI2307 FJEN0_SX147 FJEN0_SX237 FJEN0_SX57 FJHK0_SA1 FJHK0_SA2 FJHK0_SI1022 FJHK0_SI1652 FJHK0_SX122 FJHK0_SX212 FJHK0_SX32 FJHK0_SX392 FJKL0_SA1 FJKL0_SA2 FJKL0_SI1562 FJKL0_SI2192 FJKL0_SX122 FJKL0_SX302 FJKL0_SX32 FJLG0_SA1 FJLG0_SA2 FJLG0_SI1506 FJLG0_SX179 FJLG0_SX269 FJLG0_SX359 FJLG0_SX449 FJLG0_SX89 FJLR0_SA2 FJLR0_SI1861 FJLR0_SI601 FJLR0_SX151 FJLR0_SX241 FJLR0_SX331 FJLR0_SX421 FJLR0_SX61 FJRB0_SA1 FJRB0_SA2 FJRB0_SI1302 FJRB0_SI1932 FJRB0_SI672 FJRB0_SX132 FJRB0_SX222 FJRB0_SX312 FJRB0_SX42 FJRP1_SA2 FJRP1_SI802 FJRP1_SX172 FJRP1_SX442 FJSK0_SA2 FJSK0_SI1682 FJSK0_SI2312 FJSK0_SX152 FJSK0_SX242 FJSK0_SX332 FJSK0_SX422 FJSK0_SX62 FJSP0_SA1 FJSP0_SA2 FJSP0_SI1763 FJSP0_SI804 FJSP0_SX174 FJSP0_SX84 FJWB1_SA2 FJWB1_SI2055 FJWB1_SI795 FJWB1_SX165 FJWB1_SX255 FJWB1_SX75 FJXM0_SA2 FJXM0_SI1211 FJXM0_SI1971 FJXM0_SX131 FJXM0_SX221 FJXP0_SA2 FJXP0_SI492 FJXP0_SX222 FJXP0_SX312 FJXP0_SX402 FJXP0_SX42 FKAA0_SA2 FKAA0_SI1208 FKAA0_SI1838 FKAA0_SI578 FKAA0_SX218 FKAA0_SX308 FKAA0_SX38 FKDE0_SA2 FKDE0_SI2221 FKDE0_SX331 FKDW0_SA1 FKDW0_SA2 FKDW0_SI577 FKDW0_SX127 FKDW0_SX217 FKDW0_SX307 FKDW0_SX37 FKFB0_SA1 FKFB0_SI2238 FKFB0_SI978 FKFB0_SX168 FKFB0_SX258 FKKH0_SI660 FKKH0_SX210 FKKH0_SX30 FKKH0_SX300 FKLC0_SA1 FKLC0_SA2 FKLC0_SI1615 FKLC0_SI2245 FKLC0_SX265 FKLC0_SX445 FKLC0_SX85 FKLC1_SA1 FKLC1_SA2 FKLC1_SI1678 FKLC1_SX148 FKLC1_SX58 FKLH0_SA1 FKLH0_SI1887 FKLH0_SI627 FKLH0_SX267 FKLH0_SX357 FKLH0_SX447 FKLH0_SX87 FKSR0_SI1117 FKSR0_SX161 FKSR0_SX37 FKSR0_SX397 FLAC0_SA1 FLAC0_SA2 FLAC0_SI2161 FLAC0_SI901 FLAC0_SX181 FLAC0_SX271 FLAC0_SX361 FLAC0_SX91 FLAG0_SA1 FLAG0_SI2094 FLAG0_SX294 FLEH0_SA1 FLEH0_SA2 FLEH0_SX151 FLEH0_SX241 FLEH0_SX421 FLEH0_SX61 FLET0_SA2 FLET0_SI1137 FLET0_SI1767 FLET0_SX147 FLET0_SX237 FLET0_SX277 FLET0_SX417 FLET0_SX57 FLHD0_SA1 FLHD0_SA2 FLHD0_SI1344 FLHD0_SI1974 FLHD0_SX174 FLHD0_SX264 FLHD0_SX444 FLHD0_SX84 FLJA0_SA2 FLJA0_SI1708 FLJA0_SX268 FLJA0_SX358 FLJA0_SX448 FLJA0_SX88 FLJD0_SA1 FLJD0_SA2 FLJD0_SI2146 FLJD0_SX166 FLJD0_SX256 FLJD0_SX346 FLJD0_SX436 FLJG0_SA1 FLJG0_SI1611 FLJG0_SI2241 FLJG0_SX261 FLJG0_SX441 FLJG0_SX81 FLKM0_SI1880 FLKM0_SX116 FLMA0_SA2 FLMA0_SI1243 FLMA0_SI1873 FLMA0_SX163 FLMA0_SX253 FLMA0_SX343 FLMC0_SA1 FLMC0_SA2 FLMC0_SI2002 FLMC0_SI742 FLMC0_SX112 FLMC0_SX292 FLMC0_SX336 FLMC0_SX382 FLMK0_SA2 FLMK0_SI2295 FLMK0_SX135 FLMK0_SX225 FLMK0_SX45 FLOD0_SA1 FLOD0_SA2 FLOD0_SI1287 FLOD0_SI657 FLOD0_SX207 FLOD0_SX387 FLTM0_SA2 FLTM0_SI1700 FLTM0_SX260 FLTM0_SX80 FMAH1_SA1 FMAH1_SI1509 FMAH1_SI2139 FMAH1_SX249 FMAH1_SX339 FMAH1_SX429 FMAH1_SX69 FMBG0_SA1 FMBG0_SI1790 FMBG0_SX260 FMBG0_SX3 FMBG0_SX350 FMBG0_SX440 FMBG0_SX80 FMEM0_SA2 FMEM0_SI1377 FMEM0_SI2007 FMEM0_SX117 FMEM0_SX207 FMEM0_SX297 FMJB0_SA1 FMJB0_SA2 FMJB0_SI1807 FMJB0_SX187 FMJB0_SX277 FMJB0_SX367 FMJB0_SX7 FMJF0_SA1 FMJF0_SI1254 FMJF0_SI1884 FMJF0_SX264 FMJF0_SX354 FMJF0_SX444 FMJU0_SA1 FMJU0_SA2 FMJU0_SI2019 FMJU0_SI759 FMJU0_SX129 FMJU0_SX219 FMJU0_SX39 FMKC0_SA1 FMKC0_SA2 FMKC0_SI1072 FMKC0_SX172 FMKC0_SX262 FMKC0_SX352 FMKF0_SA1 FMKF0_SA2 FMKF0_SI1536 FMKF0_SI906 FMKF0_SX276 FMKF0_SX366 FMKF0_SX6 FMKF0_SX96 FMMH0_SA1 FMMH0_SA2 FMMH0_SI1537 FMMH0_SI2167 FMMH0_SI907 FMMH0_SX187 FMMH0_SX367 FMMH0_SX420 FMMH0_SX7 FMMH0_SX97 FMPG0_SI1602 FMPG0_SI2232 FMPG0_SX252 FMPG0_SX72 FNKL0_SA1 FNKL0_SA2 FNKL0_SI2152 FNKL0_SX172 FNKL0_SX196 FNKL0_SX262 FNKL0_SX442 FNKL0_SX82 FNTB0_SA1 FNTB0_SA2 FNTB0_SX123 FNTB0_SX213 FNTB0_SX33 FNTB0_SX393 FPAB1_SA2 FPAB1_SX121 FPAB1_SX301 FPAB1_SX31 FPAB1_SX391 FPAC0_SA1 FPAC0_SI2011 FPAC0_SX121 FPAC0_SX211 FPAC0_SX301 FPAC0_SX31 FPAC0_SX391 FPAD0_SA1 FPAD0_SI1346 FPAD0_SI1976 FPAD0_SX266 FPAD0_SX446 FPAF0_SI1684 FPAF0_SI2314 FPAF0_SX244 FPAF0_SX334 FPAF0_SX424 FPAF0_SX64 FPAZ0_SI1593 FPAZ0_SX153 FPAZ0_SX27 FPAZ0_SX423 FPAZ0_SX63 FPJF0_SA2 FPJF0_SI1046 FPJF0_SI1676 FPJF0_SX236 FPJF0_SX326 FPLS0_SA1 FPLS0_SA2 FPLS0_SI2220 FPLS0_SX150 FPLS0_SX240 FPLS0_SX3 FPLS0_SX60 FPMY0_SA2 FPMY0_SI1783 FPMY0_SX163 FPMY0_SX196 FPMY0_SX253 FPMY0_SX73 FREH0_SI1315 FREH0_SI685 FREH0_SX145 FREH0_SX235 FREH0_SX325 FREH0_SX55 FRJB0_SA1 FRJB0_SA2 FRJB0_SI1427 FRJB0_SI1470 FRJB0_SI1794 FRJB0_SX167 FRJB0_SX257 FRJB0_SX437 FRJB0_SX77 FRLL0_SA1 FRLL0_SA2 FRLL0_SI1514 FRLL0_SI884 FRLL0_SX164 FRLL0_SX254 FRLL0_SX344 FRLL0_SX74 FSAG0_SA2 FSAG0_SI1953 FSAG0_SI693 FSAG0_SX63 FSAH0_SI1244 FSAH0_SI1874 FSAH0_SX344 FSAH0_SX74 FSAK0_SA1 FSAK0_SA2 FSAK0_SI1930 FSAK0_SI670 FSAK0_SX130 FSAK0_SX220 FSAK0_SX310 FSAK0_SX40 FSAK0_SX400 FSBK0_SA1 FSBK0_SI1699 FSBK0_SI2329 FSBK0_SX259 FSBK0_SX439 FSBK0_SX79 FSCN0_SI1886 FSCN0_SX356 FSDC0_SA1 FSDC0_SI1942 FSDC0_SI2234 FSDC0_SX232 FSDC0_SX412 FSDJ0_SA1 FSDJ0_SA2 FSDJ0_SI1745 FSDJ0_SX125 FSDJ0_SX35 FSGF0_SA1 FSGF0_SA2 FSGF0_SI1557 FSGF0_SX207 FSGF0_SX27 FSGF0_SX297 FSGF0_SX387 FSJG0_SI1570 FSJG0_SI2200 FSJG0_SX310 FSJK1_SA1 FSJK1_SI1025 FSJK1_SI2285 FSJK1_SI696 FSJK1_SX215 FSJK1_SX305 FSJK1_SX395 FSJS0_SA2 FSJS0_SI1171 FSJS0_SI1801 FSJS0_SI541 FSJS0_SX271 FSJS0_SX361 FSJS0_SX91 FSJW0_SA1 FSJW0_SA2 FSJW0_SI703 FSJW0_SX163 FSJW0_SX253 FSJW0_SX343 FSJW0_SX73 FSKC0_SA1 FSKC0_SA2 FSKC0_SI2046 FSKC0_SX156 FSKC0_SX336 FSKC0_SX426 FSKC0_SX66 FSKL0_SA1 FSKL0_SA2 FSKL0_SI2159 FSKL0_SI899 FSKL0_SX179 FSKL0_SX269 FSKL0_SX359 FSKL0_SX89 FSKP0_SA1 FSKP0_SI1728 FSKP0_SI468 FSKP0_SX108 FSKP0_SX18 FSKP0_SX198 FSKP0_SX288 FSKP0_SX378 FSLS0_SA1 FSLS0_SA2 FSLS0_SI1056 FSLS0_SI1686 FSLS0_SI2316 FSLS0_SX202 FSLS0_SX246 FSLS0_SX66 FSMA0_SA1 FSMA0_SI1621 FSMA0_SI2251 FSMA0_SX271 FSMA0_SX361 FSMA0_SX91 FSMM0_SA1 FSMM0_SA2 FSMM0_SI1314 FSMM0_SI1944 FSMM0_SI684 FSMM0_SX414 FSMM0_SX54 FSMS1_SA1 FSMS1_SA2 FSMS1_SI1504 FSMS1_SI2134 FSMS1_SI874 FSMS1_SX154 FSMS1_SX334 FSMS1_SX64 FSPM0_SA1 FSPM0_SI1871 FSPM0_SI611 FSPM0_SX341 FSPM0_SX431 FSRH0_SA1 FSRH0_SA2 FSRH0_SI1719 FSRH0_SX131 FSRH0_SX41 FSSB0_SA1 FSSB0_SA2 FSSB0_SI1082 FSSB0_SI2342 FSSB0_SX182 FSSB0_SX272 FSSB0_SX452 FSSB0_SX92 FTAJ0_SA1 FTAJ0_SA2 FTAJ0_SI1329 FTAJ0_SI474 FTAJ0_SX339 FTAJ0_SX69 FTBR0_SA1 FTBR0_SA2 FTBR0_SI2181 FTBR0_SX111 FTBR0_SX201 FTBR0_SX291 FTBR0_SX381 FTBW0_SA2 FTBW0_SI1345 FTBW0_SI1975 FTBW0_SX265 FTBW0_SX355 FTBW0_SX445 FTBW0_SX85 FTLG0_SA1 FTLG0_SA2 FTLG0_SI840 FTLG0_SX123 FTLG0_SX213 FTLG0_SX303 FTLG0_SX33 FTLG0_SX393 FTMG0_SA1 FTMG0_SA2 FTMG0_SX182 FTMG0_SX272 FTMG0_SX362 FTMG0_SX92 FVFB0_SA1 FVFB0_SI1032 FVFB0_SI2292 FVFB0_SX222 FVFB0_SX312 FVFB0_SX402 FVKB0_SA2 FVKB0_SI1159 FVKB0_SI1789 FVKB0_SI529 FVKB0_SX169 FVKB0_SX259 FVKB0_SX439 FVKB0_SX79 FVMH0_SA1 FVMH0_SI2096 FVMH0_SX206 FVMH0_SX296 FVMH0_SX386 MABC0_SA1 MABC0_SA2 MABC0_SX151 MABC0_SX241 MABC0_SX331 MABC0_SX421 MABC0_SX61 MADC0_SA1 MADC0_SA2 MADC0_SI1997 MADC0_SX17 MADC0_SX197 MADC0_SX287 MADD0_SA1 MADD0_SI1798 MADD0_SI538 MADD0_SX358 MADD0_SX448 MAEB0_SA1 MAEB0_SA2 MAEB0_SI2250 MAEB0_SI990 MAEB0_SX180 MAEB0_SX270 MAEB0_SX360 MAEB0_SX90 MAEO0_SA2 MAEO0_SI1655 MAEO0_SI1956 MAEO0_SX156 MAEO0_SX246 MAEO0_SX336 MAEO0_SX426 MAEO0_SX66 MAFM0_SA1 MAFM0_SA2 MAFM0_SI1569 MAFM0_SI2199 MAFM0_SX219 MAFM0_SX39 MAFM0_SX399 MAJP0_SA1 MAJP0_SI1074 MAJP0_SI2334 MAJP0_SX264 MAJP0_SX354 MAJP0_SX444 MAJP0_SX84 MAKB0_SA1 MAKB0_SX206 MAKB0_SX296 MAKR0_SA1 MAKR0_SA2 MAKR0_SI1352 MAKR0_SI1982 MAKR0_SI722 MAKR0_SX182 MAKR0_SX272 MAKR0_SX452 MAPV0_SA1 MAPV0_SA2 MAPV0_SI1923 MAPV0_SX123 MAPV0_SX303 MAPV0_SX33 MAPV0_SX393 MARC0_SA1 MARC0_SI1188 MARC0_SI1818 MARC0_SI558 MARC0_SX288 MARC0_SX378 MARW0_SA1 MARW0_SA2 MARW0_SI1276 MARW0_SI646 MARW0_SX106 MARW0_SX16 MARW0_SX376 MBAR0_SA2 MBAR0_SI1319 MBAR0_SI1949 MBAR0_SI689 MBAR0_SX149 MBAR0_SX239 MBAR0_SX329 MBBR0_SA1 MBBR0_SA2 MBBR0_SI1685 MBBR0_SX155 MBBR0_SX245 MBBR0_SX425 MBCG0_SA2 MBCG0_SI2217 MBCG0_SX147 MBCG0_SX237 MBCG0_SX417 MBCG0_SX57 MBEF0_SA1 MBEF0_SA2 MBEF0_SX111 MBEF0_SX201 MBEF0_SX291 MBGT0_SA1 MBGT0_SI1341 MBGT0_SI711 MBGT0_SX81 MBJV0_SA2 MBJV0_SI1247 MBJV0_SI1877 MBJV0_SX167 MBJV0_SX257 MBJV0_SX437 MBJV0_SX77 MBMA0_SA1 MBMA0_SA2 MBMA0_SI1852 MBMA0_SX142 MBMA0_SX322 MBMA0_SX412 MBMA1_SA1 MBMA1_SA2 MBMA1_SI2207 MBMA1_SX144 MBMA1_SX234 MBMA1_SX414 MBML0_SA1 MBML0_SI1799 MBML0_SI539 MBML0_SX179 MBML0_SX269 MBML0_SX359 MBML0_SX449 MBOM0_SA1 MBOM0_SI1014 MBOM0_SI1644 MBOM0_SX114 MBOM0_SX204 MBOM0_SX311 MBOM0_SX384 MBSB0_SA2 MBSB0_SI1353 MBSB0_SI1983 MBSB0_SI723 MBSB0_SX183 MBSB0_SX273 MBSB0_SX363 MBSB0_SX93 MBTH0_SA1 MBTH0_SI505 MBTH0_SI757 MBTH0_SX212 MBTH0_SX302 MBTH0_SX392 MBWP0_SA1 MBWP0_SA2 MBWP0_SI1531 MBWP0_SI1969 MBWP0_SI709 MBWP0_SX169 MBWP0_SX259 MBWP0_SX439 MBWP0_SX79 MCAE0_SA1 MCAE0_SA2 MCAE0_SX187 MCAE0_SX367 MCAE0_SX7 MCAE0_SX97 MCAL0_SA1 MCAL0_SI508 MCAL0_SX148 MCAL0_SX238 MCAL0_SX328 MCAL0_SX418 MCAL0_SX58 MCDC0_SA2 MCDC0_SI1292 MCDC0_SI1922 MCDC0_SI662 MCDC0_SX122 MCDC0_SX302 MCDC0_SX32 MCDC0_SX392 MCDD0_SA1 MCDD0_SI1513 MCDD0_SI2143 MCDD0_SX163 MCDD0_SX343 MCDD0_SX73 MCDR0_SA1 MCDR0_SA2 MCDR0_SX164 MCDR0_SX254 MCDR0_SX344 MCDR0_SX434 MCDR0_SX74 MCEF0_SA1 MCEF0_SA2 MCEF0_SI1135 MCEF0_SI1765 MCEF0_SX145 MCEF0_SX325 MCEF0_SX55 MCEW0_SI1442 MCEW0_SX182 MCEW0_SX272 MCEW0_SX92 MCHL0_SA1 MCHL0_SA2 MCHL0_SI1977 MCHL0_SX177 MCHL0_SX267 MCHL0_SX357 MCHL0_SX447 MCLK0_SA1 MCLK0_SA2 MCLK0_SI1660 MCLK0_SX130 MCLK0_SX220 MCLK0_SX40 MCLK0_SX400 MCLM0_SA2 MCLM0_SI1456 MCLM0_SX106 MCLM0_SX16 MCLM0_SX196 MCLM0_SX286 MCLM0_SX376 MCPM0_SA2 MCPM0_SI1194 MCPM0_SI564 MCPM0_SX204 MCPM0_SX24 MCRE0_SA1 MCRE0_SA2 MCRE0_SI1121 MCRE0_SI1725 MCRE0_SI1751 MCRE0_SX131 MCRE0_SX221 MCRE0_SX24 MCRE0_SX401 MCRE0_SX41 MCSS0_SA1 MCSS0_SA2 MCSS0_SX120 MCSS0_SX210 MCSS0_SX30 MCSS0_SX300 MCSS0_SX390 MCTH0_SA2 MCTH0_SI1209 MCTH0_SI1839 MCTH0_SI579 MCTH0_SX129 MCTH0_SX219 MCTH0_SX309 MCTH0_SX399 MCTM0_SA1 MCTM0_SA2 MCTM0_SI720 MCTM0_SX180 MCTM0_SX270 MCTM0_SX360 MCTM0_SX450 MCTM0_SX90 MCXM0_SA1 MCXM0_SA2 MCXM0_SI1351 MCXM0_SI1981 MCXM0_SI721 MCXM0_SX181 MCXM0_SX271 MCXM0_SX361 MCXM0_SX451 MDAC0_SA2 MDAC0_SI1261 MDAC0_SI1837 MDAC0_SX271 MDAC0_SX451 MDAC0_SX91 MDAS0_SA1 MDAS0_SA2 MDAS0_SI1266 MDAS0_SX186 MDAS0_SX21 MDAS0_SX276 MDAS0_SX96 MDBB1_SA1 MDBB1_SA2 MDBB1_SI1006 MDBB1_SI1636 MDBB1_SI2056 MDBB1_SX196 MDBB1_SX286 MDBP0_SA1 MDBP0_SA2 MDBP0_SI1158 MDBP0_SI1788 MDBP0_SX258 MDBP0_SX348 MDBP0_SX78 MDCD0_SA1 MDCD0_SA2 MDCD0_SI2045 MDCD0_SX155 MDCD0_SX65 MDCM0_SA1 MDCM0_SA2 MDCM0_SI2110 MDCM0_SI850 MDCM0_SX130 MDCM0_SX220 MDCM0_SX310 MDDC0_SA1 MDDC0_SA2 MDDC0_SX249 MDDC0_SX339 MDDC0_SX429 MDED0_SI1170 MDED0_SI1800 MDED0_SX180 MDED0_SX270 MDED0_SX360 MDED0_SX450 MDED0_SX90 MDEF0_SA1 MDEF0_SA2 MDEF0_SI1563 MDEF0_SI2193 MDEF0_SX213 MDEF0_SX33 MDEF0_SX393 MDEM0_SA2 MDEM0_SI1868 MDEM0_SX158 MDEM0_SX248 MDEM0_SX338 MDEM0_SX68 MDHL0_SA1 MDHL0_SA2 MDHL0_SI2069 MDHL0_SI809 MDHL0_SX179 MDHL0_SX359 MDHL0_SX89 MDHS0_SX180 MDHS0_SX270 MDHS0_SX360 MDHS0_SX450 MDHS0_SX90 MDJM0_SA1 MDJM0_SA2 MDJM0_SI2085 MDJM0_SI825 MDJM0_SX195 MDJM0_SX285 MDJM0_SX375 MDKS0_SA1 MDKS0_SA2 MDKS0_SI1066 MDKS0_SI1696 MDKS0_SI2326 MDKS0_SX256 MDKS0_SX76 MDLB0_SA1 MDLB0_SI1936 MDLB0_SI676 MDLB0_SX226 MDLB0_SX316 MDLB0_SX46 MDLC0_SA1 MDLC0_SA2 MDLC0_SI765 MDLC0_SX135 MDLC0_SX225 MDLC0_SX315 MDLC0_SX45 MDLC1_SA1 MDLC1_SX175 MDLC1_SX265 MDLC1_SX355 MDLC1_SX85 MDLC2_SA1 MDLC2_SA2 MDLC2_SI1614 MDLC2_SI984 MDLC2_SX174 MDLC2_SX264 MDLC2_SX444 MDLC2_SX84 MDLH0_SA1 MDLH0_SI1960 MDLH0_SI574 MDLH0_SI700 MDLH0_SX250 MDLH0_SX340 MDLH0_SX70 MDLM0_SA1 MDLM0_SA2 MDLM0_SX244 MDLM0_SX334 MDLM0_SX64 MDLR0_SI1233 MDLR0_SX243 MDLR0_SX423 MDLR0_SX63 MDLR1_SI1299 MDLR1_SI1929 MDLR1_SX129 MDLR1_SX219 MDLR1_SX309 MDLR1_SX39 MDLR1_SX399 MDMA0_SA1 MDMA0_SA2 MDMA0_SI1238 MDMA0_SI2060 MDMT0_SI2341 MDMT0_SI572 MDMT0_SX212 MDMT0_SX302 MDMT0_SX392 MDNS0_SA1 MDNS0_SX111 MDNS0_SX291 MDNS0_SX381 MDPB0_SA1 MDPB0_SA2 MDPB0_SI2126 MDPB0_SX146 MDPB0_SX236 MDPB0_SX326 MDPB0_SX56 MDPK0_SA1 MDPK0_SA2 MDPK0_SI1683 MDPK0_SI552 MDPK0_SX153 MDPK0_SX243 MDPK0_SX63 MDPS0_SA1 MDPS0_SA2 MDPS0_SI1651 MDPS0_SI1979 MDPS0_SX179 MDPS0_SX269 MDPS0_SX449 MDPS0_SX89 MDRD0_SA2 MDRD0_SI1382 MDRD0_SI2012 MDRD0_SX122 MDRD0_SX212 MDRD0_SX302 MDRD0_SX392 MDSJ0_SA1 MDSJ0_SA2 MDSJ0_SI832 MDSJ0_SX112 MDSJ0_SX22 MDSJ0_SX292 MDSJ0_SX382 MDSS0_SA1 MDSS0_SI1881 MDSS0_SI2087 MDSS0_SI621 MDSS0_SX171 MDSS0_SX261 MDSS0_SX351 MDSS0_SX81 MDSS1_SA2 MDSS1_SI1713 MDSS1_SX247 MDSS1_SX337 MDSS1_SX427 MDTB0_SA1 MDTB0_SA2 MDTB0_SI570 MDTB0_SX210 MDTB0_SX300 MDTB0_SX321 MDTB0_SX390 MDWD0_SA1 MDWD0_SI1890 MDWD0_SI557 MDWD0_SX180 MDWD0_SX360 MDWD0_SX450 MDWH0_SA2 MDWH0_SI1925 MDWH0_SX125 MDWH0_SX35 MDWH0_SX395 MDWM0_SI1546 MDWM0_SI2176 MDWM0_SX106 MDWM0_SX376 MDWM0_SX433 MEAL0_SA1 MEAL0_SI1547 MEAL0_SI917 MEAL0_SX197 MEAL0_SX287 MEAL0_SX377 MEDR0_SI744 MEDR0_SX114 MEDR0_SX204 MEDR0_SX24 MEDR0_SX294 MEDR0_SX384 MEFG0_SA2 MEFG0_SI465 MEFG0_SX105 MEFG0_SX15 MEFG0_SX195 MEFG0_SX285 MEFG0_SX375 MEGJ0_SI1967 MEGJ0_SX437 MEGJ0_SX77 MEJL0_SA2 MEJL0_SI1592 MEJL0_SI1654 MEJL0_SI962 MEJL0_SX332 MEJL0_SX422 MEJL0_SX62 MEJS0_SA1 MEJS0_SA2 MEJS0_SI1870 MEJS0_SX250 MEJS0_SX430 MEJS0_SX70 MESG0_SA1 MESG0_SA2 MESG0_SI1332 MESG0_SI1962 MESG0_SX162 MESG0_SX252 MESG0_SX342 MESG0_SX72 MESJ0_SA1 MESJ0_SA2 MESJ0_SI2257 MESJ0_SI997 MESJ0_SX277 MESJ0_SX367 MESJ0_SX7 MEWM0_SA1 MEWM0_SA2 MEWM0_SI1348 MEWM0_SI1978 MEWM0_SX268 MEWM0_SX358 MEWM0_SX448 MFER0_SA1 MFER0_SA2 MFER0_SI1492 MFER0_SI2122 MFER0_SX232 MFER0_SX322 MFER0_SX412 MFER0_SX52 MFMC0_SA1 MFMC0_SA2 MFMC0_SI1132 MFMC0_SI1762 MFMC0_SI502 MFMC0_SX142 MFMC0_SX232 MFMC0_SX322 MFMC0_SX412 MFMC0_SX52 MFRM0_SA1 MFRM0_SA2 MFRM0_SI1155 MFRM0_SI1717 MFRM0_SI1785 MFRM0_SX165 MFRM0_SX255 MFRM0_SX75 MFWK0_SA1 MFWK0_SA2 MFWK0_SI1249 MFWK0_SI619 MFWK0_SX259 MFWK0_SX439 MFWK0_SX79 MFXS0_SA1 MFXS0_SA2 MFXS0_SI1674 MFXS0_SI2225 MFXS0_SI2304 MFXS0_SX144 MFXS0_SX234 MFXS0_SX414 MFXV0_SA1 MFXV0_SI1635 MFXV0_SX15 MFXV0_SX195 MFXV0_SX285 MFXV0_SX375 MGAF0_SA2 MGAF0_SI1912 MGAF0_SI652 MGAF0_SX112 MGAF0_SX202 MGAF0_SX292 MGAG0_SA1 MGAG0_SI1321 MGAG0_SI645 MGAG0_SX151 MGAG0_SX241 MGAG0_SX331 MGAG0_SX421 MGAG0_SX61 MGAK0_SA1 MGAK0_SA2 MGAK0_SI1666 MGAK0_SI2296 MGAK0_SX316 MGAK0_SX406 MGAR0_SA1 MGAR0_SA2 MGAR0_SI1212 MGAR0_SI1694 MGAR0_SI1842 MGAR0_SX222 MGAR0_SX402 MGAR0_SX42 MGAW0_SA1 MGAW0_SA2 MGAW0_SI1802 MGAW0_SX265 MGAW0_SX355 MGAW0_SX445 MGAW0_SX85 MGES0_SA2 MGES0_SI1481 MGES0_SX131 MGES0_SX221 MGES0_SX401 MGES0_SX41 MGJC0_SA1 MGJC0_SI1256 MGJC0_SI1335 MGJC0_SI1965 MGJC0_SX165 MGJC0_SX255 MGJC0_SX345 MGRL0_SA1 MGRL0_SA2 MGRL0_SI1497 MGRL0_SX237 MGRL0_SX417 MGRL0_SX57 MGRP0_SA1 MGRP0_SI1947 MGRP0_SI687 MGRP0_SX147 MGRP0_SX237 MGRP0_SX417 MGRP0_SX57 MGSH0_SA1 MGSH0_SX186 MGSH0_SX96 MGSL0_SA2 MGSL0_SI1164 MGSL0_SX174 MGSL0_SX354 MGSL0_SX444 MGSL0_SX84 MGXP0_SA1 MGXP0_SA2 MGXP0_SI457 MGXP0_SX277 MGXP0_SX367 MGXP0_SX97 MHBS0_SA1 MHBS0_SA2 MHBS0_SI1575 MHBS0_SI2205 MHBS0_SX135 MHBS0_SX225 MHBS0_SX405 MHIT0_SA2 MHIT0_SI1613 MHIT0_SI2243 MHIT0_SX173 MHIT0_SX263 MHIT0_SX353 MHIT0_SX443 MHIT0_SX83 MHJB0_SA2 MHJB0_SI1647 MHJB0_SI2277 MHJB0_SX117 MHJB0_SX207 MHJB0_SX27 MHJB0_SX297 MHJB0_SX387 MHMG0_SA1 MHMG0_SA2 MHMG0_SI1365 MHMG0_SI1995 MHMG0_SX105 MHMG0_SX15 MHMG0_SX285 MHMG0_SX375 MHMR0_SA2 MHMR0_SI1119 MHMR0_SX129 MHMR0_SX219 MHMR0_SX309 MHMR0_SX39 MHMR0_SX399 MHRM0_SA2 MHRM0_SI1475 MHRM0_SI2218 MHRM0_SX238 MHRM0_SX328 MHRM0_SX418 MHXL0_SA1 MHXL0_SA2 MHXL0_SI512 MHXL0_SI612 MHXL0_SX152 MHXL0_SX332 MHXL0_SX422 MHXL0_SX62 MILB0_SA1 MILB0_SI2163 MILB0_SI807 MILB0_SX183 MILB0_SX273 MILB0_SX3 MILB0_SX363 MILB0_SX93 MJAC0_SA1 MJAC0_SA2 MJAC0_SI1331 MJAC0_SI2148 MJAC0_SX341 MJAC0_SX431 MJAE0_SA1 MJAE0_SA2 MJAE0_SI1524 MJAE0_SI1999 MJAE0_SI2154 MJAE0_SX264 MJAE0_SX354 MJAE0_SX444 MJAI0_SI1604 MJAI0_SX164 MJAI0_SX254 MJAI0_SX344 MJAI0_SX434 MJAI0_SX74 MJBG0_SA1 MJBG0_SA2 MJBG0_SI1232 MJBG0_SI1724 MJBG0_SI1862 MJBG0_SX152 MJBG0_SX242 MJBG0_SX332 MJBG0_SX422 MJDA0_SA1 MJDA0_SA2 MJDA0_SI1661 MJDA0_SI2291 MJDA0_SX131 MJDA0_SX221 MJDA0_SX401 MJDA0_SX41 MJDC0_SA1 MJDC0_SA2 MJDC0_SI1161 MJDC0_SI2165 MJDC0_SX171 MJDC0_SX261 MJDC0_SX351 MJDC0_SX441 MJDC0_SX81 MJDE0_SA2 MJDE0_SX130 MJDE0_SX310 MJDE0_SX40 MJDE0_SX400 MJDG0_SA1 MJDG0_SI1672 MJDG0_SX142 MJDG0_SX232 MJDG0_SX322 MJDG0_SX412 MJDG0_SX52 MJDM0_SA2 MJDM0_SI1937 MJDM0_SX260 MJDM0_SX440 MJDM0_SX80 MJEB0_SA1 MJEB0_SA2 MJEB0_SI1286 MJEB0_SI1916 MJEB0_SX206 MJEB0_SX26 MJEB0_SX386 MJEB1_SA1 MJEB1_SI2097 MJEB1_SX117 MJEB1_SX27 MJEB1_SX297 MJEE0_SA2 MJEE0_SI1237 MJEE0_SI1867 MJEE0_SI607 MJEE0_SX157 MJEE0_SX427 MJEE0_SX67 MJFH0_SA1 MJFH0_SI1737 MJFH0_SI477 MJFH0_SX117 MJFH0_SX207 MJFH0_SX27 MJFH0_SX297 MJFH0_SX387 MJFR0_SA2 MJFR0_SI1605 MJFR0_SI2235 MJFR0_SI975 MJFR0_SX165 MJFR0_SX255 MJFR0_SX345 MJHI0_SA2 MJHI0_SI555 MJHI0_SI698 MJHI0_SX248 MJHI0_SX338 MJHI0_SX428 MJHI0_SX68 MJJB0_SA2 MJJB0_SI1139 MJJB0_SI1277 MJJB0_SI1769 MJJB0_SX149 MJJB0_SX329 MJJB0_SX419 MJJB0_SX59 MJJJ0_SA1 MJJJ0_SA2 MJJJ0_SI1793 MJJJ0_SI533 MJJJ0_SX173 MJJJ0_SX263 MJJJ0_SX353 MJJJ0_SX83 MJJM0_SA1 MJJM0_SI1457 MJJM0_SX17 MJJM0_SX197 MJJM0_SX287 MJJM0_SX377 MJKR0_SA2 MJKR0_SI1201 MJKR0_SI1831 MJKR0_SX121 MJKR0_SX211 MJKR0_SX301 MJKR0_SX31 MJKR0_SX391 MJLB0_SA1 MJLB0_SA2 MJLB0_SI2246 MJLB0_SI986 MJLB0_SX266 MJLB0_SX356 MJLB0_SX446 MJLB0_SX86 MJLG1_SA1 MJLG1_SA2 MJLG1_SI1012 MJLG1_SI1642 MJLG1_SI2272 MJLG1_SX112 MJLG1_SX202 MJLG1_SX22 MJLG1_SX382 MJLS0_SA1 MJLS0_SA2 MJLS0_SI1096 MJLS0_SI466 MJLS0_SX16 MJLS0_SX196 MJLS0_SX286 MJLS0_SX376 MJMA0_SI1495 MJMA0_SI865 MJMA0_SX145 MJMA0_SX235 MJMA0_SX325 MJMA0_SX415 MJMA0_SX55 MJMD0_SA1 MJMD0_SI1028 MJMD0_SI1658 MJMD0_SX128 MJMD0_SX218 MJMD0_SX398 MJMM0_SA1 MJMM0_SA2 MJMM0_SI1885 MJMM0_SI625 MJMM0_SX265 MJMM0_SX355 MJMM0_SX445 MJPG0_SA1 MJPG0_SA2 MJPG0_SI561 MJPG0_SX291 MJPG0_SX381 MJPM0_SA1 MJPM0_SI1998 MJPM0_SI738 MJPM0_SX108 MJPM0_SX18 MJPM0_SX198 MJPM0_SX288 MJPM1_SA1 MJPM1_SA2 MJPM1_SI1897 MJPM1_SI761 MJPM1_SX131 MJPM1_SX221 MJPM1_SX41 MJRA0_SI606 MJRA0_SX156 MJRA0_SX246 MJRA0_SX66 MJRG0_SA1 MJRG0_SA2 MJRG0_SX106 MJRG0_SX16 MJRG0_SX286 MJRH0_SA1 MJRH0_SA2 MJRH0_SI1125 MJRH0_SI1755 MJRH0_SX135 MJRH0_SX315 MJRH0_SX405 MJRH0_SX45 MJRH1_SA2 MJRH1_SI1774 MJRH1_SX334 MJRH1_SX64 MJRK0_SI2103 MJRK0_SX340 MJRK0_SX70 MJRP0_SI1835 MJRP0_SI585 MJRP0_SX135 MJRP0_SX315 MJRP0_SX405 MJRP0_SX45 MJSR0_SA2 MJSR0_SX164 MJSR0_SX254 MJSR0_SX434 MJSR0_SX74 MJWG0_SA2 MJWG0_SI2155 MJWG0_SX355 MJWG0_SX445 MJWG0_SX85 MJWS0_SA1 MJWS0_SA2 MJWS0_SI1143 MJWS0_SI1773 MJWS0_SX243 MJWS0_SX423 MJWT0_SA2 MJWT0_SI751 MJXA0_SA1 MJXA0_SA2 MJXA0_SI1507 MJXA0_SI2137 MJXA0_SI877 MJXA0_SX157 MJXA0_SX247 MJXA0_SX337 MJXA0_SX67 MJXL0_SA1 MJXL0_SA2 MJXL0_SI1795 MJXL0_SX182 MJXL0_SX272 MJXL0_SX362 MJXL0_SX452 MJXL0_SX92 MKAG0_SA2 MKAG0_SI1609 MKAG0_SI2239 MKAG0_SX169 MKAG0_SX30 MKAG0_SX439 MKAG0_SX79 MKAH0_SA1 MKAH0_SA2 MKAH0_SI1528 MKAH0_SI2158 MKAH0_SI898 MKAH0_SX268 MKAH0_SX358 MKAH0_SX448 MKAH0_SX88 MKAJ0_SA1 MKAJ0_SI1414 MKAJ0_SI2044 MKAJ0_SI784 MKAJ0_SX244 MKAJ0_SX334 MKAJ0_SX424 MKAJ0_SX64 MKAM0_SA2 MKAM0_SI1316 MKAM0_SX236 MKAM0_SX416 MKDB0_SI2132 MKDB0_SI588 MKDB0_SI872 MKDB0_SX242 MKDB0_SX332 MKDB0_SX422 MKDB0_SX62 MKDD0_SA1 MKDD0_SX127 MKDD0_SX217 MKDD0_SX307 MKDD0_SX37 MKDD0_SX397 MKDT0_SA1 MKDT0_SA2 MKDT0_SI2153 MKDT0_SI893 MKDT0_SX173 MKDT0_SX263 MKDT0_SX353 MKDT0_SX443 MKDT0_SX83 MKES0_SA2 MKES0_SX263 MKES0_SX353 MKES0_SX443 MKES0_SX83 MKJO0_SA1 MKJO0_SA2 MKJO0_SI2147 MKJO0_SX167 MKJO0_SX257 MKJO0_SX424 MKJO0_SX77 MKLN0_SA1 MKLN0_SA2 MKLN0_SI1598 MKLN0_SI2228 MKLN0_SX158 MKLN0_SX338 MKLN0_SX428 MKLN0_SX68 MKLR0_SA1 MKLR0_SI1059 MKLR0_SI2319 MKLR0_SX159 MKLR0_SX249 MKLR0_SX339 MKLR0_SX429 MKLR0_SX69 MKLS0_SA2 MKLS0_SI1533 MKLS0_SX177 MKLS0_SX267 MKLS0_SX447 MKLS1_SI1545 MKLS1_SI2175 MKLS1_SX105 MKLS1_SX15 MKLS1_SX195 MKLS1_SX285 MKLW0_SA2 MKLW0_SI1844 MKLW0_SI2201 MKLW0_SX131 MKLW0_SX221 MKLW0_SX401 MKLW0_SX41 MKRG0_SA1 MKRG0_SA2 MKRG0_SI1491 MKRG0_SI2121 MKRG0_SX141 MKRG0_SX231 MKRG0_SX31 MKRG0_SX51 MKXL0_SA1 MKXL0_SI1185 MKXL0_SX105 MKXL0_SX195 MKXL0_SX285 MLBC0_SA2 MLBC0_SI609 MLBC0_SX159 MLBC0_SX339 MLBC0_SX429 MLBC0_SX69 MLEL0_SI1876 MLEL0_SX346 MLEL0_SX76 MLJC0_SA1 MLJC0_SA2 MLJC0_SI1855 MLJC0_SI595 MLJC0_SX235 MLJC0_SX325 MLJC0_SX55 MLJH0_SI1324 MLJH0_SX154 MLJH0_SX334 MLJH0_SX424 MLNS0_SA1 MLNS0_SA2 MLNS0_SI1407 MLNS0_SI777 MLNS0_SX147 MLNS0_SX237 MLNS0_SX327 MLNS0_SX417 MLNS0_SX57 MLSH0_SA1 MLSH0_SA2 MLSH0_SI2047 MLSH0_SI787 MLSH0_SX157 MLSH0_SX337 MLSH0_SX427 MLSH0_SX67 MMAA0_SI2105 MMAA0_SX125 MMAA0_SX215 MMAA0_SX305 MMAA0_SX395 MMAB1_SA1 MMAB1_SA2 MMAB1_SI2124 MMAB1_SX144 MMAB1_SX414 MMAB1_SX54 MMAG0_SI496 MMAG0_SX226 MMAG0_SX406 MMAG0_SX46 MMAM0_SA1 MMAM0_SA2 MMAM0_SI1597 MMAM0_SI1668 MMAM0_SX247 MMAM0_SX337 MMAM0_SX67 MMAR0_SA1 MMAR0_SA2 MMAR0_SI1336 MMAR0_SI706 MMAR0_SX436 MMAR0_SX76 MMBS0_SA1 MMBS0_SA2 MMBS0_SI1151 MMBS0_SX251 MMBS0_SX341 MMBS0_SX431 MMBS0_SX71 MMCC0_SA1 MMCC0_SI1968 MMCC0_SI708 MMCC0_SX168 MMCC0_SX258 MMCC0_SX348 MMCC0_SX438 MMCC0_SX78 MMDB0_SA1 MMDB0_SA2 MMDB0_SI1358 MMDB0_SI1617 MMDB0_SX267 MMDB0_SX357 MMDB0_SX447 MMDB0_SX87 MMDG0_SI2035 MMDG0_SX340 MMDG0_SX430 MMDG0_SX70 MMDM0_SA1 MMDM0_SA2 MMDM0_SX231 MMDM0_SX321 MMDM0_SX411 MMDM0_SX51 MMDM1_SA1 MMDM1_SI1650 MMDM1_SI783 MMDM1_SX243 MMDS0_SA2 MMDS0_SI1343 MMDS0_SI1973 MMDS0_SI713 MMDS0_SX173 MMDS0_SX263 MMDS0_SX353 MMDS0_SX443 MMDS0_SX83 MMEA0_SA2 MMEA0_SI1388 MMEA0_SI2018 MMEA0_SI758 MMEA0_SX218 MMEA0_SX308 MMEA0_SX38 MMEB0_SA1 MMEB0_SI1357 MMEB0_SI1987 MMEB0_SI727 MMEB0_SX7 MMEB0_SX97 MMGC0_SA1 MMGC0_SI1935 MMGC0_SI2184 MMGC0_SX315 MMGC0_SX405 MMGC0_SX45 MMGG0_SA1 MMGG0_SA2 MMGG0_SI1709 MMGG0_SI2339 MMGG0_SX179 MMGG0_SX359 MMGG0_SX89 MMGK0_SA1 MMGK0_SA2 MMGK0_SI1322 MMGK0_SI1952 MMGK0_SI692 MMGK0_SX152 MMGK0_SX242 MMGK0_SX422 MMJB1_SA1 MMJB1_SI1408 MMJB1_SI2038 MMJB1_SI778 MMJB1_SX148 MMJB1_SX238 MMJB1_SX328 MMJB1_SX418 MMJB1_SX58 MMLM0_SA1 MMLM0_SA2 MMLM0_SI1527 MMLM0_SI897 MMLM0_SX177 MMLM0_SX267 MMLM0_SX357 MMLM0_SX447 MMLM0_SX87 MMPM0_SA1 MMPM0_SA2 MMPM0_SI1061 MMPM0_SI1691 MMPM0_SI2321 MMPM0_SX251 MMPM0_SX341 MMPM0_SX431 MMPM0_SX71 MMRP0_SA1 MMRP0_SI2034 MMRP0_SI717 MMRP0_SI774 MMRP0_SX234 MMRP0_SX414 MMRP0_SX54 MMSM0_SA1 MMSM0_SA2 MMSM0_SI1736 MMSM0_SX26 MMSM0_SX296 MMSM0_SX386 MMVP0_SI1284 MMVP0_SI1914 MMVP0_SX114 MMVP0_SX204 MMVP0_SX294 MMVP0_SX384 MMWB0_SA2 MMWB0_SI1619 MMWB0_SX179 MMWB0_SX269 MMWS0_SA1 MMWS0_SI1518 MMWS0_SI559 MMWS0_SI888 MMWS0_SX258 MMWS0_SX78 MMWS1_SA1 MMWS1_SA2 MMWS1_SI1071 MMWS1_SI2331 MMWS1_SX261 MMWS1_SX27 MMWS1_SX351 MMWS1_SX441 MMWS1_SX81 MMXS0_SA1 MMXS0_SA2 MMXS0_SI629 MMXS0_SI876 MMXS0_SX156 MMXS0_SX336 MMXS0_SX66 MNET0_SA1 MNET0_SA2 MNET0_SI1446 MNET0_SI2076 MNET0_SX186 MNET0_SX276 MNET0_SX366 MNET0_SX96 MNTW0_SA1 MNTW0_SI2328 MNTW0_SX202 MNTW0_SX258 MNTW0_SX348 MPAR0_SA1 MPAR0_SA2 MPAR0_SI1576 MPAR0_SX226 MPAR0_SX406 MPAR0_SX46 MPEB0_SA1 MPEB0_SA2 MPEB0_SX150 MPEB0_SX420 MPEB0_SX60 MPFU0_SA1 MPFU0_SA2 MPFU0_SI1888 MPFU0_SX178 MPFU0_SX268 MPFU0_SX358 MPFU0_SX88 MPGH0_SA1 MPGH0_SA2 MPGH0_SI1554 MPGH0_SI924 MPGH0_SX204 MPGH0_SX294 MPGH0_SX384 MPGR0_SA1 MPGR0_SA2 MPGR0_SI2040 MPGR0_SI780 MPGR0_SX150 MPGR0_SX420 MPGR0_SX60 MPGR1_SA1 MPGR1_SA2 MPGR1_SI1269 MPGR1_SI2129 MPGR1_SX239 MPGR1_SX329 MPGR1_SX419 MPGR1_SX59 MPMB0_SX241 MPPC0_SA2 MPPC0_SI2042 MPPC0_SI782 MPPC0_SX152 MPPC0_SX242 MPPC0_SX332 MPPC0_SX422 MPPC0_SX62 MPRB0_SA1 MPRB0_SA2 MPRB0_SI1205 MPRB0_SX125 MPRB0_SX215 MPRB0_SX305 MPRB0_SX35 MPRB0_SX395 MPRD0_SA2 MPRD0_SI1431 MPRD0_SI2061 MPRK0_SA2 MPRK0_SX17 MPRK0_SX197 MPRT0_SA2 MPRT0_SI1210 MPRT0_SI495 MPRT0_SI580 MPRT0_SX130 MPRT0_SX220 MPRT0_SX40 MPRT0_SX400 MPSW0_SA1 MPSW0_SA2 MPSW0_SI1697 MPSW0_SI2327 MPSW0_SX24 MPSW0_SX257 MPSW0_SX77 MRAB0_SA1 MRAB0_SA2 MRAB0_SI1224 MRAB0_SI594 MRAB0_SX144 MRAB0_SX234 MRAB0_SX324 MRAB0_SX414 MRAB0_SX54 MRAB1_SA1 MRAB1_SA2 MRAB1_SI1478 MRAB1_SI2108 MRAB1_SX218 MRAB1_SX38 MRAB1_SX398 MRAI0_SI1954 MRAI0_SX162 MRAI0_SX252 MRAI0_SX342 MRAM0_SI1275 MRAM0_SI1905 MRAM0_SX105 MRAM0_SX195 MRAM0_SX285 MRAM0_SX375 MRAV0_SA1 MRAV0_SA2 MRAV0_SI1008 MRAV0_SI1638 MRAV0_SI2268 MRAV0_SX108 MRAV0_SX18 MRAV0_SX198 MRAV0_SX288 MRAV0_SX378 MRBC0_SA1 MRBC0_SA2 MRBC0_SI1665 MRBC0_SI599 MRBC0_SX149 MRBC0_SX239 MRBC0_SX59 MRCG0_SA1 MRCG0_SI2058 MRCG0_SX258 MRCG0_SX78 MRCW0_SA2 MRCW0_SI1371 MRCW0_SI2001 MRCW0_SX111 MRCW0_SX201 MRCW0_SX21 MRCW0_SX381 MRDD0_SA1 MRDD0_SA2 MRDD0_SI1050 MRDD0_SI2310 MRDD0_SX240 MRDD0_SX330 MRDM0_SA1 MRDM0_SA2 MRDM0_SI965 MRDM0_SX155 MRDM0_SX245 MRDM0_SX425 MRDS0_SA2 MRDS0_SI1167 MRDS0_SI1797 MRDS0_SI537 MRDS0_SX177 MRDS0_SX267 MRDS0_SX357 MRDS0_SX447 MRDS0_SX87 MREE0_SA1 MREE0_SA2 MREE0_SI1734 MREE0_SX114 MREE0_SX204 MREE0_SX294 MREE0_SX384 MREH1_SA2 MREH1_SI2229 MREH1_SX159 MREH1_SX339 MREH1_SX429 MREM0_SA1 MREM0_SI1591 MREM0_SI961 MREM0_SX151 MREM0_SX241 MREM0_SX331 MREM0_SX421 MREM0_SX61 MREW1_SA1 MREW1_SA2 MREW1_SI1500 MREW1_SI2130 MREW1_SX150 MREW1_SX240 MREW1_SX330 MREW1_SX420 MREW1_SX60 MRFK0_SA1 MRFK0_SA2 MRFK0_SI1706 MRFK0_SI2336 MRFK0_SX176 MRFK0_SX266 MRFK0_SX356 MRFK0_SX86 MRFL0_SA2 MRFL0_SI1786 MRFL0_SX346 MRGM0_SA1 MRGM0_SI1162 MRGM0_SI1792 MRGM0_SX416 MRGM0_SX82 MRGS0_SA1 MRGS0_SI1986 MRGS0_SX276 MRGS0_SX366 MRGS0_SX96 MRHL0_SA1 MRHL0_SA2 MRHL0_SI1515 MRHL0_SI2145 MRHL0_SX165 MRHL0_SX255 MRHL0_SX75 MRJB1_SI1020 MRJB1_SX300 MRJH0_SA1 MRJH0_SI914 MRJH0_SX259 MRJH0_SX439 MRJM0_SA1 MRJM0_SA2 MRJM0_SI1095 MRJM0_SI1228 MRJM0_SI1858 MRJM0_SX238 MRJM0_SX328 MRJM0_SX418 MRJM0_SX58 MRJM1_SA1 MRJM1_SI668 MRJM1_SX218 MRJM1_SX308 MRJM1_SX38 MRJM1_SX398 MRJT0_SA1 MRJT0_SI1805 MRJT0_SX148 MRJT0_SX238 MRKM0_SA1 MRKM0_SX187 MRKM0_SX277 MRKM0_SX7 MRKM0_SX97 MRLD0_SA1 MRLD0_SI1594 MRLD0_SI964 MRLD0_SX244 MRLD0_SX334 MRLD0_SX64 MRLJ0_SA2 MRLJ0_SI1420 MRLJ0_SI2050 MRLJ0_SX160 MRLJ0_SX430 MRLJ0_SX70 MRLJ1_SI1671 MRLJ1_SI2332 MRLJ1_SX141 MRLJ1_SX231 MRLJ1_SX411 MRLJ1_SX51 MRLK0_SA1 MRLK0_SA2 MRLK0_SI2140 MRLK0_SX303 MRLK0_SX33 MRLK0_SX393 MRLR0_SA1 MRLR0_SA2 MRLR0_SI1826 MRLR0_SI566 MRLR0_SX116 MRLR0_SX206 MRLR0_SX26 MRLR0_SX296 MRLR0_SX386 MRMB0_SA1 MRMB0_SI2211 MRMB0_SI951 MRMB0_SX141 MRMB0_SX231 MRMB0_SX321 MRMB0_SX51 MRMG0_SA2 MRMG0_SI1710 MRMG0_SI2340 MRMG0_SX180 MRMG0_SX270 MRMG0_SX360 MRMG0_SX90 MRMH0_SA1 MRMH0_SA2 MRMH0_SI1021 MRMH0_SX211 MRMH0_SX301 MRMH0_SX31 MRMH0_SX391 MRML0_SI2051 MRML0_SI791 MRML0_SX431 MRML0_SX71 MRMS0_SA1 MRMS0_SA2 MRMS0_SI1113 MRMS0_SI2100 MRMS0_SX120 MRMS0_SX210 MRMS0_SX30 MRMS0_SX300 MRMS0_SX390 MRPC1_SA1 MRPC1_SA2 MRPC1_SI1482 MRPC1_SI2026 MRPC1_SX132 MRPC1_SX222 MRPC1_SX312 MRPC1_SX402 MRPC1_SX42 MRRE0_SI704 MRRE0_SX254 MRRE0_SX434 MRSO0_SA1 MRSO0_SA2 MRSO0_SI1659 MRSO0_SI2289 MRSO0_SX219 MRSO0_SX309 MRSO0_SX399 MRSP0_SA1 MRSP0_SA2 MRSP0_SI2059 MRSP0_SI799 MRSP0_SX169 MRSP0_SX196 MRSP0_SX439 MRSP0_SX79 MRTC0_SA1 MRTC0_SA2 MRTC0_SI2088 MRTC0_SI828 MRTC0_SX108 MRTC0_SX18 MRTC0_SX198 MRTC0_SX288 MRTJ0_SA2 MRTJ0_SI1551 MRTJ0_SI2032 MRTJ0_SX322 MRTJ0_SX412 MRVG0_SA1 MRVG0_SA2 MRVG0_SI1770 MRVG0_SI510 MRVG0_SX150 MRVG0_SX330 MRVG0_SX420 MRVG0_SX60 MRWA0_SA1 MRWA0_SA2 MRWA0_SI1603 MRWA0_SI2233 MRWA0_SX253 MRWA0_SX343 MRWA0_SX433 MRWS0_SA1 MRWS0_SA2 MRWS0_SX112 MRWS0_SX202 MRWS0_SX292 MRXB0_SA1 MRXB0_SI1585 MRXB0_SX145 MRXB0_SX235 MRXB0_SX325 MRXB0_SX55 MSAH1_SA1 MSAH1_SA2 MSAH1_SI1049 MSAH1_SI2309 MSAH1_SX149 MSAH1_SX239 MSAH1_SX329 MSAH1_SX419 MSAH1_SX59 MSAS0_SA1 MSAS0_SA2 MSAS0_SI2006 MSAS0_SX26 MSAS0_SX296 MSAT0_SA2 MSAT0_SI1526 MSAT0_SI2156 MSAT0_SI896 MSAT0_SX176 MSAT0_SX266 MSAT0_SX356 MSAT0_SX446 MSAT0_SX86 MSAT1_SA1 MSAT1_SA2 MSAT1_SI1073 MSAT1_SI1703 MSAT1_SI2333 MSAT1_SX173 MSAT1_SX353 MSDB0_SA1 MSDB0_SA2 MSDB0_SI1007 MSDB0_SI1637 MSDB0_SI2267 MSDB0_SX107 MSDB0_SX17 MSDH0_SA1 MSDH0_SA2 MSDH0_SI2113 MSDH0_SX260 MSDH0_SX350 MSDS0_SA2 MSDS0_SI1707 MSDS0_SI2337 MSDS0_SX177 MSDS0_SX447 MSDS0_SX87 MSEM1_SA1 MSEM1_SA2 MSEM1_SX360 MSEM1_SX450 MSEM1_SX90 MSES0_SA1 MSES0_SA2 MSES0_SI2216 MSES0_SI2219 MSES0_SX149 MSES0_SX329 MSES0_SX59 MSFH0_SA2 MSFH0_SI1216 MSFH0_SI586 MSFH0_SX226 MSFH0_SX46 MSFV0_SA1 MSFV0_SA2 MSFV0_SI1262 MSFV0_SX182 MSFV0_SX272 MSFV0_SX452 MSJK0_SA1 MSJK0_SA2 MSJK0_SI2226 MSJK0_SI966 MSJK0_SX156 MSJK0_SX246 MSJK0_SX426 MSJK0_SX66 MSMC0_SA1 MSMC0_SA2 MSMC0_SI1907 MSMC0_SI647 MSMC0_SX107 MSMC0_SX17 MSMC0_SX197 MSMC0_SX287 MSMC0_SX377 MSMR0_SA1 MSMR0_SA2 MSMR0_SI1405 MSMR0_SI775 MSMR0_SX145 MSMR0_SX235 MSMR0_SX325 MSMR0_SX55 MSMS0_SA2 MSMS0_SI2063 MSMS0_SI803 MSMS0_SX263 MSMS0_SX353 MSMS0_SX443 MSRG0_SA2 MSRG0_SI1851 MSRG0_SI591 MSRG0_SX141 MSRG0_SX231 MSRG0_SX321 MSRG0_SX411 MSRG0_SX51 MSRR0_SA1 MSRR0_SA2 MSRR0_SI1131 MSRR0_SX141 MSRR0_SX231 MSRR0_SX30 MSRR0_SX411 MSRR0_SX51 MSTF0_SA1 MSTF0_SA2 MSTF0_SI1396 MSTF0_SX136 MSTF0_SX226 MSTF0_SX406 MSVS0_SA1 MSVS0_SI1568 MSVS0_SX128 MSVS0_SX218 MSVS0_SX38 MTAB0_SA1 MTAB0_SA2 MTAB0_SI2202 MTAB0_SI942 MTAB0_SX132 MTAB0_SX222 MTAB0_SX402 MTAB0_SX42 MTAS0_SA1 MTAS0_SA2 MTAS0_SI1385 MTAS0_SI2015 MTAS0_SI755 MTAS0_SX125 MTAS0_SX305 MTAT0_SA2 MTAT0_SI1740 MTAT0_SX120 MTAT0_SX210 MTAT0_SX30 MTAT0_SX300 MTAT1_SA1 MTAT1_SA2 MTAT1_SI1409 MTAT1_SI1627 MTAT1_SX239 MTAT1_SX419 MTBC0_SA1 MTBC0_SA2 MTBC0_SI1173 MTBC0_SX183 MTBC0_SX273 MTBC0_SX347 MTBC0_SX363 MTBC0_SX93 MTCS0_SA1 MTCS0_SI1972 MTCS0_SX172 MTCS0_SX262 MTCS0_SX352 MTCS0_SX442 MTDB0_SA1 MTDB0_SA2 MTDB0_SI2031 MTDB0_SX141 MTDB0_SX231 MTDB0_SX321 MTDB0_SX411 MTDB0_SX51 MTDP0_SI1274 MTDP0_SI2151 MTDP0_SX261 MTDP0_SX441 MTDP0_SX81 MTER0_SI527 MTER0_SX167 MTER0_SX17 MTER0_SX257 MTER0_SX77 MTJG0_SA2 MTJG0_SI1520 MTJG0_SI890 MTJG0_SX350 MTJG0_SX440 MTJG0_SX80 MTJM0_SA1 MTJM0_SA2 MTJM0_SI1226 MTJM0_SI655 MTJM0_SX236 MTJM0_SX326 MTJM0_SX416 MTJM0_SX56 MTJS0_SA1 MTJS0_SI1192 MTJS0_SX112 MTJS0_SX202 MTJS0_SX22 MTJS0_SX292 MTJU0_SA1 MTJU0_SA2 MTJU0_SI2269 MTJU0_SI760 MTJU0_SX220 MTJU0_SX310 MTJU0_SX40 MTKD0_SA1 MTKD0_SA2 MTKD0_SI1187 MTKD0_SI1817 MTKD0_SX17 MTKD0_SX197 MTKD0_SX377 MTKP0_SA1 MTKP0_SA2 MTKP0_SX123 MTKP0_SX213 MTKP0_SX303 MTKP0_SX33 MTKP0_SX393 MTLB0_SA2 MTLB0_SI1764 MTLB0_SI504 MTLB0_SX144 MTLB0_SX414 MTLB0_SX54 MTLC0_SA2 MTLC0_SI847 MTLC0_SX127 MTLC0_SX217 MTLC0_SX307 MTLC0_SX37 MTLC0_SX397 MTML0_SA1 MTML0_SA2 MTML0_SI1065 MTML0_SI1695 MTML0_SX255 MTML0_SX345 MTML0_SX75 MTMN0_SA1 MTMN0_SX164 MTMN0_SX254 MTMN0_SX344 MTMN0_SX74 MTMT0_SA1 MTMT0_SI1118 MTMT0_SX128 MTMT0_SX218 MTMT0_SX308 MTMT0_SX38 MTMT0_SX398 MTPF0_SA1 MTPF0_SA2 MTPF0_SI1235 MTPF0_SI1865 MTPF0_SI605 MTPF0_SX155 MTPF0_SX245 MTPF0_SX335 MTPF0_SX425 MTPG0_SA1 MTPG0_SA2 MTPG0_SI2013 MTPG0_SX123 MTPG0_SX213 MTPG0_SX33 MTPG0_SX393 MTPP0_SA1 MTPP0_SA2 MTPP0_SI2138 MTPP0_SI878 MTPP0_SX158 MTPP0_SX248 MTPP0_SX428 MTPP0_SX68 MTPR0_SA1 MTPR0_SA2 MTPR0_SI1600 MTPR0_SI506 MTPR0_SX250 MTPR0_SX70 MTQC0_SA2 MTQC0_SI2071 MTQC0_SX271 MTQC0_SX361 MTRC0_SA1 MTRC0_SA2 MTRC0_SI1623 MTRC0_SI993 MTRC0_SX170 MTRC0_SX183 MTRC0_SX273 MTRC0_SX363 MTRC0_SX93 MTRR0_SA1 MTRR0_SA2 MTRR0_SI1548 MTRR0_SI2178 MTRR0_SX108 MTRR0_SX18 MTRR0_SX378 MTRT0_SA1 MTRT0_SI1857 MTRT0_SI597 MTRT0_SX147 MTRT0_SX237 MTRT0_SX417 MTWH1_SA1 MTWH1_SA2 MTWH1_SI1512 MTWH1_SI2142 MTWH1_SI882 MTWH1_SX162 MTWH1_SX252 MTWH1_SX342 MTWH1_SX432 MTXS0_SI1690 MTXS0_SX250 MTXS0_SX340 MTXS0_SX70 MVJH0_SA1 MVJH0_SA2 MVJH0_SI2186 MVJH0_SX116 MVJH0_SX26 MVJH0_SX386 MVLO0_SA2 MVLO0_SI1147 MVLO0_SI1777 MVLO0_SX157 MVLO0_SX247 MVLO0_SX337 MVLO0_SX427 MVLO0_SX67 MVRW0_SA1 MVRW0_SI1485 MVRW0_SI2115 MVRW0_SI855 MVRW0_SX315 MVRW0_SX405 MVRW0_SX45 MWAC0_SA1 MWAC0_SI2231 MWAC0_SI971 MWAC0_SX71 MWAD0_SA1 MWAD0_SA2 MWAD0_SI1062 MWAD0_SI1749 MWAD0_SI2322 MWAD0_SX162 MWAD0_SX252 MWAD0_SX342 MWAR0_SA2 MWAR0_SI2305 MWAR0_SX145 MWAR0_SX235 MWAR0_SX325 MWAR0_SX415 MWAR0_SX55 MWCH0_SA1 MWCH0_SA2 MWCH0_SI1622 MWCH0_SX272 MWCH0_SX362 MWCH0_SX92 MWDK0_SX266 MWDK0_SX356 MWDK0_SX446 MWEM0_SA1 MWEM0_SI1950 MWEM0_SX240 MWEM0_SX330 MWEM0_SX60 MWGR0_SA1 MWGR0_SA2 MWGR0_SI1606 MWGR0_SI2236 MWGR0_SI976 MWGR0_SX166 MWGR0_SX256 MWGR0_SX436 MWGR0_SX76 MWRE0_SA1 MWRE0_SI1687 MWRE0_SI2317 MWRE0_SX157 MWRP0_SA2 MWRP0_SI1525 MWRP0_SI2073 MWRP0_SX183 MWRP0_SX3 MWRP0_SX93 MWSB0_SA1 MWSB0_SA2 MWSB0_SI1626 MWSB0_SI2256 MWSB0_SX186 MWSB0_SX366 MWSB0_SX6 MWSB0_SX96 MWSH0_SA1 MWSH0_SA2 MWSH0_SI2266 MWSH0_SX346 MWSH0_SX436 MZMB0_SA2 MZMB0_SI1166 MZMB0_SI1796 MZMB0_SI536 MZMB0_SX176 MZMB0_SX266 MZMB0_SX356 MZMB0_SX446 MZMB0_SX86 ================================================ FILE: examples/wav2vec/unsupervised/config/timit_unmatched/train_text.uid ================================================ FAEM0_SI762 FAEM0_SX42 FAJW0_SA1 FAJW0_SX3 FAJW0_SX93 FALK0_SX186 FALK0_SX6 FALR0_SI1325 FBAS0_SA1 FBAS0_SX217 FBCG1_SA1 FBCG1_SX172 FBCG1_SX442 FBCH0_SX236 FBCH0_SX416 FBLV0_SA1 FBLV0_SI1058 FBLV0_SX338 FBLV0_SX68 FBMH0_SA1 FBMJ0_SI815 FCAG0_SA1 FCAG0_SX153 FCAG0_SX243 FCAJ0_SI1479 FCAJ0_SX309 FCDR1_SX106 FCDR1_SX196 FCEG0_SA2 FCJF0_SA1 FCJF0_SX127 FCJS0_SI1607 FCJS0_SI2237 FCJS0_SX257 FCKE0_SA2 FCKE0_SX121 FCLT0_SI2068 FCLT0_SX448 FCLT0_SX88 FCMG0_SA2 FCMG0_SI1872 FCMG0_SX72 FCMM0_SA1 FCMM0_SA2 FCMM0_SX183 FCRZ0_SI2053 FCRZ0_SX433 FCYL0_SA1 FCYL0_SX37 FDAS1_SI2091 FDAS1_SX201 FDAS1_SX381 FDAW0_SI1406 FDFB0_SA1 FDFB0_SA2 FDFB0_SI2010 FDFB0_SX58 FDJH0_SX305 FDML0_SA2 FDML0_SX159 FDML0_SX249 FDML0_SX429 FDMY0_SA2 FDMY0_SX27 FDNC0_SX198 FDNC0_SX288 FDTD0_SX211 FDXW0_SA1 FDXW0_SX251 FDXW0_SX341 FDXW0_SX71 FEAC0_SX165 FEAC0_SX75 FEAR0_SI622 FECD0_SX68 FEEH0_SA1 FEEH0_SI1742 FEEH0_SI471 FEEH0_SX122 FEME0_SA1 FEME0_SX155 FEME0_SX65 FETB0_SA1 FETB0_SI1148 FETB0_SX158 FEXM0_SI1101 FGCS0_SX136 FGCS0_SX226 FGCS0_SX316 FGCS0_SX406 FGDP0_SA1 FGMB0_SI1775 FGMB0_SX245 FHLM0_SX390 FHXS0_SA2 FHXS0_SX445 FJDM2_SA1 FJDM2_SX232 FJDM2_SX52 FJHK0_SX302 FJKL0_SX212 FJKL0_SX392 FJLG0_SI2306 FJLR0_SA1 FJRP1_SI2062 FJRP1_SX82 FJSK0_SA1 FJSP0_SX264 FJSP0_SX354 FJSP0_SX444 FJWB1_SA1 FJWB1_SX345 FJWB1_SX435 FJXM0_SA1 FJXM0_SI581 FJXM0_SX401 FJXP0_SA1 FJXP0_SI1122 FJXP0_SX132 FKAA0_SX128 FKAA0_SX398 FKDE0_SA1 FKDE0_SX151 FKDE0_SX241 FKDE0_SX421 FKDE0_SX61 FKDW0_SX397 FKFB0_SA2 FKFB0_SX348 FKFB0_SX78 FKKH0_SA1 FKKH0_SA2 FKKH0_SX120 FKKH0_SX390 FKLC0_SX355 FKLC1_SI2308 FKLC1_SX238 FKLC1_SX328 FKLC1_SX418 FKLH0_SA2 FKLH0_SX177 FKSR0_SA1 FKSR0_SA2 FKSR0_SI1747 FKSR0_SI487 FKSR0_SX217 FLAC0_SX451 FLAG0_SA2 FLAG0_SX114 FLAG0_SX204 FLAG0_SX24 FLAG0_SX384 FLEH0_SI1681 FLEH0_SI2311 FLEH0_SX331 FLET0_SA1 FLHD0_SI1827 FLHD0_SX354 FLJA0_SA1 FLJA0_SI2338 FLJD0_SI886 FLJD0_SX76 FLJG0_SA2 FLKM0_SA2 FLKM0_SI686 FLKM0_SX260 FLKM0_SX80 FLMA0_SA1 FLMA0_SI613 FLMA0_SX433 FLMA0_SX73 FLMC0_SX22 FLMK0_SI1035 FLMK0_SX315 FLMK0_SX405 FLOD0_SI1917 FLOD0_SX117 FLOD0_SX171 FLOD0_SX297 FLTM0_SA1 FLTM0_SI1070 FLTM0_SI2330 FMAH1_SA2 FMAH1_SX159 FMBG0_SA2 FMBG0_SI2264 FMEM0_SI747 FMEM0_SX387 FMJB0_SI547 FMJB0_SX97 FMJF0_SA2 FMJU0_SX309 FMJU0_SX399 FMKC0_SI1702 FMKC0_SX442 FMKC0_SX82 FMKF0_SX186 FMPG0_SA2 FNKL0_SI1522 FNTB0_SI1203 FNTB0_SI573 FNTB0_SX303 FPAB1_SI1471 FPAB1_SX211 FPAC0_SA2 FPAD0_SA2 FPAD0_SX356 FPAD0_SX86 FPAF0_SA2 FPAF0_SX154 FPAZ0_SA1 FPAZ0_SA2 FPAZ0_SX243 FPJF0_SA1 FPJF0_SX146 FPJF0_SX56 FPLS0_SI1590 FPLS0_SX330 FPMY0_SA1 FPMY0_SX343 FREH0_SA1 FREH0_SA2 FREH0_SX415 FRJB0_SX347 FRLL0_SX434 FSAG0_SA1 FSAG0_SX243 FSAH0_SA1 FSAH0_SA2 FSAH0_SX164 FSAH0_SX434 FSBK0_SA2 FSBK0_SI1069 FSBK0_SX169 FSCN0_SA2 FSCN0_SI626 FSCN0_SX266 FSCN0_SX446 FSCN0_SX86 FSDC0_SA2 FSDC0_SX142 FSDC0_SX322 FSDC0_SX52 FSDJ0_SI485 FSDJ0_SX215 FSDJ0_SX305 FSDJ0_SX395 FSGF0_SX117 FSJG0_SX130 FSJK1_SA2 FSJK1_SX125 FSJK1_SX35 FSJS0_SX181 FSJW0_SI1963 FSJW0_SX433 FSKC0_SI1416 FSKC0_SI786 FSKC0_SX246 FSKL0_SI1529 FSKL0_SX449 FSKP0_SA2 FSLS0_SX156 FSLS0_SX426 FSMA0_SA2 FSMA0_SX181 FSMM0_SX144 FSMM0_SX234 FSMS1_SX244 FSMS1_SX347 FSPM0_SA2 FSPM0_SX161 FSPM0_SX71 FSRH0_SI1931 FSRH0_SI671 FSRH0_SX221 FSRH0_SX401 FTAJ0_SI699 FTAJ0_SX159 FTAJ0_SX249 FTAJ0_SX429 FTBR0_SX21 FTBW0_SA1 FTMG0_SI1532 FTMG0_SI2162 FTMG0_SX452 FVFB0_SA2 FVFB0_SX132 FVFB0_SX42 FVKB0_SA1 FVMH0_SA2 FVMH0_SX116 FVMH0_SX26 MABC0_SI1620 MABC0_SI2041 MABC0_SI781 MADC0_SX107 MADC0_SX377 MADD0_SA2 MADD0_SI1295 MADD0_SX178 MADD0_SX268 MADD0_SX88 MAEB0_SX450 MAEO0_SA1 MAFM0_SI939 MAFM0_SX129 MAFM0_SX309 MAJP0_SA2 MAKB0_SI1646 MAKB0_SX26 MAKB0_SX386 MAKR0_SX362 MAKR0_SX92 MAPV0_SX213 MARC0_SA2 MARC0_SX108 MARC0_SX18 MARC0_SX198 MARW0_SI1906 MBAR0_SA1 MBAR0_SX419 MBAR0_SX59 MBBR0_SI2315 MBBR0_SX65 MBCG0_SA1 MBCG0_SI486 MBEF0_SI1281 MBEF0_SI1911 MBEF0_SI651 MBEF0_SX21 MBEF0_SX381 MBGT0_SA2 MBGT0_SX261 MBGT0_SX351 MBGT0_SX441 MBJV0_SA1 MBJV0_SI617 MBJV0_SX347 MBMA0_SI592 MBMA0_SX232 MBMA0_SX52 MBMA1_SI2214 MBMA1_SX54 MBML0_SA2 MBML0_SI1169 MBML0_SX89 MBOM0_SA2 MBOM0_SI2274 MBOM0_SX294 MBSB0_SA1 MBSB0_SX3 MBTH0_SA2 MBTH0_SX122 MBTH0_SX32 MCAE0_SX277 MCAL0_SA2 MCAL0_SI1768 MCDC0_SA1 MCDC0_SX212 MCDD0_SA2 MCDD0_SI883 MCDD0_SX253 MCDD0_SX433 MCDR0_SI1154 MCEF0_SX235 MCEF0_SX415 MCEW0_SA2 MCHL0_SX87 MCLK0_SX310 MCLM0_SA1 MCLM0_SI2086 MCLM0_SI826 MCPM0_SA1 MCPM0_SX114 MCPM0_SX294 MCPM0_SX384 MCSS0_SI750 MCTH0_SA1 MCTH0_SX39 MCXM0_SX91 MDAC0_SA1 MDAC0_SX181 MDAC0_SX361 MDAS0_SX6 MDBB1_SX106 MDBB1_SX16 MDBB1_SX376 MDBP0_SX168 MDCD0_SI1415 MDCD0_SX245 MDCD0_SX425 MDCM0_SX40 MDCM0_SX400 MDDC0_SI2049 MDDC0_SI789 MDDC0_SX159 MDDC0_SX69 MDED0_SA1 MDED0_SA2 MDEF0_SX123 MDEF0_SX303 MDHL0_SI1439 MDHL0_SX269 MDHL0_SX449 MDHS0_SA1 MDHS0_SA2 MDHS0_SI1530 MDHS0_SI2160 MDJM0_SX105 MDJM0_SX15 MDKS0_SX436 MDLB0_SA2 MDLC0_SX405 MDLC1_SA2 MDLC1_SI2065 MDLC1_SI2144 MDLC1_SX445 MDLC2_SI2244 MDLC2_SX354 MDLH0_SA2 MDLM0_SI1234 MDLM0_SI1864 MDLM0_SX154 MDLM0_SX424 MDLR0_SA1 MDLR0_SA2 MDLR0_SI1863 MDLR0_SI603 MDLR0_SX153 MDLR1_SA1 MDLR1_SA2 MDMA0_SI1430 MDMA0_SX260 MDMA0_SX80 MDMT0_SA1 MDMT0_SA2 MDMT0_SI1832 MDMT0_SX122 MDMT0_SX32 MDNS0_SA2 MDNS0_SI2271 MDNS0_SX201 MDNS0_SX21 MDPB0_SX416 MDPK0_SI1053 MDPK0_SX333 MDPK0_SX423 MDPS0_SI719 MDPS0_SX359 MDRD0_SA1 MDRD0_SX32 MDSJ0_SI2092 MDSS0_SA2 MDSS0_SX441 MDSS1_SA1 MDSS1_SI1327 MDSS1_SI697 MDSS1_SX157 MDSS1_SX67 MDTB0_SI1200 MDTB0_SI1830 MDTB0_SX120 MDWD0_SA2 MDWD0_SX270 MDWD0_SX90 MDWH0_SX215 MDWH0_SX305 MDWM0_SA1 MDWM0_SA2 MDWM0_SX16 MDWM0_SX286 MEAL0_SA2 MEAL0_SI2177 MEAL0_SX107 MEAL0_SX347 MEDR0_SA1 MEDR0_SA2 MEDR0_SI1374 MEFG0_SA1 MEGJ0_SA2 MEGJ0_SX257 MEGJ0_SX3 MEJL0_SA1 MEJL0_SX152 MEJL0_SX242 MEJS0_SI610 MEJS0_SX160 MEJS0_SX340 MESG0_SX432 MESJ0_SX187 MESJ0_SX97 MEWM0_SI718 MEWM0_SX178 MEWM0_SX88 MFER0_SI862 MFER0_SX142 MFRM0_SX345 MFRM0_SX435 MFWK0_SI1879 MFWK0_SX169 MFXS0_SX54 MFXV0_SA2 MFXV0_SX105 MGAF0_SA1 MGAF0_SX22 MGAF0_SX382 MGAG0_SA2 MGAK0_SX226 MGAK0_SX46 MGAR0_SX132 MGAW0_SI535 MGAW0_SX175 MGES0_SA1 MGES0_SI2111 MGES0_SI851 MGJC0_SA2 MGJC0_SX75 MGRL0_SI2127 MGRL0_SI867 MGRL0_SX147 MGRP0_SA2 MGSH0_SA2 MGSH0_SI1806 MGSH0_SX127 MGSH0_SX276 MGSH0_SX6 MGSL0_SA1 MGSL0_SI534 MGSL0_SX264 MGXP0_SX187 MGXP0_SX7 MHBS0_SX315 MHBS0_SX45 MHIT0_SA1 MHJB0_SA1 MHJB0_SI1017 MHMG0_SX195 MHMR0_SA1 MHMR0_SI489 MHRM0_SA1 MHRM0_SI958 MHRM0_SX148 MHRM0_SX58 MHXL0_SI1772 MHXL0_SX242 MILB0_SA2 MJAC0_SX307 MJAC0_SX71 MJAE0_SX174 MJAI0_SA1 MJAI0_SA2 MJBG0_SX62 MJDA0_SI1031 MJDA0_SX311 MJDE0_SI463 MJDG0_SA2 MJDG0_SI1042 MJDG0_SI1705 MJDM0_SA1 MJDM0_SI974 MJEB0_SI656 MJEB0_SX296 MJEB1_SA2 MJEB1_SX207 MJEB1_SX387 MJEE0_SA1 MJEE0_SX247 MJEE0_SX337 MJFH0_SA2 MJFH0_SI1107 MJFR0_SX75 MJHI0_SA1 MJHI0_SX158 MJJB0_SA1 MJJB0_SX239 MJJJ0_SX443 MJJM0_SA2 MJJM0_SI827 MJJM0_SX107 MJKR0_SA1 MJKR0_SI571 MJLB0_SX176 MJLG1_SX292 MJLS0_SX106 MJMA0_SA1 MJMA0_SA2 MJMD0_SA2 MJMD0_SX308 MJMD0_SX38 MJMM0_SX85 MJPG0_SI1191 MJPG0_SX111 MJPG0_SX201 MJPG0_SX21 MJPM0_SA2 MJPM0_SX378 MJPM1_SI2280 MJPM1_SX401 MJRA0_SA1 MJRA0_SA2 MJRA0_SI1236 MJRA0_SI1866 MJRA0_SX426 MJRG0_SI1366 MJRG0_SI1996 MJRG0_SX376 MJRH0_SX225 MJRH1_SA1 MJRH1_SI514 MJRH1_SX154 MJRH1_SX244 MJRH1_SX424 MJRK0_SA1 MJRK0_SA2 MJRK0_SI1662 MJRK0_SX160 MJRK0_SX250 MJRK0_SX430 MJRP0_SA1 MJRP0_SA2 MJRP0_SX225 MJSR0_SA1 MJSR0_SI1424 MJSR0_SX344 MJWG0_SA1 MJWG0_SX265 MJWS0_SI513 MJWS0_SX153 MJWS0_SX63 MJWT0_SA1 MJWT0_SX121 MJWT0_SX211 MJWT0_SX301 MJWT0_SX31 MJWT0_SX391 MJXA0_SX427 MJXL0_SI542 MKAG0_SA1 MKAG0_SX259 MKAJ0_SA2 MKAJ0_SX154 MKAM0_SA1 MKAM0_SX146 MKAM0_SX326 MKAM0_SX56 MKDB0_SA1 MKDB0_SA2 MKDB0_SX152 MKDD0_SA2 MKES0_SA1 MKES0_SI1253 MKES0_SI1883 MKES0_SX173 MKJO0_SI1517 MKJO0_SI887 MKJO0_SX437 MKLN0_SI968 MKLN0_SX248 MKLR0_SA2 MKLR0_SI1689 MKLS0_SA1 MKLS0_SX357 MKLS0_SX87 MKLS1_SA1 MKLS1_SA2 MKLS1_SX375 MKLW0_SA1 MKRG0_SX411 MKXL0_SA2 MKXL0_SX15 MKXL0_SX375 MLBC0_SA1 MLBC0_SI1869 MLBC0_SX249 MLEL0_SA1 MLEL0_SA2 MLEL0_SI1246 MLEL0_SX256 MLEL0_SX436 MLJC0_SX145 MLJC0_SX415 MLJH0_SX64 MLNS0_SI2037 MMAA0_SA1 MMAA0_SA2 MMAA0_SX35 MMAB1_SI1494 MMAB1_SX234 MMAG0_SA2 MMAG0_SI1126 MMAG0_SX316 MMAM0_SI2227 MMAM0_SX157 MMAM0_SX427 MMAR0_SX256 MMBS0_SI1781 MMCC0_SA2 MMDB0_SX177 MMDG0_SA1 MMDG0_SA2 MMDG0_SI520 MMDG0_SX160 MMDG0_SX250 MMDM0_SI1941 MMDM0_SI681 MMDM0_SX141 MMDM1_SA2 MMDM1_SI2043 MMDM1_SX423 MMDM1_SX63 MMDS0_SA1 MMEA0_SA1 MMEA0_SX128 MMEA0_SX398 MMEB0_SA2 MMEB0_SX187 MMEB0_SX367 MMGC0_SA2 MMGC0_SX135 MMGC0_SX225 MMGG0_SX269 MMGK0_SX332 MMGK0_SX62 MMJB1_SA2 MMRP0_SA2 MMRP0_SX144 MMSM0_SX116 MMSM0_SX206 MMVP0_SA1 MMVP0_SA2 MMWB0_SI989 MMWB0_SX89 MMWS0_SA2 MMWS0_SX168 MMWS0_SX348 MMWS0_SX438 MMWS1_SI1701 MMXS0_SI2136 MMXS0_SX246 MMXS0_SX426 MNET0_SI816 MNET0_SX6 MNTW0_SA2 MNTW0_SX168 MNTW0_SX78 MPAR0_SI2206 MPAR0_SI946 MPAR0_SX136 MPAR0_SX316 MPEB0_SI1034 MPEB0_SI1860 MPEB0_SX240 MPEB0_SX330 MPFU0_SI628 MPFU0_SX448 MPGH0_SX114 MPGH0_SX24 MPGR0_SX240 MPGR0_SX330 MPGR1_SX149 MPPC0_SA1 MPRD0_SA1 MPRD0_SX261 MPRD0_SX351 MPRD0_SX441 MPRD0_SX81 MPRK0_SI1727 MPRK0_SX107 MPRK0_SX377 MPRT0_SA1 MPRT0_SX310 MPSW0_SI1067 MPSW0_SX167 MPSW0_SX437 MRAB1_SX128 MRAB1_SX308 MRAI0_SA1 MRAI0_SA2 MRAI0_SX72 MRAM0_SA1 MRAM0_SA2 MRAM0_SX15 MRBC0_SI1859 MRBC0_SX329 MRBC0_SX419 MRCG0_SI798 MRCG0_SX168 MRCW0_SA1 MRCW0_SX291 MRDD0_SI1680 MRDD0_SX150 MRDD0_SX277 MRDD0_SX60 MRDM0_SI1595 MRDM0_SX65 MRDS0_SA1 MREE0_SX24 MREH1_SX249 MREH1_SX69 MREM0_SA2 MREW1_SI870 MRFK0_SX446 MRFL0_SA1 MRFL0_SX256 MRFL0_SX436 MRFL0_SX76 MRGM0_SA2 MRGM0_SX262 MRGS0_SA2 MRGS0_SX186 MRHL0_SI885 MRHL0_SX345 MRHL0_SX435 MRJB1_SA1 MRJB1_SA2 MRJB1_SX210 MRJB1_SX30 MRJB1_SX390 MRJH0_SA2 MRJH0_SX307 MRJH0_SX79 MRJM0_SX148 MRJM1_SA2 MRJM1_SI1298 MRJM1_SI1928 MRJM1_SX128 MRJT0_SA2 MRJT0_SI1498 MRJT0_SX328 MRJT0_SX418 MRKM0_SA2 MRKM0_SX367 MRLD0_SA2 MRLD0_SI2224 MRLD0_SX154 MRLD0_SX424 MRLJ0_SA1 MRLJ0_SX250 MRLJ0_SX340 MRLJ1_SA1 MRLJ1_SA2 MRLJ1_SX321 MRLK0_SI843 MRLK0_SX123 MRLK0_SX213 MRMB0_SA2 MRMB0_SI1581 MRMB0_SX411 MRMG0_SA1 MRMG0_SI1080 MRMG0_SX450 MRMH0_SI1349 MRMH0_SI2281 MRMH0_SX121 MRML0_SA2 MRML0_SX341 MRPC1_SI2112 MRRE0_SA2 MRRE0_SX164 MRRE0_SX344 MRRE0_SX74 MRSO0_SX129 MRSO0_SX39 MRSP0_SX259 MRTC0_SX378 MRVG0_SI1140 MRVG0_SX240 MRWA0_SI973 MRWA0_SX163 MRWA0_SX73 MRWS0_SI1732 MRWS0_SI472 MRWS0_SX22 MRWS0_SX382 MRXB0_SA2 MRXB0_SX415 MSAH1_SI1679 MSAS0_SX116 MSAS0_SX206 MSAS0_SX386 MSAT0_SA1 MSAT1_SX263 MSAT1_SX443 MSAT1_SX83 MSDB0_SX197 MSDB0_SX287 MSDB0_SX377 MSDH0_SI2240 MSDH0_SX440 MSDH0_SX80 MSDS0_SA1 MSEM1_SI1440 MSEM1_SX180 MSEM1_SX270 MSES0_SI1589 MSES0_SX239 MSES0_SX419 MSFH0_SX316 MSFV0_SI1892 MSFV0_SX362 MSFV0_SX92 MSMR0_SX415 MSMS0_SA1 MSMS0_SX173 MSMS0_SX83 MSRG0_SA1 MSRG0_SI1221 MSTF0_SI766 MSTF0_SX316 MSTF0_SX46 MSVS0_SA2 MSVS0_SX308 MTAS0_SX215 MTAS0_SX35 MTAS0_SX395 MTAT0_SX390 MTAT1_SX59 MTBC0_SI1803 MTCS0_SA2 MTCS0_SI2265 MTCS0_SX82 MTDP0_SA2 MTER0_SA2 MTER0_SI1787 MTJG0_SA1 MTJG0_SI2157 MTJG0_SX260 MTJM0_SI1856 MTJM0_SX146 MTJU0_SX130 MTJU0_SX400 MTKD0_SX107 MTKD0_SX287 MTKP0_SI1023 MTLB0_SA1 MTLB0_SX234 MTLC0_SA1 MTML0_SI2325 MTML0_SX165 MTMN0_SA2 MTMN0_SI1064 MTMN0_SI2324 MTMN0_SX434 MTMT0_SA2 MTMT0_SI1748 MTPF0_SX65 MTPG0_SI1383 MTPG0_SI753 MTPG0_SX303 MTPP0_SX338 MTPR0_SX340 MTQC0_SI480 MTQC0_SX91 MTRR0_SX198 MTRR0_SX288 MTRT0_SA2 MTRT0_SX254 MTRT0_SX57 MTWH1_SX72 MTXS0_SA1 MTXS0_SA2 MVJH0_SI926 MVJH0_SX206 MVJH0_SX296 MVLO0_SA1 MVRW0_SA2 MVRW0_SX135 MVRW0_SX225 MWAC0_SA2 MWAC0_SX341 MWAC0_SX431 MWAD0_SX432 MWAD0_SX72 MWAR0_SA1 MWAR0_SI1675 MWCH0_SI1895 MWCH0_SI2252 MWCH0_SX182 MWCH0_SX452 MWDK0_SA1 MWDK0_SA2 MWDK0_SI2017 MWDK0_SI806 MWDK0_SX176 MWDK0_SX86 MWEM0_SA2 MWEM0_SI1320 MWEM0_SI1393 MWEM0_SX150 MWGR0_SX346 MWRE0_SX247 MWRE0_SX337 MWRE0_SX427 MWRP0_SA1 MWRP0_SX273 MWRP0_SX363 MWSB0_SX276 MWSH0_SX256 MWSH0_SX76 MZMB0_SA1 ================================================ FILE: examples/wav2vec/unsupervised/config/timit_unmatched/valid.uid ================================================ FAEM0_SI1392 FAJW0_SI1263 FAJW0_SI633 FALK0_SI658 FALR0_SX335 FAPB0_SI1063 FAPB0_SI2323 FAPB0_SX433 FBAS0_SI1472 FBAS0_SI2066 FBCG1_SX352 FBCH0_SI959 FBJL0_SI922 FBLV0_SI1688 FBMH0_SI1136 FBMH0_SI970 FBMJ0_SA1 FBMJ0_SI1776 FBMJ0_SI516 FBMJ0_SX336 FCDR1_SI1186 FCDR1_SI1816 FCDR1_SI556 FCDR1_SX286 FCKE0_SI1741 FCKE0_SI481 FCLT0_SI808 FCMG0_SI1142 FCMG0_SX432 FCMM0_SI1957 FCMM0_SX420 FCYL0_SI667 FCYL0_SX349 FDAS1_SI1461 FDAS1_SI831 FDAW0_SI1271 FDAW0_SI2036 FDJH0_SI935 FDKN0_SI1202 FDKN0_SX181 FDKN0_SX451 FDMY0_SA1 FDMY0_SI567 FDMY0_SI714 FDMY0_SX387 FDNC0_SI1278 FDNC0_SI1908 FDTD0_SA1 FDTD0_SX321 FEAC0_SI615 FEAR0_SX352 FECD0_SA1 FECD0_SI1418 FECD0_SI788 FEME0_SI875 FEME0_SX335 FEXM0_SA1 FEXM0_SI482 FEXM0_SX366 FGDP0_SI988 FGDP0_SX88 FGMB0_SI1145 FGMB0_SX335 FGRW0_SA1 FGRW0_SI1152 FGRW0_SX162 FGRW0_SX432 FHLM0_SX120 FHLM0_SX349 FHXS0_SA1 FHXS0_SI1075 FHXS0_SI2302 FHXS0_SX175 FJDM2_SA2 FJDM2_SX142 FJEN0_SA1 FJEN0_SX327 FJEN0_SX417 FJHK0_SI2282 FJKL0_SI932 FJLG0_SI1889 FJLR0_SI1231 FJRB0_SX402 FJRP1_SA1 FJRP1_SI1432 FJRP1_SX262 FJRP1_SX352 FJSK0_SI1052 FJSP0_SI1434 FJWB1_SI748 FJXM0_SX311 FJXM0_SX41 FJXP0_SI1752 FKAA0_SA1 FKDE0_SI1141 FKDE0_SI1771 FKDW0_SI1207 FKDW0_SI1891 FKFB0_SI1608 FKFB0_SX438 FKKH0_SI1290 FKKH0_SI1920 FKLC0_SI985 FKLC0_SX175 FKLC1_SI1048 FKLH0_SI1257 FKSR0_SX366 FLAC0_SI1339 FLAG0_SI1464 FLAG0_SI834 FLEH0_SI1051 FLET0_SI507 FLJA0_SI1078 FLJA0_SX178 FLJD0_SI1516 FLJG0_SI981 FLJG0_SX171 FLJG0_SX351 FLKM0_SA1 FLKM0_SI620 FLKM0_SX350 FLKM0_SX440 FLMC0_SI1372 FLMK0_SA1 FLMK0_SI1229 FLTM0_SX170 FLTM0_SX350 FLTM0_SX440 FMAH1_SI879 FMBG0_SI1160 FMEM0_SA1 FMEM0_SX333 FMJB0_SI1177 FMJF0_SI624 FMJF0_SX174 FMJF0_SX84 FMJU0_SI1389 FMKC0_SI1041 FMKF0_SI1018 FMPG0_SA1 FMPG0_SI972 FMPG0_SX162 FMPG0_SX342 FMPG0_SX432 FNKL0_SI892 FNTB0_SI679 FPAB1_SA1 FPAB1_SI2101 FPAB1_SI841 FPAC0_SI1921 FPAC0_SI661 FPAD0_SI716 FPAD0_SX176 FPAF0_SA1 FPAF0_SI1054 FPAZ0_SI2223 FPAZ0_SI963 FPJF0_SI1259 FPJF0_SX352 FPLS0_SI960 FPMY0_SI1153 FPMY0_SI523 FREH0_SI1945 FRLL0_SI805 FSAG0_SI1323 FSAG0_SX153 FSAG0_SX333 FSAG0_SX423 FSAH0_SI614 FSAH0_SX327 FSAK0_SI1300 FSBK0_SX349 FSCN0_SA1 FSCN0_SI705 FSCN0_SX176 FSDC0_SI1312 FSDJ0_SI1115 FSGF0_SI2187 FSGF0_SI927 FSJG0_SA1 FSJG0_SA2 FSJG0_SI940 FSJG0_SX220 FSJG0_SX40 FSJG0_SX400 FSJS0_SA1 FSJS0_SX451 FSJW0_SI1333 FSKP0_SI1098 FSMA0_SI991 FSMA0_SX451 FSMM0_SX324 FSPM0_SI1241 FSPM0_SX251 FSRH0_SX311 FSSB0_SI1712 FSSB0_SX362 FTBR0_SI1402 FTBR0_SI921 FTBW0_SI715 FTBW0_SX175 FTLG0_SI1743 FTLG0_SI483 FTMG0_SI902 FVFB0_SI1510 FVKB0_SX349 FVMH0_SI1466 FVMH0_SI836 MADC0_SI1367 MADC0_SI737 MAEB0_SI1411 MAEO0_SI1326 MAJP0_SI1704 MAJP0_SX174 MAKB0_SA2 MAKB0_SI1016 MAKB0_SI2276 MAKB0_SX116 MAPV0_SI1293 MAPV0_SI663 MARW0_SX286 MARW0_SX349 MBBR0_SI1055 MBBR0_SX335 MBCG0_SI957 MBCG0_SX327 MBGT0_SI1841 MBGT0_SX171 MBMA0_SI1222 MBMA1_SI954 MBMA1_SX324 MBTH0_SI2102 MBWP0_SX349 MCAE0_SI1447 MCAE0_SI2077 MCAE0_SI817 MCAL0_SI1138 MCDR0_SI1784 MCDR0_SI524 MCEF0_SI842 MCEW0_SA1 MCEW0_SI2072 MCEW0_SI812 MCEW0_SX362 MCEW0_SX452 MCHL0_SI1347 MCHL0_SI1404 MCLK0_SI2290 MCLK0_SI650 MCPM0_SI1824 MCSS0_SI1380 MCSS0_SI688 MCTM0_SI1350 MCTM0_SI1980 MDAC0_SI631 MDAS0_SI1896 MDAS0_SI636 MDBP0_SI528 MDBP0_SX438 MDCD0_SI785 MDCD0_SX335 MDCM0_SI1480 MDDC0_SI1419 MDED0_SI540 MDEF0_SI1123 MDEM0_SA1 MDEM0_SI608 MDEM0_SI800 MDEM0_SX428 MDHS0_SI900 MDJM0_SI1455 MDKS0_SX166 MDKS0_SX346 MDLB0_SI1306 MDLB0_SX136 MDLB0_SX406 MDLC0_SI1395 MDLC0_SI2025 MDLC1_SI1435 MDLH0_SX160 MDLH0_SX430 MDLM0_SI604 MDLR0_SX333 MDLR1_SI669 MDMA0_SX170 MDMA0_SX350 MDMA0_SX440 MDNS0_SI1011 MDNS0_SI873 MDPB0_SI1760 MDPB0_SI866 MDRD0_SI752 MDSJ0_SI1462 MDSJ0_SX438 MDWD0_SI1260 MDWH0_SA1 MDWH0_SI1168 MDWH0_SI665 MDWM0_SI916 MEDR0_SI2004 MEFG0_SI491 MEFG0_SI598 MEGJ0_SA1 MEGJ0_SI1337 MEGJ0_SI707 MEGJ0_SX167 MEJS0_SI1240 MESG0_SI702 MESJ0_SI2039 MFWK0_SX349 MFXS0_SX324 MFXV0_SI1005 MFXV0_SI1342 MGAF0_SI1282 MGAG0_SI691 MGAK0_SI1036 MGAK0_SX136 MGAR0_SX312 MGAW0_SI1165 MGES0_SX311 MGJC0_SX435 MGRL0_SX327 MGRP0_SI1317 MGRP0_SX327 MGSH0_SI1176 MGSH0_SI546 MGSL0_SI797 MGXP0_SI1087 MGXP0_SI525 MHBS0_SI945 MHIT0_SI983 MHMG0_SI735 MHMR0_SI1692 MILB0_SI903 MJAC0_SI701 MJAC0_SX251 MJAE0_SX84 MJAI0_SI682 MJAI0_SI710 MJDC0_SI531 MJDE0_SA1 MJDE0_SI1120 MJDE0_SI490 MJDE0_SX220 MJDM0_SI1340 MJDM0_SX170 MJDM0_SX350 MJEB0_SX170 MJEB1_SI1467 MJEB1_SI837 MJFR0_SA1 MJFR0_SX435 MJHI0_SI1328 MJJJ0_SI1163 MJJM0_SI1251 MJLB0_SI1616 MJLS0_SI1726 MJMA0_SI2125 MJMD0_SI2288 MJMM0_SI1255 MJMM0_SX175 MJPG0_SI1821 MJPM0_SI1368 MJPM1_SX311 MJRA0_SX336 MJRG0_SI736 MJRG0_SX352 MJRH0_SI1840 MJRH1_SI1558 MJRK0_SI880 MJRP0_SI1845 MJSR0_SI2054 MJSR0_SI794 MJWG0_SI813 MJWG0_SI895 MJWG0_SX175 MJWS0_SX333 MJWT0_SI1291 MJWT0_SI1381 MJXL0_SI1172 MKAG0_SI979 MKAH0_SX178 MKAM0_SI1250 MKAM0_SI1465 MKDD0_SI1567 MKDD0_SI2197 MKDD0_SI937 MKDT0_SI814 MKES0_SI623 MKLS0_SI1437 MKLS0_SI2067 MKLS1_SI915 MKLW0_SI1571 MKLW0_SX311 MKRG0_SI861 MKXL0_SI1815 MKXL0_SI1958 MLBC0_SI1239 MLEL0_SI616 MLEL0_SX166 MLJC0_SI1225 MLJH0_SA1 MLJH0_SA2 MLJH0_SI1422 MLJH0_SI694 MLJH0_SX244 MLSH0_SI1417 MLSH0_SX247 MMAA0_SI1588 MMAA0_SI845 MMAB1_SI864 MMAB1_SX324 MMAG0_SA1 MMAG0_SI1756 MMAG0_SX136 MMAR0_SI1966 MMAR0_SX166 MMAR0_SX346 MMBS0_SI521 MMBS0_SX161 MMCC0_SI1338 MMDB0_SI987 MMDG0_SI1780 MMDM0_SI1311 MMDM1_SX153 MMDM1_SX333 MMEB0_SX327 MMGC0_SI1305 MMGG0_SI1079 MMGG0_SX449 MMLM0_SI2150 MMPM0_SX161 MMRP0_SX324 MMSM0_SI1106 MMSM0_SI476 MMVP0_SI654 MMVP0_SX347 MMWB0_SA1 MMWB0_SI2249 MMWB0_SX359 MMWB0_SX449 MNTW0_SI1068 MNTW0_SI1698 MPEB0_SI600 MPFU0_SI1258 MPGH0_SI675 MPGR0_SI1410 MPGR1_SI1499 MPMB0_SA1 MPMB0_SA2 MPMB0_SI1501 MPMB0_SI2131 MPMB0_SI871 MPMB0_SX151 MPMB0_SX331 MPMB0_SX421 MPMB0_SX61 MPPC0_SI1412 MPRB0_SI1215 MPRB0_SI575 MPRD0_SI801 MPRD0_SX171 MPRK0_SA1 MPRK0_SI1097 MPRK0_SI467 MPRK0_SX287 MRAB0_SI1854 MRAB1_SI848 MRAI0_SI2052 MRAI0_SI792 MRAI0_SX432 MRAM0_SI1951 MRCG0_SA2 MRCG0_SI1428 MRCG0_SX348 MRCG0_SX438 MRCW0_SI741 MRDM0_SI1044 MRDM0_SX335 MREE0_SI1104 MREE0_SI1959 MREH1_SA1 MREH1_SI1599 MREH1_SI969 MREM0_SI511 MRFK0_SI1076 MRFL0_SI1156 MRFL0_SI526 MRFL0_SX166 MRGM0_SI532 MRGM0_SX172 MRGM0_SX442 MRGS0_SI1356 MRGS0_SI726 MRGS0_SX6 MRJB1_SI1413 MRJB1_SI2021 MRJB1_SX120 MRJH0_SI1519 MRJH0_SI889 MRJH0_SX169 MRJT0_SI868 MRJT0_SX58 MRKM0_SI1267 MRKM0_SI1391 MRKM0_SI637 MRLJ0_SI790 MRLJ1_SI2301 MRLK0_SI1468 MRLR0_SI1196 MRML0_SA1 MRML0_SI1421 MRML0_SX161 MRML0_SX251 MRMS0_SI2057 MRRE0_SA1 MRRE0_SI1334 MRRE0_SI952 MRSO0_SI1206 MRSP0_SI1429 MRTC0_SI1458 MRTJ0_SA1 MRTJ0_SI772 MRTJ0_SX142 MRTJ0_SX232 MRTJ0_SX52 MRWS0_SI1102 MRXB0_SI2215 MRXB0_SI955 MSAS0_SI1376 MSAS0_SI746 MSDH0_SI980 MSDH0_SX170 MSDS0_SI1077 MSDS0_SX267 MSDS0_SX357 MSEM1_SI2070 MSEM1_SI810 MSFH0_SA1 MSFH0_SI1738 MSFH0_SX136 MSFH0_SX406 MSFV0_SI632 MSJK0_SI1596 MSJK0_SX336 MSMC0_SI509 MSMR0_SI1150 MSMS0_SI1433 MSRR0_SI1761 MSRR0_SI501 MSTF0_SI852 MSVS0_SI2198 MSVS0_SI938 MSVS0_SX398 MTAB0_SI1572 MTAB0_SX312 MTAT0_SA1 MTAT0_SI1110 MTAT0_SI811 MTAT1_SI779 MTAT1_SX149 MTAT1_SX329 MTBC0_SI543 MTCS0_SI712 MTDB0_SI1401 MTDB0_SI771 MTDP0_SA1 MTDP0_SI1521 MTDP0_SX171 MTDP0_SX351 MTER0_SA1 MTER0_SI1157 MTER0_SX437 MTJG0_SX170 MTJS0_SA2 MTJS0_SI1822 MTJS0_SI562 MTJS0_SX382 MTJU0_SI2020 MTKD0_SI630 MTKP0_SI2283 MTKP0_SI454 MTLB0_SI1134 MTLB0_SX324 MTLC0_SI1313 MTLC0_SI1477 MTML0_SX435 MTMN0_SI582 MTMT0_SI488 MTPP0_SI1508 MTPR0_SI2230 MTPR0_SX160 MTPR0_SX430 MTQC0_SA1 MTQC0_SI1441 MTQC0_SX181 MTQC0_SX451 MTRC0_SI589 MTRR0_SI918 MTRT0_SI1227 MTXS0_SI1060 MTXS0_SI2320 MTXS0_SX160 MTXS0_SX430 MVJH0_SI1556 MVLO0_SI517 MWAC0_SI1601 MWAC0_SX161 MWAC0_SX251 MWAR0_SI1045 MWDK0_SI1436 MWEM0_SX420 MWRE0_SA2 MWRE0_SI1057 MWRE0_SX67 MWRP0_SI1443 MWSB0_SI996 MWSH0_SI1426 MWSH0_SI796 MWSH0_SX166 ================================================ FILE: examples/wav2vec/unsupervised/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .extracted_features_dataset import ExtractedFeaturesDataset from .random_input_dataset import RandomInputDataset __all__ = [ "ExtractedFeaturesDataset", "RandomInputDataset", ] ================================================ FILE: examples/wav2vec/unsupervised/data/extracted_features_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import contextlib import numpy as np import torch from fairseq.data import FairseqDataset, data_utils logger = logging.getLogger(__name__) class ExtractedFeaturesDataset(FairseqDataset): def __init__( self, path, split, min_length=3, max_length=None, labels=None, label_dict=None, shuffle=True, sort_by_length=True, aux_target_postfix=None, ): super().__init__() self.min_length = min_length self.max_length = max_length self.shuffle = shuffle self.sort_by_length = sort_by_length self.label_dict = label_dict if labels is not None: assert label_dict is not None self.sizes = [] self.offsets = [] self.labels = [] self.aux_tgt = None path = os.path.join(path, split) data_path = path self.data = np.load(data_path + ".npy", mmap_mode="r") offset = 0 skipped = 0 if not os.path.exists(path + f".{labels}"): labels = None with open(data_path + ".lengths", "r") as len_f, open( path + f".{labels}", "r" ) if labels is not None else contextlib.ExitStack() as lbl_f: for line in len_f: length = int(line.rstrip()) lbl = None if labels is None else next(lbl_f).rstrip().split() if length >= min_length and ( max_length is None or length <= max_length ): self.sizes.append(length) self.offsets.append(offset) if lbl is not None: self.labels.append(lbl) offset += length self.sizes = np.asarray(self.sizes) self.offsets = np.asarray(self.offsets) if aux_target_postfix is not None: if not os.path.exists(path+f".{aux_target_postfix}"): logger.info(f"auxaliry target for {split} missing") else: with open(path+f".{aux_target_postfix}", "r") as t_f: self.aux_tgt = [ torch.LongTensor(list(map(int,seg.strip().split())))\ for seg in t_f] logger.info(f"loaded {len(self.offsets)}, skipped {skipped} samples") def __getitem__(self, index): offset = self.offsets[index] end = self.sizes[index] + offset feats = torch.from_numpy(self.data[offset:end].copy()).float() res = {"id": index, "features": feats} if len(self.labels) > 0: res["target"] = self.label_dict.encode_line( self.labels[index], line_tokenizer=lambda x: x, append_eos=False, ) if self.aux_tgt: res["aux_target"] = self.aux_tgt[index] return res def __len__(self): return len(self.sizes) def collater(self, samples): if len(samples) == 0: return {} features = [s["features"] for s in samples] sizes = [len(s) for s in features] target_size = max(sizes) collated_features = features[0].new_zeros( len(features), target_size, features[0].size(-1) ) padding_mask = torch.BoolTensor(collated_features.shape[:-1]).fill_(False) for i, (f, size) in enumerate(zip(features, sizes)): collated_features[i, :size] = f padding_mask[i, size:] = True res = { "id": torch.LongTensor([s["id"] for s in samples]), "net_input": {"features": collated_features, "padding_mask": padding_mask}, } if len(self.labels) > 0: target = data_utils.collate_tokens( [s["target"] for s in samples], pad_idx=self.label_dict.pad(), left_pad=False, ) res["target"] = target if self.aux_tgt: idxs = torch.nn.utils.rnn.pad_sequence( [s["aux_target"] for s in samples], batch_first=True, padding_value=-1, ) res["net_input"]["aux_target"] = idxs return res def num_tokens(self, index): return self.size(index) def size(self, index): return self.sizes[index] def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] if self.sort_by_length: order.append(self.sizes) return np.lexsort(order)[::-1] else: return order[0] ================================================ FILE: examples/wav2vec/unsupervised/data/random_input_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import random from typing import List from fairseq.data import BaseWrapperDataset, data_utils class RandomInputDataset(BaseWrapperDataset): def __init__( self, dataset, random_input_dataset, input_key_path: List[str], add_to_input, pad_idx, ): super().__init__(dataset) self.random_input_dataset = random_input_dataset if isinstance(input_key_path, str): input_key_path = [input_key_path] assert len(input_key_path) > 0 self.input_key_path = input_key_path self.add_to_input = add_to_input self.pad_idx = pad_idx def get_target(self, item): target_loc = item for p in self.input_key_path[:-1]: target_loc = target_loc[p] return self.input_key_path[-1], target_loc def get_target_value(self, item): k, target_loc = self.get_target(item) return target_loc[k] def __getitem__(self, index): item = self.dataset[index] k, target_loc = self.get_target(item) target_loc[k] = random.choice(self.random_input_dataset) return item def collater(self, samples): collated = self.dataset.collater(samples) if len(collated) == 0: return collated indices = set(collated["id"].tolist()) random_inputs = data_utils.collate_tokens( [self.get_target_value(s) for s in samples if s["id"] in indices], pad_idx=self.pad_idx, left_pad=False, ) k, target_loc = self.get_target( collated if not self.add_to_input else collated["net_input"] ) target_loc[k] = random_inputs return collated ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/README.md ================================================ # Self-Training with Kaldi HMM Models This folder contains recipes for self-training on pseudo phone transcripts and decoding into phones or words with [kaldi](https://github.com/kaldi-asr/kaldi). To start, download and install kaldi follow its instruction, and place this folder in `path/to/kaldi/egs`. ## Training Assuming the following has been prepared: - `w2v_dir`: contains features `{train,valid}.{npy,lengths}`, real transcripts `{train,valid}.${label}`, and dict `dict.${label}.txt` - `lab_dir`: contains pseudo labels `{train,valid}.txt` - `arpa_lm`: Arpa-format n-gram phone LM for decoding - `arpa_lm_bin`: Arpa-format n-gram phone LM for unsupervised model selection to be used with KenLM Set these variables in `train.sh`, as well as `out_dir`, the output directory, and then run it. The output will be: ``` ==== WER w.r.t. real transcript (select based on unsupervised metric) INFO:root:./out/exp/mono/decode_valid/scoring/14.0.0.tra.txt: score 0.9178 wer 28.71% lm_ppl 24.4500 gt_wer 25.57% INFO:root:./out/exp/tri1/decode_valid/scoring/17.1.0.tra.txt: score 0.9257 wer 26.99% lm_ppl 30.8494 gt_wer 21.90% INFO:root:./out/exp/tri2b/decode_valid/scoring/8.0.0.tra.txt: score 0.7506 wer 23.15% lm_ppl 25.5944 gt_wer 15.78% ``` where `wer` is the word eror rate with respect to the pseudo label, `gt_wer` to the ground truth label, `lm_ppl` the language model perplexity of HMM prediced transcripts, and `score` is the unsupervised metric for model selection. We choose the model and the LM parameter of the one with the lowest score. In the example above, it is `tri2b`, `8.0.0`. ## Decoding into Phones In `decode_phone.sh`, set `out_dir` the same as used in `train.sh`, set `dec_exp` and `dec_lmparam` to the selected model and LM parameter (e.g. `tri2b` and `8.0.0` in the above example). `dec_script` needs to be set according to `dec_exp`: for mono/tri1/tri2b, use `decode.sh`; for tri3b, use `decode_fmllr.sh`. The output will be saved at `out_dir/dec_data` ## Decoding into Words `decode_word_step1.sh` prepares WFSTs for word decoding. Besides the variables mentioned above, set - `wrd_arpa_lm`: Arpa-format n-gram word LM for decoding - `wrd_arpa_lm_bin`: Arpa-format n-gram word LM for unsupervised model selection `decode_word_step1.sh` decodes the `train` and `valid` split into word and runs unsupervised model selection using the `valid` split. The output is like: ``` INFO:root:./out/exp/tri2b/decodeword_valid/scoring/17.0.0.tra.txt: score 1.8693 wer 24.97% lm_ppl 1785.5333 gt_wer 31.45% ``` After determining the LM parameter (`17.0.0` in the example above), set it in `decode_word_step2.sh` and run it. The output will be saved at `out_dir/dec_data_word`. ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/cmd.sh ================================================ # you can change cmd.sh depending on what type of queue you are using. # If you have no queueing system and want to run on a local machine, you # can change all instances 'queue.pl' to run.pl (but be careful and run # commands one by one: most recipes will exhaust the memory on your # machine). queue.pl works with GridEngine (qsub). slurm.pl works # with slurm. Different queues are configured differently, with different # queue names and different ways of specifying things like memory; # to account for these differences you can create and edit the file # conf/queue.conf to match your queue's configuration. Search for # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. export train_cmd="run.pl --mem 2G" export decode_cmd="run.pl --mem 4G" export mkgraph_cmd="run.pl --mem 8G" ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/decode_phone.sh ================================================ #!/bin/bash # decode into phones (and prepare a new data directory for HMM outputs) . ./path.sh set -eu out_dir= # same as in train.sh dec_lmparam= # LM hyperparameters (e.g., 7.0.0) dec_exp= dec_script= dec_splits="train valid" dec_data_dir=$out_dir/dec_data # where to write HMM output data_dir=${out_dir}/data local/decode.sh --nj 40 --graph_name graph \ --val_sets "$dec_splits" --decode_script $dec_script \ $out_dir/exp/$dec_exp $data_dir $data_dir/lang_test if [ ! -z $dec_lmparam ]; then for x in $dec_splits; do mkdir -p $dec_data_dir/$x cp $data_dir/$x/{feats.scp,cmvn.scp,utt2spk,spk2utt} $dec_data_dir/$x/ tra=$out_dir/exp/$dec_exp/decode_${x}/scoring/${dec_lmparam}.tra cat $tra | utils/int2sym.pl -f 2- $data_dir/lang/words.txt | \ sed 's:<UNK>::g' | sed 's:<SIL>::g' > $dec_data_dir/${x}/text utils/fix_data_dir.sh $dec_data_dir/${x} echo "WER on ${x} is" $(compute-wer ark:$data_dir/${x}_gt/text ark:$dec_data_dir/$x/text | cut -d" " -f2-) done fi ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step1.sh ================================================ #!/bin/bash # prepare word WFSTs, reference data, and decode set -eu w2v_dir= # same as in train.sh out_dir= # same as in train.sh lexicon= # word to phone mapping wrd_arpa_lm= # word LM wrd_arpa_lm_bin= # word LM for KenLM, used in unsupervised selection dec_exp= # what HMM stage to decode (e.g., tri3b) dec_script= # what decoding script to use (e.g., steps/decode_fmllr.sh) phn_label=phnc wrd_label=wrd dec_suffix=word dec_splits="train valid" valid_split="valid" data_dir=$out_dir/data wrd_data_dir=$out_dir/data_word lexicon_clean=$(mktemp) cat $lexicon | sort | uniq > $lexicon_clean local/prepare_lang_word.sh $w2v_dir/dict.${phn_label}.txt $data_dir $lexicon_clean && rm $lexicon_clean local/prepare_lm.sh --langdir $data_dir/lang_word --lmdir $data_dir/lang_test_word $wrd_arpa_lm $data_dir for x in $dec_splits; do x_gt=${x}_gt mkdir -p $wrd_data_dir/$x_gt cp $data_dir/$x_gt/{feats.scp,cmvn.scp,utt2spk,spk2utt} $wrd_data_dir/$x_gt/ python local/copy_aligned_text.py < $w2v_dir/$x.$wrd_label > $wrd_data_dir/$x_gt/text done local/decode.sh --nj 40 --graph_name graph${dec_suffix} --decode_suffix $dec_suffix \ --val_sets "$dec_splits" --decode_script $dec_script \ $out_dir/exp/$dec_exp $data_dir $data_dir/lang_test_word local/unsup_select_decode_word.sh \ --split $valid_split --kenlm_path $wrd_arpa_lm_bin \ --ref_txt $wrd_data_dir/${valid_split}_gt/text \ --psd_txt $data_dir/${valid_split}/text \ --dec_name decode${dec_suffix} --graph_name graph${dec_suffix} \ --phonemize_lexicon $data_dir/local/dict_word/lexicon.txt \ $out_dir/exp ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/decode_word_step2.sh ================================================ #!/bin/bash # prepare a new data directory of HMM word output . ./path.sh set -eu out_dir= # same as in train.sh dec_lmparam= # LM hyperparameters (e.g., 7.0.0) dec_exp=tri3b # what HMM stage to decode (e.g., tri3b) dec_suffix=word dec_splits="train valid" dec_data_dir=$out_dir/dec_data_word # where to write HMM output data_dir=$out_dir/data wrd_data_dir=$out_dir/data_word for x in $dec_splits; do mkdir -p $dec_data_dir/$x cp $data_dir/$x/{feats.scp,cmvn.scp,utt2spk,spk2utt} $dec_data_dir/$x/ tra=$out_dir/exp/$dec_exp/decode${dec_suffix}_${x}/scoring/${dec_lmparam}.tra cat $tra | utils/int2sym.pl -f 2- $data_dir/lang_word/words.txt | \ sed 's:<UNK>::g' | sed 's:<SIL>::g' > $dec_data_dir/$x/text utils/fix_data_dir.sh $dec_data_dir/$x echo "WER on $x is" $(compute-wer ark:$wrd_data_dir/${x}_gt/text ark:$dec_data_dir/$x/text | cut -d" " -f2-) done ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/copy_aligned_text.py ================================================ import sys for idx, line in enumerate(sys.stdin): print(f"utt{idx:010d} {line}", end='') ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/decode.sh ================================================ #!/bin/bash set -u val_sets="dev_other" graph_name=graph decode_suffix="" decode_script="steps/decode_fmllr.sh" decode_args="" nj=60 . ./cmd.sh . ./path.sh . parse_options.sh set -x exp_dir=$1 data_root=$2 lang_test=$3 graph=$exp_dir/$graph_name if [ ! -d $graph ]; then utils/mkgraph.sh $lang_test $exp_dir $graph fi for part in $val_sets; do dec_dir=$exp_dir/decode${decode_suffix}_${part} if [ ! -d $dec_dir ]; then echo "decoding $part for $exp_dir" $decode_script --nj $nj --cmd "$decode_cmd" $decode_args \ $graph $data_root/$part $dec_dir & else echo "$dec_dir exists. skip" fi done wait ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_data_from_w2v.py ================================================ import kaldi_io import numpy as np import os def get_parser(): import argparse parser = argparse.ArgumentParser() parser.add_argument("w2v_dir", help="wav2vec feature and text directory") parser.add_argument("tar_root", help="output data directory in kaldi's format") parser.add_argument("split", help="name of the subset") parser.add_argument("--label", default="", help="if specified, copy labels too") return parser def main(): parser = get_parser() args = parser.parse_args() tar_dir = os.path.join(args.tar_root, args.split) os.makedirs(tar_dir, exist_ok=True) lengths_path = os.path.join(args.w2v_dir, f"{args.split}.lengths") with open(lengths_path) as f: lengths = [int(line.rstrip()) for line in f] offsets = [0] + np.cumsum(lengths[:-1]).tolist() feats = np.load( os.path.join(args.w2v_dir, f"{args.split}.npy"), mmap_mode="r" ) assert feats.shape[0] == sum(lengths), \ f"lengths mismatch {feats.shape[0]} != {sum(lengths)}" ark_path = os.path.join(tar_dir, "feats.ark") scp_path = os.path.join(tar_dir, "feats.scp") wspec = f"ark:| copy-feats --compress=true ark:- ark,scp:{ark_path},{scp_path}" with kaldi_io.open_or_fd(wspec, "wb") as f: for idx, (offset, length) in enumerate(zip(offsets, lengths)): feat = feats[offset:offset+length] kaldi_io.write_mat(f, feat, key=f"utt{idx:010d}") u2s_path = os.path.join(tar_dir, "utt2spk") s2u_path = os.path.join(tar_dir, "spk2utt") with open(u2s_path, "w") as f_u2s, open(s2u_path, "w") as f_s2u: for idx in range(len(lengths)): f_u2s.write(f"utt{idx:010d} utt{idx:010d}\n") f_s2u.write(f"utt{idx:010d} utt{idx:010d}\n") if bool(args.label): lab_path = os.path.join(args.w2v_dir, f"{args.split}.{args.label}") txt_path = os.path.join(tar_dir, "text") with open(lab_path) as f_lab, open(txt_path, "w") as f_txt: for idx, line in enumerate(f_lab): f_txt.write(f"utt{idx:010d} {line}") if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang.sh ================================================ #!/bin/bash sil_prob=0.5 num_sil_states=3 num_nonsil_states=1 . ./cmd.sh . ./path.sh . parse_options.sh set -eux dict=$1 data_dir=$2 dict_dir=$data_dir/local/dict tmplm_dir=$data_dir/local/lang_tmp lm_dir=$data_dir/lang mkdir -p $dict_dir $tmplm_dir $lm_dir # prepare dict echo "SIL" > $dict_dir/silence_phones.txt echo "SIL" > $dict_dir/optional_silence.txt awk '{print $1}' $dict > $dict_dir/nonsilence_phones.txt echo "SIL SIL" > $dict_dir/lexicon.txt echo "<UNK> SIL" >> $dict_dir/lexicon.txt awk '{print $1" "$1}' $dict >> $dict_dir/lexicon.txt echo "SIL" > $dict_dir/extra_questions.txt awk '{printf $1" "} END {printf "\n"}' $dict >> $dict_dir/extra_questions.txt # prepare lang utils/prepare_lang.sh --sil-prob $sil_prob --position-dependent-phones false \ --num_sil_states $num_sil_states --num_nonsil_states $num_nonsil_states \ $dict_dir "<UNK>" $tmplm_dir $lm_dir ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lang_word.sh ================================================ #!/bin/bash num_sil_states=3 num_nonsil_states=1 . ./cmd.sh . ./path.sh . parse_options.sh set -eux dict=$1 data_dir=$2 lexicon=$3 dict_dir=$data_dir/local/dict_word tmplm_dir=$data_dir/local/lang_tmp_word lm_dir=$data_dir/lang_word mkdir -p $dict_dir $tmplm_dir $lm_dir # prepare dict echo "SIL" > $dict_dir/silence_phones.txt echo "SIL" > $dict_dir/optional_silence.txt awk '{print $1}' $dict > $dict_dir/nonsilence_phones.txt (echo "!SIL SIL"; echo "<UNK> SIL";) | cat - $lexicon > $dict_dir/lexicon.txt echo "SIL" > $dict_dir/extra_questions.txt awk '{printf $1" "} END {printf "\n"}' $dict >> $dict_dir/extra_questions.txt # prepare lang utils/prepare_lang.sh --position-dependent-phones false \ --num_sil_states $num_sil_states --num_nonsil_states $num_nonsil_states \ $dict_dir "<UNK>" $tmplm_dir $lm_dir ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/prepare_lm.sh ================================================ #!/usr/bin/env bash langdir="" lmdir="" . ./cmd.sh . ./path.sh . parse_options.sh arpa_lm=$1 data=$2 if [ -z $langdir ]; then langdir=$data/lang fi if [ -z $lmdir ]; then lmdir=$data/lang_test fi if [ ! -d $langdir ]; then echo "$langdir not found. run local/prepare_lang.sh first" && exit 1 fi mkdir -p $lmdir cp -r $langdir/* $lmdir if [[ "$arpa_lm" == *.gz ]]; then gunzip -c $arpa_lm | arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt - $lmdir/G.fst else arpa2fst --disambig-symbol=#0 --read-symbol-table=$lmdir/words.txt $arpa_lm $lmdir/G.fst fi fstisstochastic $lmdir/G.fst utils/validate_lang.pl $lmdir || exit 1 echo "done preparing lm ($lmdir)" ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/score.sh ================================================ #!/usr/bin/env bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # 2014 Guoguo Chen # Apache 2.0 [ -f ./path.sh ] && . ./path.sh # begin configuration section. cmd=run.pl stage=0 decode_mbr=true word_ins_penalty=0.0,0.5,1.0 min_lmwt=7 max_lmwt=17 iter=final #end configuration section. [ -f ./path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# -ne 3 ]; then echo "Usage: local/score.sh [--cmd (run.pl|queue.pl...)] <data-dir> <lang-dir|graph-dir> <decode-dir>" echo " Options:" echo " --cmd (run.pl|queue.pl...) # specify how to run the sub-processes." echo " --stage (0|1|2) # start scoring script from part-way through." echo " --decode_mbr (true/false) # maximum bayes risk decoding (confusion network)." echo " --min_lmwt <int> # minumum LM-weight for lattice rescoring " echo " --max_lmwt <int> # maximum LM-weight for lattice rescoring " exit 1; fi data=$1 lang_or_graph=$2 dir=$3 symtab=$lang_or_graph/words.txt for f in $symtab $dir/lat.1.gz $data/text; do [ ! -f $f ] && echo "score.sh: no such file $f" && exit 1; done mkdir -p $dir/scoring/log cat $data/text | sed 's:<NOISE>::g' | sed 's:<SPOKEN_NOISE>::g' > $dir/scoring/test_filt.txt for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/best_path.LMWT.$wip.log \ lattice-scale --inv-acoustic-scale=LMWT "ark:gunzip -c $dir/lat.*.gz|" ark:- \| \ lattice-add-penalty --word-ins-penalty=$wip ark:- ark:- \| \ lattice-best-path --word-symbol-table=$symtab \ ark:- ark,t:$dir/scoring/LMWT.$wip.tra || exit 1; done # Note: the double level of quoting for the sed command for wip in $(echo $word_ins_penalty | sed 's/,/ /g'); do $cmd LMWT=$min_lmwt:$max_lmwt $dir/scoring/log/score.LMWT.$wip.log \ cat $dir/scoring/LMWT.$wip.tra \| \ utils/int2sym.pl -f 2- $symtab \| sed 's:\<UNK\>::g' \| \ compute-wer --text --mode=present \ ark:$dir/scoring/test_filt.txt ark,p:- ">&" $dir/wer_LMWT_$wip || exit 1; done exit 0; ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/show_wer.sh ================================================ #!/bin/bash split="dev_other" ref_data="" get_best_wer=true dec_name="decode" graph_name="graph" . ./cmd.sh . ./path.sh . parse_options.sh exp_root=$1 set -eu echo "==== WER w.r.t. pseudo transcript" for x in $exp_root/*/${dec_name}_${split}*; do grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh; done if [ ! -z $ref_data ]; then echo "==== WER w.r.t. real transcript (select based on pseudo WER)" ref_txt=$ref_data/$split/text for x in $exp_root/*/${dec_name}_${split}*; do lang=$(dirname $x)/$graph_name lmwt=$( grep WER $x/wer_* 2>/dev/null | utils/best_wer.sh | sed 's/.*wer_\(.*\)$/\1/g' | sed 's/_/./g' ) tra=$x/scoring/$lmwt.tra cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \ compute-wer --text --mode=present \ ark:$ref_txt ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra done fi if [ ! -z $ref_data ] && $get_best_wer; then echo "==== WER w.r.t. real transcript (select based on true WER)" ref_txt=$ref_data/$split/text for x in $exp_root/*/${dec_name}_${split}*; do lang=$(dirname $x)/$graph_name for tra in $x/scoring/*.tra; do cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' | \ compute-wer --text --mode=present \ ark:$ref_txt ark,p:- 2> /dev/null | grep WER | xargs -I{} echo {} $tra done | sort -k2n | head -n1 done fi exit 0; ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/train_subset_lgbeam.sh ================================================ #!/usr/bin/env bash out_root=/tmp out_name=train_${RANDOM} num_nonsil_states=1 valid="dev_other" train="train" mono_size="-1" # 2000 tri1_size="-1" # 5000 tri2b_size="-1" # 10000 tri3b_size="-1" # 10000 # Acoustic model parameters numLeavesTri1=2000 numGaussTri1=10000 numLeavesMLLT=2500 numGaussMLLT=15000 numLeavesSAT=2500 numGaussSAT=15000 stage=1 max_stage=1 . ./cmd.sh . ./path.sh . parse_options.sh data=$1 lang=$2 lang_test=$3 exp_root=$out_root/$out_name # you might not want to do this for interactive shells. set -e if [ $stage -le 1 ] && [ $max_stage -ge 1 ]; then # train a monophone system if [ ! $mono_size -eq -1 ]; then utils/subset_data_dir.sh $data/$train $mono_size $data/${train}_${mono_size} mono_train=${train}_${mono_size} else mono_train=${train} fi steps/train_mono.sh --boost-silence 1.25 --nj 20 --cmd "$train_cmd" \ --initial-beam 40 --regular-beam 60 --retry-beam 120 \ $data/$mono_train $lang $exp_root/mono utils/mkgraph.sh $lang_test $exp_root/mono $exp_root/mono/graph steps/decode.sh --nj 20 --cmd "$decode_cmd" \ $exp_root/mono/graph $data/$valid $exp_root/mono/decode_$valid & fi if [ $stage -le 2 ] && [ $max_stage -ge 2 ]; then # train a first delta + delta-delta triphone system on a subset of 5000 utterances if [ ! $tri1_size -eq -1 ]; then utils/subset_data_dir.sh $data/$train $tri1_size $data/${train}_${tri1_size} tri1_train=${train}_${tri1_size} else tri1_train=${train} fi steps/align_si.sh --boost-silence 1.25 --nj 10 --cmd "$train_cmd" \ $data/$tri1_train $lang \ $exp_root/mono $exp_root/mono_ali_${tri1_train} steps_gan/train_deltas.sh --boost-silence 1.25 --cmd "$train_cmd" \ --num_nonsil_states $num_nonsil_states $numLeavesTri1 $numGaussTri1 \ $data/$tri1_train $lang \ $exp_root/mono_ali_${tri1_train} $exp_root/tri1 utils/mkgraph.sh $lang_test $exp_root/tri1 $exp_root/tri1/graph steps/decode.sh --nj 20 --cmd "$decode_cmd" \ $exp_root/tri1/graph $data/$valid $exp_root/tri1/decode_$valid & fi if [ $stage -le 3 ] && [ $max_stage -ge 3 ]; then # train an LDA+MLLT system. if [ ! $tri2b_size -eq -1 ]; then utils/subset_data_dir.sh $data/$train $tri2b_size $data/${train}_${tri2b_size} tri2b_train=${train}_${tri2b_size} else tri2b_train=${train} fi steps/align_si.sh --nj 10 --cmd "$train_cmd" \ $data/$tri2b_train $lang \ $exp_root/tri1 $exp_root/tri1_ali_${tri2b_train} steps_gan/train_lda_mllt.sh --cmd "$train_cmd" \ --num_nonsil_states $num_nonsil_states \ --splice-opts "--left-context=3 --right-context=3" $numLeavesMLLT $numGaussMLLT \ $data/$tri2b_train $lang \ $exp_root/tri1_ali_${tri2b_train} $exp_root/tri2b utils/mkgraph.sh $lang_test $exp_root/tri2b $exp_root/tri2b/graph steps/decode.sh --nj 20 --cmd "$decode_cmd" \ $exp_root/tri2b/graph $data/$valid $exp_root/tri2b/decode_$valid & fi if [ $stage -le 4 ] && [ $max_stage -ge 4 ]; then # Train tri3b, which is LDA+MLLT+SAT on 10k utts if [ ! $tri3b_size -eq -1 ]; then utils/subset_data_dir.sh $data/$train $tri3b_size $data/${train}_${tri3b_size} tri3b_train=${train}_${tri3b_size} else tri3b_train=${train} fi steps/align_si.sh --nj 10 --cmd "$train_cmd" --use-graphs true \ $data/$tri3b_train $lang \ $exp_root/tri2b $exp_root/tri2b_ali_${tri2b_train} steps_gan/train_sat.sh --cmd "$train_cmd" \ --num_nonsil_states $num_nonsil_states $numLeavesSAT $numGaussSAT \ $data/$tri3b_train $lang \ $exp_root/tri2b_ali_${tri2b_train} $exp_root/tri3b utils/mkgraph.sh $lang_test $exp_root/tri3b $exp_root/tri3b/graph steps/decode_fmllr.sh --nj 20 --cmd "$decode_cmd" \ $exp_root/tri3b/graph $data/$valid $exp_root/tri3b/decode_$valid & fi wait ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select.py ================================================ """ Implement unsupervised metric for decoding hyperparameter selection: $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$ """ import argparse import logging import math import sys import kenlm import editdistance from g2p_en import G2p logging.root.setLevel(logging.INFO) logging.basicConfig(stream=sys.stdout, level=logging.INFO) logger = logging.getLogger(__name__) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("ref_tra", help="reference pseudo labels") parser.add_argument("hyp_tra", help="decoded pseudo labels to be assess") parser.add_argument("--kenlm_path", default="/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o5.bin", help="") parser.add_argument("--uppercase", action="store_true", help="") parser.add_argument("--skipwords", default="", help="") parser.add_argument("--gt_tra", default="", help="ground truth pseudo labels for computing oracle WER") parser.add_argument("--min_vt_uer", default=0.0, type=float) parser.add_argument("--phonemize", action="store_true", help="phonemize word hypotheses, used when reference is phone transcript") parser.add_argument("--phonemize_lexicon", default="", type=str, help="use a lexicon for phonemizing") return parser def load_tra(tra_path): with open(tra_path, "r") as f: uid_to_tra = {} for line in f: toks = line.rstrip().split() uid, tra = toks[0], " ".join(toks[1:]) uid_to_tra[uid] = tra logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}") return uid_to_tra def load_lex(lex_path): with open(lex_path, "r") as f: w2p = {} for line in f: w, p = line.rstrip().split(None, 1) w2p[w] = p.split() return w2p def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict): d_cnt = 0 w_cnt = 0 w_cnt_h = 0 for uid in hyp_uid_to_tra: ref = ref_uid_to_tra[uid].split() if g2p_dict is not None: hyp = [] for word in hyp_uid_to_tra[uid].split(): if word in g2p_dict: hyp = hyp + g2p_dict[word] else: logger.warning(f"{word} not in g2p_dict") elif g2p is not None: hyp = g2p(hyp_uid_to_tra[uid]) hyp = [p for p in hyp if p != "'" and p != " "] hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp] else: hyp = hyp_uid_to_tra[uid].split() logger.debug(( f"======================\n" f"HYP: {' '.join(hyp)}\n" f"REF: {' '.join(ref)}" )) d_cnt += editdistance.eval(ref, hyp) w_cnt += len(ref) w_cnt_h += len(hyp) wer = float(d_cnt) / w_cnt logger.debug(( f"wer = {wer*100:.2f}%; num. of ref words = {w_cnt}; " f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}" )) return wer def compute_lm_ppl(hyp_uid_to_tra, score_fn): lm_score = 0. w_cnt = 0 for hyp in hyp_uid_to_tra.values(): cur_score = score_fn(hyp) cur_cnt = len(hyp.split()) + 1 # plus one for </s> lm_score += cur_score w_cnt += cur_cnt logger.debug(( f"======================\n" f"score sum/avg = {cur_score:.2f}/{cur_score/cur_cnt:.2f}\n" f"hyp = {hyp}" )) lm_ppl = math.pow(10, -lm_score / w_cnt) logger.debug(f"lm ppl = {lm_ppl:.2f}; num. of words = {w_cnt}") return lm_ppl def main(): args = get_parser().parse_args() logger.debug(f"Args: {args}") ref_uid_to_tra = load_tra(args.ref_tra) hyp_uid_to_tra = load_tra(args.hyp_tra) assert not bool(set(hyp_uid_to_tra.keys()) - set(ref_uid_to_tra.keys())) lm = kenlm.Model(args.kenlm_path) skipwords = set(args.skipwords.split(",")) def compute_lm_score(s): s = " ".join(w for w in s.split() if w not in skipwords) s = s.upper() if args.uppercase else s return lm.score(s) g2p, g2p_dict = None, None if args.phonemize: if args.phonemize_lexicon: g2p_dict = load_lex(args.phonemize_lexicon) else: g2p = G2p() wer = compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p, g2p_dict) lm_ppl = compute_lm_ppl(hyp_uid_to_tra, compute_lm_score) gt_wer = -math.inf if args.gt_tra: gt_uid_to_tra = load_tra(args.gt_tra) gt_wer = compute_wer(gt_uid_to_tra, hyp_uid_to_tra, None, None) score = math.log(lm_ppl) * max(wer, args.min_vt_uer) logging.info(f"{args.hyp_tra}: score={score:.4f}; wer={wer*100:.2f}%; lm_ppl={lm_ppl:.4f}; gt_wer={gt_wer*100:.2f}%") if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode.sh ================================================ #!/bin/bash split="dev_other" ref_txt="" # ground truth transcript path psd_txt="" # pseudo transcript path get_best_wer=true dec_name="decode" graph_name="graph" kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin . ./cmd.sh . ./path.sh . parse_options.sh exp_root=$1 unsup_args="" if [ $# -ge 2 ]; then unsup_args=$2 fi set -eu if [ ! -z $ref_txt ] && $get_best_wer; then echo "==== WER w.r.t. real transcript (select based on unsupervised metric)" for x in $exp_root/*/${dec_name}_${split}*; do lang=$(dirname $x)/$graph_name ( for tra in $x/scoring/*.tra; do cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:<UNK>::g' | sed 's:<SIL>::g' > $tra.txt python local/unsup_select.py $psd_txt $tra.txt --kenlm_path $kenlm_path --gt_tra $ref_txt $unsup_args done 2>/dev/null | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1 ) & done fi wait ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/local/unsup_select_decode_word.sh ================================================ #!/bin/bash split="dev_other" ref_txt="" # ground truth transcript path psd_txt="" # pseudo transcript path get_best_wer=true dec_name="decode" graph_name="graph" kenlm_path=/checkpoint/abaevski/data/speech/libri/librispeech_lm_novox.phnc_o6.bin phonemize_lexicon="" . ./cmd.sh . ./path.sh . parse_options.sh . /private/home/wnhsu/unsup_asr/fairseq-py-unsup/env.sh exp_root=$1 set -eu if [ ! -z $ref_txt ] && $get_best_wer; then echo "==== WER w.r.t. real transcript (select based on unsupervised metric)" for x in $exp_root/*/${dec_name}_${split}*; do lang=$(dirname $x)/$graph_name for tra in $x/scoring/*.tra; do cat $tra | utils/int2sym.pl -f 2- $lang/words.txt | sed 's:\<UNK\>::g' > $tra.txt python local/unsup_select.py $psd_txt $tra.txt \ --kenlm_path $kenlm_path --gt_tra $ref_txt --phonemize \ --phonemize_lexicon "$phonemize_lexicon" done | grep "score=" | sed 's/=/ /g' | sed 's/;//g' | sort -k3n | head -n1 done fi ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/path.sh ================================================ export KALDI_ROOT=`pwd`/../../.. export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH [ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 . $KALDI_ROOT/tools/config/common_path.sh export LC_ALL=C ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_deltas.sh ================================================ #!/usr/bin/env bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # Apache 2.0 # Begin configuration. stage=-4 # This allows restarting after partway, when something when wrong. config= cmd=run.pl scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" realign_iters="10 20 30"; num_iters=35 # Number of iterations of training max_iter_inc=25 # Last iter to increase #Gauss on. beam=10 careful=false retry_beam=40 boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment power=0.25 # Exponent for number of gaussians according to occurrence counts cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=true" # use the option --cmvn-opts "--norm-means=false" cmvn_opts= delta_opts= context_opts= # use"--context-width=5 --central-position=2" for quinphone num_nonsil_states=3 # End configuration. echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh; . parse_options.sh || exit 1; if [ $# != 6 ]; then echo "Usage: steps/train_deltas.sh <num-leaves> <tot-gauss> <data-dir> <lang-dir> <alignment-dir> <exp-dir>" echo "e.g.: steps/train_deltas.sh 2000 10000 data/train_si84_half data/lang exp/mono_ali exp/tri1" echo "main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --config <config-file> # config containing options" echo " --stage <stage> # stage to do partial re-run from." exit 1; fi numleaves=$1 totgauss=$2 data=$3 lang=$4 alidir=$5 dir=$6 for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do [ ! -f $f ] && echo "train_deltas.sh: no such file $f" && exit 1; done numgauss=$numleaves incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter increment for #Gauss oov=`cat $lang/oov.int` || exit 1; ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; nj=`cat $alidir/num_jobs` || exit 1; mkdir -p $dir/log echo $nj > $dir/num_jobs utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; cp $lang/phones.txt $dir || exit 1; sdata=$data/split$nj; split_data.sh $data $nj || exit 1; [ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \ echo "$0: warning: ignoring CMVN options from source directory $alidir" $norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts" echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN. [ ! -z $delta_opts ] && echo $delta_opts > $dir/delta_opts feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |" rm $dir/.error 2>/dev/null if [ $stage -le -3 ]; then echo "$0: accumulating tree stats" $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ acc-tree-stats $context_opts \ --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; sum-tree-stats $dir/treeacc $dir/*.treeacc 2>$dir/log/sum_tree_acc.log || exit 1; rm $dir/*.treeacc fi if [ $stage -le -2 ]; then echo "$0: getting questions for tree-building, via clustering" # preparing questions, roots file... cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts \ $dir/treeacc $lang/phones/sets.int \ $dir/questions.int 2> $dir/log/questions.log || exit 1; cat $lang/phones/extra_questions.int >> $dir/questions.int compile-questions $context_opts $lang/topo $dir/questions.int \ $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; echo "$0: building the tree" $cmd $dir/log/build_tree.log \ build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ $dir/questions.qst $lang/topo $dir/tree || exit 1; $cmd $dir/log/init_model.log \ gmm-init-model --write-occs=$dir/1.occs \ $dir/tree $dir/treeacc $lang/topo $dir/1.mdl || exit 1; if grep 'no stats' $dir/log/init_model.log; then echo "** The warnings above about 'no stats' generally mean you have phones **" echo "** (or groups of phones) in your phone set that had no corresponding data. **" echo "** You should probably figure out whether something went wrong, **" echo "** or whether your data just doesn't happen to have examples of those **" echo "** phones. **" fi gmm-mixup --mix-up=$numgauss $dir/1.mdl $dir/1.occs $dir/1.mdl 2>$dir/log/mixup.log || exit 1; rm $dir/treeacc fi if [ $stage -le -1 ]; then # Convert the alignments. echo "$0: converting alignments from $alidir to use current tree" $cmd JOB=1:$nj $dir/log/convert.JOB.log \ convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi if [ $stage -le 0 ]; then echo "$0: compiling graphs of transcripts" $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; fi x=1 while [ $x -lt $num_iters ]; do echo "$0: training pass $x" if [ $stage -le $x ]; then if echo $realign_iters | grep -w $x >/dev/null; then echo "$0: aligning data" mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ gmm-acc-stats-ali $dir/$x.mdl "$feats" \ "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; $cmd $dir/log/update.$x.log \ gmm-est --mix-up=$numgauss --power=$power \ --write-occs=$dir/$[$x+1].occs $dir/$x.mdl \ "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; rm $dir/$x.mdl $dir/$x.*.acc rm $dir/$x.occs fi [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; x=$[$x+1]; done rm $dir/final.mdl $dir/final.occs 2>/dev/null ln -s $x.mdl $dir/final.mdl ln -s $x.occs $dir/final.occs steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir # Summarize warning messages... utils/summarize_warnings.pl $dir/log steps/info/gmm_dir_info.pl $dir echo "$0: Done training system with delta+delta-delta features in $dir" exit 0 ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_lda_mllt.sh ================================================ #!/usr/bin/env bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey) # # LDA+MLLT refers to the way we transform the features after computing # the MFCCs: we splice across several frames, reduce the dimension (to 40 # by default) using Linear Discriminant Analysis), and then later estimate, # over multiple iterations, a diagonalizing transform known as MLLT or STC. # See http://kaldi-asr.org/doc/transform.html for more explanation. # # Apache 2.0. # Begin configuration. cmd=run.pl config= stage=-5 scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" realign_iters="10 20 30"; mllt_iters="2 4 6 12"; num_iters=35 # Number of iterations of training max_iter_inc=25 # Last iter to increase #Gauss on. dim=40 beam=10 retry_beam=40 careful=false boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment power=0.25 # Exponent for number of gaussians according to occurrence counts randprune=4.0 # This is approximately the ratio by which we will speed up the # LDA and MLLT calculations via randomized pruning. splice_opts= cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves norm_vars=false # deprecated. Prefer --cmvn-opts "--norm-vars=false" cmvn_opts= context_opts= # use "--context-width=5 --central-position=2" for quinphone. # End configuration. train_tree=true # if false, don't actually train the tree. use_lda_mat= # If supplied, use this LDA[+MLLT] matrix. num_nonsil_states=3 echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# != 6 ]; then echo "Usage: steps/train_lda_mllt.sh [options] <#leaves> <#gauss> <data> <lang> <alignments> <dir>" echo " e.g.: steps/train_lda_mllt.sh 2500 15000 data/train_si84 data/lang exp/tri1_ali_si84 exp/tri2b" echo "Main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --config <config-file> # config containing options" echo " --stage <stage> # stage to do partial re-run from." exit 1; fi numleaves=$1 totgauss=$2 data=$3 lang=$4 alidir=$5 dir=$6 for f in $alidir/final.mdl $alidir/ali.1.gz $data/feats.scp $lang/phones.txt; do [ ! -f $f ] && echo "train_lda_mllt.sh: no such file $f" && exit 1; done numgauss=$numleaves incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment oov=`cat $lang/oov.int` || exit 1; nj=`cat $alidir/num_jobs` || exit 1; silphonelist=`cat $lang/phones/silence.csl` || exit 1; ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; mkdir -p $dir/log utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; cp $lang/phones.txt $dir || exit 1; echo $nj >$dir/num_jobs echo "$splice_opts" >$dir/splice_opts # keep track of frame-splicing options # so that later stages of system building can know what they were. [ $(cat $alidir/cmvn_opts 2>/dev/null | wc -c) -gt 1 ] && [ -z "$cmvn_opts" ] && \ echo "$0: warning: ignoring CMVN options from source directory $alidir" $norm_vars && cmvn_opts="--norm-vars=true $cmvn_opts" echo $cmvn_opts > $dir/cmvn_opts # keep track of options to CMVN. sdata=$data/split$nj; split_data.sh $data $nj || exit 1; splicedfeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- |" # Note: $feats gets overwritten later in the script. feats="$splicedfeats transform-feats $dir/0.mat ark:- ark:- |" if [ $stage -le -5 ]; then if [ -z "$use_lda_mat" ]; then echo "$0: Accumulating LDA statistics." rm $dir/lda.*.acc 2>/dev/null $cmd JOB=1:$nj $dir/log/lda_acc.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 $silphonelist $alidir/final.mdl ark:- ark:- \| \ acc-lda --rand-prune=$randprune $alidir/final.mdl "$splicedfeats" ark,s,cs:- \ $dir/lda.JOB.acc || exit 1; est-lda --write-full-matrix=$dir/full.mat --dim=$dim $dir/0.mat $dir/lda.*.acc \ 2>$dir/log/lda_est.log || exit 1; rm $dir/lda.*.acc else echo "$0: Using supplied LDA matrix $use_lda_mat" cp $use_lda_mat $dir/0.mat || exit 1; [ ! -z "$mllt_iters" ] && \ echo "$0: Warning: using supplied LDA matrix $use_lda_mat but we will do MLLT," && \ echo " which you might not want; to disable MLLT, specify --mllt-iters ''" && \ sleep 5 fi fi cur_lda_iter=0 if [ $stage -le -4 ] && $train_tree; then echo "$0: Accumulating tree stats" $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ acc-tree-stats $context_opts \ --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; [ `ls $dir/*.treeacc | wc -w` -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; $cmd $dir/log/sum_tree_acc.log \ sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; rm $dir/*.treeacc fi if [ $stage -le -3 ] && $train_tree; then echo "$0: Getting questions for tree clustering." # preparing questions, roots file... cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) $context_opts $dir/treeacc $lang/phones/sets.int \ $dir/questions.int 2> $dir/log/questions.log || exit 1; cat $lang/phones/extra_questions.int >> $dir/questions.int compile-questions $context_opts $lang/topo $dir/questions.int \ $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; echo "$0: Building the tree" $cmd $dir/log/build_tree.log \ build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ $dir/questions.qst $lang/topo $dir/tree || exit 1; fi if [ $stage -le -2 ]; then echo "$0: Initializing the model" if $train_tree; then gmm-init-model --write-occs=$dir/1.occs \ $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; rm $dir/treeacc else cp $alidir/tree $dir/ || exit 1; $cmd JOB=1 $dir/log/init_model.log \ gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \ "$feats subset-feats ark:- ark:-|" || exit 1; fi fi if [ $stage -le -1 ]; then # Convert the alignments. echo "$0: Converting alignments from $alidir to use current tree" $cmd JOB=1:$nj $dir/log/convert.JOB.log \ convert-ali $alidir/final.mdl $dir/1.mdl $dir/tree \ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then echo "$0: Compiling graphs of transcripts" $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $data/split$nj/JOB/text |" \ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; fi x=1 while [ $x -lt $num_iters ]; do echo Training pass $x if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then echo Aligning data mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi if echo $mllt_iters | grep -w $x >/dev/null; then if [ $stage -le $x ]; then echo "$0: Estimating MLLT" $cmd JOB=1:$nj $dir/log/macc.$x.JOB.log \ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ weight-silence-post 0.0 $silphonelist $dir/$x.mdl ark:- ark:- \| \ gmm-acc-mllt --rand-prune=$randprune $dir/$x.mdl "$feats" ark:- $dir/$x.JOB.macc \ || exit 1; est-mllt $dir/$x.mat.new $dir/$x.*.macc 2> $dir/log/mupdate.$x.log || exit 1; gmm-transform-means $dir/$x.mat.new $dir/$x.mdl $dir/$x.mdl \ 2> $dir/log/transform_means.$x.log || exit 1; compose-transforms --print-args=false $dir/$x.mat.new $dir/$cur_lda_iter.mat $dir/$x.mat || exit 1; rm $dir/$x.*.macc fi feats="$splicedfeats transform-feats $dir/$x.mat ark:- ark:- |" cur_lda_iter=$x fi if [ $stage -le $x ]; then $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ gmm-acc-stats-ali $dir/$x.mdl "$feats" \ "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; $cmd $dir/log/update.$x.log \ gmm-est --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss --power=$power \ $dir/$x.mdl "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; rm $dir/$x.mdl $dir/$x.*.acc $dir/$x.occs fi [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; x=$[$x+1]; done rm $dir/final.{mdl,mat,occs} 2>/dev/null ln -s $x.mdl $dir/final.mdl ln -s $x.occs $dir/final.occs ln -s $cur_lda_iter.mat $dir/final.mat steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir # Summarize warning messages... utils/summarize_warnings.pl $dir/log steps/info/gmm_dir_info.pl $dir echo "$0: Done training system with LDA+MLLT features in $dir" exit 0 ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/steps_gan/train_sat.sh ================================================ #!/usr/bin/env bash # Copyright 2012 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. # This does Speaker Adapted Training (SAT), i.e. train on # fMLLR-adapted features. It can be done on top of either LDA+MLLT, or # delta and delta-delta features. If there are no transforms supplied # in the alignment directory, it will estimate transforms itself before # building the tree (and in any case, it estimates transforms a number # of times during training). # Begin configuration section. stage=-5 exit_stage=-100 # you can use this to require it to exit at the # beginning of a specific stage. Not all values are # supported. fmllr_update_type=full cmd=run.pl scale_opts="--transition-scale=1.0 --acoustic-scale=0.1 --self-loop-scale=0.1" beam=10 retry_beam=40 careful=false boost_silence=1.0 # Factor by which to boost silence likelihoods in alignment context_opts= # e.g. set this to "--context-width 5 --central-position 2" for quinphone. realign_iters="10 20 30"; fmllr_iters="2 4 6 12"; silence_weight=0.0 # Weight on silence in fMLLR estimation. num_iters=35 # Number of iterations of training max_iter_inc=25 # Last iter to increase #Gauss on. power=0.2 # Exponent for number of gaussians according to occurrence counts cluster_thresh=-1 # for build-tree control final bottom-up clustering of leaves phone_map= train_tree=true tree_stats_opts= cluster_phones_opts= compile_questions_opts= # End configuration section. num_nonsil_states=3 echo "$0 $@" # Print the command line for logging [ -f path.sh ] && . ./path.sh . parse_options.sh || exit 1; if [ $# != 6 ]; then echo "Usage: steps/train_sat.sh <#leaves> <#gauss> <data> <lang> <ali-dir> <exp-dir>" echo " e.g.: steps/train_sat.sh 2500 15000 data/train_si84 data/lang exp/tri2b_ali_si84 exp/tri3b" echo "Main options (for others, see top of script file)" echo " --cmd (utils/run.pl|utils/queue.pl <queue opts>) # how to run jobs." echo " --config <config-file> # config containing options" echo " --stage <stage> # stage to do partial re-run from." exit 1; fi numleaves=$1 totgauss=$2 data=$3 lang=$4 alidir=$5 dir=$6 for f in $data/feats.scp $lang/phones.txt $alidir/final.mdl $alidir/ali.1.gz; do [ ! -f $f ] && echo "train_sat.sh: no such file $f" && exit 1; done numgauss=$numleaves incgauss=$[($totgauss-$numgauss)/$max_iter_inc] # per-iter #gauss increment oov=`cat $lang/oov.int` nj=`cat $alidir/num_jobs` || exit 1; silphonelist=`cat $lang/phones/silence.csl` ciphonelist=`cat $lang/phones/context_indep.csl` || exit 1; sdata=$data/split$nj; splice_opts=`cat $alidir/splice_opts 2>/dev/null` # frame-splicing options. cmvn_opts=`cat $alidir/cmvn_opts 2>/dev/null` delta_opts=`cat $alidir/delta_opts 2>/dev/null` phone_map_opt= [ ! -z "$phone_map" ] && phone_map_opt="--phone-map='$phone_map'" mkdir -p $dir/log cp $alidir/splice_opts $dir 2>/dev/null # frame-splicing options. cp $alidir/cmvn_opts $dir 2>/dev/null # cmn/cmvn option. cp $alidir/delta_opts $dir 2>/dev/null # delta option. utils/lang/check_phones_compatible.sh $lang/phones.txt $alidir/phones.txt || exit 1; cp $lang/phones.txt $dir || exit 1; echo $nj >$dir/num_jobs [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1; # Set up features. if [ -f $alidir/final.mat ]; then feat_type=lda; else feat_type=delta; fi echo "$0: feature type is $feat_type" ## Set up speaker-independent features. case $feat_type in delta) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | add-deltas $delta_opts ark:- ark:- |";; lda) sifeats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $alidir/final.mat ark:- ark:- |" cp $alidir/final.mat $dir cp $alidir/full.mat $dir 2>/dev/null ;; *) echo "$0: invalid feature type $feat_type" && exit 1; esac ## Get initial fMLLR transforms (possibly from alignment dir) if [ -f $alidir/trans.1 ]; then echo "$0: Using transforms from $alidir" feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$alidir/trans.JOB ark:- ark:- |" cur_trans_dir=$alidir else if [ $stage -le -5 ]; then echo "$0: obtaining initial fMLLR transforms since not present in $alidir" # The next line is necessary because of $silphonelist otherwise being incorrect; would require # old $lang dir which would require another option. Not needed anyway. [ ! -z "$phone_map" ] && \ echo "$0: error: you must provide transforms if you use the --phone-map option." && exit 1; $cmd JOB=1:$nj $dir/log/fmllr.0.JOB.log \ ali-to-post "ark:gunzip -c $alidir/ali.JOB.gz|" ark:- \| \ weight-silence-post $silence_weight $silphonelist $alidir/final.mdl ark:- ark:- \| \ gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ --spk2utt=ark:$sdata/JOB/spk2utt $alidir/final.mdl "$sifeats" \ ark:- ark:$dir/trans.JOB || exit 1; fi feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark,s,cs:$dir/trans.JOB ark:- ark:- |" cur_trans_dir=$dir fi if [ $stage -le -4 ] && $train_tree; then # Get tree stats. echo "$0: Accumulating tree stats" $cmd JOB=1:$nj $dir/log/acc_tree.JOB.log \ acc-tree-stats $context_opts $tree_stats_opts $phone_map_opt --ci-phones=$ciphonelist $alidir/final.mdl "$feats" \ "ark:gunzip -c $alidir/ali.JOB.gz|" $dir/JOB.treeacc || exit 1; [ "`ls $dir/*.treeacc | wc -w`" -ne "$nj" ] && echo "$0: Wrong #tree-accs" && exit 1; $cmd $dir/log/sum_tree_acc.log \ sum-tree-stats $dir/treeacc $dir/*.treeacc || exit 1; rm $dir/*.treeacc fi if [ $stage -le -3 ] && $train_tree; then echo "$0: Getting questions for tree clustering." # preparing questions, roots file... cluster-phones --pdf-class-list=$(($num_nonsil_states / 2)) \ $cluster_phones_opts $context_opts \ $dir/treeacc $lang/phones/sets.int $dir/questions.int 2>$dir/log/questions.log || exit 1; cat $lang/phones/extra_questions.int >> $dir/questions.int compile-questions $context_opts $compile_questions_opts $lang/topo $dir/questions.int $dir/questions.qst 2>$dir/log/compile_questions.log || exit 1; echo "$0: Building the tree" $cmd $dir/log/build_tree.log \ build-tree $context_opts --verbose=1 --max-leaves=$numleaves \ --cluster-thresh=$cluster_thresh $dir/treeacc $lang/phones/roots.int \ $dir/questions.qst $lang/topo $dir/tree || exit 1; fi if [ $stage -le -2 ]; then echo "$0: Initializing the model" if $train_tree; then gmm-init-model --write-occs=$dir/1.occs \ $dir/tree $dir/treeacc $lang/topo $dir/1.mdl 2> $dir/log/init_model.log || exit 1; grep 'no stats' $dir/log/init_model.log && echo "This is a bad warning."; rm $dir/treeacc else cp $alidir/tree $dir/ || exit 1; $cmd JOB=1 $dir/log/init_model.log \ gmm-init-model-flat $dir/tree $lang/topo $dir/1.mdl \ "$feats subset-feats ark:- ark:-|" || exit 1; fi fi if [ $stage -le -1 ]; then # Convert the alignments. echo "$0: Converting alignments from $alidir to use current tree" $cmd JOB=1:$nj $dir/log/convert.JOB.log \ convert-ali $phone_map_opt $alidir/final.mdl $dir/1.mdl $dir/tree \ "ark:gunzip -c $alidir/ali.JOB.gz|" "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi [ "$exit_stage" -eq 0 ] && echo "$0: Exiting early: --exit-stage $exit_stage" && exit 0; if [ $stage -le 0 ] && [ "$realign_iters" != "" ]; then echo "$0: Compiling graphs of transcripts" $cmd JOB=1:$nj $dir/log/compile_graphs.JOB.log \ compile-train-graphs --read-disambig-syms=$lang/phones/disambig.int $dir/tree $dir/1.mdl $lang/L.fst \ "ark:utils/sym2int.pl --map-oov $oov -f 2- $lang/words.txt < $sdata/JOB/text |" \ "ark:|gzip -c >$dir/fsts.JOB.gz" || exit 1; fi x=1 while [ $x -lt $num_iters ]; do echo Pass $x if echo $realign_iters | grep -w $x >/dev/null && [ $stage -le $x ]; then echo Aligning data mdl="gmm-boost-silence --boost=$boost_silence `cat $lang/phones/optional_silence.csl` $dir/$x.mdl - |" $cmd JOB=1:$nj $dir/log/align.$x.JOB.log \ gmm-align-compiled $scale_opts --beam=$beam --retry-beam=$retry_beam --careful=$careful "$mdl" \ "ark:gunzip -c $dir/fsts.JOB.gz|" "$feats" \ "ark:|gzip -c >$dir/ali.JOB.gz" || exit 1; fi if echo $fmllr_iters | grep -w $x >/dev/null; then if [ $stage -le $x ]; then echo Estimating fMLLR transforms # We estimate a transform that's additional to the previous transform; # we'll compose them. $cmd JOB=1:$nj $dir/log/fmllr.$x.JOB.log \ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ weight-silence-post $silence_weight $silphonelist $dir/$x.mdl ark:- ark:- \| \ gmm-est-fmllr --fmllr-update-type=$fmllr_update_type \ --spk2utt=ark:$sdata/JOB/spk2utt $dir/$x.mdl \ "$feats" ark:- ark:$dir/tmp_trans.JOB || exit 1; for n in `seq $nj`; do ! ( compose-transforms --b-is-affine=true \ ark:$dir/tmp_trans.$n ark:$cur_trans_dir/trans.$n ark:$dir/composed_trans.$n \ && mv $dir/composed_trans.$n $dir/trans.$n && \ rm $dir/tmp_trans.$n ) 2>$dir/log/compose_transforms.$x.log \ && echo "$0: Error composing transforms" && exit 1; done fi feats="$sifeats transform-feats --utt2spk=ark:$sdata/JOB/utt2spk ark:$dir/trans.JOB ark:- ark:- |" cur_trans_dir=$dir fi if [ $stage -le $x ]; then $cmd JOB=1:$nj $dir/log/acc.$x.JOB.log \ gmm-acc-stats-ali $dir/$x.mdl "$feats" \ "ark,s,cs:gunzip -c $dir/ali.JOB.gz|" $dir/$x.JOB.acc || exit 1; [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; $cmd $dir/log/update.$x.log \ gmm-est --power=$power --write-occs=$dir/$[$x+1].occs --mix-up=$numgauss $dir/$x.mdl \ "gmm-sum-accs - $dir/$x.*.acc |" $dir/$[$x+1].mdl || exit 1; rm $dir/$x.mdl $dir/$x.*.acc rm $dir/$x.occs fi [ $x -le $max_iter_inc ] && numgauss=$[$numgauss+$incgauss]; x=$[$x+1]; done if [ $stage -le $x ]; then # Accumulate stats for "alignment model"-- this model is # computed with the speaker-independent features, but matches Gaussian-for-Gaussian # with the final speaker-adapted model. $cmd JOB=1:$nj $dir/log/acc_alimdl.JOB.log \ ali-to-post "ark:gunzip -c $dir/ali.JOB.gz|" ark:- \| \ gmm-acc-stats-twofeats $dir/$x.mdl "$feats" "$sifeats" \ ark,s,cs:- $dir/$x.JOB.acc || exit 1; [ `ls $dir/$x.*.acc | wc -w` -ne "$nj" ] && echo "$0: Wrong #accs" && exit 1; # Update model. $cmd $dir/log/est_alimdl.log \ gmm-est --power=$power --remove-low-count-gaussians=false $dir/$x.mdl \ "gmm-sum-accs - $dir/$x.*.acc|" $dir/$x.alimdl || exit 1; rm $dir/$x.*.acc fi rm $dir/final.{mdl,alimdl,occs} 2>/dev/null ln -s $x.mdl $dir/final.mdl ln -s $x.occs $dir/final.occs ln -s $x.alimdl $dir/final.alimdl steps/diagnostic/analyze_alignments.sh --cmd "$cmd" $lang $dir utils/summarize_warnings.pl $dir/log ( echo "$0: Likelihood evolution:" for x in `seq $[$num_iters-1]`; do tail -n 30 $dir/log/acc.$x.*.log | awk '/Overall avg like/{l += $(NF-3)*$(NF-1); t += $(NF-1); } /Overall average logdet/{d += $(NF-3)*$(NF-1); t2 += $(NF-1);} END{ d /= t2; l /= t; printf("%s ", d+l); } ' done echo ) | tee $dir/log/summary.log steps/info/gmm_dir_info.pl $dir echo "$0: done training SAT system in $dir" exit 0 ================================================ FILE: examples/wav2vec/unsupervised/kaldi_self_train/st/train.sh ================================================ #!/bin/bash set -eu w2v_dir= # contains features `{train,valid}.{npy,lengths}`, real transcripts `{train,valid}.${label}`, and dict `dict.${label}.txt` lab_dir= # contains pseudo labels `{train,valid}.txt` out_dir= # output root arpa_lm= # phone LM arpa_lm_bin= # (binary) phone LM for KenLM, used in unsupervised selection label=phnc train_name="train" valid_name="valid" data_dir=${out_dir}/data mkdir -p ${out_dir}/exp local/prepare_lang.sh $w2v_dir/dict.${label}.txt $data_dir local/prepare_lm.sh $arpa_lm $data_dir for x in $train_name $valid_name; do x_gt=${x}_gt # prepare pseudo data python local/prepare_data_from_w2v.py $w2v_dir $data_dir $x steps/compute_cmvn_stats.sh $data_dir/$x $out_dir/exp/make_feat/$x $out_dir/feats/$x python local/copy_aligned_text.py < $lab_dir/$x.txt > $data_dir/$x/text # prepare ground truth data mkdir $data_dir/$x_gt cp $data_dir/$x/{feats.scp,cmvn.scp,utt2spk,spk2utt} $data_dir/$x_gt/ python local/copy_aligned_text.py < $w2v_dir/$x.$label > $data_dir/$x_gt/text done local/train_subset_lgbeam.sh \ --out_root ${out_dir} --out_name exp --train $train_name --valid $valid_name \ --mono_size 2000 --tri1_size 5000 --tri2b_size -1 --tri3b_size -1 \ --stage 1 --max_stage 3 $data_dir $data_dir/lang $data_dir/lang_test local/unsup_select_decode.sh \ --split $valid_name --kenlm_path $arpa_lm_bin \ --ref_txt $data_dir/${valid_name}_gt/text \ --psd_txt $data_dir/${valid_name}/text \ $out_dir/exp ================================================ FILE: examples/wav2vec/unsupervised/models/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .wav2vec_u import Wav2vec_U __all__ = [ "Wav2vec_U", ] ================================================ FILE: examples/wav2vec/unsupervised/models/wav2vec_u.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass from enum import Enum, auto import math import numpy as np from typing import Tuple, List, Optional, Dict import torch import torch.nn as nn import torch.nn.functional as F from torch import autograd from fairseq import checkpoint_utils, utils from fairseq.dataclass import FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from fairseq.modules import ( SamePad, TransposeLast, ) class SegmentationType(Enum): NONE = auto() RANDOM = auto() UNIFORM_RANDOM = auto() UNIFORM_RANDOM_JOIN = auto() JOIN = auto() @dataclass class SegmentationConfig(FairseqDataclass): type: SegmentationType = SegmentationType.NONE subsample_rate: float = 0.25 mean_pool: bool = True mean_pool_join: bool = False remove_zeros: bool = False @dataclass class Wav2vec_UConfig(FairseqDataclass): discriminator_kernel: int = 3 discriminator_dilation: int = 1 discriminator_dim: int = 256 discriminator_causal: bool = True discriminator_linear_emb: bool = False discriminator_depth: int = 1 discriminator_max_pool: bool = False discriminator_act_after_linear: bool = False discriminator_dropout: float = 0.0 discriminator_spectral_norm: bool = False discriminator_weight_norm: bool = False generator_kernel: int = 4 generator_dilation: int = 1 generator_stride: int = 1 generator_pad: int = -1 generator_bias: bool = False generator_dropout: float = 0.0 generator_batch_norm: int = 0 generator_residual: bool = False blank_weight: float = 0 blank_mode: str = "add" blank_is_sil: bool = False no_softmax: bool = False smoothness_weight: float = 0.0 smoothing: float = 0.0 smoothing_one_sided: bool = False gradient_penalty: float = 0.0 probabilistic_grad_penalty_slicing: bool = False code_penalty: float = 0.0 mmi_weight: float = 0.0 target_dim: int = 64 target_downsample_rate: int = 2 gumbel: bool = False hard_gumbel: bool = True temp: Tuple[float, float, float] = (2, 0.1, 0.99995) input_dim: int = 128 segmentation: SegmentationConfig = SegmentationConfig() class Segmenter(nn.Module): cfg: SegmentationConfig def __init__(self, cfg: SegmentationConfig): super().__init__() self.cfg = cfg self.subsample_rate = cfg.subsample_rate def pre_segment(self, dense_x, dense_padding_mask): return dense_x, dense_padding_mask def logit_segment(self, logits, padding_mask): return logits, padding_mask class RandomSegmenter(Segmenter): def pre_segment(self, dense_x, dense_padding_mask): target_num = math.ceil(dense_x.size(1) * self.subsample_rate) ones = torch.ones(dense_x.shape[:-1], device=dense_x.device) indices, _ = ones.multinomial(target_num).sort(dim=-1) indices_ld = indices.unsqueeze(-1).expand(-1, -1, dense_x.size(-1)) dense_x = dense_x.gather(1, indices_ld) dense_padding_mask = dense_padding_mask.gather(1, index=indices) return dense_x, dense_padding_mask class UniformRandomSegmenter(Segmenter): def pre_segment(self, dense_x, dense_padding_mask): bsz, tsz, fsz = dense_x.shape target_num = math.ceil(tsz * self.subsample_rate) rem = tsz % target_num if rem > 0: dense_x = F.pad(dense_x, [0, 0, 0, target_num - rem]) dense_padding_mask = F.pad( dense_padding_mask, [0, target_num - rem], value=True ) dense_x = dense_x.view(bsz, target_num, -1, fsz) dense_padding_mask = dense_padding_mask.view(bsz, target_num, -1) if self.cfg.mean_pool: dense_x = dense_x.mean(dim=-2) dense_padding_mask = dense_padding_mask.all(dim=-1) else: ones = torch.ones((bsz, dense_x.size(2)), device=dense_x.device) indices = ones.multinomial(1) indices = indices.unsqueeze(-1).expand(-1, target_num, -1) indices_ld = indices.unsqueeze(-1).expand(-1, -1, -1, fsz) dense_x = dense_x.gather(2, indices_ld).reshape(bsz, -1, fsz) dense_padding_mask = dense_padding_mask.gather(2, index=indices).reshape( bsz, -1 ) return dense_x, dense_padding_mask class JoinSegmenter(Segmenter): def logit_segment(self, logits, padding_mask): preds = logits.argmax(dim=-1) if padding_mask.any(): preds[padding_mask] = -1 # mark pad uniques = [] bsz, tsz, csz = logits.shape for p in preds: uniques.append( p.cpu().unique_consecutive(return_inverse=True, return_counts=True) ) new_tsz = max(u[0].numel() for u in uniques) new_logits = logits.new_zeros(bsz, new_tsz, csz) new_pad = padding_mask.new_zeros(bsz, new_tsz) for b in range(bsz): u, idx, c = uniques[b] keep = u != -1 if self.cfg.remove_zeros: keep.logical_and_(u != 0) if self.training and not self.cfg.mean_pool_join: u[0] = 0 u[1:] = c.cumsum(0)[:-1] m = c > 1 r = torch.rand(m.sum()) o = (c[m] * r).long() u[m] += o new_logits[b, : u.numel()] = logits[b, u] else: new_logits[b].index_add_( dim=0, index=idx.to(new_logits.device), source=logits[b] ) new_logits[b, : c.numel()] /= c.unsqueeze(-1).to(new_logits.device) new_sz = keep.sum() if not keep.all(): kept_logits = new_logits[b, : c.numel()][keep] new_logits[b, :new_sz] = kept_logits if new_sz < new_tsz: pad = new_tsz - new_sz new_logits[b, -pad:] = 0 new_pad[b, -pad:] = True return new_logits, new_pad class UniformRandomJoinSegmenter(UniformRandomSegmenter, JoinSegmenter): pass SEGMENT_FACTORY = { SegmentationType.NONE: Segmenter, SegmentationType.RANDOM: RandomSegmenter, SegmentationType.UNIFORM_RANDOM: UniformRandomSegmenter, SegmentationType.UNIFORM_RANDOM_JOIN: UniformRandomJoinSegmenter, SegmentationType.JOIN: JoinSegmenter, } class Discriminator(nn.Module): def __init__(self, dim, cfg: Wav2vec_UConfig): super().__init__() inner_dim = cfg.discriminator_dim kernel = cfg.discriminator_kernel dilation = cfg.discriminator_dilation self.max_pool = cfg.discriminator_max_pool if cfg.discriminator_causal: padding = kernel - 1 else: padding = kernel // 2 def make_conv(in_d, out_d, k, p=0, has_dilation=True): conv = nn.Conv1d( in_d, out_d, kernel_size=k, padding=p, dilation=dilation if has_dilation else 1, ) if cfg.discriminator_spectral_norm: conv = nn.utils.spectral_norm(conv) elif cfg.discriminator_weight_norm: conv = nn.utils.weight_norm(conv) return conv inner_net = [ nn.Sequential( make_conv(inner_dim, inner_dim, kernel, padding), SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), nn.Dropout(cfg.discriminator_dropout), nn.GELU(), ) for _ in range(cfg.discriminator_depth - 1) ] + [ make_conv(inner_dim, 1, kernel, padding, has_dilation=False), SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), ] if cfg.discriminator_linear_emb: emb_net = [make_conv(dim, inner_dim, 1)] else: emb_net = [ make_conv(dim, inner_dim, kernel, padding), SamePad(kernel_size=kernel, causal=cfg.discriminator_causal), ] if cfg.discriminator_act_after_linear: emb_net.append(nn.GELU()) self.net = nn.Sequential( *emb_net, nn.Dropout(cfg.discriminator_dropout), *inner_net, ) def forward(self, x, padding_mask): x = x.transpose(1, 2) # BTC -> BCT x = self.net(x) x = x.transpose(1, 2) x_sz = x.size(1) if padding_mask is not None and padding_mask.any() and padding_mask.dim() > 1: padding_mask = padding_mask[:, : x.size(1)] x[padding_mask] = float("-inf") if self.max_pool else 0 x_sz = x_sz - padding_mask.sum(dim=-1) x = x.squeeze(-1) if self.max_pool: x, _ = x.max(dim=-1) else: x = x.sum(dim=-1) x = x / x_sz return x class Generator(nn.Module): def __init__(self, input_dim, output_dim, cfg: Wav2vec_UConfig): super().__init__() self.cfg = cfg self.output_dim = output_dim self.stride = cfg.generator_stride self.dropout = nn.Dropout(cfg.generator_dropout) self.batch_norm = cfg.generator_batch_norm != 0 self.residual = cfg.generator_residual padding = ( cfg.generator_kernel // 2 if cfg.generator_pad < 0 else cfg.generator_pad ) self.proj = nn.Sequential( TransposeLast(), nn.Conv1d( input_dim, output_dim, kernel_size=cfg.generator_kernel, stride=cfg.generator_stride, dilation=cfg.generator_dilation, padding=padding, bias=cfg.generator_bias, ), TransposeLast(), ) if self.batch_norm: self.bn = nn.BatchNorm1d(input_dim) self.bn.weight.data.fill_(cfg.generator_batch_norm) if self.residual: self.in_proj = nn.Linear(input_dim, input_dim) def forward(self, dense_x, tokens, dense_padding_mask): result = {} if self.batch_norm: dense_x = self.bn_padded_data(dense_x, dense_padding_mask) if self.residual: inter_x = self.in_proj(self.dropout(dense_x)) dense_x = dense_x + inter_x result["inter_x"] = inter_x dense_x = self.dropout(dense_x) dense_x = self.proj(dense_x) if self.stride > 1: dense_padding_mask = dense_padding_mask[:, :: self.stride] if dense_padding_mask.size(1) != dense_x.size(1): new_padding = dense_padding_mask.new_zeros(dense_x.shape[:-1]) diff = new_padding.size(1) - dense_padding_mask.size(1) if diff > 0: new_padding[:, diff:] = dense_padding_mask else: assert diff < 0 new_padding = dense_padding_mask[:, :diff] dense_padding_mask = new_padding token_x = None if tokens is not None: token_x = dense_x.new_zeros(tokens.numel(), self.output_dim) token_x.scatter_(1, tokens.view(-1, 1).long(), 1) token_x = token_x.view(tokens.shape + (self.output_dim,)) result["dense_x"] = dense_x result["token_x"] = token_x result["dense_padding_mask"] = dense_padding_mask return result def bn_padded_data(self, feature, padding_mask): normed_feature = feature.clone() normed_feature[~padding_mask] = self.bn( feature[~padding_mask].unsqueeze(-1) ).squeeze(-1) return normed_feature @register_model("wav2vec_u", dataclass=Wav2vec_UConfig) class Wav2vec_U(BaseFairseqModel): def calc_gradient_penalty(self, real_data, fake_data): b_size = min(real_data.size(0), fake_data.size(0)) t_size = min(real_data.size(1), fake_data.size(1)) if self.cfg.probabilistic_grad_penalty_slicing: def get_slice(data, dim, target_size): size = data.size(dim) diff = size - target_size if diff <= 0: return data start = np.random.randint(0, diff + 1) return data.narrow(dim=dim, start=start, length=target_size) real_data = get_slice(real_data, 0, b_size) real_data = get_slice(real_data, 1, t_size) fake_data = get_slice(fake_data, 0, b_size) fake_data = get_slice(fake_data, 1, t_size) else: real_data = real_data[:b_size, :t_size] fake_data = fake_data[:b_size, :t_size] alpha = torch.rand(real_data.size(0), 1, 1) alpha = alpha.expand(real_data.size()) alpha = alpha.to(real_data.device) interpolates = alpha * real_data + ((1 - alpha) * fake_data) disc_interpolates = self.discriminator(interpolates, None) gradients = autograd.grad( outputs=disc_interpolates, inputs=interpolates, grad_outputs=torch.ones(disc_interpolates.size(), device=real_data.device), create_graph=True, retain_graph=True, only_inputs=True, )[0] gradient_penalty = (gradients.norm(2, dim=1) - 1) ** 2 return gradient_penalty def set_num_updates(self, num_updates): super().set_num_updates(num_updates) self.update_num = num_updates self.curr_temp = max( self.max_temp * self.temp_decay ** num_updates, self.min_temp ) def discrim_step(self, num_updates): return num_updates % 2 == 1 def get_groups_for_update(self, num_updates): return "discriminator" if self.discrim_step(num_updates) else "generator" def __init__(self, cfg: Wav2vec_UConfig, target_dict): super().__init__() self.cfg = cfg self.zero_index = target_dict.index("<SIL>") if "<SIL>" in target_dict else 0 self.smoothness_weight = cfg.smoothness_weight output_size = len(target_dict) self.pad = target_dict.pad() self.eos = target_dict.eos() self.smoothing = cfg.smoothing self.smoothing_one_sided = cfg.smoothing_one_sided self.no_softmax = cfg.no_softmax self.gumbel = cfg.gumbel self.hard_gumbel = cfg.hard_gumbel self.last_acc = None self.gradient_penalty = cfg.gradient_penalty self.code_penalty = cfg.code_penalty self.mmi_weight = cfg.mmi_weight self.blank_weight = cfg.blank_weight self.blank_mode = cfg.blank_mode self.blank_index = target_dict.index("<SIL>") if cfg.blank_is_sil else 0 assert self.blank_index != target_dict.unk() self.discriminator = Discriminator(output_size, cfg) for p in self.discriminator.parameters(): p.param_group = "discriminator" self.pca_A = self.pca_b = None d = cfg.input_dim self.segmenter = SEGMENT_FACTORY[cfg.segmentation.type](cfg.segmentation) self.generator = Generator(d, output_size, cfg) for p in self.generator.parameters(): p.param_group = "generator" for p in self.segmenter.parameters(): p.param_group = "generator" self.max_temp, self.min_temp, self.temp_decay = cfg.temp self.curr_temp = self.max_temp self.update_num = 0 if self.mmi_weight > 0: self.target_downsample_rate = cfg.target_downsample_rate self.decoder = nn.Linear(d, cfg.target_dim) for p in self.decoder.parameters(): p.param_group = "generator" @classmethod def build_model(cls, cfg, task): return cls(cfg, task.target_dictionary) def get_logits( self, net_output: Optional[Dict[str, List[Optional[torch.Tensor]]]], normalize: bool = False, ): logits = net_output["logits"] if self.blank_weight != 0: if self.blank_mode == "add": logits[..., self.blank_index] += self.blank_weight elif self.blank_mode == "set": logits[..., self.blank_index] = self.blank_weight else: raise Exception(f"invalid blank mode {self.blank_mode}") padding = net_output["padding_mask"] if padding.any(): logits[padding] = float("-inf") logits[padding][..., self.blank_index] = float("inf") if normalize: logits = utils.log_softmax(logits.float(), dim=-1) return logits.transpose(0, 1) def get_normalized_probs( self, net_output: Tuple[ torch.Tensor, Optional[Dict[str, List[Optional[torch.Tensor]]]] ], log_probs: bool, sample: Optional[Dict[str, torch.Tensor]] = None, ): logits = self.get_logits(net_output) probs = super().get_normalized_probs(logits, log_probs, sample) # BTC -> TBC for ctc probs = probs.transpose(0, 1) return probs def normalize(self, dense_x): bsz, tsz, csz = dense_x.shape if dense_x.numel() == 0: raise Exception(dense_x.shape) _, k = dense_x.max(-1) hard_x = ( dense_x.new_zeros(bsz * tsz, csz) .scatter_(-1, k.view(-1, 1), 1.0) .view(-1, csz) ) hard_probs = torch.mean(hard_x.float(), dim=0) code_perplexity = torch.exp( -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) ) avg_probs = torch.softmax(dense_x.reshape(-1, csz).float(), dim=-1).mean(dim=0) prob_perplexity = torch.exp( -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1) ) if not self.no_softmax: if self.training and self.gumbel: dense_x = F.gumbel_softmax( dense_x.float(), tau=self.curr_temp, hard=self.hard_gumbel ).type_as(dense_x) else: dense_x = dense_x.softmax(-1) return dense_x, code_perplexity, prob_perplexity def forward( self, features, padding_mask, random_label=None, dense_x_only=False, segment=True, aux_target=None, ): if segment: features, padding_mask = self.segmenter.pre_segment(features, padding_mask) orig_size = features.size(0) * features.size(1) - padding_mask.sum() gen_result = self.generator(features, random_label, padding_mask) orig_dense_x, token_x = gen_result["dense_x"], gen_result["token_x"] orig_dense_padding_mask = gen_result["dense_padding_mask"] if segment: dense_x, dense_padding_mask = self.segmenter.logit_segment( orig_dense_x, orig_dense_padding_mask ) else: dense_x = orig_dense_x dense_padding_mask = orig_dense_padding_mask dense_logits = dense_x prob_perplexity = None code_perplexity = None if not (self.no_softmax and dense_x_only): dense_x, code_perplexity, prob_perplexity = self.normalize(dense_logits) if dense_x_only or self.discriminator is None: return { "logits": dense_x, "padding_mask": dense_padding_mask, } token_padding_mask = random_label == self.pad dense_y = self.discriminator(dense_x, dense_padding_mask) token_y = self.discriminator(token_x, token_padding_mask) sample_size = features.size(0) d_step = self.discrim_step(self.update_num) fake_smooth = self.smoothing real_smooth = self.smoothing if self.smoothing_one_sided: fake_smooth = 0 zero_loss = None smoothness_loss = None code_pen = None mmi_loss = None if d_step: loss_dense = F.binary_cross_entropy_with_logits( dense_y, dense_y.new_ones(dense_y.shape) - fake_smooth, reduction="sum", ) loss_token = F.binary_cross_entropy_with_logits( token_y, token_y.new_zeros(token_y.shape) + real_smooth, reduction="sum", ) if self.training and self.gradient_penalty > 0: grad_pen = self.calc_gradient_penalty(token_x, dense_x) grad_pen = grad_pen.sum() * self.gradient_penalty else: grad_pen = None else: grad_pen = None loss_token = None loss_dense = F.binary_cross_entropy_with_logits( dense_y, dense_y.new_zeros(dense_y.shape) + fake_smooth, reduction="sum", ) num_vars = dense_x.size(-1) if prob_perplexity is not None: code_pen = (num_vars - prob_perplexity) / num_vars code_pen = code_pen * sample_size * self.code_penalty if self.smoothness_weight > 0: smoothness_loss = F.mse_loss( dense_logits[:, :-1], dense_logits[:, 1:], reduction="none" ) smoothness_loss[dense_padding_mask[:, 1:]] = 0 smoothness_loss = ( smoothness_loss.mean() * sample_size * self.smoothness_weight ) if (self.mmi_weight > 0) and (aux_target is not None): inter_x = self.decoder(gen_result["inter_x"]) if self.target_downsample_rate > 1: aux_target = aux_target[:, :: self.target_downsample_rate] max_t_len = min(aux_target.shape[1], inter_x.shape[1]) mmi_loss = F.cross_entropy( inter_x[:, :max_t_len].transpose(1, 2), aux_target[:, :max_t_len], ignore_index=-1, reduction="none", ) mmi_loss = mmi_loss.mean() * mmi_loss.shape[0] * self.mmi_weight result = { "losses": { "grad_pen": grad_pen, "code_pen": code_pen, "smoothness": smoothness_loss, "mmi": mmi_loss, }, "temp": self.curr_temp, "code_ppl": code_perplexity, "prob_ppl": prob_perplexity, "d_steps": int(d_step), "sample_size": sample_size, } suff = "_d" if d_step else "_g" result["losses"]["dense" + suff] = loss_dense result["losses"]["token" + suff] = loss_token return result ================================================ FILE: examples/wav2vec/unsupervised/scripts/apply_pca.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import os.path as osp import math import numpy as np import tqdm import torch from shutil import copyfile from npy_append_array import NpyAppendArray def get_parser(): parser = argparse.ArgumentParser( description="transforms features via a given pca and stored them in target dir" ) # fmt: off parser.add_argument('source', help='directory with features') parser.add_argument('--split', help='which split to read', required=True) parser.add_argument('--save-dir', help='where to save the output', required=True) parser.add_argument('--pca-path', type=str, help='pca location. will append _A.npy and _b.npy', required=True) parser.add_argument('--batch-size', type=int, default=2048000, help='batch size') parser.add_argument('--unfiltered', action='store_true', help='process the unfiltered version') # fmt: on return parser def main(): parser = get_parser() args = parser.parse_args() source_path = osp.join(args.source, args.split) data_poth = source_path + "_unfiltered" if args.unfiltered else source_path print(f"data path: {data_poth}") features = np.load(data_poth + ".npy", mmap_mode="r") pca_A = torch.from_numpy(np.load(args.pca_path + "_A.npy")).cuda() pca_b = torch.from_numpy(np.load(args.pca_path + "_b.npy")).cuda() os.makedirs(args.save_dir, exist_ok=True) save_path = osp.join(args.save_dir, args.split) copyfile(source_path + ".tsv", save_path + ".tsv") copyfile(data_poth + ".lengths", save_path + ".lengths") if osp.exists(source_path + ".phn"): copyfile(source_path + ".phn", save_path + ".phn") if osp.exists(source_path + ".wrd"): copyfile(source_path + ".wrd", save_path + ".wrd") if osp.exists(save_path + ".npy"): os.remove(save_path + ".npy") npaa = NpyAppendArray(save_path + ".npy") batches = math.ceil(features.shape[0] / args.batch_size) with torch.no_grad(): for b in tqdm.trange(batches): start = b * args.batch_size end = start + args.batch_size x = torch.from_numpy(features[start:end]).cuda() x = torch.matmul(x, pca_A) + pca_b npaa.append(x.cpu().numpy()) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/copy_labels.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys for idx, line in enumerate(sys.stdin): print(f"utt{idx:010d} {line}", end="") ================================================ FILE: examples/wav2vec/unsupervised/scripts/filter_lexicon.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import sys from fairseq.data import Dictionary def get_parser(): parser = argparse.ArgumentParser( description="filters a lexicon given a unit dictionary" ) parser.add_argument("-d", "--unit-dict", help="unit dictionary", required=True) return parser def main(): parser = get_parser() args = parser.parse_args() d = Dictionary.load(args.unit_dict) symbols = set(d.symbols) for line in sys.stdin: items = line.rstrip().split() skip = len(items) < 2 for x in items[1:]: if x not in symbols: skip = True break if not skip: print(line, end="") if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/filter_tsv.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import argparse import sys parser = argparse.ArgumentParser() parser.add_argument("--tsv", required=True, type=str) parser.add_argument("--no-skip", action="store_true") parser.add_argument("--keep", action="store_true") params = parser.parse_args() def get_fname(line): p = os.path.basename(line.split("\t")[0]) p = os.path.splitext(p)[0] return p # filenames to exclude seen = set() with open(params.tsv) as f: if not params.no_skip: root = next(f).rstrip() for line in f: seen.add(get_fname(line)) for i, line in enumerate(sys.stdin): exists = get_fname(line) in seen keep = (exists and params.keep) or (not exists and not params.keep) if i == 0 or keep: print(line, end="") ================================================ FILE: examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import sys from g2p_en import G2p def main(): parser = argparse.ArgumentParser() parser.add_argument( "--compact", action="store_true", help="if set, compacts phones", ) args = parser.parse_args() compact = args.compact wrd_to_phn = {} g2p = G2p() for line in sys.stdin: words = line.strip().split() phones = [] for w in words: if w not in wrd_to_phn: wrd_to_phn[w] = g2p(w) if compact: wrd_to_phn[w] = [ p[:-1] if p[-1].isnumeric() else p for p in wrd_to_phn[w] ] phones.extend(wrd_to_phn[w]) try: print(" ".join(phones)) except: print(wrd_to_phn, words, phones, file=sys.stderr) raise if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/ltr_to_wrd.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys def main(): for line in sys.stdin: print(line.replace(" ", "").replace("|", " ").strip()) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/mean_pool.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import os.path as osp import math import numpy as np import tqdm import torch import torch.nn.functional as F from shutil import copyfile from npy_append_array import NpyAppendArray def get_parser(): parser = argparse.ArgumentParser( description="mean pools representations by compressing uniform splits of the data" ) # fmt: off parser.add_argument('source', help='directory with features') parser.add_argument('--split', help='which split to read', required=True) parser.add_argument('--save-dir', help='where to save the output', required=True) parser.add_argument('--subsample-rate', type=float, default=0.5, help='size to subsample data to') parser.add_argument('--remove-extra', action='store_true', help='if true, removes extra states that cant be pooled, otherwise pads with 0s') # fmt: on return parser def main(): parser = get_parser() args = parser.parse_args() source_path = osp.join(args.source, args.split) print(f"data path: {source_path}") features = np.load(source_path + ".npy", mmap_mode="r") os.makedirs(args.save_dir, exist_ok=True) save_path = osp.join(args.save_dir, args.split) copyfile(source_path + ".tsv", save_path + ".tsv") if os.path.exists(source_path + ".phn"): copyfile(source_path + ".phn", save_path + ".phn") if os.path.exists(source_path + ".wrd"): copyfile(source_path + ".wrd", save_path + ".wrd") if os.path.exists(osp.join(args.source, "dict.phn.txt")): copyfile( osp.join(args.source, "dict.phn.txt"), osp.join(args.save_dir, "dict.phn.txt"), ) if osp.exists(save_path + ".npy"): os.remove(save_path + ".npy") npaa = NpyAppendArray(save_path + ".npy") with open(source_path + ".lengths", "r") as lf: lengths = lf.readlines() fsz = features.shape[-1] start = 0 with torch.no_grad(): with open(save_path + ".lengths", "w") as lengths_out: for length in tqdm.tqdm(lengths): length = int(length) end = start + length feats = features[start:end] start += length x = torch.from_numpy(feats).cuda() target_num = math.ceil(length * args.subsample_rate) rem = length % target_num if rem > 0: if args.remove_extra: to_rem = target_num - rem target_num -= 1 x = x[:-to_rem] else: to_add = target_num - rem x = F.pad(x, [0, 0, 0, to_add]) x[-to_add:] = x[-to_add - 1] x = x.view(target_num, -1, fsz) x = x.mean(dim=-2) print(target_num, file=lengths_out) npaa.append(x.cpu().numpy()) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/merge_clusters.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import os.path as osp import numpy as np import tqdm import torch import random from shutil import copyfile from npy_append_array import NpyAppendArray def get_parser(): parser = argparse.ArgumentParser( description="transforms features via a given pca and stored them in target dir" ) # fmt: off parser.add_argument('source', help='directory with features') parser.add_argument('--split', help='which split to read', required=True) parser.add_argument('--save-dir', help='where to save the output', required=True) parser.add_argument('--cluster-dir', help='where the clusters are') parser.add_argument('--pooling', type=str, default='mean', choices=['mean', 'sample'], help='how to pool') # fmt: on return parser def main(): parser = get_parser() args = parser.parse_args() source_path = osp.join(args.source, args.split) cluster_path = osp.join(args.cluster_dir, args.split + ".src") print(f"data path: {source_path}") features = np.load(source_path + ".npy", mmap_mode="r") sizes = [] offsets = [] offset = 0 with open(source_path + ".lengths", "r") as len_f: for line in len_f: length = int(line.rstrip()) sizes.append(length) offsets.append(offset) offset += length clusters = [] with open(cluster_path, "r") as cf: for line in cf: line = line.rstrip() items = line.split() items = list(map(int, items)) clusters.append(items) os.makedirs(args.save_dir, exist_ok=True) save_path = osp.join(args.save_dir, args.split) copyfile(source_path + ".tsv", save_path + ".tsv") if os.path.exists(source_path + ".phn"): copyfile(source_path + ".phn", save_path + ".phn") if os.path.exists(osp.join(args.source, "dict.phn.txt")): copyfile( osp.join(args.source, "dict.phn.txt"), osp.join(args.save_dir, "dict.phn.txt"), ) if os.path.exists(source_path + ".wrd"): copyfile(source_path + ".wrd", save_path + ".wrd") if osp.exists(save_path + ".npy"): os.remove(save_path + ".npy") npaa = NpyAppendArray(save_path + ".npy") def merge(feats, clust): feats = torch.from_numpy(feats.copy()) clust = torch.LongTensor(clust) _, counts = clust.unique_consecutive(return_counts=True) curr = 0 merged = [] for c in counts: c = c.item() start = curr end = curr + c curr += c if args.pooling == "mean": new_x = feats[start:end].mean(dim=0) elif args.pooling == "sample": new_x = feats[start + int(random.random() * c)] else: raise NotImplementedError() merged.append(new_x) return torch.stack(merged, dim=0).numpy() with open(save_path + ".lengths", "w") as l_f: for size, offset, clust in tqdm.tqdm( zip(sizes, offsets, clusters), total=len(sizes) ): end = size + offset feats = features[offset:end] feats = merge(feats, clust) print(len(feats), file=l_f) npaa.append(feats) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import fasttext as ft import os import regex import sys def get_parser(): parser = argparse.ArgumentParser( description="reads text from stdin and outputs normalized, lid-filtered version to stdout" ) parser.add_argument( "--fasttext-model", help="path to fasttext model", default="lid.187.bin", ) parser.add_argument("--lang", help="language id", required=True) parser.add_argument( "--lid-threshold", type=float, help="threshold for this lang id probability", default=0.4, ) return parser def main(): parser = get_parser() args = parser.parse_args() filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]") lg = args.lang.lower() lg_label = f"__label__{lg}" thresh = args.lid_threshold if os.path.exists(args.fasttext_model): model = ft.load_model(args.fasttext_model) else: print( f"fasttext language id model {args.fasttext_model} not found. Proceeding without language filtering. " f"To enable language filtering, please download the latest language id model " f"from https://fasttext.cc/docs/en/language-identification.html", file=sys.stderr, ) model = None for line in sys.stdin: line = line.strip() line = filter_r.sub(" ", line) line = " ".join(line.split()) if model is not None: lid, prob = model.predict(line, k=100) try: target_idx = lid.index(lg_label) except ValueError: continue if target_idx == 0 or prob[target_idx] >= thresh: print(line) else: print(line) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/normalize_text.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import regex import sys def main(): filter_r = regex.compile(r"[^\p{L}\p{N}\p{M}\' \-]") for line in sys.stdin: line = line.strip() line = filter_r.sub(" ", line) line = " ".join(line.split()) print(line) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/pca.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import os.path as osp import numpy as np import faiss def get_parser(): parser = argparse.ArgumentParser( description="compute a pca matrix given an array of numpy features" ) # fmt: off parser.add_argument('data', help='numpy file containing features') parser.add_argument('--output', help='where to save the pca matrix', required=True) parser.add_argument('--dim', type=int, help='dim for pca reduction', required=True) parser.add_argument('--eigen-power', type=float, default=0, help='eigen power, -0.5 for whitening') return parser def main(): parser = get_parser() args = parser.parse_args() print("Reading features") x = np.load(args.data, mmap_mode="r") print("Computing PCA") pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power) pca.train(x) b = faiss.vector_to_array(pca.b) A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) os.makedirs(args.output, exist_ok=True) prefix = str(args.dim) if args.eigen_power != 0: prefix += f"_{args.eigen_power}" np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T) np.save(osp.join(args.output, f"{prefix}_pca_b"), b) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import numpy as np import sys def get_parser(): parser = argparse.ArgumentParser( description="converts words to phones adding optional silences around in between words" ) parser.add_argument( "--sil-prob", "-s", type=float, default=0, help="probability of inserting silence between each word", ) parser.add_argument( "--surround", action="store_true", help="if set, surrounds each example with silence", ) parser.add_argument( "--lexicon", help="lexicon to convert to phones", required=True, ) return parser def main(): parser = get_parser() args = parser.parse_args() sil_prob = args.sil_prob surround = args.surround sil = "<SIL>" wrd_to_phn = {} with open(args.lexicon, "r") as lf: for line in lf: items = line.rstrip().split() assert len(items) > 1, line assert items[0] not in wrd_to_phn, items wrd_to_phn[items[0]] = items[1:] for line in sys.stdin: words = line.strip().split() if not all(w in wrd_to_phn for w in words): continue phones = [] if surround: phones.append(sil) sample_sil_probs = None if sil_prob > 0 and len(words) > 1: sample_sil_probs = np.random.random(len(words) - 1) for i, w in enumerate(words): phones.extend(wrd_to_phn[w]) if ( sample_sil_probs is not None and i < len(sample_sil_probs) and sample_sil_probs[i] < sil_prob ): phones.append(sil) if surround: phones.append(sil) print(" ".join(phones)) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/prepare_audio.sh ================================================ #!/usr/bin/env zsh # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. source_dir=$1 tgt_dir=$2 model=$3 if [ -z "$4" ] then dim=512 else dim=$4 fi echo "using $dim dim for PCA" if [ -z "$5" ] then layer=14 else layer=$5 fi echo "extracting from layer $layer" train_split=train valid_split=valid test_split=test all_splits=($train_split) if [[ -f "$source_dir/valid.tsv" ]]; then all_splits+=('valid') fi if [[ -f "$source_dir/test.tsv" ]]; then all_splits+=('test') fi echo "processing splits: $all_splits" mkdir -p $tgt_dir cp $source_dir/*.tsv $tgt_dir cp $source_dir/*.wrd $tgt_dir cp $source_dir/*.ltr $tgt_dir cp $source_dir/*.phn $tgt_dir cp $source_dir/dict* $tgt_dir setopt shwordsplit for split in $all_splits; do python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \ --save-dir $tgt_dir --checkpoint $model --layer $layer done python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py $tgt_dir/${train_split}.tsv \ --checkpoint $model --save-dir $tgt_dir -f "CLUS128" --sample-pct 1.0 for split in $all_splits; do python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py $tgt_dir \ --checkpoint $model --path $tgt_dir/CLUS128 --split $split done python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/pca.py $tgt_dir/${train_split}.npy --output $tgt_dir/pca --dim $dim for split in $all_splits; do python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/apply_pca.py $tgt_dir --split $split --save-dir $tgt_dir/precompute_pca$dim --pca-path $tgt_dir/pca/${dim}_pca --batch-size 1048000 python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/merge_clusters.py $tgt_dir/precompute_pca$dim --cluster-dir $tgt_dir/CLUS128 \ --split $split --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean --pooling mean python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/mean_pool.py $tgt_dir/precompute_pca${dim}_cls128_mean \ --save-dir $tgt_dir/precompute_pca${dim}_cls128_mean_pooled --split $split done ================================================ FILE: examples/wav2vec/unsupervised/scripts/prepare_audio_v2.sh ================================================ #!/usr/bin/env zsh # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. source_dir=$1 tgt_dir=$2 model=$3 if [ -z "$4" ] then dim=64 else dim=$4 fi echo "using $dim clusters for auxilary target" if [ -z "$5" ] then layer=14 else layer=$5 fi echo "extracting from layer $layer" train_split=train valid_split=valid test_split=test all_splits=($train_split) if [[ -f "$source_dir/valid.tsv" ]]; then all_splits+=('valid') fi if [[ -f "$source_dir/test.tsv" ]]; then all_splits+=('test') fi echo "processing splits: $all_splits" mkdir -p $tgt_dir cp $source_dir/*.tsv $tgt_dir cp $source_dir/*.wrd $tgt_dir cp $source_dir/*.ltr $tgt_dir cp $source_dir/*.phn $tgt_dir cp $source_dir/dict* $tgt_dir setopt shwordsplit for split in $all_splits; do python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py $source_dir --split $split \ --save-dir $tgt_dir --checkpoint $model --layer $layer done mkdir -p $tgt_dir/mfcc # Consider spliting corpus into chuncks for large corpus, see HuBERT preprocessing for more details python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_mfcc_feature.py \ $tgt_dir $train_split 1 0 $tgt_dir/mfcc python $FAIRSEQ_ROOT/examples/hubert/simple_kmeans/dump_km_label.py \ $tgt_dir/mfcc $train_split $tgt_dir/mfcc/cls$dim 1 0 $tgt_dir/mfcc/cls${dim}_idx cp $tgt_dir/mfcc/cls${dim}_idx/${train_split}_0_1.km $tgt_dir/$train_split.km ================================================ FILE: examples/wav2vec/unsupervised/scripts/prepare_text.sh ================================================ #!/usr/bin/env zsh # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. lg=$1 text_path=$2 target_dir=$3 min_phones=$4 phonemizer=$5 lid_path=$6 sil_prob=$7 if [ -z "$lid_path" ]; then lid_path="lid.187.bin" fi ph_lg=${lg:l} if test "$lg" = 'fr'; then ph_lg='fr-fr' elif test "$lg" = 'en'; then ph_lg='en-us' elif test "$lg" = 'pt'; then ph_lg='pt-br' fi ESPEAK_PATH='' if test "$phonemizer" = 'espeak'; then ESPEAK_PATH=$(which espeak) elif test "$phonemizer" = 'espeak-ng'; then ESPEAK_PATH=$(which espeak-ng) elif test "$phonemizer" = 'G2P'; then ESPEAK_PATH='' else echo "Unknown phonemizer $phonemizer. Valid options are espeak, espean-ng and G2P" exit 1 fi echo $lg echo $ph_lg echo $text_path echo $target_dir echo "min phone seen threshold is $min_phones" mkdir -p $target_dir python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/normalize_and_filter_text.py --lang $lg --fasttext-model $lid_path < $text_path | grep -v '\-\-\-' >! $target_dir/lm.upper.lid.txt python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/lm.upper.lid.txt --only-source --destdir $target_dir --thresholdsrc 2 --padding-factor 1 --dict-only cut -f1 -d' ' $target_dir/dict.txt | grep -v -x '[[:punct:]]*' | grep -Pv '\d\d\d\d\d+' >! $target_dir/words.txt if [ -z "$ESPEAK_PATH" ]; then python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/g2p_wrd_to_phn.py --compact < $target_dir/words.txt > $target_dir/phones.txt else # echoing 1 into corpus will prevent the mismatch lines between lexicon and phones in case the phonemizer fails one=$(echo "1" | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -p ' ' -w '' -l $ph_lg --language-switch remove-flags) sed 's/$/ 1/' $target_dir/words.txt | PHONEMIZER_ESPEAK_PATH=$ESPEAK_PATH phonemize -o $target_dir/phones.txt -p ' ' -w '' -l $ph_lg -j 70 --language-switch remove-flags echo "one is ${one}" sed -i "s/${one}$//" $target_dir/phones.txt fi paste $target_dir/words.txt $target_dir/phones.txt >! $target_dir/lexicon.lst python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones.txt --only-source --destdir $target_dir/phones --thresholdsrc $min_phones --padding-factor 1 --dict-only python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/filter_lexicon.py -d $target_dir/phones/dict.txt < $target_dir/lexicon.lst >! $target_dir/lexicon_filtered.lst python $FAIRSEQ_ROOT/examples/wav2vec/unsupervised/scripts/phonemize_with_sil.py -s $sil_prob --surround --lexicon $target_dir/lexicon_filtered.lst < $target_dir/lm.upper.lid.txt >! $target_dir/phones/lm.phones.filtered.txt cp $target_dir/phones/dict.txt $target_dir/phones/dict.phn.txt echo "<SIL> 0" >> $target_dir/phones/dict.phn.txt python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $target_dir/phones/lm.phones.filtered.txt --workers 70 --only-source --destdir $target_dir/phones --srcdict $target_dir/phones/dict.phn.txt $KENLM_ROOT/lmplz -o 4 < $target_dir/lm.upper.lid.txt --discount_fallback --prune 0 0 0 3 >! $target_dir/kenlm.wrd.o40003.arpa $KENLM_ROOT/build_binary $target_dir/kenlm.wrd.o40003.arpa $target_dir/kenlm.wrd.o40003.bin lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words_sil lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'" lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_words lm_arpa=$target_dir/kenlm.wrd.o40003.arpa wav2letter_lexicon=$target_dir/lexicon_filtered.lst data_dir=$target_dir/phones in_labels=phn $KENLM_ROOT/lmplz -o 4 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.04.arpa $KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.04.arpa $target_dir/phones/lm.phones.filtered.04.bin $KENLM_ROOT/lmplz -o 6 < $target_dir/phones/lm.phones.filtered.txt --discount_fallback >! $target_dir/phones/lm.phones.filtered.06.arpa $KENLM_ROOT/build_binary $target_dir/phones/lm.phones.filtered.06.arpa $target_dir/phones/lm.phones.filtered.06.bin lg=$lg python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$target_dir/fst/phn_to_phn_sil lm_arpa=$target_dir/phones/lm.phones.filtered.06.arpa data_dir=$target_dir/phones in_labels=phn "blank_symbol='<SIL>'" ================================================ FILE: examples/wav2vec/unsupervised/scripts/prepare_timit.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. timit_root=$1 # assume it is the upper-cased version tgt_dir=$2 model=$3 set -eu setups="matched unmatched" splits="test valid train train_text" tgt_dir=$(realpath $tgt_dir) sph2wav=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe wav_dir=$tgt_dir/wav mkdir -p $tgt_dir $wav_dir find $timit_root/{TRAIN,TEST} -iname "*.WAV" > $tgt_dir/all_sph.flist cat $tgt_dir/all_sph.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).WAV#\1_\2#g' > $tgt_dir/all.uid paste -d' ' $tgt_dir/{all_sph.flist,all.uid} | \ awk -v sph2wav=$sph2wav -v wav_dir=$wav_dir '{print sph2wav " -f wav " $1 " > " wav_dir "/" $2 ".wav"}' \ > $tgt_dir/sph2wav.sh bash $tgt_dir/sph2wav.sh cat $tgt_dir/all.uid | awk -v wav_dir=$(pwd)/$wav_dir '{print $1" "wav_dir"/"$1".wav"}' | sort > $tgt_dir/all_wav.scp cut -d' ' -f2 $tgt_dir/all_wav.scp | xargs -I{} soxi -s {} > $tgt_dir/all.dur paste -d' ' $tgt_dir/{all_wav.scp,all.dur} > $tgt_dir/all_wav_dur.scp rm $tgt_dir/{all.uid,all_sph.flist,sph2wav.sh} find $timit_root/{TRAIN,TEST} -iname "*.PHN" > $tgt_dir/all_phn60.flist while read line; do if [ ! -f $line ]; then >&2 echo "Cannot find transcription file '$line'" && exit 1; fi cut -f3 -d' ' "$line" | tr '\n' ' ' | perl -ape 's: *$:\n:;' done < $tgt_dir/all_phn60.flist > $tgt_dir/all.phn60 cat $tgt_dir/all_phn60.flist | sed -e 's#//*#/#g' -e 's#.*/\([^/]*\)/\([^/]*\).PHN#\1_\2#g' | \ paste -d' ' - $tgt_dir/all.phn60 | \ $KALDI_ROOT/egs/timit/s5/local/timit_norm_trans.pl -i - -m $KALDI_ROOT/egs/timit/s5/conf/phones.60-48-39.map -to 39 | \ sort > $tgt_dir/all.phn echo "done preparing wav and 39-phone transcripts" for s in $setups; do mkdir -p $tgt_dir/$s for x in $splits; do uid_path=config/timit_${s}/${x}.uid grep -w -f $uid_path $tgt_dir/all.phn | cut -d' ' -f2- > $tgt_dir/$s/$x.phn ln -sf $(realpath $tgt_dir/$s/$x.phn) $tgt_dir/$s/$x.wrd echo "/" > $tgt_dir/$s/$x.tsv && grep -w -f $uid_path $tgt_dir/all_wav_dur.scp | cut -d' ' -f2- | sed 's# #\t#' >> $tgt_dir/$s/$x.tsv done for x in $splits; do cat $tgt_dir/$s/$x.phn done | tr ' ' '\n' | sort -u | awk '{print $1" "1}' > $tgt_dir/$s/dict.phn.txt ln -sf $(realpath $tgt_dir/$s/dict.phn.txt) $tgt_dir/$s/dict.wrd.txt done echo "done preparing unmatched and matched setups for TIMIT" for s in $setups; do zsh scripts/prepare_audio.sh $tgt_dir/$s $tgt_dir/$s/feat $model lm_dir=$tgt_dir/$s/phones fst_dir=$tgt_dir/$s/fst/phn_to_phn python $FAIRSEQ_ROOT/fairseq_cli/preprocess.py --dataset-impl mmap --trainpref $tgt_dir/$s/train_text.phn --workers 10 --only-source --destdir $lm_dir --srcdict $tgt_dir/$s/dict.phn.txt $KENLM_ROOT/lmplz -o 3 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.03.arpa $KENLM_ROOT/build_binary $lm_dir/train_text_phn.03.arpa $lm_dir/train_text_phn.03.bin $KENLM_ROOT/lmplz -o 4 < $tgt_dir/$s/train_text.phn --discount_fallback >$lm_dir/train_text_phn.04.arpa $KENLM_ROOT/build_binary $lm_dir/train_text_phn.04.arpa $lm_dir/train_text_phn.04.bin python $FAIRSEQ_ROOT/examples/speech_recognition/kaldi/kaldi_initializer.py kaldi_root=$KALDI_ROOT fst_dir=$fst_dir lm_arpa=$lm_dir/train_text_phn.03.arpa data_dir=$tgt_dir/$s in_labels=phn done echo "done preprocessing audio and text for wav2vec-U" ================================================ FILE: examples/wav2vec/unsupervised/scripts/remove_silence.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ get intervals from .vads file, specify output data, and this script removes silences and saves the audio data in out path folder paths=shards/train.tsv vads=shards/train.vads python remove_silence.py --paths $paths --vads $vads """ import os import argparse import torch import torchaudio import tqdm parser = argparse.ArgumentParser() parser.add_argument("--tsv", default="", type=str) parser.add_argument("--vads", default="", type=str) parser.add_argument("--out", type=str) params = parser.parse_args() # load paths paths = [] with open(params.tsv) as f: root = next(f).rstrip() for line in f: paths.append(os.path.join(root, line.rstrip().split("\t")[0])) # load vads list_intervals = [] with open(params.vads) as f: for line in f: interval = [ [int(w.split(":")[0]), int(w.split(":")[1])] for w in line.rstrip().split() ] list_intervals.append(interval) # load audio and keep only intervals (i.e. remove silences) for i in tqdm.trange(len(paths)): data, _ = torchaudio.load(paths[i]) if len(list_intervals[i]) > 0: data_filtered = torch.cat( [data[0][int(it[0]) : int(it[1])] for it in list_intervals[i]] ).unsqueeze(0) else: data_filtered = data # YOU MAY NEED TO MODIFY THIS TO GET THE RIGHT SUBPATH # outpath = params.out + '/'.join(paths[i].split('/')[-1]) outpath = params.out + "/" + "/".join(paths[i].split("/")[-2:]) if not os.path.isdir("/".join(outpath.split("/")[:-1])): os.makedirs("/".join(outpath.split("/")[:-1])) if not os.path.exists(outpath): torchaudio.save(outpath, data_filtered, sample_rate=16000) else: print(outpath, "exists!") ================================================ FILE: examples/wav2vec/unsupervised/scripts/vads.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import sys from copy import deepcopy from scipy.signal import lfilter import numpy as np from tqdm import tqdm import soundfile as sf import os.path as osp def get_parser(): parser = argparse.ArgumentParser(description="compute vad segments") parser.add_argument( "--rvad-home", "-r", help="path to rvad home (see https://github.com/zhenghuatan/rVADfast)", required=True, ) return parser def rvad(speechproc, path): winlen, ovrlen, pre_coef, nfilter, nftt = 0.025, 0.01, 0.97, 20, 512 ftThres = 0.5 vadThres = 0.4 opts = 1 data, fs = sf.read(path) assert fs == 16_000, "sample rate must be 16khz" ft, flen, fsh10, nfr10 = speechproc.sflux(data, fs, winlen, ovrlen, nftt) # --spectral flatness -- pv01 = np.zeros(ft.shape[0]) pv01[np.less_equal(ft, ftThres)] = 1 pitch = deepcopy(ft) pvblk = speechproc.pitchblockdetect(pv01, pitch, nfr10, opts) # --filtering-- ENERGYFLOOR = np.exp(-50) b = np.array([0.9770, -0.9770]) a = np.array([1.0000, -0.9540]) fdata = lfilter(b, a, data, axis=0) # --pass 1-- noise_samp, noise_seg, n_noise_samp = speechproc.snre_highenergy( fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk ) # sets noisy segments to zero for j in range(n_noise_samp): fdata[range(int(noise_samp[j, 0]), int(noise_samp[j, 1]) + 1)] = 0 vad_seg = speechproc.snre_vad( fdata, nfr10, flen, fsh10, ENERGYFLOOR, pv01, pvblk, vadThres ) return vad_seg, data def main(): parser = get_parser() args = parser.parse_args() sys.path.append(args.rvad_home) import speechproc stride = 160 lines = sys.stdin.readlines() root = lines[0].rstrip() for fpath in tqdm(lines[1:]): path = osp.join(root, fpath.split()[0]) vads, wav = rvad(speechproc, path) start = None vad_segs = [] for i, v in enumerate(vads): if start is None and v == 1: start = i * stride elif start is not None and v == 0: vad_segs.append((start, i * stride)) start = None if start is not None: vad_segs.append((start, len(wav))) print(" ".join(f"{v[0]}:{v[1]}" for v in vad_segs)) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/wav2vec_apply_cluster_faiss.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import os.path as osp import numpy as np import tqdm import torch import sys import faiss import torch.nn.functional as F from wav2vec_cluster_faiss import parse_faiss_specs, Wav2VecFeatureReader def get_parser(): parser = argparse.ArgumentParser(description="apply clusters") # fmt: off parser.add_argument('data', help='location of tsv files') parser.add_argument('--split', help='split to process', required=True) parser.add_argument('--labels', help='split to process', default="phn") parser.add_argument('--path', help='path to pca and centroids', required=True) parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True) parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14) parser.add_argument('--max-tsz', type=int, help='batch kmeans up to this much', default=14) # fmt: on return parser def get_iterator(args): label_path = osp.join(args.data, f"{args.split}.{args.labels}") if osp.exists(label_path): lp = open(label_path, "r") else: lp = None with open(osp.join(args.data, f"{args.split}.tsv"), "r") as fp: lines = fp.read().split("\n") root = lines.pop(0).strip() files = [line.rstrip() for line in lines if len(line) > 0] if lp is not None: lbls = [line.rstrip() for line in lp] else: lbls = [None] * len(files) num = len(files) reader = Wav2VecFeatureReader(args.checkpoint, args.layer) def iterate(): for fname, lbl in zip(files, lbls): file = osp.join(root, fname.split("\t")[0]) feats = reader.get_feats(file) yield feats.data, fname, lbl return iterate, num, root def main(): parser = get_parser() args = parser.parse_args() spec = osp.basename(args.path) try: faiss_spec = parse_faiss_specs(spec.rstrip("/"))[0] except: print(spec) raise print("Faiss Spec:", faiss_spec, file=sys.stderr) if faiss_spec.pca: A = torch.from_numpy(np.load(osp.join(args.path, "pca_A.npy"))).cuda() b = torch.from_numpy(np.load(osp.join(args.path, "pca_b.npy"))).cuda() print("Loaded PCA", file=sys.stderr) centroids = np.load(osp.join(args.path, "centroids.npy")) print("Loaded centroids", centroids.shape, file=sys.stderr) res = faiss.StandardGpuResources() index_flat = ( faiss.IndexFlatL2(centroids.shape[1]) if not faiss_spec.sphere else faiss.IndexFlatIP(centroids.shape[1]) ) faiss_index = faiss.index_cpu_to_gpu(res, 0, index_flat) faiss_index.add(centroids) generator, num, root = get_iterator(args) iterator = generator() had_labels = False label_path = osp.join(args.path, f"{args.split}.{args.labels}") with torch.no_grad(): with open(osp.join(args.path, f"{args.split}.src"), "w") as fp, open( osp.join(args.path, f"{args.split}.tsv"), "w" ) as pp, open(label_path, "w") as lp: print(root, file=pp) for f, fname, lbl in tqdm.tqdm(iterator, total=num): if faiss_spec.pca: f = torch.mm(f, A) + b if faiss_spec.norm: f = F.normalize(f, p=2, dim=-1) f = f.cpu().numpy() _, z = faiss_index.search(f, 1) print(" ".join(str(x.item()) for x in z), file=fp) print(fname, file=pp) if lbl is not None: print(lbl, file=lp) had_labels = True if not had_labels: os.remove(label_path) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/wav2vec_cluster_faiss.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import gc import os import os.path as osp import random import numpy as np import tqdm import torch from collections import namedtuple import faiss import fairseq import soundfile as sf def get_parser(): parser = argparse.ArgumentParser( description="compute kmeans codebook from kaldi-computed feats" ) # fmt: off parser.add_argument('data', help='location of tsv files') parser.add_argument('--save-dir', help='where to save the output', required=True) parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec model (if using wav2vec features)', required=True) parser.add_argument('--sample-pct', '-r', type=float, help='percentage of timesteps to sample', default=0) parser.add_argument('--layer', '-l', type=int, help='which layer to read', default=14) parser.add_argument('--faiss-specs', '-f', type=str, help='faiss index specs; separated by space ' 'format is: PCAx_NORM_CLUSx_SPHERICAL -> ' 'PCAx if exists first apply PCA ' 'NORM if exists, normalize the vector by L2 norm ' 'CLUSx must exist, cluster to x clusters ' 'SPEHRICAL if exists, apply spherical kmeans', default='l2') # fmt: on return parser faiss_spec = namedtuple("faiss_spec", ["pca", "norm", "n_clus", "sphere", "spec_str"]) def parse_faiss_specs(specs_str): specs = [] for ss in specs_str.split(): comps = ss.split("_") pca = 0 norm = False n_clus = 0 sphere = False for c in comps: if c.startswith("PCA"): pca = int(c[3:]) elif c == "NORM": norm = True elif c.startswith("CLUS"): n_clus = int(c[4:]) elif c == "SPHERICAL": sphere = True assert n_clus > 0 specs.append( faiss_spec(pca=pca, norm=norm, n_clus=n_clus, sphere=sphere, spec_str=ss) ) return specs class Wav2VecFeatureReader(object): def __init__(self, cp_file, layer): state = fairseq.checkpoint_utils.load_checkpoint_to_cpu(cp_file) self.layer = layer if "cfg" in state: w2v_args = state["cfg"] task = fairseq.tasks.setup_task(w2v_args.task) model = task.build_model(w2v_args.model) else: w2v_args = state["args"] task = fairseq.tasks.setup_task(w2v_args) model = task.build_model(w2v_args) model.load_state_dict(state["model"], strict=True) model.eval() model.cuda() self.model = model def read_audio(self, fname): """Load an audio file and return PCM along with the sample rate""" wav, sr = sf.read(fname) assert sr == 16e3 return wav def get_feats(self, loc): x = self.read_audio(loc) with torch.no_grad(): source = torch.from_numpy(x).view(1, -1).float().cuda() res = self.model( source=source, mask=False, features_only=True, layer=self.layer ) return res["layer_results"][self.layer][0].squeeze(1) def get_iterator(args): with open(args.data, "r") as fp: lines = fp.read().split("\n") root = lines.pop(0).strip() files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0] if getattr(args, "sample_pct", 0) > 0: files = random.sample(files, int(args.sample_pct * len(files))) num = len(files) reader = Wav2VecFeatureReader(args.checkpoint, args.layer) def iterate(): for fname in files: feats = reader.get_feats(fname) yield feats.cpu().numpy() return iterate, num def main(): parser = get_parser() args = parser.parse_args() faiss_specs = parse_faiss_specs(args.faiss_specs) print("Faiss Specs:", faiss_specs) feat_path = osp.join(args.save_dir, "features") if osp.exists(feat_path + ".npy"): feats = np.load(feat_path + ".npy") else: generator, num = get_iterator(args) iterator = generator() feats = [] for f in tqdm.tqdm(iterator, total=num): feats.append(f) del iterator del generator feats = np.concatenate(feats) print(feats.shape) os.makedirs(args.save_dir, exist_ok=True) # np.save(feat_path, feats) gc.collect() torch.cuda.empty_cache() reload = False for spec in faiss_specs: print("Processing spec", spec) if reload: print("Reloading...") del feats gc.collect() feats = np.load(feat_path + ".npy") save_path = osp.join(args.save_dir, spec.spec_str) os.makedirs(save_path, exist_ok=True) d = feats.shape[-1] x = feats if spec.pca > 0: print("Computing PCA") pca = faiss.PCAMatrix(d, spec.pca) pca.train(x) d = spec.pca b = faiss.vector_to_array(pca.b) A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) np.save(osp.join(save_path, "pca_A"), A.T) np.save(osp.join(save_path, "pca_b"), b) print("Applying PCA") x = pca.apply_py(x) if spec.norm: reload = spec.pca <= 0 print("Normalizing") faiss.normalize_L2(x) print("Computing kmeans") kmeans = faiss.Kmeans( d, spec.n_clus, niter=50, verbose=True, spherical=spec.sphere, max_points_per_centroid=feats.shape[0], gpu=True, nredo=3, ) kmeans.train(x) np.save(osp.join(save_path, "centroids"), kmeans.centroids) del kmeans del x gc.collect() if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/wav2vec_extract_features.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import os.path as osp import tqdm import torch import torch.nn.functional as F from shutil import copyfile from npy_append_array import NpyAppendArray import fairseq import soundfile as sf def get_parser(): parser = argparse.ArgumentParser( description="compute kmeans codebook from kaldi-computed feats" ) # fmt: off parser.add_argument('data', help='location of tsv files') parser.add_argument('--split', help='which split to read', required=True) parser.add_argument('--save-dir', help='where to save the output', required=True) parser.add_argument('--checkpoint', type=str, help='checkpoint for wav2vec ctc model', required=True) parser.add_argument('--layer', type=int, default=14, help='which layer to use') # fmt: on return parser class Wav2VecFeatureReader(object): def __init__(self, cp_file, layer): model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( [cp_file] ) model = model[0] model.eval() model.cuda() self.model = model self.task = task self.layer = layer def read_audio(self, fname): """Load an audio file and return PCM along with the sample rate""" wav, sr = sf.read(fname) assert sr == 16e3 return wav def get_feats(self, loc): x = self.read_audio(loc) with torch.no_grad(): source = torch.from_numpy(x).float().cuda() if self.task.cfg.normalize: assert source.dim() == 1, source.dim() with torch.no_grad(): source = F.layer_norm(source, source.shape) source = source.view(1, -1) m_res = self.model(source=source, mask=False, features_only=True, layer=self.layer) return m_res["x"].squeeze(0).cpu() def get_iterator(args): with open(osp.join(args.data, args.split) + ".tsv", "r") as fp: lines = fp.read().split("\n") root = lines.pop(0).strip() files = [osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0] num = len(files) reader = Wav2VecFeatureReader(args.checkpoint, args.layer) def iterate(): for fname in files: w2v_feats = reader.get_feats(fname) yield w2v_feats return iterate, num def main(): parser = get_parser() args = parser.parse_args() os.makedirs(args.save_dir, exist_ok=True) def create_files(dest): copyfile(osp.join(args.data, args.split) + ".tsv", dest + ".tsv") if osp.exists(osp.join(args.data, args.split) + ".wrd"): copyfile(osp.join(args.data, args.split) + ".wrd", dest + ".wrd") if osp.exists(osp.join(args.data, args.split) + ".phn"): copyfile(osp.join(args.data, args.split) + ".phn", dest + ".phn") if osp.exists(dest + ".npy"): os.remove(dest + ".npy") npaa = NpyAppendArray(dest + ".npy") return npaa save_path = osp.join(args.save_dir, args.split) npaa = create_files(save_path) generator, num = get_iterator(args) iterator = generator() with open(save_path + ".lengths", "w") as l_f: for w2v_feats in tqdm.tqdm(iterator, total=num): print(len(w2v_feats), file=l_f) if len(w2v_feats) > 0: npaa.append(w2v_feats.numpy()) if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/scripts/wer.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Implement unsupervised metric for decoding hyperparameter selection: $$ alpha * LM_PPL + ViterbitUER(%) * 100 $$ """ import argparse import logging import sys import editdistance logging.root.setLevel(logging.INFO) logging.basicConfig(stream=sys.stdout, level=logging.INFO) logger = logging.getLogger(__name__) def get_parser(): parser = argparse.ArgumentParser() parser.add_argument("-s", "--hypo", help="hypo transcription", required=True) parser.add_argument( "-r", "--reference", help="reference transcription", required=True ) return parser def compute_wer(ref_uid_to_tra, hyp_uid_to_tra, g2p): d_cnt = 0 w_cnt = 0 w_cnt_h = 0 for uid in hyp_uid_to_tra: ref = ref_uid_to_tra[uid].split() if g2p is not None: hyp = g2p(hyp_uid_to_tra[uid]) hyp = [p for p in hyp if p != "'" and p != " "] hyp = [p[:-1] if p[-1].isnumeric() else p for p in hyp] else: hyp = hyp_uid_to_tra[uid].split() d_cnt += editdistance.eval(ref, hyp) w_cnt += len(ref) w_cnt_h += len(hyp) wer = float(d_cnt) / w_cnt logger.debug( ( f"wer = {wer * 100:.2f}%; num. of ref words = {w_cnt}; " f"num. of hyp words = {w_cnt_h}; num. of sentences = {len(ref_uid_to_tra)}" ) ) return wer def main(): args = get_parser().parse_args() errs = 0 count = 0 with open(args.hypo, "r") as hf, open(args.reference, "r") as rf: for h, r in zip(hf, rf): h = h.rstrip().split() r = r.rstrip().split() errs += editdistance.eval(r, h) count += len(r) logger.info(f"UER: {errs / count * 100:.2f}%") if __name__ == "__main__": main() def load_tra(tra_path): with open(tra_path, "r") as f: uid_to_tra = {} for line in f: uid, tra = line.split(None, 1) uid_to_tra[uid] = tra logger.debug(f"loaded {len(uid_to_tra)} utterances from {tra_path}") return uid_to_tra ================================================ FILE: examples/wav2vec/unsupervised/scripts/wrd_to_ltr.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys def main(): for line in sys.stdin: print(" ".join(list(line.strip().replace(" ", "|"))) + " |") if __name__ == "__main__": main() ================================================ FILE: examples/wav2vec/unsupervised/tasks/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .unpaired_audio_text import UnpairedAudioText __all__ = [ "UnpairedAudioText", ] ================================================ FILE: examples/wav2vec/unsupervised/tasks/unpaired_audio_text.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. from dataclasses import dataclass, field import logging import math import os from typing import Optional import torch from fairseq.logging import metrics from fairseq.tasks import FairseqTask, register_task from ..data import ExtractedFeaturesDataset, RandomInputDataset from fairseq.data import ( Dictionary, data_utils, StripTokenDataset, ) from fairseq.dataclass import FairseqDataclass from fairseq.distributed.utils import get_data_parallel_world_size from omegaconf import MISSING from examples.speech_recognition.kaldi.kaldi_decoder import ( KaldiDecoder, KaldiDecoderConfig, ) logger = logging.getLogger(__name__) @dataclass class DecodingConfig(FairseqDataclass): kenlm_path: Optional[str] = None lm_weight: float = 0 blank_weight: float = 0 @dataclass class UnpairedAudioTextConfig(FairseqDataclass): data: str = field( default=MISSING, metadata={"help": "path to data directory containing audio"} ) text_data: str = field( default=MISSING, metadata={"help": "path to data directory containing text"} ) max_length: Optional[int] = None labels: Optional[str] = field( default=None, metadata={"help": "extension of the label file to load, used for fine-tuning"}, ) aux_target_postfix: Optional[str] = field( default=None, metadata={"help": "auxaliry target filename extension"}, ) unfiltered: bool = field( default=False, metadata={"help": "load data with _unfiltered suffix"} ) ctc_eval: bool = field( default=False, metadata={"help": "eval UER as if computed by CTC"} ) sort_by_length: bool = field( default=True, metadata={"help": "sort examples by length of audio timesteps"} ) shuffle: bool = field(default=True, metadata={"help": "shuffle examples"}) append_eos: bool = field(default=False, metadata={"help": "append eos"}) uppercase: Optional[bool] = field( default=False, metadata={"help": "uppercase for LM score computation"} ) skipwords: Optional[str] = field( default="", metadata={ "help": "comma-separated words to be removed for LM score computation" }, ) kenlm_path: Optional[str] = None vocab_usage_power: float = 2 word_decoder_config: Optional[KaldiDecoderConfig] = None word_kenlm_path: Optional[str] = None decoding_config: DecodingConfig = DecodingConfig() @register_task("unpaired_audio_text", dataclass=UnpairedAudioTextConfig) class UnpairedAudioText(FairseqTask): """ """ cfg: UnpairedAudioTextConfig def __init__( self, cfg: UnpairedAudioTextConfig, source_dictionary=None, target_dictionary=None, ): super().__init__(cfg) self._target_dictionary = target_dictionary self._source_dictionary = source_dictionary self.num_symbols = ( len([s for s in target_dictionary.symbols if not s.startswith("madeup")]) - target_dictionary.nspecial ) self.sil_id = ( target_dictionary.index("<SIL>") if "<SIL>" in target_dictionary else -1 ) self.kenlm = None if cfg.kenlm_path is not None: import kenlm self.kenlm = kenlm.Model(cfg.kenlm_path) self.word_kenlm = None if cfg.word_kenlm_path is not None: import kenlm self.word_kenlm = kenlm.Model(cfg.word_kenlm_path) self.uppercase = cfg.uppercase self.skipwords = set(cfg.skipwords.split(",")) def str_postprocess(s): s = " ".join(w for w in s.split() if w not in self.skipwords) s = s.upper() if self.uppercase else s return s self.str_postprocess = str_postprocess self.compute_lm_score = lambda s: self.kenlm.score(self.str_postprocess(s)) self.compute_word_score = None if cfg.word_decoder_config is not None: self.kaldi_decoder = KaldiDecoder(cfg.word_decoder_config, beam=10) def compute_word_score(logits, padding): res = self.kaldi_decoder.decode(logits, padding) for r in res: r = r.result() assert len(r) == 1 r = r[0] yield r["score"], r["words"] self.compute_word_score = compute_word_score @classmethod def setup_task(cls, cfg: UnpairedAudioTextConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: cfg (AudioPretrainingConfig): configuration of this task """ dict_path = os.path.join(cfg.text_data, "dict.txt") if os.path.exists(dict_path): target_dictionary = Dictionary.load(dict_path) else: dict_path = os.path.join(cfg.data, f"dict.{cfg.labels}.txt") target_dictionary = Dictionary.load(dict_path) return cls(cfg, target_dictionary=target_dictionary) def optimizer_step(self, optimizer, model, update_num): if hasattr(model, "get_groups_for_update"): groups = model.get_groups_for_update(update_num) optimizer.step(groups={groups}) else: optimizer.step() def valid_step(self, sample, model, criterion): res = model( **sample["net_input"], dense_x_only=True, ) dense_x = res["logits"] padding_mask = res["padding_mask"] word_scores = None if self.compute_word_score is not None: word_scores = self.compute_word_score(dense_x.cpu(), padding_mask.cpu()) z = dense_x.argmax(-1) z[padding_mask] = self.target_dictionary.pad() vocab_seen = torch.zeros(self.num_symbols, dtype=torch.bool) import editdistance c_err = 0 c_len = 0 pred_c_len = 0 lm_score_sum = 0 for i, (x, t, id) in enumerate( zip( z, sample["target"] if "target" in sample else [None] * len(z), sample["id"], ) ): if t is not None: t = t[(t >= self.target_dictionary.nspecial)] x = x[ (x >= self.target_dictionary.nspecial) & (x < (self.num_symbols + self.target_dictionary.nspecial)) ] if self.sil_id >= 0: x = x[x != self.sil_id] vocab_seen[x - self.target_dictionary.nspecial] = True pred_units_arr = x if self.cfg.ctc_eval: pred_units_arr = pred_units_arr.unique_consecutive() pred_units_arr = pred_units_arr[pred_units_arr != 0] if id == 0: if t is not None: logger.info(f"REF: {self.target_dictionary.string(t)}") logger.info(f"HYP: {self.target_dictionary.string(pred_units_arr)}") if self.kenlm is not None: if t is not None: ref_lm_s = self.compute_lm_score( self.target_dictionary.string(t) ) logger.info( f"LM [REF]: {ref_lm_s}, {math.pow(10, -ref_lm_s / (len(t) + 1))}" ) hyp_lm_s = self.compute_lm_score( self.target_dictionary.string(pred_units_arr) ) logger.info( f"LM [HYP]: {hyp_lm_s}, {math.pow(10, -hyp_lm_s / (len(pred_units_arr) + 1))}" ) pred_units_arr = pred_units_arr.tolist() pred_c_len += len(pred_units_arr) if t is not None: t = t.tolist() c_err += editdistance.eval(pred_units_arr, t) c_len += len(t) else: c_len = pred_c_len if self.kenlm is not None: pred_str = self.target_dictionary.string(pred_units_arr) lm_score = self.compute_lm_score(pred_str) lm_score_sum += lm_score kaldi_score_sum = 0 word_lm_sum = 0 num_words = 0 if word_scores is not None: for score, words in word_scores: kaldi_score_sum += score num_words += len(words) if self.word_kenlm is not None: word_lm_sum += self.kenlm.score(" ".join(words)) try: world_size = get_data_parallel_world_size() except: world_size = 1 logging_output = { "loss": c_err, "_num_char_errors": c_err, "_num_chars": c_len, "_num_pred_chars": pred_c_len, "ntokens": c_len, "nsentences": z.size(0), "sample_size": c_len, "_world_size": world_size, "_lm_score_sum": lm_score_sum, "_kaldi_score_sum": kaldi_score_sum, "_word_lm_sum": word_lm_sum, "_num_words": num_words, "_vocab_seen": vocab_seen, } return c_err, c_len, logging_output def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): data_path = self.cfg.data task_cfg = task_cfg or self.cfg has_unpaired_text = os.path.exists( os.path.join(self.cfg.text_data, f"{split}.idx") ) self.datasets[split] = ExtractedFeaturesDataset( path=data_path, split=split, min_length=3, max_length=task_cfg.max_length, labels=None if has_unpaired_text else task_cfg.labels, label_dict=self.target_dictionary, shuffle=getattr(task_cfg, "shuffle", True), sort_by_length=task_cfg.sort_by_length, aux_target_postfix=task_cfg.aux_target_postfix, ) logger.info(f"split {split} has unpaired text? {has_unpaired_text}") if has_unpaired_text: text_dataset = data_utils.load_indexed_dataset( os.path.join(self.cfg.text_data, split), self.target_dictionary ) text_dataset = StripTokenDataset(text_dataset, self.target_dictionary.eos()) self.datasets[split] = RandomInputDataset( self.datasets[split], text_dataset, ["random_label"], add_to_input=True, pad_idx=self.target_dictionary.pad(), ) @property def source_dictionary(self): return self._source_dictionary @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self._target_dictionary def max_positions(self): """Maximum input length supported by the encoder.""" return None def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) zero = torch.scalar_tensor(0.0) num_char_errors = sum( log.get("_num_char_errors", zero) for log in logging_outputs ) num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs) num_word_errors = sum( log.get("_num_word_errors", zero) for log in logging_outputs ) num_words = sum(log.get("_num_words", zero) for log in logging_outputs) num_pred_chars = sum( log.get("_num_pred_chars", zero) for log in logging_outputs ) lm_score_sum = sum(log.get("_lm_score_sum", zero) for log in logging_outputs) vocab_seen = ( sum(log.get("_vocab_seen", zero) for log in logging_outputs) .bool() .sum() .item() ) kaldi_score_sum = sum( log.get("_kaldi_score_sum", zero) for log in logging_outputs ) word_lm_sum = sum(log.get("_word_lm_sum", zero) for log in logging_outputs) metrics.log_scalar_sum("_num_char_errors", num_char_errors) metrics.log_scalar_sum("_num_chars", num_chars) metrics.log_scalar_sum("_num_word_errors", num_word_errors) metrics.log_scalar_sum("_num_words", num_words) metrics.log_scalar_sum("lm_score_sum", lm_score_sum) metrics.log_scalar_sum("num_pred_chars", num_pred_chars) if self.cfg.word_kenlm_path is not None: metrics.log_scalar_sum("kaldi_score_sum", kaldi_score_sum) metrics.log_scalar_sum("word_lm_sum", word_lm_sum) if num_chars > 0: metrics.log_derived( "uer", lambda meters: meters["_num_char_errors"].sum * 100.0 / meters["_num_chars"].sum if meters["_num_chars"].sum > 0 else float("nan"), ) if lm_score_sum < 0 and vocab_seen > 0: metrics.log_scalar("vocab_seen_pct", vocab_seen / self.num_symbols) metrics.log_derived( "weighted_lm_ppl", lambda meters: math.pow( 10, -meters["lm_score_sum"].sum / ( meters["num_pred_chars"].sum + meters["nsentences"].sum ), # account for </s> ) / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power, ) metrics.log_derived( "lm_ppl", lambda meters: math.pow( 10, -meters["lm_score_sum"].sum / ( meters["num_pred_chars"].sum + meters["nsentences"].sum ), # account for </s> ), ) else: metrics.log_derived("weighted_lm_ppl", lambda meters: float("inf")) if num_words > 0: if word_lm_sum != 0: metrics.log_derived( "word_lm_ppl", lambda meters: math.pow( 10, -meters["word_lm_sum"].sum / ( meters["_num_words"].sum + meters["nsentences"].sum ), # account for </s> ), ) metrics.log_derived( "weighted_word_lm_ppl", lambda meters: math.pow( 10, -meters["word_lm_sum"].sum / ( meters["_num_words"].sum + meters["nsentences"].sum ), # account for </s> ) / meters["vocab_seen_pct"].avg ** self.cfg.vocab_usage_power, ) if self.cfg.word_kenlm_path is not None: metrics.log_derived( "kaldi_score", lambda meters: meters["kaldi_score_sum"].sum / meters["nsentences"].sum, ) def build_model(self, cfg: FairseqDataclass, from_checkpoint=False): model = super().build_model(cfg) return model ================================================ FILE: examples/wav2vec/unsupervised/w2vu_generate.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Run inference for pre-processed data with a trained model. """ import ast from collections import namedtuple from dataclasses import dataclass, field from enum import Enum, auto import hydra from hydra.core.config_store import ConfigStore import logging import math import os from omegaconf import OmegaConf from typing import Optional import sys import editdistance import torch from hydra.core.hydra_config import HydraConfig from fairseq import checkpoint_utils, progress_bar, tasks, utils from fairseq.data.data_utils import post_process from fairseq.dataclass.configs import FairseqDataclass, FairseqConfig from fairseq.logging.meters import StopwatchMeter from omegaconf import open_dict from examples.speech_recognition.kaldi.kaldi_decoder import KaldiDecoderConfig logging.root.setLevel(logging.INFO) logging.basicConfig(stream=sys.stdout, level=logging.INFO) logger = logging.getLogger(__name__) class DecoderType(Enum): VITERBI = auto() KENLM = auto() FAIRSEQ = auto() KALDI = auto() @dataclass class UnsupGenerateConfig(FairseqDataclass): fairseq: FairseqConfig = FairseqConfig() lm_weight: float = field( default=2.0, metadata={"help": "language model weight"}, ) w2l_decoder: DecoderType = field( default=DecoderType.VITERBI, metadata={"help": "type of decoder to use"}, ) kaldi_decoder_config: Optional[KaldiDecoderConfig] = None lexicon: Optional[str] = field( default=None, metadata={ "help": "path to lexicon. This is also used to 'phonemize' for unsupvised param tuning" }, ) lm_model: Optional[str] = field( default=None, metadata={"help": "path to language model (kenlm or fairseq)"}, ) decode_stride: Optional[float] = field( default=None, metadata={"help": "changing the decoding frequency of the generator"}, ) unit_lm: bool = field( default=False, metadata={"help": "whether to use unit lm"}, ) beam_threshold: float = field( default=50.0, metadata={"help": "beam score threshold"}, ) beam_size_token: float = field( default=100.0, metadata={"help": "max tokens per beam"}, ) beam: int = field( default=5, metadata={"help": "decoder beam size"}, ) nbest: int = field( default=1, metadata={"help": "number of results to return"}, ) word_score: float = field( default=1.0, metadata={"help": "word score to add at end of word"}, ) unk_weight: float = field( default=-math.inf, metadata={"help": "unknown token weight"}, ) sil_weight: float = field( default=0.0, metadata={"help": "silence token weight"}, ) targets: Optional[str] = field( default=None, metadata={"help": "extension of ground truth labels to compute UER"}, ) results_path: Optional[str] = field( default=None, metadata={"help": "where to store results"}, ) post_process: Optional[str] = field( default=None, metadata={"help": "how to post process results"}, ) vocab_usage_power: float = field( default=2, metadata={"help": "for unsupervised param tuning"}, ) viterbi_transcript: Optional[str] = field( default=None, metadata={"help": "for unsupervised param tuning"}, ) min_lm_ppl: float = field( default=0, metadata={"help": "for unsupervised param tuning"}, ) min_vt_uer: float = field( default=0, metadata={"help": "for unsupervised param tuning"}, ) blank_weight: float = field( default=0, metadata={"help": "value to add or set for blank emission"}, ) blank_mode: str = field( default="set", metadata={ "help": "can be add or set, how to modify blank emission with blank weight" }, ) sil_is_blank: bool = field( default=False, metadata={"help": "if true, <SIL> token is same as blank token"}, ) unsupervised_tuning: bool = field( default=False, metadata={ "help": "if true, returns a score based on unsupervised param selection metric instead of UER" }, ) is_ax: bool = field( default=False, metadata={ "help": "if true, assumes we are using ax for tuning and returns a tuple for ax to consume" }, ) def get_dataset_itr(cfg, task): return task.get_batch_iterator( dataset=task.dataset(cfg.fairseq.dataset.gen_subset), max_tokens=cfg.fairseq.dataset.max_tokens, max_sentences=cfg.fairseq.dataset.batch_size, max_positions=(sys.maxsize, sys.maxsize), ignore_invalid_inputs=cfg.fairseq.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.fairseq.dataset.required_batch_size_multiple, num_shards=cfg.fairseq.dataset.num_shards, shard_id=cfg.fairseq.dataset.shard_id, num_workers=cfg.fairseq.dataset.num_workers, data_buffer_size=cfg.fairseq.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) def process_predictions( cfg: UnsupGenerateConfig, hypos, tgt_dict, target_tokens, res_files, ): retval = [] word_preds = [] transcriptions = [] dec_scores = [] for i, hypo in enumerate(hypos[: min(len(hypos), cfg.nbest)]): if torch.is_tensor(hypo["tokens"]): tokens = hypo["tokens"].int().cpu() tokens = tokens[tokens >= tgt_dict.nspecial] hyp_pieces = tgt_dict.string(tokens) else: hyp_pieces = " ".join(hypo["tokens"]) if "words" in hypo and len(hypo["words"]) > 0: hyp_words = " ".join(hypo["words"]) else: hyp_words = post_process(hyp_pieces, cfg.post_process) to_write = {} if res_files is not None: to_write[res_files["hypo.units"]] = hyp_pieces to_write[res_files["hypo.words"]] = hyp_words tgt_words = "" if target_tokens is not None: if isinstance(target_tokens, str): tgt_pieces = tgt_words = target_tokens else: tgt_pieces = tgt_dict.string(target_tokens) tgt_words = post_process(tgt_pieces, cfg.post_process) if res_files is not None: to_write[res_files["ref.units"]] = tgt_pieces to_write[res_files["ref.words"]] = tgt_words if not cfg.fairseq.common_eval.quiet: logger.info(f"HYPO {i}:" + hyp_words) if tgt_words: logger.info("TARGET:" + tgt_words) if "am_score" in hypo and "lm_score" in hypo: logger.info( f"DECODER AM SCORE: {hypo['am_score']}, DECODER LM SCORE: {hypo['lm_score']}, DECODER SCORE: {hypo['score']}" ) elif "score" in hypo: logger.info(f"DECODER SCORE: {hypo['score']}") logger.info("___________________") hyp_words_arr = hyp_words.split() tgt_words_arr = tgt_words.split() retval.append( ( editdistance.eval(hyp_words_arr, tgt_words_arr), len(hyp_words_arr), len(tgt_words_arr), hyp_pieces, hyp_words, ) ) word_preds.append(hyp_words_arr) transcriptions.append(to_write) dec_scores.append(-hypo.get("score", 0)) # negate cuz kaldi returns NLL if len(retval) > 1: best = None for r, t in zip(retval, transcriptions): if best is None or r[0] < best[0][0]: best = r, t for dest, tran in best[1].items(): print(tran, file=dest) dest.flush() return best[0] assert len(transcriptions) == 1 for dest, tran in transcriptions[0].items(): print(tran, file=dest) return retval[0] def prepare_result_files(cfg: UnsupGenerateConfig): def get_res_file(file_prefix): if cfg.fairseq.dataset.num_shards > 1: file_prefix = f"{cfg.fairseq.dataset.shard_id}_{file_prefix}" path = os.path.join( cfg.results_path, "{}{}.txt".format( cfg.fairseq.dataset.gen_subset, file_prefix, ), ) return open(path, "w", buffering=1) if not cfg.results_path: return None return { "hypo.words": get_res_file(""), "hypo.units": get_res_file("_units"), "ref.words": get_res_file("_ref"), "ref.units": get_res_file("_ref_units"), "hypo.nbest.words": get_res_file("_nbest_words"), } def optimize_models(cfg: UnsupGenerateConfig, use_cuda, models): """Optimize ensemble for generation""" for model in models: model.eval() if cfg.fairseq.common.fp16: model.half() if use_cuda: model.cuda() GenResult = namedtuple( "GenResult", [ "count", "errs_t", "gen_timer", "lengths_hyp_unit_t", "lengths_hyp_t", "lengths_t", "lm_score_t", "num_feats", "num_sentences", "num_symbols", "vt_err_t", "vt_length_t", ], ) def generate(cfg: UnsupGenerateConfig, models, saved_cfg, use_cuda): task = tasks.setup_task(cfg.fairseq.task) saved_cfg.task.labels = cfg.fairseq.task.labels task.load_dataset(cfg.fairseq.dataset.gen_subset, task_cfg=saved_cfg.task) # Set dictionary tgt_dict = task.target_dictionary logger.info( "| {} {} {} examples".format( cfg.fairseq.task.data, cfg.fairseq.dataset.gen_subset, len(task.dataset(cfg.fairseq.dataset.gen_subset)), ) ) # Load dataset (possibly sharded) itr = get_dataset_itr(cfg, task) # Initialize generator gen_timer = StopwatchMeter() def build_generator(cfg: UnsupGenerateConfig): w2l_decoder = cfg.w2l_decoder if w2l_decoder == DecoderType.VITERBI: from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(cfg, task.target_dictionary) elif w2l_decoder == DecoderType.KENLM: from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder return W2lKenLMDecoder(cfg, task.target_dictionary) elif w2l_decoder == DecoderType.FAIRSEQ: from examples.speech_recognition.w2l_decoder import W2lFairseqLMDecoder return W2lFairseqLMDecoder(cfg, task.target_dictionary) elif w2l_decoder == DecoderType.KALDI: from examples.speech_recognition.kaldi.kaldi_decoder import KaldiDecoder assert cfg.kaldi_decoder_config is not None return KaldiDecoder( cfg.kaldi_decoder_config, cfg.beam, ) else: raise NotImplementedError( "only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment but found " + str(w2l_decoder) ) generator = build_generator(cfg) kenlm = None fairseq_lm = None if cfg.lm_model is not None: import kenlm kenlm = kenlm.Model(cfg.lm_model) num_sentences = 0 if cfg.results_path is not None and not os.path.exists(cfg.results_path): os.makedirs(cfg.results_path) res_files = prepare_result_files(cfg) errs_t = 0 lengths_hyp_t = 0 lengths_hyp_unit_t = 0 lengths_t = 0 count = 0 num_feats = 0 all_hyp_pieces = [] all_hyp_words = [] num_symbols = ( len([s for s in tgt_dict.symbols if not s.startswith("madeup")]) - tgt_dict.nspecial ) targets = None if cfg.targets is not None: tgt_path = os.path.join( cfg.fairseq.task.data, cfg.fairseq.dataset.gen_subset + "." + cfg.targets ) if os.path.exists(tgt_path): with open(tgt_path, "r") as f: targets = f.read().splitlines() viterbi_transcript = None if cfg.viterbi_transcript is not None and len(cfg.viterbi_transcript) > 0: logger.info(f"loading viterbi transcript from {cfg.viterbi_transcript}") with open(cfg.viterbi_transcript, "r") as vf: viterbi_transcript = vf.readlines() viterbi_transcript = [v.rstrip().split() for v in viterbi_transcript] gen_timer.start() start = 0 end = len(itr) hypo_futures = None if cfg.w2l_decoder == DecoderType.KALDI: logger.info("Extracting features") hypo_futures = [] samples = [] with progress_bar.build_progress_bar(cfg.fairseq.common, itr) as t: for i, sample in enumerate(t): if "net_input" not in sample or i < start or i >= end: continue if "padding_mask" not in sample["net_input"]: sample["net_input"]["padding_mask"] = None hypos, num_feats = gen_hypos( generator, models, num_feats, sample, task, use_cuda ) hypo_futures.append(hypos) samples.append(sample) itr = list(zip(hypo_futures, samples)) start = 0 end = len(itr) logger.info("Finished extracting features") with progress_bar.build_progress_bar(cfg.fairseq.common, itr) as t: for i, sample in enumerate(t): if i < start or i >= end: continue if hypo_futures is not None: hypos, sample = sample hypos = [h.result() for h in hypos] else: if "net_input" not in sample: continue hypos, num_feats = gen_hypos( generator, models, num_feats, sample, task, use_cuda ) for i, sample_id in enumerate(sample["id"].tolist()): if targets is not None: target_tokens = targets[sample_id] elif "target" in sample or "target_label" in sample: toks = ( sample["target"][i, :] if "target_label" not in sample else sample["target_label"][i, :] ) target_tokens = utils.strip_pad(toks, tgt_dict.pad()).int().cpu() else: target_tokens = None # Process top predictions ( errs, length_hyp, length, hyp_pieces, hyp_words, ) = process_predictions( cfg, hypos[i], tgt_dict, target_tokens, res_files, ) errs_t += errs lengths_hyp_t += length_hyp lengths_hyp_unit_t += ( len(hyp_pieces) if len(hyp_pieces) > 0 else len(hyp_words) ) lengths_t += length count += 1 all_hyp_pieces.append(hyp_pieces) all_hyp_words.append(hyp_words) num_sentences += ( sample["nsentences"] if "nsentences" in sample else sample["id"].numel() ) lm_score_sum = 0 if kenlm is not None: if cfg.unit_lm: lm_score_sum = sum(kenlm.score(w) for w in all_hyp_pieces) else: lm_score_sum = sum(kenlm.score(w) for w in all_hyp_words) elif fairseq_lm is not None: lm_score_sum = sum(fairseq_lm.score([h.split() for h in all_hyp_words])[0]) vt_err_t = 0 vt_length_t = 0 if viterbi_transcript is not None: unit_hyps = [] if cfg.targets is not None and cfg.lexicon is not None: lex = {} with open(cfg.lexicon, "r") as lf: for line in lf: items = line.rstrip().split() lex[items[0]] = items[1:] for h in all_hyp_pieces: hyp_ws = [] for w in h.split(): assert w in lex, w hyp_ws.extend(lex[w]) unit_hyps.append(hyp_ws) else: unit_hyps.extend([h.split() for h in all_hyp_words]) vt_err_t = sum( editdistance.eval(vt, h) for vt, h in zip(viterbi_transcript, unit_hyps) ) vt_length_t = sum(len(h) for h in viterbi_transcript) if res_files is not None: for r in res_files.values(): r.close() gen_timer.stop(lengths_hyp_t) return GenResult( count, errs_t, gen_timer, lengths_hyp_unit_t, lengths_hyp_t, lengths_t, lm_score_sum, num_feats, num_sentences, num_symbols, vt_err_t, vt_length_t, ) def gen_hypos(generator, models, num_feats, sample, task, use_cuda): sample = utils.move_to_cuda(sample) if use_cuda else sample if "features" in sample["net_input"]: sample["net_input"]["dense_x_only"] = True num_feats += ( sample["net_input"]["features"].shape[0] * sample["net_input"]["features"].shape[1] ) hypos = task.inference_step(generator, models, sample, None) return hypos, num_feats def main(cfg: UnsupGenerateConfig, model=None): if ( cfg.fairseq.dataset.max_tokens is None and cfg.fairseq.dataset.batch_size is None ): cfg.fairseq.dataset.max_tokens = 1024000 use_cuda = torch.cuda.is_available() and not cfg.fairseq.common.cpu task = tasks.setup_task(cfg.fairseq.task) overrides = ast.literal_eval(cfg.fairseq.common_eval.model_overrides) if cfg.fairseq.task._name == "unpaired_audio_text": overrides["model"] = { "blank_weight": cfg.blank_weight, "blank_mode": cfg.blank_mode, "blank_is_sil": cfg.sil_is_blank, "no_softmax": True, "segmentation": { "type": "NONE", }, } else: overrides["model"] = { "blank_weight": cfg.blank_weight, "blank_mode": cfg.blank_mode, } if cfg.decode_stride: overrides["model"]["generator_stride"] = cfg.decode_stride if model is None: # Load ensemble logger.info("| loading model(s) from {}".format(cfg.fairseq.common_eval.path)) models, saved_cfg = checkpoint_utils.load_model_ensemble( cfg.fairseq.common_eval.path.split("\\"), arg_overrides=overrides, task=task, suffix=cfg.fairseq.checkpoint.checkpoint_suffix, strict=(cfg.fairseq.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.fairseq.checkpoint.checkpoint_shard_count, ) optimize_models(cfg, use_cuda, models) else: models = [model] saved_cfg = cfg.fairseq with open_dict(saved_cfg.task): saved_cfg.task.shuffle = False saved_cfg.task.sort_by_length = False gen_result = generate(cfg, models, saved_cfg, use_cuda) wer = None if gen_result.lengths_t > 0: wer = gen_result.errs_t * 100.0 / gen_result.lengths_t logger.info(f"WER: {wer}") lm_ppl = float("inf") if gen_result.lm_score_t != 0 and gen_result.lengths_hyp_t > 0: hyp_len = gen_result.lengths_hyp_t lm_ppl = math.pow( 10, -gen_result.lm_score_t / (hyp_len + gen_result.num_sentences) ) logger.info(f"LM PPL: {lm_ppl}") logger.info( "| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}" " sentences/s, {:.2f} tokens/s)".format( gen_result.num_sentences, gen_result.gen_timer.n, gen_result.gen_timer.sum, gen_result.num_sentences / gen_result.gen_timer.sum, 1.0 / gen_result.gen_timer.avg, ) ) vt_diff = None if gen_result.vt_length_t > 0: vt_diff = gen_result.vt_err_t / gen_result.vt_length_t vt_diff = max(cfg.min_vt_uer, vt_diff) lm_ppl = max(cfg.min_lm_ppl, lm_ppl) if not cfg.unsupervised_tuning: weighted_score = wer else: weighted_score = math.log(lm_ppl) * (vt_diff or 1.0) res = ( f"| Generate {cfg.fairseq.dataset.gen_subset} with beam={cfg.beam}, " f"lm_weight={cfg.kaldi_decoder_config.acoustic_scale if cfg.kaldi_decoder_config else cfg.lm_weight}, " f"word_score={cfg.word_score}, sil_weight={cfg.sil_weight}, blank_weight={cfg.blank_weight}, " f"WER: {wer}, LM_PPL: {lm_ppl}, num feats: {gen_result.num_feats}, " f"length: {gen_result.lengths_hyp_t}, UER to viterbi: {(vt_diff or 0) * 100}, score: {weighted_score}" ) logger.info(res) # print(res) return task, weighted_score @hydra.main( config_path=os.path.join("../../..", "fairseq", "config"), config_name="config" ) def hydra_main(cfg): with open_dict(cfg): # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) cfg.job_logging_cfg = OmegaConf.to_container( HydraConfig.get().job_logging, resolve=True ) cfg = OmegaConf.create( OmegaConf.to_container(cfg, resolve=False, enum_to_str=False) ) OmegaConf.set_struct(cfg, True) logger.info(cfg) utils.import_user_module(cfg.fairseq.common) _, score = main(cfg) if cfg.is_ax: return score, None return score def cli_main(): try: from hydra._internal.utils import get_args cfg_name = get_args().config_name or "config" except: logger.warning("Failed to get config name from hydra args") cfg_name = "config" cs = ConfigStore.instance() cs.store(name=cfg_name, node=UnsupGenerateConfig) hydra_main() if __name__ == "__main__": cli_main() ================================================ FILE: examples/wav2vec/vq-wav2vec_featurize.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset """ import argparse import glob import os import os.path as osp import pprint import soundfile as sf import torch import fairseq from torch import nn from torch.utils.data import DataLoader try: import tqdm except: print("Install tqdm to use --log-format=tqdm") class FilesDataset: def __init__(self, files, labels): self.files = files if labels and osp.exists(labels): with open(labels, "r") as lbl_f: self.labels = [line.rstrip() for line in lbl_f] else: self.labels = labels def __len__(self): return len(self.files) def __getitem__(self, index): fname = self.files[index] wav, sr = sf.read(fname) assert sr == 16000 wav = torch.from_numpy(wav).float() lbls = None if self.labels: if isinstance(self.labels, str): lbl_file = osp.splitext(fname)[0] + "." + self.labels with open(lbl_file, "r") as lblf: lbls = lblf.readline() assert lbls is not None else: lbls = self.labels[index] return wav, lbls def collate(self, batch): return batch class ArgTypes: @staticmethod def existing_path(arg): arg = str(arg) assert osp.exists(arg), f"File {arg} does not exist" return arg @staticmethod def mkdir(arg): arg = str(arg) os.makedirs(arg, exist_ok=True) return arg class DatasetWriter: def __init__(self): self.args = self.load_config() pprint.pprint(self.args.__dict__) self.model = self.load_model() def __getattr__(self, attr): return getattr(self.args, attr) def read_manifest(self, fname): with open(fname, "r") as fp: lines = fp.read().split("\n") root = lines.pop(0).strip() fnames = [ osp.join(root, line.split("\t")[0]) for line in lines if len(line) > 0 ] return fnames def process_splits(self): if self.args.shard is not None or self.args.num_shards is not None: assert self.args.shard is not None and self.args.num_shards is not None for split in self.splits: print(split) if self.extension == "tsv": datadir = osp.join(self.data_dir, f"{split}.{self.extension}") print("Reading manifest file: ", datadir) files = self.read_manifest(datadir) else: datadir = osp.join(self.data_dir, split, f"**/*.{self.extension}") files = glob.glob(datadir, recursive=True) assert len(files) > 0 if self.args.shard is not None: files = files[self.args.shard :: self.args.num_shards] lbls = [] with open(self.data_file(split), "w") as srcf: for line, lbl in self.iterate(files): print(line, file=srcf) if self.args.labels: lbls.append(lbl + "\n") if self.args.labels: assert all(a is not None for a in lbls) with open(self.lbl_file(split), "w") as lblf: lblf.writelines(lbls) def iterate(self, files): data = self.load_data(files) for samples in tqdm.tqdm(data, total=len(files) // 32): for wav, lbl in samples: x = wav.unsqueeze(0).float().cuda() div = 1 while x.size(-1) // div > self.args.max_size: div += 1 xs = x.chunk(div, dim=-1) result = [] for x in xs: torch.cuda.empty_cache() x = self.model.feature_extractor(x) if self.quantize_location == "encoder": with torch.no_grad(): _, idx = self.model.vector_quantizer.forward_idx(x) idx = idx.squeeze(0).cpu() else: with torch.no_grad(): z = self.model.feature_aggregator(x) _, idx = self.model.vector_quantizer.forward_idx(z) idx = idx.squeeze(0).cpu() result.append(idx) idx = torch.cat(result, dim=0) yield " ".join("-".join(map(str, a.tolist())) for a in idx), lbl def lbl_file(self, name): shard_part = "" if self.args.shard is None else f".{self.args.shard}" return osp.join(self.output_dir, f"{name}.lbl{shard_part}") def data_file(self, name): shard_part = "" if self.args.shard is None else f".{self.args.shard}" return osp.join(self.output_dir, f"{name}.src{shard_part}") def var_file(self): return osp.join(self.output_dir, f"vars.pt") def load_config(self): parser = argparse.ArgumentParser("Vector Quantized wav2vec features") # Model Arguments parser.add_argument("--checkpoint", type=ArgTypes.existing_path, required=True) parser.add_argument("--data-parallel", action="store_true") # Output Arguments parser.add_argument("--output-dir", type=ArgTypes.mkdir, required=True) # Data Arguments parser.add_argument("--data-dir", type=ArgTypes.existing_path, required=True) parser.add_argument("--splits", type=str, nargs="+", required=True) parser.add_argument("--extension", type=str, required=True) parser.add_argument("--labels", type=str, required=False) parser.add_argument("--shard", type=int, default=None) parser.add_argument("--num-shards", type=int, default=None) parser.add_argument("--max-size", type=int, default=1300000) # Logger Arguments parser.add_argument( "--log-format", type=str, choices=["none", "simple", "tqdm"] ) return parser.parse_args() def load_data(self, fnames): dataset = FilesDataset(fnames, self.args.labels) loader = DataLoader( dataset, batch_size=32, collate_fn=dataset.collate, num_workers=8 ) return loader def load_model(self): model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([self.checkpoint]) model = model[0] self.quantize_location = getattr(cfg.model, "vq", "encoder") model.eval().float() model.cuda() if self.data_parallel: model = nn.DataParallel(model) return model def __call__(self): self.process_splits() if hasattr(self.model.feature_extractor, "vars") and ( self.args.shard is None or self.args.shard == 0 ): vars = ( self.model.feature_extractor.vars.view( self.model.feature_extractor.banks, self.model.feature_extractor.num_vars, -1, ) .cpu() .detach() ) print("writing learned latent variable embeddings: ", vars.shape) torch.save(vars, self.var_file()) if __name__ == "__main__": write_data = DatasetWriter() write_data() print("Done.") ================================================ FILE: examples/wav2vec/wav2vec_featurize.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Helper script to pre-compute embeddings for a flashlight (previously called wav2letter++) dataset """ import argparse import glob import os from shutil import copy import h5py import numpy as np import soundfile as sf import torch import tqdm import fairseq from torch import nn def read_audio(fname): """ Load an audio file and return PCM along with the sample rate """ wav, sr = sf.read(fname) assert sr == 16e3 return wav, 16e3 class PretrainedWav2VecModel(nn.Module): def __init__(self, fname): super().__init__() model, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task([fname]) model = model[0] model.eval() self.model = model def forward(self, x): with torch.no_grad(): z = self.model.feature_extractor(x) if isinstance(z, tuple): z = z[0] c = self.model.feature_aggregator(z) return z, c class EmbeddingWriterConfig(argparse.ArgumentParser): def __init__(self): super().__init__("Pre-compute embeddings for flashlight datasets") kwargs = {"action": "store", "type": str, "required": True} self.add_argument("--input", "-i", help="Input Directory", **kwargs) self.add_argument("--output", "-o", help="Output Directory", **kwargs) self.add_argument("--model", help="Path to model checkpoint", **kwargs) self.add_argument("--split", help="Dataset Splits", nargs="+", **kwargs) self.add_argument( "--ext", default="wav", required=False, help="Audio file extension" ) self.add_argument( "--no-copy-labels", action="store_true", help="Do not copy label files. Useful for large datasets, use --targetdir in flashlight then.", ) self.add_argument( "--use-feat", action="store_true", help="Use the feature vector ('z') instead of context vector ('c') for features", ) self.add_argument("--gpu", help="GPU to use", default=0, type=int) class Prediction: """ Lightweight wrapper around a fairspeech embedding model """ def __init__(self, fname, gpu=0): self.gpu = gpu self.model = PretrainedWav2VecModel(fname).cuda(gpu) def __call__(self, x): x = torch.from_numpy(x).float().cuda(self.gpu) with torch.no_grad(): z, c = self.model(x.unsqueeze(0)) return z.squeeze(0).cpu().numpy(), c.squeeze(0).cpu().numpy() class H5Writer: """ Write features as hdf5 file in flashlight compatible format """ def __init__(self, fname): self.fname = fname os.makedirs(os.path.dirname(self.fname), exist_ok=True) def write(self, data): channel, T = data.shape with h5py.File(self.fname, "w") as out_ds: data = data.T.flatten() out_ds["features"] = data out_ds["info"] = np.array([16e3 // 160, T, channel]) class EmbeddingDatasetWriter(object): """Given a model and a flashlight dataset, pre-compute and store embeddings Args: input_root, str : Path to the flashlight dataset output_root, str : Desired output directory. Will be created if non-existent split, str : Dataset split """ def __init__( self, input_root, output_root, split, model_fname, extension="wav", gpu=0, verbose=False, use_feat=False, ): assert os.path.exists(model_fname) self.model_fname = model_fname self.model = Prediction(self.model_fname, gpu) self.input_root = input_root self.output_root = output_root self.split = split self.verbose = verbose self.extension = extension self.use_feat = use_feat assert os.path.exists(self.input_path), "Input path '{}' does not exist".format( self.input_path ) def _progress(self, iterable, **kwargs): if self.verbose: return tqdm.tqdm(iterable, **kwargs) return iterable def require_output_path(self, fname=None): path = self.get_output_path(fname) os.makedirs(path, exist_ok=True) @property def input_path(self): return self.get_input_path() @property def output_path(self): return self.get_output_path() def get_input_path(self, fname=None): if fname is None: return os.path.join(self.input_root, self.split) return os.path.join(self.get_input_path(), fname) def get_output_path(self, fname=None): if fname is None: return os.path.join(self.output_root, self.split) return os.path.join(self.get_output_path(), fname) def copy_labels(self): self.require_output_path() labels = list( filter( lambda x: self.extension not in x, glob.glob(self.get_input_path("*")) ) ) for fname in tqdm.tqdm(labels): copy(fname, self.output_path) @property def input_fnames(self): return sorted(glob.glob(self.get_input_path("*.{}".format(self.extension)))) def __len__(self): return len(self.input_fnames) def write_features(self): paths = self.input_fnames fnames_context = map( lambda x: os.path.join( self.output_path, x.replace("." + self.extension, ".h5context") ), map(os.path.basename, paths), ) for name, target_fname in self._progress( zip(paths, fnames_context), total=len(self) ): wav, sr = read_audio(name) z, c = self.model(wav) feat = z if self.use_feat else c writer = H5Writer(target_fname) writer.write(feat) def __repr__(self): return "EmbeddingDatasetWriter ({n_files} files)\n\tinput:\t{input_root}\n\toutput:\t{output_root}\n\tsplit:\t{split})".format( n_files=len(self), **self.__dict__ ) if __name__ == "__main__": args = EmbeddingWriterConfig().parse_args() for split in args.split: writer = EmbeddingDatasetWriter( input_root=args.input, output_root=args.output, split=split, model_fname=args.model, gpu=args.gpu, extension=args.ext, use_feat=args.use_feat, ) print(writer) writer.require_output_path() print("Writing Features...") writer.write_features() print("Done.") if not args.no_copy_labels: print("Copying label data...") writer.copy_labels() print("Done.") ================================================ FILE: examples/wav2vec/wav2vec_manifest.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Data pre-processing: build vocabularies and binarize training data. """ import argparse import glob import os import random import soundfile def get_parser(): parser = argparse.ArgumentParser() parser.add_argument( "root", metavar="DIR", help="root directory containing flac files to index" ) parser.add_argument( "--valid-percent", default=0.01, type=float, metavar="D", help="percentage of data to use as validation set (between 0 and 1)", ) parser.add_argument( "--dest", default=".", type=str, metavar="DIR", help="output directory" ) parser.add_argument( "--ext", default="flac", type=str, metavar="EXT", help="extension to look for" ) parser.add_argument("--seed", default=42, type=int, metavar="N", help="random seed") parser.add_argument( "--path-must-contain", default=None, type=str, metavar="FRAG", help="if set, path must contain this substring for a file to be included in the manifest", ) return parser def main(args): assert args.valid_percent >= 0 and args.valid_percent <= 1.0 if not os.path.exists(args.dest): os.makedirs(args.dest) dir_path = os.path.realpath(args.root) search_path = os.path.join(dir_path, "**/*." + args.ext) rand = random.Random(args.seed) valid_f = ( open(os.path.join(args.dest, "valid.tsv"), "w") if args.valid_percent > 0 else None ) with open(os.path.join(args.dest, "train.tsv"), "w") as train_f: print(dir_path, file=train_f) if valid_f is not None: print(dir_path, file=valid_f) for fname in glob.iglob(search_path, recursive=True): file_path = os.path.realpath(fname) if args.path_must_contain and args.path_must_contain not in file_path: continue frames = soundfile.info(fname).frames dest = train_f if rand.random() > args.valid_percent else valid_f print( "{}\t{}".format(os.path.relpath(file_path, dir_path), frames), file=dest ) if valid_f is not None: valid_f.close() if __name__ == "__main__": parser = get_parser() args = parser.parse_args() main(args) ================================================ FILE: examples/wav2vec/xlsr/README.md ================================================ # XLS-R XLS-R is a set of large-scale models for self-supervised cross-lingual speech representation learning based on wav2vec 2.0. It was pretrained on 128 languages and approximately 436K hours of unlabeled speech data. With finetuning, these models achieve state of the art performance in speech translation, speech recognition and language identification. We evaluate the model across multiple benchmarks such as CoVoST-2 for speech translation, BABEL / MLS / CommonVoice / VoxPopuli for automatic speech recognition, and VoxLingua107 for language identification as we llas VoxCeleb1 for speaker identification. More details about this work can be found in our [paper](https://arxiv.org/pdf/2111.09296.pdf) and download links can be found below. Model | Link |------|------ XLS-R 300M | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_300m.pt) XLS-R 1B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_960m_1000k.pt) XLS-R 2B | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr2_2B_1000k.pt) You can also download these models [here](https://huggingface.co/models?other=xls_r) and read more about it in the [blogpost](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) from Hugging Face. ## Speech Translation Finetuned Models We multilingually finetune XLS-R models on [CoVoST 2](https://github.com/facebookresearch/covost), which has 21 into-English and 15 out-of-English directions. Model | Directions | Link |------|------|------ XLS-R 300M | 21 langs → En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_21_en.pt) XLS-R 300M | En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_300m_en_15.pt) XLS-R 1B | 21 langs → En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_21_en.pt) XLS-R 1B | En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_1b_en_15.pt) XLS-R 2B | 21 langs → En | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_21_en.pt) XLS-R 2B | En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_en_15.pt) XLS-R 2B | 21 langs → En + En → 15 langs | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xls_r_2b_22_16.pt) ## ASR Finetuning You can refer the original wav2vec documentation on detailed instructions about how to finetune a pretrained model with CTC [here](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#fine-tune-a-pre-trained-model-with-ctc). Below is an example command and you can find the values for different hyperparameters to reproduce the results in our paper. ```shell script $ fairseq-hydra-train \ distributed_training.distributed_port=$PORT \ task.data=/path/to/data \ model.w2v_path=/path/to/model.pt \ --config-dir /path/to/fairseq-py/examples/wav2vec/xlsr/config \ --config-name finetune ``` For finetuning the 300M as well as 1B model, we use the same hyperparameter setting defined in `finetune.yaml`. We vary `optimization.max_update` as described in the below table and the `optimization.lr` is picked from the interval [2e-5, 3e-4] based on dev word error rate. Benchmark | Total Number of Updates |------|------ Babel | 26000 Common Voice | 13000 VoxPopuli | 50000 MLS 10h | 20000 For finetuning the 2B model, we make some additional changes for `finetune.yaml` . We use the fully_sharded `distributed_training.ddp_backend` provided by the [fairscale](https://github.com/facebookresearch/fairscale) library and and set `model.activation_checkpoint` to true. We also increase `dataset.max_tokens` to 2560000 and use a total effective batch size of 2560000*24. We sweep for the best `optimization.lr` within the interval [3e−6,3e−5] using dev error rate. For common voice dataset, we pick the `model.mask_prob` for different languages among {0.30, 0.40} based on best dev error rate. ## LID Inference Model | Link |------|------ XLS-R 300M + ft Voxlingua107 | [download](https://dl.fbaipublicfiles.com/fairseq/wav2vec/xlsr_300m_voxlingua107_ft.pt) How to run inference & calculate accuracy (step-by-step): 1. Download the Voxlingua107 checkpoint from the table above. 1. Use this python script to extract logit/embedding from the XLSR model: https://github.com/fairinternal/fairseq-py/blob/xlsr2/examples/wav2vec/gen_audio_embedding.py ```shell command CUDA_VISIBLE_DEVICES=0 PYTHONPATH=. python3 examples/wav2vec/gen_audio_embedding.py \ /fsx/data/VoxLingua107/manifest --path "/path/to/checkpoint.pt" \ --task audio_classification --batch-size 90 --gen-subset test \ --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \ --infer-xtimes 10 --infer-max-sample-size 160000 --output-path /tmp/tmp_voxling_infer.npz ``` 2. Calculate the overall accuracy, 0-5 seconds and 5-20 seconds: ```shell command PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \ --task cls --merge mean_logit --data /tmp/tmp_voxling_infer.npz Output: | run classification evaluation | acc = 94.34% -- err = 5.66% -- correct=1518 total=1609 | acc 0to5 = 90.91% -- err = 9.09% -- c_5=230.0 t_5=253 | acc 5to20 = 94.99% -- err = 5.01% -- c_20=1288.0 t_20=1356 ``` ## Citation Please cite as: ``` bibtex @article{babu2021xlsr, title={XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale}, author={Arun Babu and Changhan Wang and Andros Tjandra and Kushal Lakhotia and Qiantong Xu and Naman Goyal and Kritika Singh and Patrick von Platen and Yatharth Saraf and Juan Pino and Alexei Baevski and Alexis Conneau and Michael Auli}, year={2021}, volume={abs/2111.09296}, journal={arXiv}, } ``` ================================================ FILE: examples/wav2vec/xlsr/config/finetune.yaml ================================================ # @package _group_ common: fp16: true log_format: json log_interval: 200 tensorboard_logdir: tb checkpoint: save_interval: 1000 save_interval_updates: 1000 keep_interval_updates: 1 no_epoch_checkpoints: true best_checkpoint_metric: wer task: _name: audio_finetuning data: ??? normalize: true labels: ltr dataset: num_workers: 6 max_tokens: 1280000 skip_invalid_size_inputs_valid_test: true validate_after_updates: 10000 validate_interval_updates: 1000 valid_subset: valid distributed_training: ddp_backend: legacy_ddp distributed_world_size: 4 criterion: _name: ctc zero_infinity: true optimization: max_update: ??? lr: [0.0003] sentence_avg: true update_freq: [5] optimizer: _name: adam adam_betas: (0.9,0.98) adam_eps: 1e-08 lr_scheduler: _name: tri_stage phase_ratio: [0.1, 0.4, 0.5] final_lr_scale: 0.05 model: _name: wav2vec_ctc w2v_path: ??? apply_mask: true mask_prob: 0.75 mask_channel_prob: 0.25 mask_channel_length: 64 layerdrop: 0.1 activation_dropout: 0.1 feature_grad_mult: 0.0 freeze_finetune_updates: 10000 checkpoint_activations: false ================================================ FILE: examples/wav2vec/xlsr/scripts/eval_speaker_clf_task.py ================================================ """ Usage: This scripts it to evaluate the classification accuracy/error rate from the embedding extracted by gen_audio_embedding.py Example (LID classification) PYTHONPATH='.' python examples/wav2vec/eval_speaker_clf_task.py \ --data /fsx/androstj/exps/lid_voxlingua/infer/atj_xlsr2_100pct_300M_mean_fast_upd_100k_new.npz \ --task cls --merge mean_logit """ import numpy as np import sklearn from sklearn.metrics.pairwise import cosine_similarity from sklearn.preprocessing import StandardScaler from tqdm import tqdm import ipdb import logging import argparse from scipy.special import softmax log=logging.getLogger(__name__) log.setLevel(logging.INFO) def calculate_eer(y_label, y_score): # y denotes groundtruth scores, # y_score denotes the prediction scores. from scipy.optimize import brentq from sklearn.metrics import roc_curve from scipy.interpolate import interp1d fpr, tpr, thresholds = roc_curve(y_label, y_score, pos_label=1) eer = brentq(lambda x : 1. - x - interp1d(fpr, tpr)(x), 0., 1.) optimal_threshold = interp1d(fpr, thresholds)(eer) return eer, optimal_threshold def calculate_minDCF(y_label, y_score, p_target=0.01, c_miss=1, c_fa=1): # https://github.com/kaldi-asr/kaldi/blob/master/egs/sre08/v1/sid/compute_min_dcf.py from sklearn.metrics import det_curve fpr, fnr, thresholds = det_curve(y_label, y_score, pos_label=1) min_c_det = float("inf") min_c_det_threshold = thresholds[0] for i in range(0, len(fpr)): # See Equation (2). it is a weighted sum of false negative # and false positive errors. c_det = c_miss * fnr[i] * p_target + c_fa * fpr[i] * (1 - p_target) if c_det < min_c_det: min_c_det = c_det min_c_det_threshold = thresholds[i] # See Equations (3) and (4). Now we normalize the cost. c_def = min(c_miss * p_target, c_fa * (1 - p_target)) min_dcf = min_c_det / c_def return min_dcf, min_c_det_threshold if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--data', help='npz contains name & latent file') parser.add_argument('--task', choices=['cls', 'veri', 'cls_voxlingua']) parser.add_argument('--merge', choices=['mean_logit', 'first_logit', 'mean_latent_sim', 'first_latent_sim', 'mean_logit_sim', 'first_logit_sim']) parser.add_argument('--veri-pair', help='verification file contains 1/0 utt_x utt_y') parser.add_argument('--scaler', type=str, choices=['mean_var']) parser.add_argument('--compress-method', choices=['pca']) parser.add_argument('--compress-dim', type=int) args = parser.parse_args() if args.task in ['cls', 'cls_voxlingua']: print('| run classification evaluation') data = np.load(args.data) data_logit = data['logit'] data_target = data['target'] data_src_len = data['src_len'] assert data_logit.shape[0] == data_target.shape[0] B = data_logit.shape[0] correct = 0 total = 0 data_prob = softmax(data_logit, axis=2) correct_vs_len = np.empty((B, 2)) for ii in range(B): _target = data_target[ii] if args.merge == 'mean_logit': _prob = np.mean(data_prob[ii], axis=0) top_1 = np.argmax(_prob) elif args.merge == 'first_logit': _prob = data_prob[ii][0] top_1 = np.argmax(_prob) else : raise ValueError() is_top_1 = (1 if top_1 == _target else 0) correct += is_top_1 total += 1 _src_len = data_src_len[ii] / 16000 correct_vs_len[ii] = [is_top_1, _src_len] acc = correct / total * 100 t_5 = correct_vs_len[:, 1] <= 5 t_20 = correct_vs_len[:, 1] > 5 c_5 = correct_vs_len[t_5, 0].sum() c_20 = correct_vs_len[t_20, 0].sum() t_5 = t_5.sum() t_20 = t_20.sum() acc_5 = c_5 / t_5 * 100 acc_20 = c_20 / t_20 * 100 print(f'| acc = {acc:.2f}% -- err = {100-acc:.2f}% -- {correct=} {total=}') print(f'| acc 0to5 = {acc_5:.2f}% -- err = {100-acc_5:.2f}% -- {c_5=} {t_5=}') print(f'| acc 5to20 = {acc_20:.2f}% -- err = {100-acc_20:.2f}% -- {c_20=} {t_20=}') if args.task == 'veri': print('| run verification evaluation') veri_pairs = [] with open(args.veri_pair) as ff: for fi in ff: a,b,c = fi.split() a = int(a) veri_pairs.append([a,b,c]) data = np.load(args.data) if 'logit' in args.merge: data_latent = data['logit'] elif 'latent' in args.merge: data_latent = data['latent'] else : raise ValueError() data_name = data['name'] assert len(data_name) == len(data_latent) map_name_latent = {} from sklearn.pipeline import make_pipeline pipe = [] if args.scaler == 'mean_var': print(f'| apply StandardScaler') pipe.append(StandardScaler()) if args.compress_method == 'pca': n_comp = args.compress_dim print(f'| apply PCA with {n_comp=}') from sklearn.decomposition import PCA pipe.append(PCA(n_components=n_comp)) if len(pipe) > 0 : pipe = make_pipeline(*pipe) data_latent_2d = data_latent.reshape(-1, data_latent.shape[-1]) pipe.fit(data_latent_2d) data_latent_2d = pipe.transform(data_latent_2d) data_latent = data_latent_2d.reshape(data_latent.shape[0], data_latent.shape[1], -1) for ii in range(len(data_name)): map_name_latent[data_name[ii]] = data_latent[ii] labels = [] scores = [] for lbl, pair_a, pair_b in tqdm(veri_pairs): labels.append(lbl) pair_a = map_name_latent[pair_a] pair_b = map_name_latent[pair_b] assert pair_a.ndim == pair_b.ndim == 2 score = cosine_similarity(pair_a, pair_b) if args.merge.startswith('mean'): score = np.mean(score) elif args.merge.startswith('first'): score = score[0, 0] else : raise ValueError() scores.append(score) labels = np.array(labels) scores = np.array(scores) eer, eer_threshold = calculate_eer(labels, scores) minDCF, minDCF_threshold = calculate_minDCF(labels, scores) print('='*40) print(f'| EER = {eer*100:.2f}%\tthreshold = {eer_threshold:.2f}') print(f'| minDCF = {minDCF:.2f}\tthreshold = {minDCF_threshold:.2f}') ================================================ FILE: examples/wav2vec/xlsr/scripts/gen_audio_embedding.py ================================================ """ Usage: This script is used to extract the embedding / logit for speech classification task. 1. Set fdir into your model checkpoint directory 2. Run the following command (preferrably on GPU machine to speed up the inference process) CUDA_VISIBLE_DEVICES=0 python3 examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \ --task audio_classification --batch-size 90 --gen-subset test \ --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \ --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir Example: Case: LID logit extraction fdir='/fsx/androstj/exps/voxlingua_lid_train_all/ckpt_100pct_300m_voxling-act_linear-pool_mean_fast-lr_1e-4-phase_0.1_0.4_0.5-maxupd_100000-ufreq_1-mprob_0.5-fz_0-cr_softmax/0/checkpoints/checkpoint_best.pt' python3 examples/wav2vec/gen_audio_embedding.py /fsx/data/VoxLingua107/manifest --path ${fdir} \ --task audio_classification --batch-size 90 --gen-subset test \ --infer-manifest /fsx/data/VoxLingua107/manifest/test.tsv \ --infer-xtimes 10 --infer-max-sample-size 160000 --output-path $odir """ import torch from fairseq import checkpoint_utils, distributed_utils, options, utils from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging import metrics, progress_bar from fairseq import checkpoint_utils, data, options, tasks from fairseq.data import FileAudioDataset, AddTargetDataset, Dictionary from fairseq.tasks.audio_classification import LabelEncoder import ipdb import copy import sys from tqdm import tqdm import tempfile import numpy as np import sklearn def subset_manifest(infer_manifest, veri_pair): with open(infer_manifest) as ff, open(veri_pair) as gg, \ tempfile.NamedTemporaryFile('w', delete=False) as ww: fnames = ff.read().strip().split("\n") basedir = fnames[0] needed_fname = [] for gi in gg.read().strip().split('\n'): _, x1, x2 = gi.split() needed_fname.append(x1) needed_fname.append(x2) needed_fname = set(needed_fname) ww.write(basedir+'\n') for ii in range(1, len(fnames)): x1,x2 = fnames[ii].split() if x1 in needed_fname: ww.write(fnames[ii]+'\n') print(f'| subset manifest for verification: {ww.name}') return ww.name def wrap_target_dataset(infer_manifest, dataset, task): label_path = infer_manifest.replace(".tsv", ".label") with open(label_path, "r") as f: labels = f.read().strip().split("\n") assert len(labels) == len(dataset) process_label = LabelEncoder(task.target_dictionary) dataset = AddTargetDataset(dataset, labels, pad=task.target_dictionary.pad(), eos=task.target_dictionary.eos(), batch_targets=True, process_label=process_label, add_to_input=False) return dataset def resample_data(source, padding_mask, n_sample, max_sample_len): # source: BxT # padding_mask: BxT B = source.shape[0] T = source.shape[1] sources = [] padding_masks = [] seq_len = (~padding_mask).sum(1) for jj in range(n_sample): new_source = source.new_zeros(B, max_sample_len) new_padding_mask = padding_mask.new_zeros(B, max_sample_len) for ii in range(B): if seq_len[ii] > max_sample_len: start = np.random.randint(0, seq_len[ii]-max_sample_len+1) end = start + max_sample_len else : start = 0 end = seq_len[ii] new_source[ii, 0:end-start] = source[ii, start:end] new_padding_mask[ii, end-start+1:] = True sources.append(new_source) padding_masks.append(new_padding_mask) return sources, padding_masks def resample_sample(sample, n_sample, max_sample_len): new_sources, new_padding_masks = resample_data(sample['net_input']['source'], sample['net_input']['padding_mask'], n_sample, max_sample_len) new_samples = [] for ii in range(n_sample): new_sample = copy.deepcopy(sample) new_sample['net_input']['source'] = new_sources[ii] new_sample['net_input']['padding_mask'] = new_padding_masks[ii] new_samples.append(new_sample) return new_samples if __name__ == '__main__': np.random.seed(123) # Parse command-line arguments for generation parser = options.get_generation_parser(default_task='audio_classification') # parser.add_argument('--infer-merge', type=str, default='mean') parser.add_argument('--infer-xtimes', type=int, default=1) parser.add_argument('--infer-max-sample-size', type=int, default=5*16000) # 5 secs parser.add_argument('--infer-manifest', type=str) parser.add_argument('--verification-pair', type=str, required=False, help=''' a file that contains pairs of utts to evaluated if they are from same speaker or not format: (following voxceleb) 1/0 <wav_pair_a> <wav_pair_b> ''') parser.add_argument('--output-path', type=str) # parser.add_argument('--infer-xtimes', type=int, default=1) args = options.parse_args_and_arch(parser) # Setup task # task = tasks.setup_task(args) use_cuda = not args.cpu # Load model & task print('| loading model from {}'.format(args.path)) arg_overrides = { 'data': args.data, # 'mask_prob': 0 #'max_sample_size': sys.maxsize, #'min_sample_size': 0, } state = checkpoint_utils.load_checkpoint_to_cpu(args.path) # move to AWS state['cfg']['model']['w2v_path'] = state['cfg']['model']['w2v_path'].replace('/checkpoint/arbabu/XLSR2/model_versions/', '/fsx/data/model_versions/').replace('/checkpoint/kushall/final_model_checkpoints/wav2vec2/', '/fsx/data/wav2vec_ckpt/') state['cfg']['task']['data'] = state['cfg']['task']['data'].replace('/checkpoint/kushall/data/', '/fsx/data/') models, _model_args, task = checkpoint_utils.load_model_ensemble_and_task([args.path], arg_overrides=arg_overrides, task=None, state=state) model = models[0] model.eval() if use_cuda: model.cuda() # Load dataset task.load_dataset(args.gen_subset) dataset = task.dataset(args.gen_subset) infer_manifest = args.infer_manifest # only decode needed utts # infer_manifest = subset_manifest(infer_manifest, # args.verification_pair) infer_dataset = FileAudioDataset(infer_manifest, sample_rate=task.cfg.sample_rate, max_sample_size=10**10, #task.cfg.max_sample_size, min_sample_size=1, #task.cfg.min_sample_size, pad=True, normalize=task.cfg.normalize) # add target (if needed) infer_dataset = wrap_target_dataset(infer_manifest, infer_dataset, task) itr = task.get_batch_iterator( dataset=infer_dataset, max_sentences=args.batch_size, ).next_epoch_itr(shuffle=False) # correct = 0 # total = 0 list_uttname = [] list_latent = [] list_logit = [] list_target = [] list_src_len = [] with torch.no_grad(): for _, sample in tqdm(enumerate(itr)): # resample if needed samples = resample_sample(sample, args.infer_xtimes, args.infer_max_sample_size) list_uttname.extend(sample['name']) list_target.extend(sample['target'][:, 0].cpu().numpy()) list_src_len.extend((~sample['net_input']['padding_mask']).sum(1).cpu().numpy()) latents = [] logits = [] for sample in samples: sample = utils.move_to_cuda(sample) if use_cuda else sample try: latent = model.forward_latent(**sample['net_input']) latents.append(latent.detach().cpu().numpy()) except: latent = None logit = model.forward(**sample['net_input']) logits.append(logit.detach().cpu().numpy()) if len(latents) > 0: latents = np.stack(latents, 1) # B,X,D logits = np.stack(logits, 1) # B,X,Cls list_latent.extend(latents) list_logit.extend(logits) # create big npz list_uttname = np.array(list_uttname) list_latent = np.array(list_latent) list_target = np.array(list_target) list_logit = np.array(list_logit) list_src_len = np.array(list_src_len) # save to npz output_path = args.output_path if (output_path is None): output_path = tempfile.NamedTemporaryFile('wb', delete=False).name with open(output_path, 'wb') as ww: np.savez(ww, name=list_uttname, latent=list_latent, target=list_target, logit=list_logit, src_len=list_src_len) print("="*10 + " REPORT " + "="*10) print(f'| latent saved in {output_path}') print(f'| {list_uttname.shape=}, {list_latent.shape=}, {list_target.shape=}, {list_logit.shape=}, {list_src_len.shape=}') ================================================ FILE: examples/wmt19/README.md ================================================ # WMT 19 This page provides pointers to the models of Facebook-FAIR's WMT'19 news translation task submission [(Ng et al., 2019)](https://arxiv.org/abs/1907.06616). ## Pre-trained models Model | Description | Download ---|---|--- `transformer.wmt19.en-de` | En->De Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz) `transformer.wmt19.de-en` | De->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz) `transformer.wmt19.en-ru` | En->Ru Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz) `transformer.wmt19.ru-en` | Ru->En Ensemble | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz) `transformer_lm.wmt19.en` | En Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.gz) `transformer_lm.wmt19.de` | De Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.gz) `transformer_lm.wmt19.ru` | Ru Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.gz) ## Pre-trained single models before finetuning Model | Description | Download ---|---|--- `transformer.wmt19.en-de` | En->De Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.ffn8192.tar.gz) `transformer.wmt19.de-en` | De->En Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.ffn8192.tar.gz) `transformer.wmt19.en-ru` | En->Ru Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ffn8192.tar.gz) `transformer.wmt19.ru-en` | Ru->En Single, no finetuning | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ffn8192.tar.gz) ## Example usage (torch.hub) #### Requirements We require a few additional Python dependencies for preprocessing: ```bash pip install fastBPE sacremoses ``` #### Translation ```python import torch # English to German translation en2de = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt', tokenizer='moses', bpe='fastbpe') en2de.translate("Machine learning is great!") # 'Maschinelles Lernen ist großartig!' # German to English translation de2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.de-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt', tokenizer='moses', bpe='fastbpe') de2en.translate("Maschinelles Lernen ist großartig!") # 'Machine learning is great!' # English to Russian translation en2ru = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-ru', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt', tokenizer='moses', bpe='fastbpe') en2ru.translate("Machine learning is great!") # 'Машинное обучение - это здорово!' # Russian to English translation ru2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.ru-en', checkpoint_file='model1.pt:model2.pt:model3.pt:model4.pt', tokenizer='moses', bpe='fastbpe') ru2en.translate("Машинное обучение - это здорово!") # 'Machine learning is great!' ``` #### Language Modeling ```python # Sample from the English LM en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.en', tokenizer='moses', bpe='fastbpe') en_lm.sample("Machine learning is") # 'Machine learning is the future of computing, says Microsoft boss Satya Nadella ...' # Sample from the German LM de_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.de', tokenizer='moses', bpe='fastbpe') de_lm.sample("Maschinelles lernen ist") # 'Maschinelles lernen ist das A und O (neues-deutschland.de) Die Arbeitsbedingungen für Lehrerinnen und Lehrer sind seit Jahren verbesserungswürdig ...' # Sample from the Russian LM ru_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt19.ru', tokenizer='moses', bpe='fastbpe') ru_lm.sample("машинное обучение это") # 'машинное обучение это то, что мы называем "искусственным интеллектом".' ``` ## Citation ```bibtex @inproceedings{ng2019facebook}, title = {Facebook FAIR's WMT19 News Translation Task Submission}, author = {Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey}, booktitle = {Proc. of WMT}, year = 2019, } ``` ================================================ FILE: examples/wmt20/README.md ================================================ # WMT 20 This page provides pointers to the models of Facebook-FAIR's WMT'20 news translation task submission [(Chen et al., 2020)](https://arxiv.org/abs/2011.08298). ## Single best MT models (after finetuning on part of WMT20 news dev set) Model | Description | Download ---|---|--- `transformer.wmt20.ta-en` | Ta->En | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta-en.single.tar.gz) `transformer.wmt20.en-ta` | En->Ta | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-ta.single.tar.gz) `transformer.wmt20.iu-en.news` | Iu->En (News domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.news.single.tar.gz) `transformer.wmt20.en-iu.news` | En->Iu (News domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.news.single.tar.gz) `transformer.wmt20.iu-en.nh` | Iu->En (Nunavut Hansard domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.nh.single.tar.gz) `transformer.wmt20.en-iu.nh` | En->Iu (Nunavut Hansard domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.nh.single.tar.gz) ## Language models Model | Description | Download ---|---|--- `transformer_lm.wmt20.en` | En Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en.tar.gz) `transformer_lm.wmt20.ta` | Ta Language Model | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta.tar.gz) `transformer_lm.wmt20.iu.news` | Iu Language Model (News domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu.news.tar.gz) `transformer_lm.wmt20.iu.nh` | Iu Language Model (Nunavut Hansard domain) | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu.nh.tar.gz) ## Example usage (torch.hub) #### Translation ```python import torch # English to Tamil translation en2ta = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.en-ta') en2ta.translate("Machine learning is great!") # 'இயந்திரக் கற்றல் அருமை!' # Tamil to English translation ta2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.ta-en') ta2en.translate("இயந்திரக் கற்றல் அருமை!") # 'Machine learning is great!' # English to Inuktitut translation en2iu = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.en-iu.news') en2iu.translate("machine learning is great!") # 'ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ ᐱᐅᔪᒻᒪᕆᒃ!' # Inuktitut to English translation iu2en = torch.hub.load('pytorch/fairseq', 'transformer.wmt20.iu-en.news') iu2en.translate("ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ ᐱᐅᔪᒻᒪᕆᒃ!") # 'Machine learning excellence!' ``` #### Language Modeling ```python # Sample from the English LM en_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt20.en') en_lm.sample("Machine learning is") # 'Machine learning is a type of artificial intelligence that uses machine learning to learn from data and make predictions.' # Sample from the Tamil LM ta_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt20.ta') ta_lm.sample("இயந்திரக் கற்றல் என்பது செயற்கை நுண்ணறிவின்") # 'இயந்திரக் கற்றல் என்பது செயற்கை நுண்ணறிவின் ஒரு பகுதியாகும்.' # Sample from the Inuktitut LM iu_lm = torch.hub.load('pytorch/fairseq', 'transformer_lm.wmt20.iu.news') iu_lm.sample("ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ") # 'ᖃᒧᑕᐅᔭᓄᑦ ᐃᓕᓐᓂᐊᕐᓂᖅ, ᐊᒻᒪᓗ ᓯᓚᐅᑉ ᐊᓯᙳᖅᐸᓪᓕᐊᓂᖓᓄᑦ ᖃᓄᐃᓕᐅᕈᑎᒃᓴᑦ, ᐃᓚᖃᖅᖢᑎᒃ ᐅᑯᓂᖓ:' ``` ## Citation ```bibtex @inproceedings{chen2020facebook title={Facebook AI's WMT20 News Translation Task Submission}, author={Peng-Jen Chen and Ann Lee and Changhan Wang and Naman Goyal and Angela Fan and Mary Williamson and Jiatao Gu}, booktitle={Proc. of WMT}, year={2020}, } ``` ================================================ FILE: examples/wmt21/README.md ================================================ # WMT 21 This page provides pointers to the models of Facebook AI's WMT'21 news translation task submission [(Tran et al., 2021)](https://arxiv.org/abs/2108.03265). ## Single best dense models Model | Description | Download ---|---|--- `wmt21.dense-24-wide.X-En` | X-En | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt21.dense-24-wide.X-En.tar.gz) `wmt21.dense-24-wide.En-X` | En-X | [download (.tar.gz)](https://dl.fbaipublicfiles.com/fairseq/models/wmt21.dense-24-wide.En-X.tar.gz) ## Example usage See eval.sh ## Citation ```bibtex @inproceedings{tran2021facebook title={Facebook AI’s WMT21 News Translation Task Submission}, author={Chau Tran and Shruti Bhosale and James Cross and Philipp Koehn and Sergey Edunov and Angela Fan}, booktitle={Proc. of WMT}, year={2021}, } ``` ================================================ FILE: examples/wmt21/eval.sh ================================================ #!/bin/bash # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. SRC=en TGT=is MODEL_NAME=wmt21.dense-24-wide.En-X PATH_TO_FAIRSEQ_PY=. TMP_DIR=generation_tmp mkdir -p $TMP_DIR REPLACE_UNICODE_PUNCT=$PATH_TO_FAIRSEQ_PY/examples/wmt21/scripts/replace-unicode-punctuation.perl NORM_PUNCT=$PATH_TO_FAIRSEQ_PY/examples/wmt21/scripts/normalize-punctuation.perl if [ ! -d "${TMP_DIR}/${MODEL_NAME}" ]; then wget https://dl.fbaipublicfiles.com/fairseq/models/${MODEL_NAME}.tar.gz -P $TMP_DIR/ tar -xvf $TMP_DIR/${MODEL_NAME}.tar.gz -C $TMP_DIR fi MODEL_DIR=$TMP_DIR/${MODEL_NAME} if [ ! -d "${TMP_DIR}/wmt21-news-systems" ]; then git clone https://github.com/wmt-conference/wmt21-news-systems $TMP_DIR/wmt21-news-systems fi DOMAIN_TAG="wmtdata newsdomain" INPUT_FILE=$TMP_DIR/wmt21-news-systems/txt/sources/newstest2021.${SRC}-${TGT}.src.${SRC} REF_FILE=$TMP_DIR/wmt21-news-systems/txt/references/newstest2021.${SRC}-${TGT}.ref.A.${TGT} # Translate cat ${INPUT_FILE} | sed "s/^/${DOMAIN_TAG} /" | $REPLACE_UNICODE_PUNCT | $NORM_PUNCT -l ${SRC} | python $PATH_TO_FAIRSEQ_PY/fairseq_cli/interactive.py $MODEL_DIR \ --path ${MODEL_DIR}/checkpoint.pt \ --task translation_multi_simple_epoch \ --langs "en,ha,is,ja,cs,ru,zh,de" \ --lang-pairs $SRC-$TGT \ --bpe "sentencepiece" \ --sentencepiece-model ${MODEL_DIR}/sentencepiece.model \ --buffer-size 1024 \ --batch-size 10 -s $SRC -t $TGT \ --decoder-langtok \ --encoder-langtok src \ --beam 5 \ --lenpen 1.0 \ --fp16 > $TMP_DIR/${SRC}-${TGT}.gen_log cat $TMP_DIR/$SRC-$TGT.gen_log | grep -P "^D-" | cut -f3 > $TMP_DIR/$SRC-$TGT.hyp # Calculate BLEU score sacrebleu -l $SRC-$TGT $REF_FILE < $TMP_DIR/$SRC-$TGT.hyp ================================================ FILE: examples/wmt21/scripts/normalize-punctuation.perl ================================================ #!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. use warnings; use strict; my $language = "en"; my $PENN = 0; while (@ARGV) { $_ = shift; /^-b$/ && ($| = 1, next); # not buffered (flush each line) /^-l$/ && ($language = shift, next); /^[^\-]/ && ($language = $_, next); /^-penn$/ && ($PENN = 1, next); } while(<STDIN>) { s/\r//g; # remove extra spaces s/\(/ \(/g; s/\)/\) /g; s/ +/ /g; s/\) ([\.\!\:\?\;\,])/\)$1/g; s/\( /\(/g; s/ \)/\)/g; s/(\d) \%/$1\%/g; s/ :/:/g; s/ ;/;/g; # normalize unicode punctuation if ($PENN == 0) { s/\`/\'/g; s/\'\'/ \" /g; } s/„/\"/g; s/“/\"/g; s/”/\"/g; s/–/-/g; s/—/ - /g; s/ +/ /g; s/´/\'/g; s/([a-z])‘([a-z])/$1\'$2/gi; s/([a-z])’([a-z])/$1\'$2/gi; s/‘/\'/g; s/‚/\'/g; s/’/\"/g; s/''/\"/g; s/´´/\"/g; s/…/.../g; # French quotes s/ « / \"/g; s/« /\"/g; s/«/\"/g; s/ » /\" /g; s/ »/\"/g; s/»/\"/g; # handle pseudo-spaces s/ \%/\%/g; s/nº /nº /g; s/ :/:/g; s/ ºC/ ºC/g; s/ cm/ cm/g; s/ \?/\?/g; s/ \!/\!/g; s/ ;/;/g; s/, /, /g; s/ +/ /g; # English "quotation," followed by comma, style if ($language eq "en") { s/\"([,\.]+)/$1\"/g; } # Czech is confused elsif ($language eq "cs" || $language eq "cz") { } # German/Spanish/French "quotation", followed by comma, style else { s/,\"/\",/g; s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence } if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") { s/(\d) (\d)/$1,$2/g; } else { s/(\d) (\d)/$1.$2/g; } print $_; } ================================================ FILE: examples/wmt21/scripts/replace-unicode-punctuation.perl ================================================ #!/usr/bin/env perl # # This file is part of moses. Its use is licensed under the GNU Lesser General # Public License version 2.1 or, at your option, any later version. use warnings; use strict; while (@ARGV) { $_ = shift; /^-b$/ && ($| = 1, next); # not buffered (flush each line) } #binmode(STDIN, ":utf8"); #binmode(STDOUT, ":utf8"); while(<STDIN>) { s/,/,/g; s/。 */. /g; s/、/,/g; s/”/"/g; s/“/"/g; s/∶/:/g; s/:/:/g; s/?/\?/g; s/《/"/g; s/》/"/g; s/)/\)/g; s/!/\!/g; s/(/\(/g; s/;/;/g; s/1/1/g; s/」/"/g; s/「/"/g; s/0/0/g; s/3/3/g; s/2/2/g; s/5/5/g; s/6/6/g; s/9/9/g; s/7/7/g; s/8/8/g; s/4/4/g; s/. */. /g; s/~/\~/g; s/’/\'/g; s/…/\.\.\./g; s/━/\-/g; s/〈/\</g; s/〉/\>/g; s/【/\[/g; s/】/\]/g; s/%/\%/g; print $_; } ================================================ FILE: examples/womens_bios/README.md ================================================ # Wikipedia Biographies of Women ## Training: The training dataset is created based on WikiSum, a dataset created from the paper [Generating Wikipedia by Summarizing Long Sequences](https://arxiv.org/pdf/1801.10198.pdf). The dataset needs to be generated following the instructions in this [Github Repository](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/wikisum). ### How is the WikiSum dataset structured? Overall, the task in WikiSum was to generate the entire Wikipedia article based on the contents of the top 10 Google Search Results. The authors provide a way for people to recreate their work. In the WikiSum Github, there are two options for the dataset recreation --- the first is to use CommonCrawl (a static, open source crawl of the web) and the second to do Live Web Fetches. The second has higher coverage, but the content is subject to change and difficult to fetch. We used the static, Commoncrawl version. This can be downloaded following the Github repo instructions, though note it will require usage of Google Cloud. Note: in our experience, it also requires requesting that the resource limit of the Google Cloud instance be raised, which requires emailing. Note: Having higher coverage in the training dataset would be expected to improve the model quality. There are many instances in the dataset where the training input (web evidence) does not contain sufficient content for producing the desired Wikipedia article. This may harm the model's ability to learn to retrieve, look at the input evidence, and overall could contribute to increased challenges in generating verifiable Wikipedia biographies. ### How do you go from WikiSum dataset to Biography dataset? The WikiSum dataset is for Wikipedia in general, not just biographies. We do this by querying WikiData to see if the Wikipedia article has an occupation, with the thought that all articles with occupations are probably biographies. ## Evaluation: You can download the dataset and baseline model with the following command: ``` wget -N 'https://dl.fbaipublicfiles.com/fairseq/womenbios_dataset.zip' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe' wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt' ``` We provide the full text Wikipedia articles split into four categories: - Women in Africa - Women in Asia - Women in Science - Women We note that these are not exhaustive intersectional categories and mainly stem from personal interest. We also provide the URL of the Wikipedia article. Note that Wikipedia articles are constantly being improved, edited, and changed. Thus, it's completely possible that the Wikipedia article on Wikipedia has been lovingly improved by other Wikipedia editors. To get the occupations of each biographical subject, we use WikiData. We provide a sample script to do this. We also provide the raw output of this query. The final part of the evaluation dataset is to query web evidence for each of the biographical subjects. This is the part of the evaluation dataset that requires the most improvement. As we discuss in our paper, one of the major reasons why it is difficult to write biographies for sometimes very well qualified women is that there is not information online about them. Further, the search engine may not find it. We encourage others to improve upon this part of the data, as even re-querying again on the internet may find new, updated sources of information as the web is constantly evolving. We use the search engine from [Internet-Augmented Dialogue Generation](https://arxiv.org/abs/2107.07566), see [project URL](https://parl.ai/projects/sea/) to do the search queries. Note: we remove wikipedia site sources from our query (or we'd query the data itself). However, it's possible Wikipedia information can be copied around in multiple forms on the web, linked with edits, etc. ## Section by Section Generation: Wikipedia articles are split into sections, which are usually separated by headings. These headings can be separated in the article text by looking for these equal signs (==), where the number of equal signs usually signals if you are looking at a toplevel heading or a subheading, etc. An example regex that you can use is: ` section_header_re = re.compile(r"(?<!=)==([^=]+)==(?!=)") ` ## List of Notes: - People can have multiple occupations, and we keep all occupations that we query from WikiData ## List of Possible Improvement Areas: Using a larger generative pre-trained model, larger-scale retrieval, a retrieval encoder specialized to Wikipedia (or biographies), tuning all of the training & generation parameters exhaustively --- and the like --- would most likely be very useful. Overall, we hope that this is a starting point for others who might be interested in focusing on how we can help address the gender gap on Wikipedia. ## Interested in Wikipedia and Gender Gap? You might want to check out: - https://humaniki.wmcloud.org/ - https://en.wikipedia.org/wiki/Wikipedia:WikiProject_Women_in_Red and https://wikimediafoundation.org/news/2018/10/18/women-in-red-wikiproject/ - https://meta.wikimedia.org/wiki/Whose_Knowledge%3F/VisibleWikiWomen - https://www.ted.com/talks/jess_wade_a_voice_for_diversity_in_science and thanks again to all of the Wikipedia editors and the entire community that is already working so hard to write amazing articles for diverse groups of people. # LICENSE This is licensed under CC-BY-NC, however portions of the dataset are available under separate license terms: text sourced from Wikipedia is licensed under CC-BY-SA. ================================================ FILE: examples/womens_bios/query_occupations_from_wikidata.py ================================================ import sys from SPARQLWrapper import SPARQLWrapper, JSON endpoint_url = "https://query.wikidata.org/sparql" with open("/your/urls/here") as f: data = f.readlines() urls = [i.strip() for i in data] def get_results(endpoint_url, URL): query = f"""SELECT ?uriLabel ?occupation ?occupationLabel ?dob ?dobLabel WHERE {{ <{URL}> schema:about ?uri . ?uri wdt:P106 ?occupation . SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en" }} }}""" user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1]) sparql = SPARQLWrapper(endpoint_url, agent=user_agent) sparql.setQuery(query) sparql.setReturnFormat(JSON) return sparql.query().convert() all_occupations = [] for URL in urls: results = get_results(endpoint_url, URL) occupations = [] for result in results["results"]["bindings"]: occupations.append(result['occupationLabel']['value']) all_occupations.append(result['uriLabel']['value'] + ", " + ", ".join(occupations)) assert(len(all_occupations) == len(urls)) with open("/your/file/output/here", "w") as o: for line in all_occupations: o.write(line.strip() + "\n") ================================================ FILE: examples/xformers/README.md ================================================ # Using xFormers with FairSeq [xFormers](https://github.com/facebookresearch/xformers) is a xFormers is a modular library for flexibly generating transformer architectures with interoperable and optimized building blocks. The current integration allows for FairSeq users to use an attention variant available in the xFormers repository. In order to enable xFormers, all that needs to be passed in is a string representing an [xFormers attention config](https://github.com/facebookresearch/xformers/blob/5f754129bfb1ea53747b1ab2077261ea762faa47/xformers/components/attention/base.py#L18). The various attention variants can be found [here](https://github.com/facebookresearch/xformers/tree/main/xformers/components/attention). These include sparse attention and blocksparse attention. For example, you could pass in the following args: ```python decoder_xformers_att_config = '{"name": "scaled_dot_product"}' encoder_xformers_att_config = '{"name": "linformer", "seq_len": "256"}' ``` In order to use blocksparse attention you would have to additionally pass in a blocksparse layout and blocksize. For example: ```python xformers_att_config = '{"name": "scaled_dot_product"}' xformers_blocksparse_blocksize = 16 xformers_blocksparse_layout = torch.ones( seq_len // xformers_blocksparse_blocksize, seq_len // xformers_blocksparse_blocksize, ) xf_blocksparse_mha = ( MultiheadAttention( embedding, num_heads, dropout=0.0, add_zero_attn=add_zero_attn, xformers_att_config=xformers_att_config, xformers_blocksparse_layout=xformers_blocksparse_layout, xformers_blocksparse_blocksize=xformers_blocksparse_blocksize, ) ``` The xFormers repository currenlty has benchmarks on the [runtime](https://github.com/facebookresearch/xformers/blob/main/docs/plots/runtime_vs_attention.png) and [memory usage](https://github.com/facebookresearch/xformers/blob/main/docs/plots/memory_vs_attention.png) of the various attentions. ================================================ FILE: examples/xglm/README.md ================================================ # Few-shot Learning with Multilingual Language Models ## Introduction In this work, we train a family of multilingual generative language models, dubbed XGLM, on a balanced corpus covering a diverse set of languages, and study their few- and zero-shot learning capabilities in a wide range of tasks. Our largest model with 7.5 billion parameters sets new state of the art in few-shot learning on more than 20 representative languages, outperforming GPT-3 of comparable size in multilingual commonsense reasoning (+7.4 accuracy points for 0-shot, +9.4 for 4-shot) and natural language inference (+5.4 for 0-shot, +5.4 for 4-shot). We have included a [model card](model_card.md) of XGLM for transparency and accountability. ## Data and Languages XGLM models are trained on a new multilingual corpus extracted from CommonCrawl (CC100-XL), a significantly larger multilingual dataset covering 68 Common Crawl (CC) snapshots (from [Summer 2013](http://commoncrawl.org/2013/11/new-crawl-data-available/) to [March/April 2020](https://commoncrawl.org/2020/04/march-april-2020-crawl-archive-now-available/) consisting of 134 languages. The detailed languages and data statistics are reported in the paper (Table A.1). ## Pre-trained models Model | Layers | Model Dim | FFN Dim | Languages | Download ---|---|---|---|---|--- `XGLM 564M` | 24 | 1024 | 4096 | trained on 30 languages| [xglm.564M.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.564M.tar.gz) `XGLM 1.7B` | 24 | 2048 | 8192 | trained on 30 languages| [xglm.1.7B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.1.7B.tar.gz) `XGLM 2.9B` | 48 | 2048 | 8192 | trained on 30 languages| [xglm.2.9B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.2.9B.tar.gz) `XGLM 7.5B` | 32 | 4096 | 16384 | trained on 30 languages| [xglm.7.5B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.7.5B.tar.gz) `XGLM 4.5B` | 48 | 2048 | 16384 | trained on 134 languages| [xglm.4.5B.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xglm/xglm.4.5B.tar.gz) ## Pre-training Data Format Our models were pre-trained with data in the following format (i.e. paragraphs are separated with new lines and documents were separated with double new lines). ``` <doc0,para0,tok0> ... <doc0,para0,tokX0> # X0: number of tokens in para0 of doc0 <doc0,para1,tok0> ... <doc0,para1,tokY0> # Y0: number of tokens in para1 of doc0 <doc1,para0,tok0> ... <doc1,para0,tokX1> # X1: number of tokens in para0 of doc1 <doc1,para1,tok0> ... <doc1,para1,tokY1> # Y1: number of tokens in para1 of doc1 ... ``` Fairseq's preprocessing replaces newlines with the end-of-sentence symbol (`</s>`). As a result, the models never saw newline characters during pretraining and the same preprocessing should be run prior to few-shot inference to maximize performance. For example, our language model scoring function has `replace_newlines_with_eos` argument to trigger this preprocessing: ```python from fairseq.models.transformer_lm import TransformerLanguageModel model_dir = 'path_to_decompressed_tar_gz_dir' lm = TransformerLanguageModel.from_pretrained(model_dir, bpe='sentencepiece') text = """First paragraph of the first document. Second paragraph of the first document. First paragraph of the second document. """ tokens = lm.score(text, replace_newlines_with_eos=True)['tokens'] assert '\n' not in lm.decode(tokens) # no newlines were encoded ``` ## Evaluation ### Example (COPA) The following snippet show how to evaluate our models on the Choice of Plausible Alternatives (COPA) task, using examples in English, Chinese and Hindi. ```python data_samples = { 'en': [ { "premise": "I wanted to conserve energy.", "choice1": "I swept the floor in the unoccupied room.", "choice2": "I shut off the light in the unoccupied room.", "question": "effect", "label": "1" }, { "premise": "The flame on the candle went out.", "choice1": "I blew on the wick.", "choice2": "I put a match to the wick.", "question": "cause", "label": "0" } ], 'zh': [ { "premise": "我想节约能源。", "choice1": "我在空着的房间里扫了地板。", "choice2": "我把空房间里的灯关了。", "question": "effect", "label": "1" }, { "premise": "蜡烛上的火焰熄灭了。", "choice1": "我吹灭了灯芯。", "choice2": "我把一根火柴放在灯芯上。", "question": "cause", "label": "0" } ], 'hi': [ { "premise": "M te vle konsève enèji.", "choice1": "Mwen te fin baleye chanm lib la.", "choice2": "Mwen te femen limyè nan chanm lib la.", "question": "effect", "label": "1" }, { "premise": "Flam bouji a te etenn.", "choice1": "Mwen te soufle bouji a.", "choice2": "Mwen te limen mèch bouji a.", "question": "cause", "label": "0" } ] } ``` In this example, we format the examples use the non-verbal prompts `{premise}\n{choice1}` and `{premise}\n{choice2}`, which are shared by all three languages. ```python from fairseq.models.transformer_lm import TransformerLanguageModel model_dir = 'path_to_decompressed_tar_gz_dir' lm = TransformerLanguageModel.from_pretrained(model_dir, bpe='sentencepiece') lm = lm.eval() lm = lm.half() lm = lm.cuda() def get_logprobs(prompt): import re prompt = re.sub('\n+' , '\n', prompt) # collapse repeated newlines, which indicate separate documents return lm.score(prompt, replace_newlines_with_eos=True)['positional_scores'] # Zero-shot evaluation for the Choice of Plausible Alternatives (COPA) task. # A return value of 0 indicates that the first alternative is more plausible, # while 1 indicates that the second alternative is more plausible. def COPA_eval(prompt, alternative1, alternative2): lprob1 = get_logprobs(prompt + "\n" + alternative1).sum() lprob2 = get_logprobs(prompt + "\n" + alternative2).sum() return 0 if lprob1 > lprob2 else 1 for lang in ['en', 'zh', 'hi']: for idx, example in enumerate(data_samples[lang]): predict = COPA_eval(example["premise"], example["choice1"], example["choice2"]) print(f'{lang}-{idx}', predict, example['label']) # en-0 1 1 # en-1 0 0 # zh-0 1 1 # zh-1 0 0 # hi-0 1 1 # hi-1 0 0 ``` ## XStoryCloze We release XStoryCloze, a new multilingual dataset intended for few-shot evaluation, alongside this paper. XStoryCloze consists of professional translation of the validation split of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 other languages. It is opensourced under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode), the same license as the English StoryCloze. You can download the dataset via [this link](https://dl.fbaipublicfiles.com/xstorycloze.zip). Language | ar | es | eu | hi | id | my | ru | sw | te | zh ---|---|---|---|---|---|---|---|---|---|--- Train size | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 Eval size | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 | 1511 Please refer to [the dataset doc](XStoryCloze.md) for more information. ## Publication [Few-shot Learning with Multilingual Generative Language Models](https://arxiv.org/abs/2112.10668). Xi Victoria Lin*, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li* (* Equal Contribution). EMNLP 2022. ## Citation ``` @article{DBLP:journals/corr/abs-2112-10668, author = {Xi Victoria Lin and Todor Mihaylov and Mikel Artetxe and Tianlu Wang and Shuohui Chen and Daniel Simig and Myle Ott and Naman Goyal and Shruti Bhosale and Jingfei Du and Ramakanth Pasunuru and Sam Shleifer and Punit Singh Koura and Vishrav Chaudhary and Brian O'Horo and Jeff Wang and Luke Zettlemoyer and Zornitsa Kozareva and Mona T. Diab and Veselin Stoyanov and Xian Li}, title = {Few-shot Learning with Multilingual Language Models}, journal = {CoRR}, volume = {abs/2112.10668}, year = {2021}, url = {https://arxiv.org/abs/2112.10668}, eprinttype = {arXiv}, eprint = {2112.10668}, timestamp = {Tue, 04 Jan 2022 15:59:27 +0100}, biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } ``` ================================================ FILE: examples/xglm/XStoryCloze.md ================================================ XStoryCloze consists of professional translation of the validation split of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 other languages. This dataset is released by FAIR (Fundamental Artificial Intelligence Research) alongside the paper [Few-shot Learning with Multilingual Generative Language Models. EMNLP 2022](https://arxiv.org/abs/2112.10668). # Languages ru, zh (Simplified), es (Latin America), ar, hi, id, te, sw, eu, my. # Data Splits This dataset is intended to be used for evaluating the zero- and few-shot learning capabilities of multlingual language models. We split the data for each language into train and test (360 vs. 1510 examples, respectively). The released data files for different languages maintain a line-by-line alignment. # Access English StoryCloze Please request the original English StoryCloze dataset through the [official website](https://cs.rochester.edu/nlp/rocstories/). You can create a split of the en data following our data split scheme using the following commands: ``` head -361 spring2016.val.tsv > spring2016.val.en.tsv.split_20_80_train.tsv head -1 spring2016.val.tsv > spring2016.val.en.tsv.split_20_80_eval.tsv # TSV header tail -1511 spring2016.val.tsv >> spring2016.val.en.tsv.split_20_80_eval.tsv ``` # Licence XStoryCloze is opensourced under [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0/legalcode), the same license as the original English StoryCloze. # Citation We hope this dataset is helpful for the research and wider NLP community. If you use XStoryCloze in your work, please cite ``` @article{DBLP:journals/corr/abs-2112-10668, author = {Xi Victoria Lin and Todor Mihaylov and Mikel Artetxe and Tianlu Wang and Shuohui Chen and Daniel Simig and Myle Ott and Naman Goyal and Shruti Bhosale and Jingfei Du and Ramakanth Pasunuru and Sam Shleifer and Punit Singh Koura and Vishrav Chaudhary and Brian O'Horo and Jeff Wang and Luke Zettlemoyer and Zornitsa Kozareva and Mona T. Diab and Veselin Stoyanov and Xian Li}, title = {Few-shot Learning with Multilingual Language Models}, journal = {CoRR}, volume = {abs/2112.10668}, year = {2021}, url = {https://arxiv.org/abs/2112.10668}, eprinttype = {arXiv}, eprint = {2112.10668}, timestamp = {Tue, 04 Jan 2022 15:59:27 +0100}, biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } ``` ================================================ FILE: examples/xglm/model_card.md ================================================ # XGLM multilingual model ## Version 1.0.0 ### Model developer FAIR (Fundamental Artificial Intelligence Research) ### Model type A family of multilingual autoregressive language models (ranging from 564 million to 7.5 billion parameters) trained on a balanced corpus of a diverse set of languages. The language model can learn tasks from natural language descriptions and a few examples. ### Model Feedback Channel https://github.com/pytorch/fairseq ## Intended use ### Primary intended use For research purposes only, e.g. reproducing model evaluation results. Generation is only used in a limited capacity for explanation/justification or for prompting/probing/priming for class labels. ### Out of scope uses The primary purpose of the model is not to generate language, although the model is capable of doing that. ## Potential risks This section lists the potential risks associated with using the model. ### Relevant factors Based on known problems with NLP technology, potential relevant factors include output correctness, robustness, bias (gender, profession, race and religion), etc. ### Evaluation factors The model was evaluated on hate speech detection and occupation identification. * Hate speech detection (Huang et al. (2020)) - A safety task to test language models’ ability to identify hateful and offensive text. * Occupation identification (De-Arteaga et al., 2019), (Zhao et al., 2020) - A bias task to study language models’ performance divergence between different gender groups on the task of occupation identification. ## Metrics ### Model performance measures The XGLM model was primarily evaluated on 1. Zero shot and few shot learning by looking at per-language performance on tasks spanning commonsense reasoning (XCOPA, XWinograd), natural language inference (XNLI) and paraphrasing (PAWS-X). The model is also evaluated on XStoryCloze, a new dataset created by FAIR (Fundamental Artificial Intelligence Research). 2. Cross lingual transfer through templates and few-shot examples. 3. Knowledge probing - Evaluate to what extent the XGLM model can effectively store factual knowledge in different languages using the mLAMA benchmark. 4. Translation - We report machine translation results on WMT benchmarks and a subset of FLORES-101 in the main paper. The model was also evaluated on hate speech datasets introduced by Huang et al. (2020) and an occupation identification dataset by De-Arteaga et al. 2019 to identify bias in the model. ### Approaches to handle uncertainty Report confidence intervals, variance metrics for the model performance metrics. Few-shot evaluation was conducted with different sampling with 5 seeds. We reported statistical significance. ## Evaluation data ## Zero Shot and Few Shot evaluation ### XNLI (Conneau et al., 2018) #### Description The Cross-lingual Natural Language Inference (XNLI) corpus is the extension of the Multi-Genre NLI (MultiNLI) corpus to 15 languages. The dataset was created by manually translating the validation and test sets of MultiNLI into each of those 15 languages. ### XStoryCloze #### Description A new dataset created by FAIR along side this work by translating the validation split of the English StoryCloze dataset (Mostafazadeh et al., 2016) (Spring 2016 version) to 10 other typologically diverse languages (ru, zh Simplified, es Latin America, ar, hi, id, te, sw, eu, my). ### XCOPA (Ponti et al., 2020) #### Description The Cross-lingual Choice of Plausible Alternatives (XCOPA) dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages. The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe. ### XWinograd (Tikhonov and Ryabinin, 2021) #### Description XWinograd is a multilingual collection of Winograd Schemas in six languages that can be used for evaluation of cross-lingual commonsense reasoning capabilities. ### PAWS-X (Yang et al., 2019) #### Description PAWS-X contains 23,659 human translated PAWS evaluation pairs and 296,406 machine translated training pairs in six typologically distinct languages: French, Spanish, German, Chinese, Japanese, and Korean. All translated pairs are sourced from examples in PAWS-Wiki. ## Responsible AI (RAI) evaluation ### Hate speech (Huang et al. 2020) This is a multilingual Twitter corpus for the task of hate speech detection with inferred four author demographic factors: age, country, gender and race/ethnicity. The corpus covers five languages: English, Italian, Polish, Portuguese and Spanish. ### Bias dataset (De-Arteaga et al. 2019) The aim of this dataset is to study the gender bias of models that identify a person’s occupation from their bios. ---- ## Training data ### CC100-XL #### Description Following the recent success of multilingual self-supervised pre-training (Devlin et al., 2019; Lample and Conneau, 2019; Con; Xue et al., 2020; Goyal et al., 2021a; Liu et al., 2020), we train our language models on a mixture of monolingual text of different languages. We extended the pipeline used for mining the CC100 corpus to generate CC100-XL, a significantly larger multilingual dataset covering 68 Common Crawl snapshots (from Summer 2013 to March/April 2020) and 134 languages. More details on the CC100-XL dataset can be found in the Appendix section of the paper. ## RAI Dimensions ### Fairness (Bias and inclusion) The XGLM model was evaluated on Hate speech and bias identification datasets. For hate speech, we observe that across the 5 languages in the dataset, in context learning results are only slightly better than random (50%). Another interesting observation is that most few shot results are worse than zero-shot, which indicates that the model is not able to utilize examples using the templates described in the paper. For bias identification, the XGLM (6.7B) English only model achieves the best performance on English and Spanish, while the GPT-3 model of comparable size (6.7B) model achieves the best in French. On certain occupations (e.g. model and teacher), XGLM 6.7B En only model and GPT-3 (6.7B) have very significant bias while XGLM 7.5B is much less biased. ### Privacy and security The XGLM model did not have any special Privacy and Security considerations. The training data and evaluation data were both public and went through standard Meta privacy and licensing procedures. ### Transparency and control In the spirit of transparency and accountability we have created this model card and a data card for the CC100-XL which can be found in the Appendix section of the paper. ### Efficiency (Green AI) From an engineering perspective, XGLM pertains to a family of models that represent single unified models catering to many languages which have wide application across many applications. Such a unified single model saves on carbon footprint as well as energy consumption (comparing to the alternative: separate models for different languages) leading to more energy efficiency. A single model, despite having the risk of being a single point of failure, has the powerful incentive of being easier to maintain, access, distribute, and track. ## References Edoardo Maria Ponti, Goran Glavas, Olga Majewska, Qianchu Liu, Ivan Vulic, and Anna Korhonen. 2020. XCOPA: A multilingual dataset for causal commonsense reasoning. In Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, EMNLP 2020, Online, November 16-20, 2020, pages 2362–2376. Association for Computational Linguistics. XCOPA Dataset | Papers With Code Alexey Tikhonov and Max Ryabinin. 2021. It’s all in the heads: Using attention heads as a baseline for cross-lingual transfer in commonsense reasoning. In Findings of the Association for Computational Linguistics: ACL/IJCNLP 2021, Online Event, August 1-6, 2021, volume ACL/IJCNLP 2021 of Findings of ACL, pages 3534–3546. Association for Computational Linguistics. XWINO Dataset | Papers With Code (XWinograd) Yinfei Yang, Yuan Zhang, Chris Tar, and Jason Baldridge. 2019. PAWS-X: A cross-lingual adversarial dataset for paraphrase identification. CoRR, abs/1908.11828. PAWS-X Dataset | Papers With Code Alexis Conneau, Guillaume Lample, Ruty Rinott, Adina Williams, Samuel R. Bowman, Holger Schwenk, and Veselin Stoyanov. 2018. XNLI: evaluating cross-lingual sentence representations. CoRR, abs/1809.05053. XNLI Dataset | Papers With Code Xiaolei Huang, Linzi Xing, Franck Dernoncourt, and Michael Paul. 2020. Multilingual twitter corpus and baselines for evaluating demographic bias in hate speech recognition. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 1440–1448. Maria De-Arteaga, Alexey Romanov, Hanna Wallach, Jennifer Chayes, Christian Borgs, Alexandra Chouldechova, Sahin Geyik, Krishnaram Kenthapadi, and Adam Tauman Kalai. 2019. Bias in bios: A case study of semantic representation bias in a high-stakes setting. In proceedings of the Conference on Fairness, Accountability, and Transparency, pages 120–128. Nasrin Mostafazadeh, Nathanael Chambers, Xiaodong He, Devi Parikh, Dhruv Batra, Lucy Vanderwende, Pushmeet Kohli, James F. Allen. A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories. CoRR abs/1604.01696. Jieyu Zhao, Subhabrata Mukherjee, Saghar Hosseini, Kai-Wei Chang, and Ahmed Hassan Awadallah. 2020. Gender bias in multilingual embeddings and crosslingual transfer. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 2896–2907. ## Citation details ``` @article{DBLP:journals/corr/abs-2112-10668, author = {Xi Victoria Lin and Todor Mihaylov and Mikel Artetxe and Tianlu Wang and Shuohui Chen and Daniel Simig and Myle Ott and Naman Goyal and Shruti Bhosale and Jingfei Du and Ramakanth Pasunuru and Sam Shleifer and Punit Singh Koura and Vishrav Chaudhary and Brian O'Horo and Jeff Wang and Luke Zettlemoyer and Zornitsa Kozareva and Mona T. Diab and Veselin Stoyanov and Xian Li}, title = {Few-shot Learning with Multilingual Language Models}, journal = {CoRR}, volume = {abs/2112.10668}, year = {2021}, url = {https://arxiv.org/abs/2112.10668}, eprinttype = {arXiv}, eprint = {2112.10668}, timestamp = {Tue, 04 Jan 2022 15:59:27 +0100}, biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} } ``` ================================================ FILE: examples/xlmr/README.md ================================================ # Unsupervised Cross-lingual Representation Learning at Scale (XLM-RoBERTa) https://arxiv.org/pdf/1911.02116.pdf # Larger-Scale Transformers for Multilingual Masked Language Modeling https://arxiv.org/pdf/2105.00572.pdf ## What's New: - June 2021: `XLMR-XL` AND `XLMR-XXL` models released. ## Introduction `XLM-R` (`XLM-RoBERTa`) is a generic cross lingual sentence encoder that obtains state-of-the-art results on many cross-lingual understanding (XLU) benchmarks. It is trained on `2.5T` of filtered CommonCrawl data in 100 languages (list below). Language | Language|Language |Language | Language ---|---|---|---|--- Afrikaans | Albanian | Amharic | Arabic | Armenian Assamese | Azerbaijani | Basque | Belarusian | Bengali Bengali Romanize | Bosnian | Breton | Bulgarian | Burmese Burmese zawgyi font | Catalan | Chinese (Simplified) | Chinese (Traditional) | Croatian Czech | Danish | Dutch | English | Esperanto Estonian | Filipino | Finnish | French | Galician Georgian | German | Greek | Gujarati | Hausa Hebrew | Hindi | Hindi Romanize | Hungarian | Icelandic Indonesian | Irish | Italian | Japanese | Javanese Kannada | Kazakh | Khmer | Korean | Kurdish (Kurmanji) Kyrgyz | Lao | Latin | Latvian | Lithuanian Macedonian | Malagasy | Malay | Malayalam | Marathi Mongolian | Nepali | Norwegian | Oriya | Oromo Pashto | Persian | Polish | Portuguese | Punjabi Romanian | Russian | Sanskrit | Scottish Gaelic | Serbian Sindhi | Sinhala | Slovak | Slovenian | Somali Spanish | Sundanese | Swahili | Swedish | Tamil Tamil Romanize | Telugu | Telugu Romanize | Thai | Turkish Ukrainian | Urdu | Urdu Romanize | Uyghur | Uzbek Vietnamese | Welsh | Western Frisian | Xhosa | Yiddish ## Pre-trained models Model | Description | #params | vocab size | Download ---|---|---|---|--- `xlmr.base` | XLM-R using the BERT-base architecture | 250M | 250k | [xlm.base.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz) `xlmr.large` | XLM-R using the BERT-large architecture | 560M | 250k | [xlm.large.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz) `xlmr.xl` | XLM-R (`layers=36, model_dim=2560`) | 3.5B | 250k | [xlm.xl.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xl.tar.gz) `xlmr.xxl` | XLM-R (`layers=48, model_dim=4096`) | 10.7B | 250k | [xlm.xxl.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xxl.tar.gz) ## Results **[XNLI (Conneau et al., 2018)](https://arxiv.org/abs/1809.05053)** Model | average | en | fr | es | de | el | bg | ru | tr | ar | vi | th | zh | hi | sw | ur ---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|--- `roberta.large.mnli` _(TRANSLATE-TEST)_ | 77.8 | 91.3 | 82.9 | 84.3 | 81.2 | 81.7 | 83.1 | 78.3 | 76.8 | 76.6 | 74.2 | 74.1 | 77.5 | 70.9 | 66.7 | 66.8 `xlmr.large` _(TRANSLATE-TRAIN-ALL)_ | 83.6 | 89.1 | 85.1 | 86.6 | 85.7 | 85.3 | 85.9 | 83.5 | 83.2 | 83.1 | 83.7 | 81.5 | 83.7 | 81.6 | 78.0 | 78.1 `xlmr.xl` _(TRANSLATE-TRAIN-ALL)_ | 85.4 | 91.1 | 87.2 | 88.1 | 87.0 | 87.4 | 87.8 | 85.3 | 85.2 | 85.3 | 86.2 | 83.8 | 85.3 | 83.1 | 79.8 | 78.2 | 85.4 `xlmr.xxl` _(TRANSLATE-TRAIN-ALL)_ | 86.0 | 91.5 | 87.6 | 88.7 | 87.8 | 87.4 | 88.2 | 85.6 | 85.1 | 85.8 | 86.3 | 83.9 | 85.6 | 84.6 | 81.7 | 80.6 **[MLQA (Lewis et al., 2018)](https://arxiv.org/abs/1910.07475)** Model | average | en | es | de | ar | hi | vi | zh ---|---|---|---|---|---|---|---|--- `BERT-large` | - | 80.2/67.4 | - | - | - | - | - | - `mBERT` | 57.7 / 41.6 | 77.7 / 65.2 | 64.3 / 46.6 | 57.9 / 44.3 | 45.7 / 29.8| 43.8 / 29.7 | 57.1 / 38.6 | 57.5 / 37.3 `xlmr.large` | 70.7 / 52.7 | 80.6 / 67.8 | 74.1 / 56.0 | 68.5 / 53.6 | 63.1 / 43.5 | 69.2 / 51.6 | 71.3 / 50.9 | 68.0 / 45.4 `xlmr.xl` | 73.4 / 55.3 | 85.1 / 72.6 | 66.7 / 46.2 | 70.5 / 55.5 | 74.3 / 56.9 | 72.2 / 54.7 | 74.4 / 52.9 | 70.9 / 48.5 `xlmr.xxl` | 74.8 / 56.6 | 85.5 / 72.4 | 68.6 / 48.4 | 72.7 / 57.8 | 75.4 / 57.6 | 73.7 / 55.8 | 76.0 / 55.0 | 71.7 / 48.9 ## Example usage ##### Load XLM-R from torch.hub (PyTorch >= 1.1): ```python import torch xlmr = torch.hub.load('pytorch/fairseq:main', 'xlmr.large') xlmr.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Load XLM-R (for PyTorch 1.0 or custom models): ```python # Download xlmr.large model wget https://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz tar -xzvf xlmr.large.tar.gz # Load the model in fairseq from fairseq.models.roberta import XLMRModel xlmr = XLMRModel.from_pretrained('/path/to/xlmr.large', checkpoint_file='model.pt') xlmr.eval() # disable dropout (or leave in train mode to finetune) ``` ##### Apply sentence-piece-model (SPM) encoding to input text: ```python en_tokens = xlmr.encode('Hello world!') assert en_tokens.tolist() == [0, 35378, 8999, 38, 2] xlmr.decode(en_tokens) # 'Hello world!' zh_tokens = xlmr.encode('你好,世界') assert zh_tokens.tolist() == [0, 6, 124084, 4, 3221, 2] xlmr.decode(zh_tokens) # '你好,世界' hi_tokens = xlmr.encode('नमस्ते दुनिया') assert hi_tokens.tolist() == [0, 68700, 97883, 29405, 2] xlmr.decode(hi_tokens) # 'नमस्ते दुनिया' ar_tokens = xlmr.encode('مرحبا بالعالم') assert ar_tokens.tolist() == [0, 665, 193478, 258, 1705, 77796, 2] xlmr.decode(ar_tokens) # 'مرحبا بالعالم' fr_tokens = xlmr.encode('Bonjour le monde') assert fr_tokens.tolist() == [0, 84602, 95, 11146, 2] xlmr.decode(fr_tokens) # 'Bonjour le monde' ``` ##### Extract features from XLM-R: ```python # Extract the last layer's features last_layer_features = xlmr.extract_features(zh_tokens) assert last_layer_features.size() == torch.Size([1, 6, 1024]) # Extract all layer's features (layer 0 is the embedding layer) all_layers = xlmr.extract_features(zh_tokens, return_all_hiddens=True) assert len(all_layers) == 25 assert torch.all(all_layers[-1] == last_layer_features) ``` ## Citation ```bibtex @article{conneau2019unsupervised, title={Unsupervised Cross-lingual Representation Learning at Scale}, author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin}, journal={arXiv preprint arXiv:1911.02116}, year={2019} } ``` ```bibtex @article{goyal2021larger, title={Larger-Scale Transformers for Multilingual Masked Language Modeling}, author={Goyal, Naman and Du, Jingfei and Ott, Myle and Anantharaman, Giri and Conneau, Alexis}, journal={arXiv preprint arXiv:2105.00572}, year={2021} } ``` ================================================ FILE: examples/xmod/README.md ================================================ # X-MOD: Lifting the Curse of Multilinguality by Pre-training Modular Transformers https://arxiv.org/abs/2205.06266 ## Introduction X-MOD extends multilingual masked language models like XLM-R to include language-specific modular components, introduced at each transformer layer. Each module is only used by one language. For fine-tuning, the modular components are frozen, and replaced with the target language in cross-lingual transfer settings. ## Pre-trained models Model | Size | # train steps | # langs | Download ---|---|---|---|--- `xmod.base.13.125k` | BERT-base | 125k | 13 | [xmod.base.13.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.13.125k.tar.gz) `xmod.base.30.125k` | BERT-base | 125k | 30 | [xmod.base.30.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.125k.tar.gz) `xmod.base.30.195k` | BERT-base | 195k | 30 | [xmod.base.30.195k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.195k.tar.gz) `xmod.base.60.125k` | BERT-base | 125k | 60 | [xmod.base.60.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.125k.tar.gz) `xmod.base.60.265k` | BERT-base | 265k | 60 | [xmod.base.60.265k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.265k.tar.gz) `xmod.base.75.125k` | BERT-base | 125k | 75 | [xmod.base.75.125k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.125k.tar.gz) `xmod.base.75.269k` | BERT-base | 269k | 75 | [xmod.base.75.269k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.269k.tar.gz) `xmod.base` | BERT-base | 1M | 81 | [xmod.base.81.1M.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.81.1M.tar.gz) `xmod.large.prenorm` | BERT-large | 500k | 81 | [xmod.large.prenorm.81.500k.tar.gz](https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.large.prenorm.81.500k.tar.gz) ## Fine-tuning on NLI We next provide an example of how to fine-tune the pre-trained models above on Natural Language Inference (NLI). We use MNLI for training in English, and show how to run inference in other languages. ### 1) Download a pre-trained model ```bash MODEL=xmod.base.81.1M wget https://dl.fbaipublicfiles.com/fairseq/models/xmod/$MODEL.tar.gz tar -xzf $MODEL.tar.gz ``` ### 2) Download and preprocess [MNLI](https://cims.nyu.edu/~sbowman/multinli/) ```bash wget https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip unzip multinli_1.0.zip python ./examples/xmod/preprocess_nli.py \ --sentencepiece-model $MODEL/sentencepiece.bpe.model \ --train multinli_1.0/multinli_1.0_train.jsonl \ --valid multinli_1.0/multinli_1.0_dev_matched.jsonl \ --destdir multinli_1.0/fairseq ``` ### 3) Fine-tune on MNLI: ```bash MAX_EPOCH=5 LR=1e-05 BATCH_SIZE=32 DATA_DIR=multinli_1.0/fairseq/bin CUDA_VISIBLE_DEVICES=0 fairseq-train $DATA_DIR \ --restore-file $MODEL/model.pt \ --save-dir $MODEL/nli \ --reset-optimizer \ --reset-dataloader \ --reset-meters \ --best-checkpoint-metric accuracy \ --maximize-best-checkpoint-metric \ --task sentence_prediction_adapters \ --num-classes 3 \ --init-token 0 \ --separator-token 2 \ --max-positions 512 \ --shorten-method "truncate" \ --arch xmod_base \ --dropout 0.1 \ --attention-dropout 0.1 \ --weight-decay 0.01 \ --criterion sentence_prediction_adapters \ --optimizer adam \ --adam-betas '(0.9, 0.98)' \ --adam-eps 1e-06 \ --clip-norm 0.0 \ --lr-scheduler fixed \ --lr $LR \ --fp16 \ --fp16-init-scale 4 \ --threshold-loss-scale 1 \ --fp16-scale-window 128 \ --batch-size $BATCH_SIZE \ --required-batch-size-multiple 1 \ --update-freq 1 \ --max-epoch $MAX_EPOCH ``` ### 4) Run inference After training the model, we can load it and run inference in our target language. The default language is set to English, which is why we were not required to pass a language ID to the model during fine-tuning. To run inference in a non-English language, we need to tell the model that the module of the target language should be used instead: ```python from fairseq.models.xmod import XMODModel MODEL='xmod.base.81.1M/nli' DATA='multinli_1.0/fairseq/bin' # Load model model = XMODModel.from_pretrained( model_name_or_path=MODEL, checkpoint_file='checkpoint_best.pt', data_name_or_path=DATA, suffix='', criterion='cross_entropy', bpe='sentencepiece', sentencepiece_model=DATA+'/input0/sentencepiece.bpe.model') model = model.eval(); # disable dropout model = model.half(); # use FP16 model = model.cuda(); # move to GPU def predict(premise, hypothesis, lang): tokens = model.encode(premise, hypothesis) idx = model.predict('sentence_classification_head', tokens, lang_id=[lang]).argmax().item() dictionary = model.task.label_dictionary return dictionary[idx + dictionary.nspecial] predict( premise='X-Mod hat spezifische Module die für jede Sprache existieren.', hypothesis='X-Mod hat Module.', lang='de_DE' ) # entailment predict( premise='Londres es la capital del Reino Unido.', hypothesis='Londres está en Francia.', lang='es_XX', ) # contradiction predict( premise='Patxik gogoko ditu babarrunak.', hypothesis='Patxik babarrunak bazkaldu zituen.', lang='eu_ES', ) # neutral ``` ## Citation ```bibtex @misc{pfeiffer2022xmod, doi = {10.48550/ARXIV.2205.06266}, url = {https://arxiv.org/abs/2205.06266}, title = {Lifting the Curse of Multilinguality by Pre-training Modular Transformers}, publisher = {arXiv}, year = {2022}, } ``` ================================================ FILE: examples/xmod/preprocess_nli.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import json import collections import argparse import shutil import subprocess import sys import tempfile from multiprocessing import Pool import sentencepiece as spm def preprocess(spm_model_path, train_path, valid_path, test_path, dest_dir, remove_empty=False, output_format='piece', workers=20): with tempfile.TemporaryDirectory() as tmp: # Tokenize with SentencePiece for split, path in ('train', train_path), ('valid', valid_path), ('test', test_path): if path is None: continue if path == '-': path = sys.stdin.fileno() with open(path, encoding='utf-8', errors='surrogateescape') as fin: with open(f'{tmp}/{split}', mode='w', encoding='utf-8', errors='surrogateescape') as fout: encoder = MultiprocessingEncoder(model=spm_model_path, remove_empty=remove_empty, output_format=output_format) pool = Pool(workers, initializer=encoder.initializer) encoded_lines = pool.imap(encoder.encode, fin, 10000) for i, line in enumerate(encoded_lines, start=1): if line is not None: print(line, file=fout) if i % 10000 == 0: print("tokenized {} lines".format(i), file=sys.stderr) # Generate dictionary sp = spm.SentencePieceProcessor(model_file=spm_model_path) if output_format == 'piece': vocab = [sp.id_to_piece(i) for i in range(3, sp.vocab_size())] else: vocab = map(str, range(sp.vocab_size())) with open(f'{tmp}/dict.txt', mode='w', encoding='utf-8', errors='surrogateescape') as f: for word in vocab: print(word, 1, file=f) # Binarize command = [ 'python3', '-m', 'fairseq_cli.preprocess', '--only-source', '--thresholdsrc', '0', '--destdir', dest_dir, '--srcdict', f'{tmp}/dict.txt', '--workers', '20', ] for split, path in ('train', train_path), ('valid', valid_path), ('test', test_path): if path is not None: command += [f'--{split}pref', f'{tmp}/{split}'] subprocess.run(command) # Copy SentencePiece model shutil.copyfile(spm_model_path, f'{dest_dir}/sentencepiece.bpe.model') class MultiprocessingEncoder(object): def __init__(self, model, remove_empty, output_format): self.model = model self.remove_empty = remove_empty self.output_format = output_format def initializer(self): global sp sp = spm.SentencePieceProcessor(model_file=self.model) def encode(self, line): global sp line = line.strip() if len(line) == 0 and self.remove_empty: return None if self.output_format == 'piece': return ' '.join(sp.encode_as_pieces(line)) else: return ' '.join(map(str, sp.encode(line))) def write_lines(lines, path): with open(path, mode='x', encoding='utf-8') as f: for line in lines: print(line, file=f) def read_jsonl(path): with open(path, encoding='utf-8') as f: return [json.loads(line) for line in f.read().splitlines()] def read_nli(path, langs=None): data = read_jsonl(path) if langs is not None: data = [sample for sample in data if sample.get('language') in langs] lang2count = collections.defaultdict(int) for sample in data: lang2count[sample.get('language')] += 1 if langs: assert set(lang2count.keys()) == set(langs) nlangs = len(lang2count) assert nlangs > 0 lens = list(lang2count.values()) assert all([lens[0] == length for length in lens]) print(f'Loaded {lens[0]} samples in {nlangs} languages from {path}', file=sys.stderr) return data def main(): parser = argparse.ArgumentParser(description='Tokenize and binarize NLI data') parser.add_argument('--sentencepiece-model', required=True) parser.add_argument('--train', required=True, help='Training data in jsonl format') parser.add_argument('--valid', required=True, help='Validation data in jsonl format') parser.add_argument('--destdir', required=True) args = parser.parse_args() os.makedirs(args.destdir + '/raw',) os.makedirs(args.destdir + '/bin', ) # Extract input/labels for split, path in ('train', args.train), ('valid', args.valid): data = read_nli(path, langs=None) original_size = len(data) data = [sample for sample in data if sample['gold_label'] != '-'] assert all(sample['gold_label'] in ('contradiction', 'entailment', 'neutral') for sample in data) filtered_size = len(data) if filtered_size != original_size: print(f'Filtered {filtered_size}/{original_size} samples from {path}', file=sys.stderr) for name, field in ('input0', 'sentence1'), ('input1', 'sentence2'), ('label', 'gold_label'): write_lines([sample[field] for sample in data], f'{args.destdir}/raw/{split}.{name}.txt') # Tokenize and binarize input for field in 'input0', 'input1': preprocess( spm_model_path=args.sentencepiece_model, train_path=f'{args.destdir}/raw/train.{field}.txt', valid_path=f'{args.destdir}/raw/valid.{field}.txt', test_path=None, dest_dir=f'{args.destdir}/bin/{field}', workers=20, ) # Binarize labels subprocess.run([ 'python3', '-m', 'fairseq_cli.preprocess', '--trainpref', f'{args.destdir}/raw/train.label.txt', '--validpref', f'{args.destdir}/raw/valid.label.txt', '--only-source', '--thresholdsrc', '0', '--destdir', f'{args.destdir}/bin/label', '--workers', '20', ]) if __name__ == '__main__': main() ================================================ FILE: fairseq/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import os import sys try: from .version import __version__ # noqa except ImportError: version_txt = os.path.join(os.path.dirname(__file__), "version.txt") with open(version_txt) as f: __version__ = f.read().strip() __all__ = ["pdb"] # backwards compatibility to support `from fairseq.X import Y` from fairseq.distributed import utils as distributed_utils from fairseq.logging import meters, metrics, progress_bar # noqa sys.modules["fairseq.distributed_utils"] = distributed_utils sys.modules["fairseq.meters"] = meters sys.modules["fairseq.metrics"] = metrics sys.modules["fairseq.progress_bar"] = progress_bar # initialize hydra from fairseq.dataclass.initialize import hydra_init hydra_init() import fairseq.criterions # noqa import fairseq.distributed # noqa import fairseq.models # noqa import fairseq.modules # noqa import fairseq.optim # noqa import fairseq.optim.lr_scheduler # noqa import fairseq.pdb # noqa import fairseq.scoring # noqa import fairseq.tasks # noqa import fairseq.token_generation_constraints # noqa import fairseq.benchmark # noqa import fairseq.model_parallel # noqa ================================================ FILE: fairseq/benchmark/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import models/tasks to register them from . import dummy_dataset, dummy_lm, dummy_masked_lm, dummy_model, dummy_mt # noqa ================================================ FILE: fairseq/benchmark/benchmark_multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import random import torch from torch.utils import benchmark from fairseq.modules.multihead_attention import MultiheadAttention BATCH = [20, 41, 97] SEQ = 64 EMB = 48 HEADS = 4 DROP = 0.1 DEVICE = torch.device("cuda") ATTN_MASK_DTYPE = [torch.uint8, torch.bool, torch.float] KEY_PADDING_MASK_DTYPE = [torch.uint8, torch.bool] def _reset_seeds(): torch.manual_seed(0) random.seed(0) def _get_mask(to_dtype: torch.dtype, dim0: int, dim1: int): if to_dtype == torch.float: mask = torch.randint(0, 2, (dim0, dim1)).to(dtype=torch.bool) return mask.to(dtype=to_dtype).masked_fill(mask, -float("inf")) return torch.randint(0, 2, (dim0, dim1)).to(dtype=to_dtype) def benchmark_multihead_attention( label="", attn_dtype=torch.uint8, key_padding_dtype=torch.uint8, add_bias_kv=False, add_zero_attn=False, static_kv=False, batch_size=20, embedding=EMB, seq_len=SEQ, num_heads=HEADS, ): results = [] # device = torch.device("cuda") xformers_att_config = '{"name": "scaled_dot_product"}' attn_mask = _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len) key_padding_mask = _get_mask( to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len ) q = torch.rand(seq_len, batch_size, embedding, requires_grad=True) k = torch.rand(seq_len, batch_size, embedding, requires_grad=True) v = torch.rand(seq_len, batch_size, embedding, requires_grad=True) _reset_seeds() original_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=None, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) xformers_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=xformers_att_config, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) def original_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): original_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) def xformers_bench_fw(q, k, v, key_padding_mask, attn_mask, static_kv): xformers_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) def original_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): output, _ = original_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) loss = torch.norm(output) loss.backward() def xformers_bench_fw_bw(q, k, v, key_padding_mask, attn_mask, static_kv): output, _ = xformers_mha( query=q, key=k, value=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) loss = torch.norm(output) loss.backward() fns = [ original_bench_fw, xformers_bench_fw, original_bench_fw_bw, xformers_bench_fw_bw, ] for fn in fns: results.append( benchmark.Timer( stmt="fn(q, k, v, key_padding_mask, attn_mask, static_kv)", globals={ "q": q, "k": k, "v": v, "key_padding_mask": key_padding_mask, "attn_mask": attn_mask, "static_kv": static_kv, "fn": fn, }, label="multihead fw + bw", sub_label=f"{fn.__name__}", description=label, ).blocked_autorange(min_run_time=1) ) compare = benchmark.Compare(results) compare.print() def run_benchmarks(): for attn_dtype, key_padding_dtype, add_bias_kv, add_zero_attn in itertools.product( ATTN_MASK_DTYPE, KEY_PADDING_MASK_DTYPE, [True, False], [True, False] ): label = f"attn_dtype {attn_dtype}, key_padding_dtype {key_padding_dtype}, \ add_bias_kv {add_bias_kv}, add_zero_attn {add_zero_attn}" benchmark_multihead_attention( label=label, attn_dtype=attn_dtype, key_padding_dtype=key_padding_dtype, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) run_benchmarks() ================================================ FILE: fairseq/benchmark/dummy_dataset.py ================================================ import numpy as np from fairseq.data import FairseqDataset class DummyDataset(FairseqDataset): def __init__(self, batch, num_items, item_size): super().__init__() self.batch = batch self.num_items = num_items self.item_size = item_size def __getitem__(self, index): return index def __len__(self): return self.num_items def collater(self, samples): return self.batch @property def sizes(self): return np.array([self.item_size] * self.num_items) def num_tokens(self, index): return self.item_size def size(self, index): return self.item_size def ordered_indices(self): return np.arange(self.num_items) @property def supports_prefetch(self): return False ================================================ FILE: fairseq/benchmark/dummy_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass, field from typing import Optional import torch from .dummy_dataset import DummyDataset from fairseq.data import Dictionary from fairseq.dataclass import FairseqDataclass from fairseq.tasks import FairseqTask, register_task from omegaconf import II logger = logging.getLogger(__name__) @dataclass class DummyLMConfig(FairseqDataclass): dict_size: int = 49996 dataset_size: int = 100000 tokens_per_sample: int = field( default=512, metadata={"help": "max sequence length"} ) add_bos_token: bool = False batch_size: Optional[int] = II("dataset.batch_size") max_tokens: Optional[int] = II("dataset.max_tokens") max_target_positions: int = II("task.tokens_per_sample") @register_task("dummy_lm", dataclass=DummyLMConfig) class DummyLMTask(FairseqTask): def __init__(self, cfg: DummyLMConfig): super().__init__(cfg) # load dictionary self.dictionary = Dictionary() for i in range(cfg.dict_size): self.dictionary.add_symbol("word{}".format(i)) self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 logger.info("dictionary: {} types".format(len(self.dictionary))) seq = torch.arange(cfg.tokens_per_sample + 1) + self.dictionary.pad() + 1 self.dummy_src = seq[:-1] self.dummy_tgt = seq[1:] def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if self.cfg.batch_size is not None: bsz = self.cfg.batch_size else: bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( (bsz,), self.cfg.tokens_per_sample, dtype=torch.long ), }, "target": torch.stack([self.dummy_tgt for _ in range(bsz)]), "nsentences": bsz, "ntokens": bsz * self.cfg.tokens_per_sample, }, num_items=self.cfg.dataset_size, item_size=self.cfg.tokens_per_sample, ) @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary ================================================ FILE: fairseq/benchmark/dummy_masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass, field from typing import Optional import torch from omegaconf import II from .dummy_dataset import DummyDataset from fairseq.data import Dictionary from fairseq.dataclass import FairseqDataclass from fairseq.tasks import FairseqTask, register_task logger = logging.getLogger(__name__) @dataclass class DummyMaskedLMConfig(FairseqDataclass): dict_size: int = 49996 dataset_size: int = 100000 tokens_per_sample: int = field( default=512, metadata={ "help": "max number of total tokens over all" " segments per sample for BERT dataset" }, ) batch_size: Optional[int] = II("dataset.batch_size") max_tokens: Optional[int] = II("dataset.max_tokens") max_target_positions: int = II("task.tokens_per_sample") @register_task("dummy_masked_lm", dataclass=DummyMaskedLMConfig) class DummyMaskedLMTask(FairseqTask): def __init__(self, cfg: DummyMaskedLMConfig): super().__init__(cfg) self.dictionary = Dictionary() for i in range(cfg.dict_size): self.dictionary.add_symbol("word{}".format(i)) logger.info("dictionary: {} types".format(len(self.dictionary))) # add mask token self.mask_idx = self.dictionary.add_symbol("<mask>") self.dictionary.pad_to_multiple_(8) # often faster if divisible by 8 mask_idx = 0 pad_idx = 1 seq = torch.arange(cfg.tokens_per_sample) + pad_idx + 1 mask = torch.arange(2, cfg.tokens_per_sample, 7) # ~15% src = seq.clone() src[mask] = mask_idx tgt = torch.full_like(seq, pad_idx) tgt[mask] = seq[mask] self.dummy_src = src self.dummy_tgt = tgt def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if self.cfg.batch_size is not None: bsz = self.cfg.batch_size else: bsz = max(1, self.cfg.max_tokens // self.cfg.tokens_per_sample) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( (bsz,), self.cfg.tokens_per_sample, dtype=torch.long ), }, "target": torch.stack([self.dummy_tgt for _ in range(bsz)]), "nsentences": bsz, "ntokens": bsz * self.cfg.tokens_per_sample, }, num_items=self.cfg.dataset_size, item_size=self.cfg.tokens_per_sample, ) @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary ================================================ FILE: fairseq/benchmark/dummy_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn import torch.nn.functional as F from fairseq.data import Dictionary from fairseq.models import ( FairseqDecoder, FairseqLanguageModel, register_model, register_model_architecture, ) @register_model("dummy_model") class DummyModel(FairseqLanguageModel): def __init__(self, args, encoder): super().__init__(encoder) self.args = args @staticmethod def add_args(parser): parser.add_argument("--num-layers", type=int, default=24) parser.add_argument("--embed-dim", type=int, default=1024) @classmethod def build_model(cls, args, task): encoder = DummyEncoder( num_embed=len(task.target_dictionary), embed_dim=args.embed_dim, num_layers=args.num_layers, ) return cls(args, encoder) def forward(self, src_tokens, masked_tokens=None, **kwargs): return self.decoder(src_tokens, masked_tokens=masked_tokens) class DummyEncoder(FairseqDecoder): def __init__(self, num_embed=50000, embed_dim=1024, num_layers=24): super().__init__(Dictionary()) self.embed = nn.Embedding( num_embeddings=num_embed, embedding_dim=embed_dim, padding_idx=0 ) self.layers_a = nn.ModuleList( [ nn.Sequential( nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 3 * embed_dim), # q, k, v input projection nn.Linear(3 * embed_dim, embed_dim), # skip self-attention nn.Linear(embed_dim, embed_dim), # output projection nn.Dropout(), ) for i in range(num_layers) ] ) self.layers_b = nn.ModuleList( [ nn.Sequential( nn.LayerNorm(embed_dim), nn.Linear(embed_dim, 4 * embed_dim), # FFN nn.ReLU(), nn.Linear(4 * embed_dim, embed_dim), # FFN nn.Dropout(0.1), ) for i in range(num_layers) ] ) self.out_proj = nn.Linear(embed_dim, num_embed) def forward(self, tokens, masked_tokens=None): x = self.embed(tokens) for layer_a, layer_b in zip(self.layers_a, self.layers_b): x = x + layer_a(x) x = x + layer_b(x) x = self.out_proj(x) if masked_tokens is not None: x = x[masked_tokens] return (x,) def max_positions(self): return 1024 def get_normalized_probs(self, net_output, log_probs, sample=None): logits = net_output[0].float() if log_probs: return F.log_softmax(logits, dim=-1) else: return F.softmax(logits, dim=-1) @register_model_architecture("dummy_model", "dummy_model") def base_architecture(args): pass ================================================ FILE: fairseq/benchmark/dummy_mt.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import numpy as np import torch from fairseq.data import Dictionary, FairseqDataset from fairseq.tasks import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) @register_task("dummy_mt") class DummyMTTask(LegacyFairseqTask): @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument("--dict-size", default=49996, type=int) parser.add_argument("--dataset-size", default=100000, type=int) parser.add_argument("--src-len", default=30, type=int) parser.add_argument("--tgt-len", default=30, type=int) def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary self.seed = args.seed dictionary.pad_to_multiple_(8) # often faster if divisible by 8 self.dummy_src = torch.arange(args.src_len + 1) + dictionary.pad() + 1 self.dummy_tgt = torch.arange(args.tgt_len + 1) + dictionary.pad() + 1 @classmethod def setup_task(cls, args, **kwargs): """Setup the task.""" dictionary = Dictionary() for i in range(args.dict_size): dictionary.add_symbol("word{}".format(i)) logger.info("dictionary: {} types".format(len(dictionary))) args.max_source_positions = args.src_len + dictionary.pad() + 2 args.max_target_positions = args.tgt_len + dictionary.pad() + 2 return cls(args, dictionary) def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ item_size = max(self.args.src_len, self.args.tgt_len) if self.args.batch_size is not None: bsz = self.args.batch_size else: bsz = max(1, self.args.max_tokens // item_size) tgt = torch.stack([self.dummy_tgt for _ in range(bsz)]) self.datasets[split] = DummyDataset( { "id": 1, "net_input": { "src_tokens": torch.stack([self.dummy_src for _ in range(bsz)]), "src_lengths": torch.full( (bsz,), self.args.src_len, dtype=torch.long ), "prev_output_tokens": tgt.clone(), }, "target": tgt, "nsentences": bsz, "ntokens": bsz * self.args.tgt_len, }, num_items=self.args.dataset_size, item_size=item_size, ) @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary class DummyDataset(FairseqDataset): def __init__(self, batch, num_items, item_size): super().__init__() self.batch = batch self.num_items = num_items self.item_size = item_size def __getitem__(self, index): return index def __len__(self): return self.num_items def collater(self, samples): return self.batch @property def sizes(self): return np.array([self.item_size] * self.num_items) def num_tokens(self, index): return self.item_size def size(self, index): return self.item_size def ordered_indices(self): return np.arange(self.num_items) @property def supports_prefetch(self): return False ================================================ FILE: fairseq/binarizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import typing as tp from abc import ABC, abstractmethod from collections import Counter from dataclasses import dataclass from multiprocessing import Pool import torch from fairseq.data import Dictionary, indexed_dataset from fairseq.file_chunker_utils import Chunker, find_offsets from fairseq.file_io import PathManager from fairseq.tokenizer import tokenize_line logger = logging.getLogger("binarizer") @dataclass class BinarizeSummary: """ Keep track of what's going on in the binarizer """ num_seq: int = 0 replaced: tp.Optional[Counter] = None num_tok: int = 0 @property def num_replaced(self) -> int: if self.replaced is None: return 0 return sum(self.replaced.values()) @property def replaced_percent(self) -> float: return 100 * self.num_replaced / self.num_tok def __str__(self) -> str: base = f"{self.num_seq} sents, {self.num_tok} tokens" if self.replaced is None: return base return f"{base}, {self.replaced_percent:.3}% replaced" def merge(self, other: "BinarizeSummary"): replaced = None if self.replaced is not None: replaced = self.replaced if other.replaced is not None: if replaced is None: replaced = other.replaced else: replaced += other.replaced self.replaced = replaced self.num_seq += other.num_seq self.num_tok += other.num_tok class Binarizer(ABC): """ a binarizer describes how to take a string and build a tensor out of it """ @abstractmethod def binarize_line( self, line: str, summary: BinarizeSummary, ) -> torch.IntTensor: ... def _worker_prefix(output_prefix: str, worker_id: int): return f"{output_prefix}.pt{worker_id}" class FileBinarizer: """ An file binarizer can take a file, tokenize it, and binarize each line to a tensor """ @classmethod def multiprocess_dataset( cls, input_file: str, dataset_impl: str, binarizer: Binarizer, output_prefix: str, vocab_size=None, num_workers=1, ) -> BinarizeSummary: final_summary = BinarizeSummary() offsets = find_offsets(input_file, num_workers) # find_offsets returns a list of position [pos1, pos2, pos3, pos4] but we would want pairs: # [(pos1, pos2), (pos2, pos3), (pos3, pos4)] to process the chunks with start/end info # we zip the list with itself shifted by one to get all the pairs. (first_chunk, *more_chunks) = zip(offsets, offsets[1:]) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) worker_results = [ pool.apply_async( cls._binarize_chunk_and_finalize, args=( binarizer, input_file, start_offset, end_offset, _worker_prefix( output_prefix, worker_id, ), dataset_impl, ), kwds={ "vocab_size": vocab_size, } if vocab_size is not None else {}, ) for worker_id, (start_offset, end_offset) in enumerate( more_chunks, start=1 ) ] pool.close() pool.join() for r in worker_results: summ = r.get() final_summary.merge(summ) # do not close the bin file as we need to merge the worker results in final_ds, summ = cls._binarize_file_chunk( binarizer, input_file, offset_start=first_chunk[0], offset_end=first_chunk[1], output_prefix=output_prefix, dataset_impl=dataset_impl, vocab_size=vocab_size if vocab_size is not None else None, ) final_summary.merge(summ) if num_workers > 1: for worker_id in range(1, num_workers): # merge the worker outputs worker_output_prefix = _worker_prefix( output_prefix, worker_id, ) final_ds.merge_file_(worker_output_prefix) try: os.remove(indexed_dataset.data_file_path(worker_output_prefix)) os.remove(indexed_dataset.index_file_path(worker_output_prefix)) except Exception as e: logger.error( f"couldn't remove {worker_output_prefix}.*", exc_info=e ) # now we can close the file idx_file = indexed_dataset.index_file_path(output_prefix) final_ds.finalize(idx_file) return final_summary @staticmethod def _binarize_file_chunk( binarizer: Binarizer, filename: str, offset_start: int, offset_end: int, output_prefix: str, dataset_impl: str, vocab_size=None, ) -> tp.Tuple[tp.Any, BinarizeSummary]: # (dataset builder, BinarizeSummary) """ creates a dataset builder and append binarized items to it. This function does not finalize the builder, this is useful if you want to do other things with your bin file like appending/merging other files """ bin_file = indexed_dataset.data_file_path(output_prefix) ds = indexed_dataset.make_builder( bin_file, impl=dataset_impl, vocab_size=vocab_size, ) summary = BinarizeSummary() with Chunker( PathManager.get_local_path(filename), offset_start, offset_end ) as line_iterator: for line in line_iterator: ds.add_item(binarizer.binarize_line(line, summary)) return ds, summary @classmethod def _binarize_chunk_and_finalize( cls, binarizer: Binarizer, filename: str, offset_start: int, offset_end: int, output_prefix: str, dataset_impl: str, vocab_size=None, ): """ same as above, but also finalizes the builder """ ds, summ = cls._binarize_file_chunk( binarizer, filename, offset_start, offset_end, output_prefix, dataset_impl, vocab_size=vocab_size, ) idx_file = indexed_dataset.index_file_path(output_prefix) ds.finalize(idx_file) return summ class VocabularyDatasetBinarizer(Binarizer): """ Takes a Dictionary/Vocabulary, assign ids to each token using the dictionary encode_line function. """ def __init__( self, dict: Dictionary, tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line, append_eos: bool = True, reverse_order: bool = False, already_numberized: bool = False, ) -> None: self.dict = dict self.tokenize = tokenize self.append_eos = append_eos self.reverse_order = reverse_order self.already_numberized = already_numberized super().__init__() def binarize_line( self, line: str, summary: BinarizeSummary, ): if summary.replaced is None: summary.replaced = Counter() def replaced_consumer(word, idx): if idx == self.dict.unk_index and word != self.dict.unk_word: summary.replaced.update([word]) if self.already_numberized: id_strings = line.strip().split() id_list = [int(id_string) for id_string in id_strings] if self.reverse_order: id_list.reverse() if self.append_eos: id_list.append(self.dict.eos()) ids = torch.IntTensor(id_list) else: ids = self.dict.encode_line( line=line, line_tokenizer=self.tokenize, add_if_not_exist=False, consumer=replaced_consumer, append_eos=self.append_eos, reverse_order=self.reverse_order, ) summary.num_seq += 1 summary.num_tok += len(ids) return ids class AlignmentDatasetBinarizer(Binarizer): """ binarize by parsing a set of alignments and packing them in a tensor (see utils.parse_alignment) """ def __init__( self, alignment_parser: tp.Callable[[str], torch.IntTensor], ) -> None: super().__init__() self.alignment_parser = alignment_parser def binarize_line( self, line: str, summary: BinarizeSummary, ): ids = self.alignment_parser(line) summary.num_seq += 1 summary.num_tok += len(ids) return ids class LegacyBinarizer: @classmethod def binarize( cls, filename: str, dico: Dictionary, consumer: tp.Callable[[torch.IntTensor], None], tokenize: tp.Callable[[str], tp.List[str]] = tokenize_line, append_eos: bool = True, reverse_order: bool = False, offset: int = 0, end: int = -1, already_numberized: bool = False, ) -> tp.Dict[str, int]: binarizer = VocabularyDatasetBinarizer( dict=dico, tokenize=tokenize, append_eos=append_eos, reverse_order=reverse_order, already_numberized=already_numberized, ) return cls._consume_file( filename, binarizer, consumer, offset_start=offset, offset_end=end, ) @classmethod def binarize_alignments( cls, filename: str, alignment_parser: tp.Callable[[str], torch.IntTensor], consumer: tp.Callable[[torch.IntTensor], None], offset: int = 0, end: int = -1, ) -> tp.Dict[str, int]: binarizer = AlignmentDatasetBinarizer(alignment_parser) return cls._consume_file( filename, binarizer, consumer, offset_start=offset, offset_end=end, ) @staticmethod def _consume_file( filename: str, binarizer: Binarizer, consumer: tp.Callable[[torch.IntTensor], None], offset_start: int, offset_end: int, ) -> tp.Dict[str, int]: summary = BinarizeSummary() with Chunker( PathManager.get_local_path(filename), offset_start, offset_end ) as line_iterator: for line in line_iterator: consumer(binarizer.binarize_line(line, summary)) return { "nseq": summary.num_seq, "nunk": summary.num_replaced, "ntok": summary.num_tok, "replaced": summary.replaced, } ================================================ FILE: fairseq/checkpoint_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ast import collections import contextlib import inspect import logging import os import re import time import traceback from collections import OrderedDict from pathlib import Path from typing import Any, Dict, Optional, Union import numpy as np import torch from fairseq.data import data_utils from fairseq.dataclass.configs import CheckpointConfig from fairseq.dataclass.utils import ( convert_namespace_to_omegaconf, overwrite_args_by_name, ) from fairseq.distributed.fully_sharded_data_parallel import FSDP, has_FSDP from fairseq.file_io import PathManager from fairseq.models import FairseqDecoder, FairseqEncoder from omegaconf import DictConfig, OmegaConf, open_dict logger = logging.getLogger(__name__) def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss): from fairseq import meters # only one worker should attempt to create the required dir if trainer.data_parallel_rank == 0: os.makedirs(cfg.save_dir, exist_ok=True) prev_best = getattr(save_checkpoint, "best", val_loss) if val_loss is not None: best_function = max if cfg.maximize_best_checkpoint_metric else min save_checkpoint.best = best_function(val_loss, prev_best) if cfg.no_save: return None trainer.consolidate_optimizer() # TODO(SS): do we need this if no_save_optimizer_state if not trainer.should_save_checkpoint_on_current_rank: if trainer.always_call_state_dict_during_save_checkpoint: trainer.state_dict() return None write_timer = meters.StopwatchMeter() write_timer.start() epoch = epoch_itr.epoch end_of_epoch = epoch_itr.end_of_epoch() updates = trainer.get_num_updates() logger.info(f"Preparing to save checkpoint for epoch {epoch} @ {updates} updates") def is_better(a, b): return a >= b if cfg.maximize_best_checkpoint_metric else a <= b suffix = trainer.checkpoint_suffix checkpoint_conds = collections.OrderedDict() checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = ( end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0 ) checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = ( not end_of_epoch and cfg.save_interval_updates > 0 and updates % cfg.save_interval_updates == 0 ) checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and ( not hasattr(save_checkpoint, "best") or is_better(val_loss, save_checkpoint.best) ) if val_loss is not None and cfg.keep_best_checkpoints > 0: worst_best = getattr(save_checkpoint, "best", None) chkpts = checkpoint_paths( cfg.save_dir, pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format( cfg.best_checkpoint_metric, suffix ), ) if len(chkpts) > 0: p = chkpts[-1] if cfg.maximize_best_checkpoint_metric else chkpts[0] worst_best = float(p.rsplit("_")[-1].replace("{}.pt".format(suffix), "")) # add random digits to resolve ties with data_utils.numpy_seed(epoch, updates, val_loss): rand_sfx = np.random.randint(0, cfg.keep_best_checkpoints) checkpoint_conds[ "checkpoint.best_{}_{:.3f}{}{}.pt".format( cfg.best_checkpoint_metric, val_loss, rand_sfx, suffix ) ] = worst_best is None or is_better(val_loss, worst_best) checkpoint_conds[ "checkpoint_last{}.pt".format(suffix) ] = not cfg.no_last_checkpoints extra_state = { "train_iterator": epoch_itr.state_dict(), "val_loss": val_loss, } # Going forward, different tasks could expose an API like this to dump all # the checkpoint worthy attributes in a dictionary which then will be # merged with the parent dictionary to create the "extra_state". This # allows for an extensible yet simple design to checkpoint task level # attributes if hasattr(trainer.task, "get_checkpoint_dict"): extra_state = {**extra_state, **trainer.task.get_checkpoint_dict()} logger.info(f"State of {trainer.task.__class__.__name__} is ready to be persisted with the checkpoint") if hasattr(save_checkpoint, "best"): extra_state.update({"best": save_checkpoint.best}) checkpoints = [ os.path.join(cfg.save_dir, fn) for fn, cond in checkpoint_conds.items() if cond ] saved_cp = None if len(checkpoints) > 0 and trainer.should_save_checkpoint_on_current_rank: saved_cp = trainer.save_checkpoint(checkpoints[0], extra_state) for cp in checkpoints[1:]: if cfg.write_checkpoints_asynchronously: # TODO[ioPath]: Need to implement a delayed asynchronous # file copying/moving feature. logger.warning( f"ioPath is not copying {checkpoints[0]} to {cp} " "since async write mode is on." ) else: assert PathManager.copy( checkpoints[0], cp, overwrite=True ), f"Failed to copy {checkpoints[0]} to {cp}" write_timer.stop() logger.info( "Saved checkpoint {} (epoch {} @ {} updates, score {}) (writing took {} seconds)".format( checkpoints[0], epoch, updates, val_loss, write_timer.sum ) ) if ( not end_of_epoch and cfg.keep_interval_updates > 0 and trainer.should_save_checkpoint_on_current_rank ): # remove old checkpoints; checkpoints are sorted in descending order if cfg.keep_interval_updates_pattern == -1: checkpoints = checkpoint_paths( cfg.save_dir, pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix) ) else: checkpoints = checkpoint_paths( cfg.save_dir, pattern=r"checkpoint_\d+_(\d+){}\.pt".format(suffix), keep_match=True, ) checkpoints = [ x[0] for x in checkpoints if x[1] % cfg.keep_interval_updates_pattern != 0 ] for old_chk in checkpoints[cfg.keep_interval_updates :]: if os.path.lexists(old_chk): os.remove(old_chk) elif PathManager.exists(old_chk): PathManager.rm(old_chk) if cfg.keep_last_epochs > 0 and trainer.should_save_checkpoint_on_current_rank: # remove old epoch checkpoints; checkpoints are sorted in descending order checkpoints = checkpoint_paths( cfg.save_dir, pattern=r"checkpoint(\d+){}\.pt".format(suffix) ) for old_chk in checkpoints[cfg.keep_last_epochs :]: if os.path.lexists(old_chk): os.remove(old_chk) elif PathManager.exists(old_chk): PathManager.rm(old_chk) if cfg.keep_best_checkpoints > 0 and trainer.should_save_checkpoint_on_current_rank: # only keep the best N checkpoints according to validation metric checkpoints = checkpoint_paths( cfg.save_dir, pattern=r"checkpoint\.best_{}_(\d+\.?\d*){}\.pt".format( cfg.best_checkpoint_metric, suffix ), ) if not cfg.maximize_best_checkpoint_metric: checkpoints = checkpoints[::-1] for old_chk in checkpoints[cfg.keep_best_checkpoints :]: if os.path.lexists(old_chk): os.remove(old_chk) elif PathManager.exists(old_chk): PathManager.rm(old_chk) return saved_cp def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args): """ Load a checkpoint and restore the training iterator. *passthrough_args* will be passed through to ``trainer.get_train_iterator``. """ reset_optimizer = cfg.reset_optimizer reset_lr_scheduler = cfg.reset_lr_scheduler optimizer_overrides = ast.literal_eval(cfg.optimizer_overrides) reset_meters = cfg.reset_meters reset_dataloader = cfg.reset_dataloader if cfg.finetune_from_model is not None and ( reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader ): raise ValueError( "--finetune-from-model can not be set together with either --reset-optimizer" " or reset_lr_scheduler or reset_meters or reset_dataloader" ) suffix = trainer.checkpoint_suffix if ( cfg.restore_file == "checkpoint_last.pt" ): # default value of restore_file is 'checkpoint_last.pt' checkpoint_path = os.path.join( cfg.save_dir, "checkpoint_last{}.pt".format(suffix) ) first_launch = not PathManager.exists(checkpoint_path) if first_launch and getattr(cfg, "continue_once", None) is not None: checkpoint_path = cfg.continue_once elif cfg.finetune_from_model is not None and first_launch: # if there is no last checkpoint to restore, start the finetune from pretrained model # else just use usual logic to load checkpoint, e.g. restart from last checkpoint and etc. if PathManager.exists(cfg.finetune_from_model): checkpoint_path = cfg.finetune_from_model reset_optimizer = True reset_lr_scheduler = True reset_meters = True reset_dataloader = True logger.info( f"loading pretrained model from {checkpoint_path}: " "optimizer, lr scheduler, meters, dataloader will be reset" ) else: raise ValueError( f"--finetune-from-model {cfg.finetune_from_model} does not exist" ) elif suffix is not None: checkpoint_path = cfg.restore_file.replace(".pt", suffix + ".pt") else: checkpoint_path = cfg.restore_file if cfg.restore_file != "checkpoint_last.pt" and cfg.finetune_from_model: raise ValueError( "--finetune-from-model and --restore-file (non-default value) " "can not be specified together: " + str(cfg) ) extra_state = trainer.load_checkpoint( checkpoint_path, reset_optimizer, reset_lr_scheduler, optimizer_overrides, reset_meters=reset_meters, ) if ( extra_state is not None and "best" in extra_state and not reset_optimizer and not reset_meters ): save_checkpoint.best = extra_state["best"] if extra_state is not None and not reset_dataloader: # restore iterator from checkpoint itr_state = extra_state["train_iterator"] epoch_itr = trainer.get_train_iterator( epoch=itr_state["epoch"], load_dataset=True, **passthrough_args ) epoch_itr.load_state_dict(itr_state) # Preload the checkpoint for the task task_cp_dict = extra_state.get(trainer.task.__class__.__name__, {}) if task_cp_dict and hasattr(trainer.task, "set_checkpoint_dict"): trainer.task.set_checkpoint_dict(task_cp_dict) else: epoch_itr = trainer.get_train_iterator( epoch=1, load_dataset=True, **passthrough_args ) trainer.lr_step(epoch_itr.epoch) return extra_state, epoch_itr def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False): """Loads a checkpoint to CPU (with upgrading for backward compatibility). If doing single-GPU training or if the checkpoint is only being loaded by at most one process on each node (current default behavior is for only rank 0 to read the checkpoint from disk), load_on_all_ranks should be False to avoid errors from torch.distributed not having been initialized or torch.distributed.barrier() hanging. If all processes on each node may be loading the checkpoint simultaneously, load_on_all_ranks should be set to True to avoid I/O conflicts. There's currently no support for > 1 but < all processes loading the checkpoint on each node. """ local_path = PathManager.get_local_path(path) # The locally cached file returned by get_local_path() may be stale for # remote files that are periodically updated/overwritten (ex: # checkpoint_last.pt) - so we remove the local copy, sync across processes # (if needed), and then download a fresh copy. if local_path != path and PathManager.path_requires_pathmanager(path): try: os.remove(local_path) except FileNotFoundError: # With potentially multiple processes removing the same file, the # file being missing is benign (missing_ok isn't available until # Python 3.8). pass if load_on_all_ranks: torch.distributed.barrier() local_path = PathManager.get_local_path(path) with open(local_path, "rb") as f: state = torch.load(f, map_location=torch.device("cpu"), weights_only=False) if "args" in state and state["args"] is not None and arg_overrides is not None: args = state["args"] for arg_name, arg_val in arg_overrides.items(): setattr(args, arg_name, arg_val) if "cfg" in state and state["cfg"] is not None: # hack to be able to set Namespace in dict config. this should be removed when we update to newer # omegaconf version that supports object flags, or when we migrate all existing models from omegaconf import __version__ as oc_version from omegaconf import _utils if oc_version < "2.2": old_primitive = _utils.is_primitive_type _utils.is_primitive_type = lambda _: True state["cfg"] = OmegaConf.create(state["cfg"]) _utils.is_primitive_type = old_primitive OmegaConf.set_struct(state["cfg"], True) else: state["cfg"] = OmegaConf.create(state["cfg"], flags={"allow_objects": True}) if arg_overrides is not None: overwrite_args_by_name(state["cfg"], arg_overrides) state = _upgrade_state_dict(state) return state def load_model_ensemble( filenames, arg_overrides: Optional[Dict[str, Any]] = None, task=None, strict=True, suffix="", num_shards=1, state=None, ): """Loads an ensemble of models. Args: filenames (List[str]): checkpoint files to load arg_overrides (Dict[str,Any], optional): override model args that were used during model training task (fairseq.tasks.FairseqTask, optional): task to use for loading """ assert not ( strict and num_shards > 1 ), "Cannot load state dict with strict=True and checkpoint shards > 1" ensemble, args, _task = load_model_ensemble_and_task( filenames, arg_overrides, task, strict, suffix, num_shards, state, ) return ensemble, args def get_maybe_sharded_checkpoint_filename( filename: str, suffix: str, shard_idx: int, num_shards: int ) -> str: orig_filename = filename filename = filename.replace(".pt", suffix + ".pt") fsdp_filename = filename[:-3] + f"-shard{shard_idx}.pt" model_parallel_filename = orig_filename[:-3] + f"_part{shard_idx}.pt" if PathManager.exists(fsdp_filename): return fsdp_filename elif num_shards > 1: return model_parallel_filename else: return filename def load_model_ensemble_and_task( filenames, arg_overrides: Optional[Dict[str, Any]] = None, task=None, strict=True, suffix="", num_shards=1, state=None, ): assert state is None or len(filenames) == 1 from fairseq import tasks assert not ( strict and num_shards > 1 ), "Cannot load state dict with strict=True and checkpoint shards > 1" ensemble = [] cfg = None for filename in filenames: orig_filename = filename model_shard_state = {"shard_weights": [], "shard_metadata": []} assert num_shards > 0 st = time.time() for shard_idx in range(num_shards): filename = get_maybe_sharded_checkpoint_filename( orig_filename, suffix, shard_idx, num_shards ) if not PathManager.exists(filename): raise IOError("Model file not found: {}".format(filename)) if state is None: state = load_checkpoint_to_cpu(filename, arg_overrides) if "args" in state and state["args"] is not None: cfg = convert_namespace_to_omegaconf(state["args"]) elif "cfg" in state and state["cfg"] is not None: cfg = state["cfg"] else: raise RuntimeError( f"Neither args nor cfg exist in state keys = {state.keys()}" ) if task is None: task = tasks.setup_task(cfg.task, from_checkpoint=True) if "task_state" in state: task.load_state_dict(state["task_state"]) argspec = inspect.getfullargspec(task.build_model) if "fsdp_metadata" in state and num_shards > 1: model_shard_state["shard_weights"].append(state["model"]) model_shard_state["shard_metadata"].append(state["fsdp_metadata"]) # check FSDP import before the code goes too far if not has_FSDP: raise ImportError( "Cannot find FullyShardedDataParallel. " "Please install fairscale with: pip install fairscale" ) if shard_idx == num_shards - 1: consolidated_model_state = FSDP.consolidate_shard_weights( shard_weights=model_shard_state["shard_weights"], shard_metadata=model_shard_state["shard_metadata"], ) if "from_checkpoint" in argspec.args: model = task.build_model(cfg.model, from_checkpoint=True) else: model = task.build_model(cfg.model) if ( "optimizer_history" in state and len(state["optimizer_history"]) > 0 and "num_updates" in state["optimizer_history"][-1] ): model.set_num_updates( state["optimizer_history"][-1]["num_updates"] ) model.load_state_dict( consolidated_model_state, strict=strict, model_cfg=cfg.model ) else: # model parallel checkpoint or unsharded checkpoint # support old external tasks if "from_checkpoint" in argspec.args: model = task.build_model(cfg.model, from_checkpoint=True) else: model = task.build_model(cfg.model) if ( "optimizer_history" in state and len(state["optimizer_history"]) > 0 and "num_updates" in state["optimizer_history"][-1] ): model.set_num_updates(state["optimizer_history"][-1]["num_updates"]) model.load_state_dict( state["model"], strict=strict, model_cfg=cfg.model ) # reset state so it gets loaded for the next model in ensemble state = None if shard_idx % 10 == 0 and shard_idx > 0: elapsed = time.time() - st logger.info( f"Loaded {shard_idx} shards in {elapsed:.2f}s, {elapsed / (shard_idx+1):.2f}s/shard" ) # build model for ensemble ensemble.append(model) return ensemble, cfg, task def load_model_ensemble_and_task_from_hf_hub( model_id, cache_dir: Optional[str] = None, arg_overrides: Optional[Dict[str, Any]] = None, **kwargs: Any, ): try: from huggingface_hub import snapshot_download except ImportError: raise ImportError( "You need to install huggingface_hub to use `load_from_hf_hub`. " "See https://pypi.org/project/huggingface-hub/ for installation." ) library_name = "fairseq" cache_dir = cache_dir or (Path.home() / ".cache" / library_name).as_posix() cache_dir = snapshot_download( model_id, cache_dir=cache_dir, library_name=library_name, **kwargs ) _arg_overrides = arg_overrides or {} _arg_overrides["data"] = cache_dir return load_model_ensemble_and_task( [p.as_posix() for p in Path(cache_dir).glob("*.pt")], arg_overrides=_arg_overrides, ) def checkpoint_paths(path, pattern=r"checkpoint(\d+)\.pt", keep_match=False): """Retrieves all checkpoints found in `path` directory. Checkpoints are identified by matching filename to the specified pattern. If the pattern contains groups, the result will be sorted by the first group in descending order. """ pt_regexp = re.compile(pattern) files = PathManager.ls(path) entries = [] for i, f in enumerate(files): m = pt_regexp.fullmatch(f) if m is not None: idx = float(m.group(1)) if len(m.groups()) > 0 else i entries.append((idx, m.group(0))) if keep_match: return [(os.path.join(path, x[1]), x[0]) for x in sorted(entries, reverse=True)] else: return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)] def torch_persistent_save(obj, filename, async_write: bool = False): if async_write: with PathManager.opena(filename, "wb") as f: _torch_persistent_save(obj, f) else: if PathManager.supports_rename(filename): # do atomic save with PathManager.open(filename + ".tmp", "wb") as f: _torch_persistent_save(obj, f) PathManager.rename(filename + ".tmp", filename) else: # fallback to non-atomic save with PathManager.open(filename, "wb") as f: _torch_persistent_save(obj, f) def _torch_persistent_save(obj, f): if isinstance(f, str): with PathManager.open(f, "wb") as h: torch_persistent_save(obj, h) return for i in range(3): try: return torch.save(obj, f) except Exception: if i == 2: logger.error(traceback.format_exc()) raise else: time.sleep(2.5) def _upgrade_state_dict(state): """Helper for upgrading old model checkpoints.""" # add optimizer_history if "optimizer_history" not in state: state["optimizer_history"] = [ {"criterion_name": "CrossEntropyCriterion", "best_loss": state["best_loss"]} ] state["last_optimizer_state"] = state["optimizer"] del state["optimizer"] del state["best_loss"] # move extra_state into sub-dictionary if "epoch" in state and "extra_state" not in state: state["extra_state"] = { "epoch": state["epoch"], "batch_offset": state["batch_offset"], "val_loss": state["val_loss"], } del state["epoch"] del state["batch_offset"] del state["val_loss"] # reduce optimizer history's memory usage (only keep the last state) if "optimizer" in state["optimizer_history"][-1]: state["last_optimizer_state"] = state["optimizer_history"][-1]["optimizer"] for optim_hist in state["optimizer_history"]: del optim_hist["optimizer"] # record the optimizer class name if "optimizer_name" not in state["optimizer_history"][-1]: state["optimizer_history"][-1]["optimizer_name"] = "FairseqNAG" # move best_loss into lr_scheduler_state if "lr_scheduler_state" not in state["optimizer_history"][-1]: state["optimizer_history"][-1]["lr_scheduler_state"] = { "best": state["optimizer_history"][-1]["best_loss"] } del state["optimizer_history"][-1]["best_loss"] # keep track of number of updates if "num_updates" not in state["optimizer_history"][-1]: state["optimizer_history"][-1]["num_updates"] = 0 # use stateful training data iterator if "train_iterator" not in state["extra_state"]: state["extra_state"]["train_iterator"] = { "epoch": state["extra_state"].get("epoch", 0), "iterations_in_epoch": state["extra_state"].get("batch_offset", 0), } # backward compatibility, cfg updates if "args" in state and state["args"] is not None: # old model checkpoints may not have separate source/target positions if hasattr(state["args"], "max_positions") and not hasattr( state["args"], "max_source_positions" ): state["args"].max_source_positions = state["args"].max_positions state["args"].max_target_positions = state["args"].max_positions # default to translation task if not hasattr(state["args"], "task"): state["args"].task = "translation" # --raw-text and --lazy-load are deprecated if getattr(state["args"], "raw_text", False): state["args"].dataset_impl = "raw" elif getattr(state["args"], "lazy_load", False): state["args"].dataset_impl = "lazy" # epochs start at 1 if state["extra_state"]["train_iterator"] is not None: state["extra_state"]["train_iterator"]["epoch"] = max( state["extra_state"]["train_iterator"].get("epoch", 1), 1 ) # --remove-bpe ==> --postprocess if hasattr(state["args"], "remove_bpe"): state["args"].post_process = state["args"].remove_bpe # --min-lr ==> --stop-min-lr if hasattr(state["args"], "min_lr"): state["args"].stop_min_lr = state["args"].min_lr del state["args"].min_lr # binary_cross_entropy / kd_binary_cross_entropy => wav2vec criterion if hasattr(state["args"], "criterion") and state["args"].criterion in [ "binary_cross_entropy", "kd_binary_cross_entropy", ]: state["args"].criterion = "wav2vec" # remove log_keys if it's None (criteria will supply a default value of []) if hasattr(state["args"], "log_keys") and state["args"].log_keys is None: delattr(state["args"], "log_keys") # speech_pretraining => audio pretraining if ( hasattr(state["args"], "task") and state["args"].task == "speech_pretraining" ): state["args"].task = "audio_pretraining" # audio_cpc => wav2vec if hasattr(state["args"], "arch") and state["args"].arch == "audio_cpc": state["args"].arch = "wav2vec" # convert legacy float learning rate to List[float] if hasattr(state["args"], "lr") and isinstance(state["args"].lr, float): state["args"].lr = [state["args"].lr] # convert task data arg to a string instead of List[string] if ( hasattr(state["args"], "data") and isinstance(state["args"].data, list) and len(state["args"].data) > 0 ): state["args"].data = state["args"].data[0] state["cfg"] = convert_namespace_to_omegaconf(state["args"]) if "cfg" in state and state["cfg"] is not None: cfg = state["cfg"] with open_dict(cfg): # any upgrades for Hydra-based configs if ( "task" in cfg and "eval_wer_config" in cfg.task and isinstance(cfg.task.eval_wer_config.print_alignment, bool) ): cfg.task.eval_wer_config.print_alignment = "hard" if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool): cfg.generation.print_alignment = ( "hard" if cfg.generation.print_alignment else None ) if ( "model" in cfg and "w2v_args" in cfg.model and cfg.model.w2v_args is not None and ( hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args ) and hasattr(cfg.model.w2v_args.task, "eval_wer_config") and cfg.model.w2v_args.task.eval_wer_config is not None and isinstance( cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool ) ): cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard" return state def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]): """Prune the given state_dict if desired for LayerDrop (https://arxiv.org/abs/1909.11556). Training with LayerDrop allows models to be robust to pruning at inference time. This function prunes state_dict to allow smaller models to be loaded from a larger model and re-maps the existing state_dict for this to occur. It's called by functions that load models from checkpoints and does not need to be called directly. """ arch = None if model_cfg is not None: arch = ( model_cfg._name if isinstance(model_cfg, DictConfig) else getattr(model_cfg, "arch", None) ) if not model_cfg or arch is None or arch == "ptt_transformer": # args should not be none, but don't crash if it is. return state_dict encoder_layers_to_keep = getattr(model_cfg, "encoder_layers_to_keep", None) decoder_layers_to_keep = getattr(model_cfg, "decoder_layers_to_keep", None) if not encoder_layers_to_keep and not decoder_layers_to_keep: return state_dict # apply pruning logger.info( "Pruning model to specified layer configuration - this works best if the model was trained with LayerDrop" ) def create_pruning_pass(layers_to_keep, layer_name): keep_layers = sorted( int(layer_string) for layer_string in layers_to_keep.split(",") ) mapping_dict = {} for i in range(len(keep_layers)): mapping_dict[str(keep_layers[i])] = str(i) regex = re.compile(r"^{layer}.*\.layers\.(\d+)".format(layer=layer_name)) return {"substitution_regex": regex, "mapping_dict": mapping_dict} pruning_passes = [] if encoder_layers_to_keep: pruning_passes.append(create_pruning_pass(encoder_layers_to_keep, "encoder")) if decoder_layers_to_keep: pruning_passes.append(create_pruning_pass(decoder_layers_to_keep, "decoder")) new_state_dict = {} for layer_name in state_dict.keys(): match = re.search(r"\.layers\.(\d+)\.", layer_name) # if layer has no number in it, it is a supporting layer, such as an # embedding if not match: new_state_dict[layer_name] = state_dict[layer_name] continue # otherwise, layer should be pruned. original_layer_number = match.group(1) # figure out which mapping dict to replace from for pruning_pass in pruning_passes: if original_layer_number in pruning_pass["mapping_dict"] and pruning_pass[ "substitution_regex" ].search(layer_name): new_layer_number = pruning_pass["mapping_dict"][original_layer_number] substitution_match = pruning_pass["substitution_regex"].search( layer_name ) new_state_key = ( layer_name[: substitution_match.start(1)] + new_layer_number + layer_name[substitution_match.end(1) :] ) new_state_dict[new_state_key] = state_dict[layer_name] # Since layers are now pruned, *_layers_to_keep are no longer needed. # This is more of "It would make it work fix" rather than a proper fix. if isinstance(model_cfg, DictConfig): context = open_dict(model_cfg) else: context = contextlib.ExitStack() with context: if hasattr(model_cfg, "encoder_layers_to_keep"): model_cfg.encoder_layers_to_keep = None if hasattr(model_cfg, "decoder_layers_to_keep"): model_cfg.decoder_layers_to_keep = None return new_state_dict def load_pretrained_component_from_model( component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str, strict: bool = True, ): """ Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the provided `component` object. If state_dict fails to load, there may be a mismatch in the architecture of the corresponding `component` found in the `checkpoint` file. """ if not PathManager.exists(checkpoint): raise IOError("Model file not found: {}".format(checkpoint)) state = load_checkpoint_to_cpu(checkpoint) if isinstance(component, FairseqEncoder): component_type = "encoder" elif isinstance(component, FairseqDecoder): component_type = "decoder" else: raise ValueError( "component to load must be either a FairseqEncoder or " "FairseqDecoder. Loading other component types are not supported." ) component_state_dict = OrderedDict() for key in state["model"].keys(): if key.startswith(component_type): # encoder.input_layers.0.0.weight --> input_layers.0.0.weight component_subkey = key[len(component_type) + 1 :] component_state_dict[component_subkey] = state["model"][key] component.load_state_dict(component_state_dict, strict=strict) return component def verify_checkpoint_directory(save_dir: str) -> None: if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) temp_file_path = os.path.join(save_dir, "dummy") try: with open(temp_file_path, "w"): pass except OSError as e: logger.warning( "Unable to access checkpoint save directory: {}".format(save_dir) ) raise e else: os.remove(temp_file_path) def save_ema_as_checkpoint(src_path, dst_path): state = load_ema_from_checkpoint(src_path) torch_persistent_save(state, dst_path) def load_ema_from_checkpoint(fpath): """Loads exponential moving averaged (EMA) checkpoint from input and returns a model with ema weights. Args: fpath: A string path of checkpoint to load from. Returns: A dict of string keys mapping to various values. The 'model' key from the returned dict should correspond to an OrderedDict mapping string parameter names to torch Tensors. """ params_dict = collections.OrderedDict() new_state = None with PathManager.open(fpath, "rb") as f: new_state = torch.load( f, map_location=( lambda s, _: torch.serialization.default_restore_location(s, "cpu") ), weights_only=False, ) # EMA model is stored in a separate "extra state" model_params = new_state["extra_state"]["ema"] for key in list(model_params.keys()): p = model_params[key] if isinstance(p, torch.HalfTensor): p = p.float() if key not in params_dict: params_dict[key] = p.clone() # NOTE: clone() is needed in case of p is a shared parameter else: raise ValueError("Key {} is repeated in EMA model params.".format(key)) if len(params_dict) == 0: raise ValueError( f"Input checkpoint path '{fpath}' does not contain " "ema model weights, is this model trained with EMA?" ) new_state["model"] = params_dict return new_state ================================================ FILE: fairseq/clib/cuda/ngram_repeat_block_cuda.cpp ================================================ /* Copyright (c) Microsoft Corporation. Licensed under the MIT License. */ #include <torch/extension.h> #include <vector> /* CPP Binding for CUDA OP */ // CUDA forward declarations torch::Tensor ngram_repeat_block_cuda_forward( torch::Tensor tokens, torch::Tensor lprobs, int bsz, int step, int beam_size, int no_repeat_ngram_size); #define CHECK_CUDA(x) \ TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) // Input check and call to CUDA OP // Backward method not required torch::Tensor ngram_repeat_block_forward( torch::Tensor tokens, torch::Tensor lprobs, int bsz, int step, int beam_size, int no_repeat_ngram_size) { CHECK_INPUT(tokens); CHECK_INPUT(lprobs); assert(bsz > 0); assert(step >= 0); assert(beam_size > 0); assert(no_repeat_ngram_size > 0); return ngram_repeat_block_cuda_forward( tokens, lprobs, bsz, step, beam_size, no_repeat_ngram_size); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def( "forward", &ngram_repeat_block_forward, "No Repeat Ngram Block forward (CUDA)"); } ================================================ FILE: fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu ================================================ /* Copyright (c) Microsoft Corporation. Licensed under the MIT License. */ /* Kernel implementation for blocking repeated n-grams. */ #include <cuda.h> #include <cuda_runtime.h> #include <math.h> #include <torch/extension.h> #include <vector> // Ban repeated ngrams of length = 'no_repeat_ngram_size' __global__ void banRepeatedTokens( long* __restrict__ tokens, float* __restrict__ lprobs, int max_predict_len, int vocab_size, int no_repeat_ngram_size) { auto row = blockIdx.x; auto col = threadIdx.x; auto start = row * (max_predict_len) + col; // Each thread compares ngram starting from // thread index with final ngram starting from // step - no_repeat_ngram_size +2 auto check_start_pos = blockDim.x; auto lprob_start = row * vocab_size; bool is_banned = true; extern __shared__ long tokens_shm[]; tokens_shm[col] = tokens[start]; if (col == blockDim.x - 1) { for (int i = 1; i < no_repeat_ngram_size; i++) { if (col + i < max_predict_len) { tokens_shm[col + i] = tokens[start + i]; } } } __syncthreads(); for (int k = 0; k < no_repeat_ngram_size - 1; k++) { if (tokens_shm[col + k] != tokens_shm[check_start_pos + k]) { is_banned = false; } } if (is_banned == true) { auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1]; lprobs[lprob_start + token_to_be_banned] = -INFINITY; } } // Allocate blocks and threads based on // batch size and sequence length and launch // kernel torch::Tensor ngram_repeat_block_cuda_forward( const torch::Tensor tokens, torch::Tensor lprobs, int bsz, int step, int beam_size, int no_repeat_ngram_size) { int threads = step - no_repeat_ngram_size + 2; if (threads <= 0) return lprobs; int max_predict_len = tokens.size(1); int vocab_size = lprobs.size(1); auto token_ptr = tokens.data_ptr<long>(); auto lprob_ptr = lprobs.data_ptr<float>(); int blocks = bsz * beam_size; int shared_mem_size = (step + 1) * sizeof(long); // Launching N blocks where N is number of samples in a batch (beams*bsz) // Launching T threads where T is number of previous ngrams in a sample // Allocating shared mem per block for fastser access of input tokens since // each token will be accessed N times to compare with current Ngram where // N is Ngram size. banRepeatedTokens<<<blocks, threads, shared_mem_size>>>( token_ptr, lprob_ptr, max_predict_len, vocab_size, no_repeat_ngram_size); return lprobs; } ================================================ FILE: fairseq/clib/libbase/balanced_assignment.cpp ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ /* C++ code for solving the linear assignment problem. Based on the Auction Algorithm from https://dspace.mit.edu/bitstream/handle/1721.1/3265/P-2108-26912652.pdf and the implementation from: https://github.com/bkj/auction-lap Adapted to be more efficient when each worker is looking for k jobs instead of 1. */ #include <torch/extension.h> #include <iostream> using namespace torch::indexing; torch::Tensor balanced_assignment(torch::Tensor job_and_worker_to_score) { int max_iterations = 100; torch::Tensor epsilon = (job_and_worker_to_score.max() - job_and_worker_to_score.min()) / 50; epsilon.clamp_min_(1e-04); torch::Tensor worker_and_job_to_score = job_and_worker_to_score.detach().transpose(0, 1).contiguous(); int num_workers = worker_and_job_to_score.size(0); int num_jobs = worker_and_job_to_score.size(1); auto device = worker_and_job_to_score.device(); int jobs_per_worker = num_jobs / num_workers; torch::Tensor value = worker_and_job_to_score.clone(); int counter = 0; torch::Tensor max_value = worker_and_job_to_score.max(); torch::Tensor bid_indices; torch::Tensor cost = worker_and_job_to_score.new_zeros({1, num_jobs}); torch::Tensor bids = worker_and_job_to_score.new_empty({num_workers, num_jobs}); torch::Tensor bid_increments = worker_and_job_to_score.new_empty({num_workers, jobs_per_worker}); torch::Tensor top_values = worker_and_job_to_score.new_empty({num_workers, jobs_per_worker + 1}); torch::Tensor high_bids = worker_and_job_to_score.new_empty({num_jobs}); torch::Tensor top_index = top_values.to(torch::kLong); torch::Tensor high_bidders = top_index.new_empty({num_jobs}); torch::Tensor have_bids = high_bidders.to(torch::kBool); torch::Tensor jobs_indices = torch::arange({num_jobs}, torch::dtype(torch::kLong).device(device)); torch::Tensor true_tensor = torch::ones({1}, torch::dtype(torch::kBool).device(device)); while (true) { bids.zero_(); torch::topk_out(top_values, top_index, value, jobs_per_worker + 1, 1); // Each worker bids the difference in value between that job and the k+1th // job torch::sub_out( bid_increments, top_values.index({Slice(None, None), Slice(0, jobs_per_worker)}), top_values.index({Slice(None, None), jobs_per_worker}).unsqueeze(1)); bid_increments.add_(epsilon); bids.scatter_( 1, top_index.index({Slice(None, None), Slice(0, jobs_per_worker)}), bid_increments); if (counter < max_iterations && counter > 0) { // Put in a minimal bid to retain items from the last round if no-one else // bids for them this round bids.view(-1).index_put_({bid_indices}, epsilon); } // Find the highest bidding worker per job torch::max_out(high_bids, high_bidders, bids, 0); torch::gt_out(have_bids, high_bids, 0); if (have_bids.all().item<bool>()) { // All jobs were bid for break; } // Make popular items more expensive cost.add_(high_bids); torch::sub_out(value, worker_and_job_to_score, cost); bid_indices = ((high_bidders * num_jobs) + jobs_indices).index({have_bids}); if (counter < max_iterations) { // Make sure that this item will be in the winning worker's top-k next // time. value.view(-1).index_put_({bid_indices}, max_value); } else { // Suboptimal approximation that converges quickly from current solution value.view(-1).index_put_( {bid_indices}, worker_and_job_to_score.view(-1).index({bid_indices})); } counter += 1; } return top_index.index({Slice(None, None), Slice(0, jobs_per_worker)}) .reshape(-1); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("balanced_assignment", &balanced_assignment, "Balanced Assignment"); } ================================================ FILE: fairseq/clib/libbleu/libbleu.cpp ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #include <array> #include <cstdio> #include <cstring> #include <map> // NOLINTNEXTLINE typedef struct { size_t reflen; size_t predlen; size_t match1; size_t count1; size_t match2; size_t count2; size_t match3; size_t count3; size_t match4; size_t count4; } bleu_stat; // left trim (remove pad) void bleu_ltrim(size_t* len, int** sent, int pad) { size_t start = 0; while (start < *len) { if (*(*sent + start) != pad) { break; } start++; } *sent += start; *len -= start; } // right trim remove (eos) void bleu_rtrim(size_t* len, int** sent, int pad, int eos) { size_t end = *len - 1; while (end > 0) { if (*(*sent + end) != eos && *(*sent + end) != pad) { break; } end--; } *len = end + 1; } // left and right trim void bleu_trim(size_t* len, int** sent, int pad, int eos) { bleu_ltrim(len, sent, pad); bleu_rtrim(len, sent, pad, eos); } size_t bleu_hash(int len, int* data) { size_t h = 14695981039346656037ul; size_t prime = 0x100000001b3; char* b = (char*)data; size_t blen = sizeof(int) * len; while (blen-- > 0) { h ^= *b++; h *= prime; } return h; } void bleu_addngram( size_t* ntotal, size_t* nmatch, size_t n, size_t reflen, int* ref, size_t predlen, int* pred) { if (predlen < n) { return; } predlen = predlen - n + 1; (*ntotal) += predlen; if (reflen < n) { return; } reflen = reflen - n + 1; std::map<size_t, size_t> count; while (predlen > 0) { size_t w = bleu_hash(n, pred++); count[w]++; predlen--; } while (reflen > 0) { size_t w = bleu_hash(n, ref++); if (count[w] > 0) { (*nmatch)++; count[w] -= 1; } reflen--; } } extern "C" { #ifdef _WIN64 __declspec(dllexport) #endif void bleu_zero_init(bleu_stat* stat) { std::memset(stat, 0, sizeof(bleu_stat)); } #ifdef _WIN64 __declspec(dllexport) #endif void bleu_one_init(bleu_stat* stat) { bleu_zero_init(stat); stat->count1 = 0; stat->count2 = 1; stat->count3 = 1; stat->count4 = 1; stat->match1 = 0; stat->match2 = 1; stat->match3 = 1; stat->match4 = 1; } #ifdef _WIN64 __declspec(dllexport) #endif void bleu_add( bleu_stat* stat, size_t reflen, int* ref, size_t predlen, int* pred, int pad, int eos) { bleu_trim(&reflen, &ref, pad, eos); bleu_trim(&predlen, &pred, pad, eos); stat->reflen += reflen; stat->predlen += predlen; bleu_addngram(&stat->count1, &stat->match1, 1, reflen, ref, predlen, pred); bleu_addngram(&stat->count2, &stat->match2, 2, reflen, ref, predlen, pred); bleu_addngram(&stat->count3, &stat->match3, 3, reflen, ref, predlen, pred); bleu_addngram(&stat->count4, &stat->match4, 4, reflen, ref, predlen, pred); } } ================================================ FILE: fairseq/clib/libbleu/module.cpp ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #include <Python.h> static PyMethodDef method_def[] = {{NULL, NULL, 0, NULL}}; // NOLINT static struct PyModuleDef module_def = { PyModuleDef_HEAD_INIT, "libbleu", /* name of module */ // NOLINTNEXTLINE NULL, /* module documentation, may be NULL */ -1, /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */ method_def}; // NOLINT #if PY_MAJOR_VERSION == 2 PyMODINIT_FUNC init_libbleu() #else PyMODINIT_FUNC PyInit_libbleu() #endif { PyObject* m = PyModule_Create(&module_def); if (!m) { return NULL; } return m; } ================================================ FILE: fairseq/clib/libnat/edit_dist.cpp ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #include <pybind11/detail/common.h> #include <pybind11/pybind11.h> #include <torch/torch.h> // @manual=//caffe2:torch_extension #include <algorithm> #include <cstdint> #include <iosfwd> #include <memory> #include <new> #include <string> #include <utility> #include <vector> using namespace ::std; vector<vector<uint32_t>> edit_distance2_with_dp( vector<uint32_t>& x, vector<uint32_t>& y) { uint32_t lx = x.size(); uint32_t ly = y.size(); vector<vector<uint32_t>> d(lx + 1, vector<uint32_t>(ly + 1)); for (uint32_t i = 0; i < lx + 1; i++) { d[i][0] = i; } for (uint32_t j = 0; j < ly + 1; j++) { d[0][j] = j; } for (uint32_t i = 1; i < lx + 1; i++) { for (uint32_t j = 1; j < ly + 1; j++) { d[i][j] = min(min(d[i - 1][j], d[i][j - 1]) + 1, d[i - 1][j - 1] + 2 * (x.at(i - 1) == y.at(j - 1) ? 0 : 1)); } } return d; } vector<vector<uint32_t>> edit_distance2_backtracking( vector<vector<uint32_t>>& d, vector<uint32_t>& x, vector<uint32_t>& y, uint32_t terminal_symbol) { vector<uint32_t> seq; vector<vector<uint32_t>> edit_seqs(x.size() + 2, vector<uint32_t>()); /* edit_seqs: 0~x.size() cell is the insertion sequences last cell is the delete sequence */ if (x.size() == 0) { edit_seqs.at(0) = y; return edit_seqs; } uint32_t i = d.size() - 1; uint32_t j = d.at(0).size() - 1; while ((i >= 0) && (j >= 0)) { if ((i == 0) && (j == 0)) { break; } if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) { seq.push_back(1); // insert seq.push_back(y.at(j - 1)); j--; } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) { seq.push_back(2); // delete seq.push_back(x.at(i - 1)); i--; } else { seq.push_back(3); // keep seq.push_back(x.at(i - 1)); i--; j--; } } uint32_t prev_op, op, s, word; prev_op = 0, s = 0; for (uint32_t k = 0; k < seq.size() / 2; k++) { op = seq.at(seq.size() - 2 * k - 2); word = seq.at(seq.size() - 2 * k - 1); if (prev_op != 1) { s++; } if (op == 1) // insert { edit_seqs.at(s - 1).push_back(word); } else if (op == 2) // delete { edit_seqs.at(x.size() + 1).push_back(1); } else { edit_seqs.at(x.size() + 1).push_back(0); } prev_op = op; } for (uint32_t k = 0; k < edit_seqs.size(); k++) { if (edit_seqs[k].size() == 0) { edit_seqs[k].push_back(terminal_symbol); } } return edit_seqs; } vector<vector<uint32_t>> edit_distance2_backtracking_with_delete( vector<vector<uint32_t>>& d, vector<uint32_t>& x, vector<uint32_t>& y, uint32_t terminal_symbol, uint32_t deletion_symbol) { vector<uint32_t> seq; vector<vector<uint32_t>> edit_seqs(x.size() + 1, vector<uint32_t>()); /* edit_seqs: 0~x.size() cell is the insertion sequences last cell is the delete sequence */ if (x.size() == 0) { edit_seqs.at(0) = y; return edit_seqs; } uint32_t i = d.size() - 1; uint32_t j = d.at(0).size() - 1; while ((i >= 0) && (j >= 0)) { if ((i == 0) && (j == 0)) { break; } if ((j > 0) && (d.at(i).at(j - 1) < d.at(i).at(j))) { seq.push_back(1); // insert seq.push_back(y.at(j - 1)); j--; } else if ((i > 0) && (d.at(i - 1).at(j) < d.at(i).at(j))) { seq.push_back(2); // delete seq.push_back(x.at(i - 1)); i--; } else { seq.push_back(3); // keep seq.push_back(x.at(i - 1)); i--; j--; } } uint32_t prev_op, op, s, word; prev_op = 0, s = 0; for (uint32_t k = 0; k < seq.size() / 2; k++) { op = seq.at(seq.size() - 2 * k - 2); word = seq.at(seq.size() - 2 * k - 1); if (prev_op != 1) { s++; } if (op == 1) // insert { edit_seqs.at(s - 1).push_back(word); } else if (op == 2) // delete { edit_seqs.at(s - 1).push_back(deletion_symbol); } prev_op = op; } for (uint32_t k = 0; k < edit_seqs.size(); k++) { if (edit_seqs.at(k).size() == 0) { edit_seqs.at(k).push_back(terminal_symbol); } } return edit_seqs; } vector<uint32_t> compute_ed2( vector<vector<uint32_t>>& xs, vector<vector<uint32_t>>& ys) { vector<uint32_t> distances(xs.size()); for (uint32_t i = 0; i < xs.size(); i++) { vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i)); distances.at(i) = d.at(xs.at(i).size()).at(ys.at(i).size()); } return distances; } vector<vector<vector<uint32_t>>> suggested_ed2_path( vector<vector<uint32_t>>& xs, vector<vector<uint32_t>>& ys, uint32_t terminal_symbol) { vector<vector<vector<uint32_t>>> seq(xs.size()); for (uint32_t i = 0; i < xs.size(); i++) { vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i)); seq.at(i) = edit_distance2_backtracking(d, xs.at(i), ys.at(i), terminal_symbol); } return seq; } vector<vector<vector<uint32_t>>> suggested_ed2_path_with_delete( vector<vector<uint32_t>>& xs, vector<vector<uint32_t>>& ys, uint32_t terminal_symbol, uint32_t deletion_symbol) { vector<vector<vector<uint32_t>>> seq(xs.size()); for (uint32_t i = 0; i < xs.size(); i++) { vector<vector<uint32_t>> d = edit_distance2_with_dp(xs.at(i), ys.at(i)); seq.at(i) = edit_distance2_backtracking_with_delete( d, xs.at(i), ys.at(i), terminal_symbol, deletion_symbol); } return seq; } PYBIND11_MODULE(libnat, m) { m.def("compute_ed2", &compute_ed2, "compute_ed2"); m.def("suggested_ed2_path", &suggested_ed2_path, "suggested_ed2_path"); m.def( "suggested_ed2_path_with_delete", &suggested_ed2_path_with_delete, "suggested_ed2_path_with_delete"); } ================================================ FILE: fairseq/clib/libnat_cuda/binding.cpp ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ /* This code is partially adpoted from https://github.com/1ytic/pytorch-edit-distance */ #include <torch/types.h> #include "edit_dist.h" #ifndef TORCH_CHECK #define TORCH_CHECK AT_CHECK #endif #define CHECK_CUDA(x) \ TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) \ TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) torch::Tensor LevenshteinDistance( torch::Tensor source, torch::Tensor target, torch::Tensor source_length, torch::Tensor target_length) { CHECK_INPUT(source); CHECK_INPUT(target); CHECK_INPUT(source_length); CHECK_INPUT(target_length); return LevenshteinDistanceCuda(source, target, source_length, target_length); } torch::Tensor GenerateDeletionLabel( torch::Tensor source, torch::Tensor operations) { CHECK_INPUT(source); CHECK_INPUT(operations); return GenerateDeletionLabelCuda(source, operations); } std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabel( torch::Tensor target, torch::Tensor operations) { CHECK_INPUT(target); CHECK_INPUT(operations); return GenerateInsertionLabelCuda(target, operations); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("levenshtein_distance", &LevenshteinDistance, "Levenshtein distance"); m.def( "generate_deletion_labels", &GenerateDeletionLabel, "Generate Deletion Label"); m.def( "generate_insertion_labels", &GenerateInsertionLabel, "Generate Insertion Label"); } ================================================ FILE: fairseq/clib/libnat_cuda/edit_dist.cu ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #include "edit_dist.h" #include <c10/cuda/CUDAStream.h> #include <cuda.h> #include <cuda_runtime.h> #include <device_launch_parameters.h> #include <utility> // std::pair template <typename scalar_t> __global__ void generate_deletion_label_kernel( const scalar_t* __restrict__ source, const size_t source_size, const size_t operation_size, int* __restrict__ operations, int* __restrict__ labels) { const int index = blockIdx.x; const int offset = index * operation_size; const int offset_label = index * source_size; for (int i = 0; i < source_size; i++) { labels[offset_label + i] = 0; } int k = 0; for (int i = 0; i < operation_size; i++) { if (operations[offset + i] == 0) { break; } else if (operations[offset + i] == 1) { continue; } else { labels[offset_label + k] = 3 - operations[offset + i]; k++; } } } template <typename scalar_t> __global__ void generate_insertion_label_kernel( const scalar_t* __restrict__ target, const size_t target_size, const size_t operation_size, int* __restrict__ operations, int* __restrict__ labels, int* __restrict__ masks) { const int index = blockIdx.x; const int offset = index * operation_size; const int offset_label = index * target_size; int k = 0; int u = 0; int m = 0; for (int i = 0; i < target_size; i++) { labels[offset_label + i] = 0; masks[offset_label + i] = 0; } for (int i = 0; i < operation_size - 1; i++) { if (operations[offset + i] == 0) { break; } else if (operations[offset + i] == 2) { continue; } else if (operations[offset + i] == 1) { masks[offset_label + m] = 1; u++; m++; } else { labels[offset_label + k] = u; masks[offset_label + m] = 0; k++; m++; u = 0; } } } template <typename scalar_t> __global__ void levenshtein_distance_kernel( const scalar_t* __restrict__ source, const scalar_t* __restrict__ target, const int* __restrict__ source_length, const int* __restrict__ target_length, const size_t source_size, const size_t target_size, int* __restrict__ operations, int* __restrict__ errors_curr) { const int index = blockIdx.x; const int offset = index * (source_size + target_size); const int d = index * (source_size + 1) * (target_size + 1); const int t = target_size + 1; auto err_idx = [d, t](int i, int j) { return d + i * t + j; }; auto opt_idx = [offset](int k) { return offset + k; }; const int hyp_len = source_length[index]; const int ref_len = target_length[index]; const scalar_t* hyp_begin = source + index * source_size; const scalar_t* ref_begin = target + index * target_size; // dynamic programming for (int i = 0; i <= hyp_len; i++) { errors_curr[err_idx(i, 0)] = i; } for (int j = 0; j <= ref_len; j++) { errors_curr[err_idx(0, j)] = j; } for (int i = 1; i <= hyp_len; i++) { for (int j = 1; j <= ref_len; j++) { errors_curr[err_idx(i, j)] = min( min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) + 1, errors_curr[err_idx(i - 1, j - 1)] + 2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1)); } } // back-tracing int i = hyp_len; int j = ref_len; int o = hyp_len + ref_len; for (int k = 0; k < source_size + target_size; k++) { operations[opt_idx(k)] = 0; } while ((i >= 0) && (j >= 0)) { if ((i == 0) && (j == 0)) { break; } if ((j > 0) && (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) { o--; operations[opt_idx(o)] = 1; j--; // insertion } else if ( (i > 0) && (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) { o--; operations[opt_idx(o)] = 2; i--; // deletion } else { o--; operations[opt_idx(o)] = 3; i--; j--; // do nothing } } // moving to the left for (int k = 0; k < hyp_len + ref_len; k++) { if (k + o < hyp_len + ref_len) { operations[opt_idx(k)] = operations[opt_idx(k + o)]; } else { operations[opt_idx(k)] = 0; // padding } } } template <typename scalar_t> __global__ void faster_levenshtein_distance_kernel( const scalar_t* __restrict__ source, const scalar_t* __restrict__ target, const int* __restrict__ source_length, const int* __restrict__ target_length, const size_t source_size, const size_t target_size, int* __restrict__ operations) { extern __shared__ short errors[]; auto errors_curr = errors; const int index = blockIdx.x; const int offset = index * (source_size + target_size); const int t = target_size + 1; auto err_idx = [t](int i, int j) { return i * t + j; }; auto opt_idx = [offset](int k) { return offset + k; }; const int hyp_len = source_length[index]; const int ref_len = target_length[index]; const scalar_t* hyp_begin = source + index * source_size; const scalar_t* ref_begin = target + index * target_size; // dynamic programming for (int i = 0; i <= hyp_len; i++) { errors_curr[err_idx(i, 0)] = i; } for (int j = 0; j <= ref_len; j++) { errors_curr[err_idx(0, j)] = j; } for (int i = 1; i <= hyp_len; i++) { for (int j = 1; j <= ref_len; j++) { errors_curr[err_idx(i, j)] = min( min(errors_curr[err_idx(i - 1, j)], errors_curr[err_idx(i, j - 1)]) + 1, errors_curr[err_idx(i - 1, j - 1)] + 2 * (*(hyp_begin + i - 1) == *(ref_begin + j - 1) ? 0 : 1)); } } // back-tracing int i = hyp_len; int j = ref_len; int o = hyp_len + ref_len; for (int k = 0; k < source_size + target_size; k++) { operations[opt_idx(k)] = 0; } while ((i >= 0) && (j >= 0)) { if ((i == 0) && (j == 0)) { break; } if ((j > 0) && (errors_curr[err_idx(i, j - 1)] < errors_curr[err_idx(i, j)])) { o--; operations[opt_idx(o)] = 1; j--; // insertion } else if ( (i > 0) && (errors_curr[err_idx(i - 1, j)] < errors_curr[err_idx(i, j)])) { o--; operations[opt_idx(o)] = 2; i--; // deletion } else { o--; operations[opt_idx(o)] = 3; i--; j--; // do nothing } } // moving to the left for (int k = 0; k < hyp_len + ref_len; k++) { if (k + o < hyp_len + ref_len) { operations[opt_idx(k)] = operations[opt_idx(k + o)]; } else { operations[opt_idx(k)] = 0; // padding } } } torch::Tensor GenerateDeletionLabelCuda( torch::Tensor source, torch::Tensor operations) { const auto batch_size = source.size(0); at::TensorOptions options(source.device()); options = options.dtype(at::ScalarType::Int); auto labels = torch::empty({batch_size, source.size(1)}, options); auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); AT_DISPATCH_ALL_TYPES(source.scalar_type(), "generate_deletion_labels", ([&] { generate_deletion_label_kernel<scalar_t> <<<batch_size, 1, 0, stream>>>( source.data_ptr<scalar_t>(), source.size(1), operations.size(1), operations.data_ptr<int>(), labels.data_ptr<int>()); })); return labels; } std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda( torch::Tensor target, torch::Tensor operations) { const auto batch_size = target.size(0); at::TensorOptions options(target.device()); options = options.dtype(at::ScalarType::Int); auto labels = torch::empty({batch_size, target.size(1)}, options); auto masks = torch::empty({batch_size, target.size(1)}, options); auto stream = at::cuda::getCurrentCUDAStream(target.device().index()); AT_DISPATCH_ALL_TYPES( target.scalar_type(), "generate_insertion_labels", ([&] { generate_insertion_label_kernel<scalar_t><<<batch_size, 1, 0, stream>>>( target.data_ptr<scalar_t>(), target.size(1), operations.size(1), operations.data_ptr<int>(), labels.data_ptr<int>(), masks.data_ptr<int>()); })); return std::make_pair(labels, masks); } torch::Tensor LevenshteinDistanceCuda( torch::Tensor source, torch::Tensor target, torch::Tensor source_length, torch::Tensor target_length) { const auto batch_size = source.size(0); const auto shared_size = (source.size(1) + 1) * (target.size(1) + 1) * sizeof(short); at::TensorOptions options(source.device()); options = options.dtype(at::ScalarType::Int); auto operations = torch::empty({batch_size, source.size(1) + target.size(1)}, options); auto stream = at::cuda::getCurrentCUDAStream(source.device().index()); if (shared_size > 40000) { auto distances = torch::empty( {batch_size, (source.size(1) + 1) * (target.size(1) + 1)}, options); AT_DISPATCH_ALL_TYPES(source.scalar_type(), "levenshtein_distance", ([&] { levenshtein_distance_kernel<scalar_t> <<<batch_size, 1, 0, stream>>>( source.data_ptr<scalar_t>(), target.data_ptr<scalar_t>(), source_length.data_ptr<int>(), target_length.data_ptr<int>(), source.size(1), target.size(1), operations.data_ptr<int>(), distances.data_ptr<int>()); })); } else { AT_DISPATCH_ALL_TYPES( source.scalar_type(), "faster_levenshtein_distance", ([&] { faster_levenshtein_distance_kernel<scalar_t> <<<batch_size, 1, shared_size, stream>>>( source.data_ptr<scalar_t>(), target.data_ptr<scalar_t>(), source_length.data_ptr<int>(), target_length.data_ptr<int>(), source.size(1), target.size(1), operations.data_ptr<int>()); })); } return operations; } ================================================ FILE: fairseq/clib/libnat_cuda/edit_dist.h ================================================ /** * Copyright 2017-present, Facebook, Inc. * All rights reserved. * * This source code is licensed under the license found in the * LICENSE file in the root directory of this source tree. */ #pragma once #include <torch/extension.h> torch::Tensor LevenshteinDistanceCuda( torch::Tensor source, torch::Tensor target, torch::Tensor source_length, torch::Tensor target_length); torch::Tensor GenerateDeletionLabelCuda( torch::Tensor source, torch::Tensor operations); std::pair<torch::Tensor, torch::Tensor> GenerateInsertionLabelCuda( torch::Tensor source, torch::Tensor operations); ================================================ FILE: fairseq/config/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: fairseq/config/config.yaml ================================================ # @package _group_ hydra: run: dir: . defaults: - _self_ - task: null - model: null - criterion: cross_entropy - optimizer: null - lr_scheduler: fixed - bpe: null - tokenizer: null - scoring: null - generation: null - common_eval: null - eval_lm: null ================================================ FILE: fairseq/config/fb_run_config/slurm.yaml ================================================ # @package _global_ hydra: job: config: override_dirname: kv_sep: ':' item_sep: '__' exclude_keys: - fb_run_config - distributed_training.distributed_port sweep: dir: /checkpoint/${env:USER}/${env:PREFIX}/${hydra.job.config_name}_${hydra.launcher.gpus_per_node}/${hydra.job.override_dirname} launcher: cpus_per_task: 60 gpus_per_node: ??? tasks_per_node: 1 nodes: 1 partition: learnfair mem_gb: 400 timeout_min: 4320 max_num_timeout: 10 name: ${env:PREFIX}_${hydra.job.config_name} submitit_folder: ${hydra.sweep.dir} distributed_training: ddp_backend: c10d distributed_world_size: ??? distributed_port: ??? ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_baevski_gbw.yaml ================================================ # @package _group_ activation_fn: "relu" dropout: 0.1 attention_dropout: 0.1 activation_dropout: 0.0 relu_dropout: 0.0 decoder_embed_dim: 512 decoder_output_dim: 512 decoder_input_dim: 512 decoder_ffn_embed_dim: 4096 decoder_layers: 12 decoder_attention_heads: 16 decoder_normalize_before: true no_decoder_final_norm: true adaptive_softmax_cutoff: null adaptive_softmax_dropout: 0 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: false adaptive_input_factor: 4 adaptive_input_cutoff: null tie_adaptive_weights: false tie_adaptive_proj: false decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_baevski_wiki103.yaml ================================================ # @package _group_ activation_fn: "relu" dropout: 0.3 attention_dropout: 0.1 activation_dropout: 0.1 relu_dropout: 0.1 decoder_embed_dim: 1024 decoder_output_dim: 1024 decoder_input_dim: 1024 decoder_ffn_embed_dim: 4096 decoder_layers: 16 decoder_attention_heads: 8 decoder_normalize_before: true no_decoder_final_norm: true adaptive_softmax_cutoff: "20000,60000" adaptive_softmax_dropout: 0.2 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: true adaptive_input_factor: 4 adaptive_input_cutoff: "20000,60000" tie_adaptive_weights: true tie_adaptive_proj: true decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_big.yaml ================================================ # @package _group_ activation_fn: "relu" dropout: 0.1 attention_dropout: 0.0 activation_dropout: 0.0 relu_dropout: 0.0 decoder_embed_dim: 1024 decoder_output_dim: 1024 decoder_input_dim: 1024 decoder_ffn_embed_dim: 4096 decoder_layers: 12 decoder_attention_heads: 16 decoder_normalize_before: true no_decoder_final_norm: false adaptive_softmax_cutoff: null adaptive_softmax_dropout: 0 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: false adaptive_input_factor: 4 adaptive_input_cutoff: null tie_adaptive_weights: false tie_adaptive_proj: false decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_gbw.yaml ================================================ # @package _group_ activation_fn: "relu" dropout: 0.1 attention_dropout: 0.1 activation_dropout: 0.0 relu_dropout: 0.0 decoder_embed_dim: 512 decoder_output_dim: 512 decoder_input_dim: 512 decoder_ffn_embed_dim: 4096 decoder_layers: 12 decoder_attention_heads: 16 decoder_normalize_before: true no_decoder_final_norm: true adaptive_softmax_cutoff: null adaptive_softmax_dropout: 0 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: false adaptive_input_factor: 4 adaptive_input_cutoff: null tie_adaptive_weights: false tie_adaptive_proj: false decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_gpt.yaml ================================================ # @package _group_ activation_fn: "gelu" dropout: 0.1 attention_dropout: 0.1 activation_dropout: 0.0 relu_dropout: 0.0 decoder_embed_dim: 768 decoder_output_dim: 768 decoder_input_dim: 768 decoder_ffn_embed_dim: 3072 decoder_layers: 12 decoder_attention_heads: 12 decoder_normalize_before: true no_decoder_final_norm: false adaptive_softmax_cutoff: null adaptive_softmax_dropout: 0 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: false adaptive_input_factor: 4 adaptive_input_cutoff: null tie_adaptive_weights: false tie_adaptive_proj: false decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_gpt2_big.yaml ================================================ # @package _group_ activation_fn: "gelu" dropout: 0.1 attention_dropout: 0.1 activation_dropout: 0.0 relu_dropout: 0.0 decoder_embed_dim: 1600 decoder_output_dim: 1600 decoder_input_dim: 1600 decoder_ffn_embed_dim: 6400 decoder_layers: 48 decoder_attention_heads: 25 decoder_normalize_before: true no_decoder_final_norm: false adaptive_softmax_cutoff: null adaptive_softmax_dropout: 0 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: false adaptive_input_factor: 4 adaptive_input_cutoff: null tie_adaptive_weights: false tie_adaptive_proj: false decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_gpt2_medium.yaml ================================================ # @package _group_ activation_fn: "gelu" dropout: 0.1 attention_dropout: 0.1 activation_dropout: 0.0 relu_dropout: 0.0 decoder_embed_dim: 1280 decoder_output_dim: 1280 decoder_input_dim: 1280 decoder_ffn_embed_dim: 5120 decoder_layers: 36 decoder_attention_heads: 20 decoder_normalize_before: true no_decoder_final_norm: false adaptive_softmax_cutoff: null adaptive_softmax_dropout: 0 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: false adaptive_input_factor: 4 adaptive_input_cutoff: null tie_adaptive_weights: false tie_adaptive_proj: false decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_gpt2_small.yaml ================================================ # @package _group_ activation_fn: "gelu" dropout: 0.1 attention_dropout: 0.1 activation_dropout: 0.0 relu_dropout: 0.0 decoder_embed_dim: 1024 decoder_output_dim: 1024 decoder_input_dim: 1024 decoder_ffn_embed_dim: 4096 decoder_layers: 24 decoder_attention_heads: 16 decoder_normalize_before: true no_decoder_final_norm: false adaptive_softmax_cutoff: null adaptive_softmax_dropout: 0 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: false adaptive_input_factor: 4 adaptive_input_cutoff: null tie_adaptive_weights: false tie_adaptive_proj: false decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/transformer_lm/transformer_lm_wiki103.yaml ================================================ # @package _group_ activation_fn: "relu" dropout: 0.3 attention_dropout: 0.1 activation_dropout: 0.1 relu_dropout: 0.1 decoder_embed_dim: 1024 decoder_output_dim: 1024 decoder_input_dim: 1024 decoder_ffn_embed_dim: 4096 decoder_layers: 16 decoder_attention_heads: 8 decoder_normalize_before: true no_decoder_final_norm: true adaptive_softmax_cutoff: "20000,60000" adaptive_softmax_dropout: 0.2 adaptive_softmax_factor: 4 no_token_positional_embeddings: false share_decoder_input_output_embed: false character_embeddings: false character_filters: "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]" character_embedding_dim: 4 char_embedder_highway_layers: 2 adaptive_input: true adaptive_input_factor: 4 adaptive_input_cutoff: "20000,60000" tie_adaptive_weights: true tie_adaptive_proj: true decoder_learned_pos: false decoder_layerdrop: 0 decoder_layers_to_keep: null layernorm_embedding: false no_scale_embedding: false quant_noise_pq: 0 quant_noise_pq_block_size: 8 quant_noise_scalar: 0 ================================================ FILE: fairseq/config/model/wav2vec/vq_wav2vec_gumbel.yaml ================================================ # @package _group_ activation: gelu vq_type: gumbel vq_depth: 2 combine_groups: true ================================================ FILE: fairseq/config/model/wav2vec2/wav2vec2_base.yaml ================================================ # @package _group_ quantize_targets: true final_dim: 256 encoder_layerdrop: 0.05 dropout_input: 0.1 dropout_features: 0.1 feature_grad_mult: 0.1 ================================================ FILE: fairseq/config/model/wav2vec2/wav2vec2_large.yaml ================================================ # @package _group_ quantize_targets: true extractor_mode: layer_norm layer_norm_first: true final_dim: 768 latent_temp: [2.0,0.1,0.999995] encoder_layerdrop: 0.0 dropout_input: 0.0 dropout_features: 0.0 dropout: 0.0 attention_dropout: 0.0 conv_bias: true encoder_layers: 24 encoder_embed_dim: 1024 encoder_ffn_embed_dim: 4096 encoder_attention_heads: 16 feature_grad_mult: 1.0 ================================================ FILE: fairseq/criterions/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import importlib import os from fairseq import registry from fairseq.criterions.fairseq_criterion import ( # noqa FairseqCriterion, LegacyFairseqCriterion, ) from omegaconf import DictConfig ( build_criterion_, register_criterion, CRITERION_REGISTRY, CRITERION_DATACLASS_REGISTRY, ) = registry.setup_registry( "--criterion", base_class=FairseqCriterion, default="cross_entropy" ) def build_criterion(cfg: DictConfig, task, from_checkpoint=False): return build_criterion_(cfg, task, from_checkpoint=from_checkpoint) # automatically import any Python files in the criterions/ directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): file_name = file[: file.find(".py")] importlib.import_module("fairseq.criterions." + file_name) ================================================ FILE: fairseq/criterions/adaptive_loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.constants import DDP_BACKEND_CHOICES from omegaconf import II @dataclass class AdaptiveLossConfig(FairseqDataclass): sentence_avg: bool = II("optimization.sentence_avg") ddp_backend: DDP_BACKEND_CHOICES = II("distributed_training.ddp_backend") @register_criterion("adaptive_loss", dataclass=AdaptiveLossConfig) class AdaptiveLoss(FairseqCriterion): """This is an implementation of the loss function accompanying the adaptive softmax approximation for graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs" (http://arxiv.org/abs/1609.04309).""" def __init__(self, task, sentence_avg): super().__init__(task) self.sentence_avg = sentence_avg @classmethod def build_criterion(cls, cfg: AdaptiveLossConfig, task): if cfg.ddp_backend in {"c10d", "pytorch_ddp"}: raise Exception( "AdaptiveLoss is not compatible with the PyTorch " "version of DistributedDataParallel. Please use " "`--ddp-backend=legacy_ddp` instead." ) return cls(task, cfg.sentence_avg) def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ assert ( hasattr(model.decoder, "adaptive_softmax") and model.decoder.adaptive_softmax is not None ) adaptive_softmax = model.decoder.adaptive_softmax net_output = model(**sample["net_input"]) orig_target = model.get_targets(sample, net_output) nsentences = orig_target.size(0) orig_target = orig_target.view(-1) bsz = orig_target.size(0) logits, target = adaptive_softmax(net_output[0], orig_target) assert len(target) == len(logits) loss = net_output[0].new(1 if reduce else bsz).zero_() for i in range(len(target)): if target[i] is not None: assert target[i].min() >= 0 and target[i].max() <= logits[i].size(1) loss += F.cross_entropy( logits[i], target[i], ignore_index=self.padding_idx, reduction="sum" if reduce else "none", ) orig = utils.strip_pad(orig_target, self.padding_idx) ntokens = orig.numel() sample_size = sample["target"].size(0) if self.sentence_avg else ntokens logging_output = { "loss": loss.data, "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size, } return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) else: metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/composite_loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq import utils from fairseq.criterions import LegacyFairseqCriterion, register_criterion from torch import nn @register_criterion("composite_loss") class CompositeLoss(LegacyFairseqCriterion): """This is a composite loss that, given a list of model outputs and a list of targets, computes an average of losses for each output-target pair""" def __init__(self, args, task): super().__init__(args, task) self.underlying_criterion = args.underlying_criterion @staticmethod def add_args(parser): """Add criterion-specific arguments to the parser.""" # fmt: off parser.add_argument('--underlying-criterion', type=str, metavar='VAL', required=True, help='underlying criterion to use for the composite loss') # fmt: on @staticmethod def build_underlying_criterion(args, task): saved_criterion = args.criterion args.criterion = args.underlying_criterion assert saved_criterion != args.underlying_criterion underlying_criterion = task.build_criterion(args) args.criterion = saved_criterion return underlying_criterion @classmethod def build_criterion(cls, args, task): underlying_criterion = CompositeLoss.build_underlying_criterion(args, task) class FakeModel(nn.Module): def __init__(self, model, net_out, target): super().__init__() self.model = model self.net_out = net_out self.target = target def forward(self, **unused): return self.net_out def get_normalized_probs(self, net_output, log_probs, sample=None): return self.model.get_normalized_probs( net_output, log_probs, sample=sample ) def get_targets(self, *unused): return self.target @property def decoder(self): return self.model.decoder class _CompositeLoss(LegacyFairseqCriterion): def __init__(self, args, task, underlying_criterion): super().__init__(args, task) self.underlying_criterion = underlying_criterion def forward(self, model, sample, reduce=True): net_outputs = model(**sample["net_input"]) targets = sample["target"] bsz = targets[0].size(0) loss = net_outputs[0][0].new(1 if reduce else bsz).float().zero_() sample_size = 0 logging_output = {} for o, t in zip(net_outputs[0], targets): m = FakeModel(model, (o, net_outputs[1]), t) sample["target"] = t l, ss, logging_output = self.underlying_criterion(m, sample, reduce) loss += l sample_size += ss loss.div_(len(targets)) sample_size /= len(targets) logging_output["loss"] = utils.item(loss.data) if reduce else loss.data return loss, sample_size, logging_output @staticmethod def aggregate_logging_outputs(logging_outputs): return underlying_criterion.__class__.aggregate_logging_outputs( logging_outputs ) @staticmethod def reduce_metrics(logging_outputs) -> None: underlying_criterion.__class__.reduce_metrics(logging_outputs) return _CompositeLoss(args, task, underlying_criterion) ================================================ FILE: fairseq/criterions/cross_entropy.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from omegaconf import II @dataclass class CrossEntropyCriterionConfig(FairseqDataclass): sentence_avg: bool = II("optimization.sentence_avg") @register_criterion("cross_entropy", dataclass=CrossEntropyCriterionConfig) class CrossEntropyCriterion(FairseqCriterion): def __init__(self, task, sentence_avg): super().__init__(task) self.sentence_avg = sentence_avg def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) loss, _ = self.compute_loss(model, net_output, sample, reduce=reduce) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } return loss, sample_size, logging_output def compute_loss(self, model, net_output, sample, reduce=True): lprobs = model.get_normalized_probs(net_output, log_probs=True) lprobs = lprobs.view(-1, lprobs.size(-1)) target = model.get_targets(sample, net_output).view(-1) loss = F.nll_loss( lprobs, target, ignore_index=self.padding_idx, reduction="sum" if reduce else "none", ) return loss, loss @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) # we divide by log(2) to convert the loss from base e to base 2 metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) else: metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/ctc.py ================================================ # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import math from argparse import Namespace from dataclasses import dataclass, field from omegaconf import II from typing import Optional import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from fairseq.data.data_utils import post_process from fairseq.tasks import FairseqTask from fairseq.logging.meters import safe_round @dataclass class CtcCriterionConfig(FairseqDataclass): zero_infinity: bool = field( default=False, metadata={"help": "zero inf loss when source length <= target length"}, ) sentence_avg: bool = II("optimization.sentence_avg") post_process: str = field( default="letter", metadata={ "help": "how to post process predictions into words. can be letter, " "wordpiece, BPE symbols, etc. " "See fairseq.data.data_utils.post_process() for full list of options" }, ) wer_kenlm_model: Optional[str] = field( default=None, metadata={ "help": "if this is provided, use kenlm to compute wer (along with other wer_* args)" }, ) wer_lexicon: Optional[str] = field( default=None, metadata={"help": "lexicon to use with wer_kenlm_model"}, ) wer_lm_weight: float = field( default=2.0, metadata={"help": "lm weight to use with wer_kenlm_model"}, ) wer_word_score: float = field( default=-1.0, metadata={"help": "lm word score to use with wer_kenlm_model"}, ) wer_sil_weight: float = field( default=0, metadata={"help": "lm word score to use with wer_kenlm_model"}, ) wer_args: Optional[str] = field( default=None, metadata={ "help": "DEPRECATED: tuple of (wer_kenlm_model, wer_lexicon, wer_lm_weight, wer_word_score)" }, ) @register_criterion("ctc", dataclass=CtcCriterionConfig) class CtcCriterion(FairseqCriterion): def __init__( self, cfg: CtcCriterionConfig, task: FairseqTask, rdrop_alpha: int = 0.0 ): super().__init__(task) self.blank_idx = ( task.target_dictionary.index(task.blank_symbol) if hasattr(task, "blank_symbol") else 0 ) self.pad_idx = task.target_dictionary.pad() self.eos_idx = task.target_dictionary.eos() self.post_process = cfg.post_process self.rdrop_alpha = rdrop_alpha if cfg.wer_args is not None: ( cfg.wer_kenlm_model, cfg.wer_lexicon, cfg.wer_lm_weight, cfg.wer_word_score, ) = eval(cfg.wer_args) if cfg.wer_kenlm_model is not None and cfg.wer_kenlm_model != "": from examples.speech_recognition.w2l_decoder import W2lKenLMDecoder dec_args = Namespace() dec_args.nbest = 1 dec_args.criterion = "ctc" dec_args.kenlm_model = cfg.wer_kenlm_model dec_args.lexicon = cfg.wer_lexicon dec_args.beam = 50 dec_args.beam_size_token = min(50, len(task.target_dictionary)) dec_args.beam_threshold = min(50, len(task.target_dictionary)) dec_args.lm_weight = cfg.wer_lm_weight dec_args.word_score = cfg.wer_word_score dec_args.sil_weight = cfg.wer_sil_weight dec_args.unk_weight = -math.inf dec_args.sil_weight = 0 self.w2l_decoder = W2lKenLMDecoder(dec_args, task.target_dictionary) else: self.w2l_decoder = None self.zero_infinity = cfg.zero_infinity self.sentence_avg = cfg.sentence_avg def forward(self, model, sample, reduce=True, **kwargs): net_output = model(**sample["net_input"]) lprobs = model.get_normalized_probs( net_output, log_probs=True ).contiguous() # (T, B, C) from the encoder # CTC loss is calculated over duplicated inputs # sample is already duplicated for R-Drop if self.rdrop_alpha > 0: for k, v in sample.items(): if k in ["target", "target_lengths"]: sample[k] = torch.cat([v, v.clone()], dim=0) elif k == "net_input": if sample[k]["src_tokens"].size(1) != sample[k]["src_lengths"].size( 0 ): # for decoder CTC loss sample[k]["src_lengths"] = torch.cat( [ sample[k]["src_lengths"], sample[k]["src_lengths"].clone(), ], dim=0, ) if "src_lengths" in sample["net_input"]: input_lengths = sample["net_input"]["src_lengths"] else: if net_output["padding_mask"] is not None: non_padding_mask = ~net_output["padding_mask"] input_lengths = non_padding_mask.long().sum(-1) else: input_lengths = lprobs.new_full( (lprobs.size(1),), lprobs.size(0), dtype=torch.long ) pad_mask = (sample["target"] != self.pad_idx) & ( sample["target"] != self.eos_idx ) targets_flat = sample["target"].masked_select(pad_mask) if "target_lengths" in sample: target_lengths = sample["target_lengths"] else: target_lengths = pad_mask.sum(-1) with torch.backends.cudnn.flags(enabled=False): loss = F.ctc_loss( lprobs, targets_flat, input_lengths, target_lengths, blank=self.blank_idx, reduction="sum", zero_infinity=self.zero_infinity, ) ntokens = ( sample["ntokens"] if "ntokens" in sample else target_lengths.sum().item() ) sample_size = sample["target"].size(0) if self.sentence_avg else ntokens logging_output = { "loss": utils.item(loss.data), # * sample['ntokens'], "ntokens": ntokens, "nsentences": sample["id"].numel(), "sample_size": sample_size, } if not model.training: import editdistance with torch.no_grad(): lprobs_t = lprobs.transpose(0, 1).float().contiguous().cpu() c_err = 0 c_len = 0 w_errs = 0 w_len = 0 wv_errs = 0 for lp, t, inp_l in zip( lprobs_t, sample["target_label"] if "target_label" in sample else sample["target"], input_lengths, ): lp = lp[:inp_l].unsqueeze(0) decoded = None if self.w2l_decoder is not None: decoded = self.w2l_decoder.decode(lp) if len(decoded) < 1: decoded = None else: decoded = decoded[0] if len(decoded) < 1: decoded = None else: decoded = decoded[0] p = (t != self.task.target_dictionary.pad()) & ( t != self.task.target_dictionary.eos() ) targ = t[p] targ_units = self.task.target_dictionary.string(targ) targ_units_arr = targ.tolist() toks = lp.argmax(dim=-1).unique_consecutive() pred_units_arr = toks[toks != self.blank_idx].tolist() c_err += editdistance.eval(pred_units_arr, targ_units_arr) c_len += len(targ_units_arr) targ_words = post_process(targ_units, self.post_process).split() pred_units = self.task.target_dictionary.string(pred_units_arr) pred_words_raw = post_process(pred_units, self.post_process).split() if decoded is not None and "words" in decoded: pred_words = decoded["words"] w_errs += editdistance.eval(pred_words, targ_words) wv_errs += editdistance.eval(pred_words_raw, targ_words) else: dist = editdistance.eval(pred_words_raw, targ_words) w_errs += dist wv_errs += dist w_len += len(targ_words) logging_output["wv_errors"] = wv_errs logging_output["w_errors"] = w_errs logging_output["w_total"] = w_len logging_output["c_errors"] = c_err logging_output["c_total"] = c_len return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) nsentences = utils.item( sum(log.get("nsentences", 0) for log in logging_outputs) ) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar("ntokens", ntokens) metrics.log_scalar("nsentences", nsentences) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) c_errors = sum(log.get("c_errors", 0) for log in logging_outputs) metrics.log_scalar("_c_errors", c_errors) c_total = sum(log.get("c_total", 0) for log in logging_outputs) metrics.log_scalar("_c_total", c_total) w_errors = sum(log.get("w_errors", 0) for log in logging_outputs) metrics.log_scalar("_w_errors", w_errors) wv_errors = sum(log.get("wv_errors", 0) for log in logging_outputs) metrics.log_scalar("_wv_errors", wv_errors) w_total = sum(log.get("w_total", 0) for log in logging_outputs) metrics.log_scalar("_w_total", w_total) if c_total > 0: metrics.log_derived( "uer", lambda meters: safe_round( meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3 ) if meters["_c_total"].sum > 0 else float("nan"), ) if w_total > 0: metrics.log_derived( "wer", lambda meters: safe_round( meters["_w_errors"].sum * 100.0 / meters["_w_total"].sum, 3 ) if meters["_w_total"].sum > 0 else float("nan"), ) metrics.log_derived( "raw_wer", lambda meters: safe_round( meters["_wv_errors"].sum * 100.0 / meters["_w_total"].sum, 3 ) if meters["_w_total"].sum > 0 else float("nan"), ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/fairseq_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import inspect from typing import Any, Dict, List from fairseq import utils from fairseq.logging import metrics from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import gen_parser_from_dataclass from torch.nn.modules.loss import _Loss class FairseqCriterion(_Loss): def __init__(self, task): super().__init__() self.task = task if hasattr(task, "target_dictionary"): tgt_dict = task.target_dictionary self.padding_idx = tgt_dict.pad() if tgt_dict is not None else -100 @classmethod def add_args(cls, parser): """Add criterion-specific arguments to the parser.""" dc = getattr(cls, "__dataclass", None) if dc is not None: gen_parser_from_dataclass(parser, dc()) @classmethod def build_criterion(cls, cfg: FairseqDataclass, task): """Construct a criterion from command-line args.""" # arguments in the __init__. init_args = {} for p in inspect.signature(cls).parameters.values(): if ( p.kind == p.POSITIONAL_ONLY or p.kind == p.VAR_POSITIONAL or p.kind == p.VAR_KEYWORD ): # we haven't implemented inference for these argument types, # but PRs welcome :) raise NotImplementedError("{} not supported".format(p.kind)) assert p.kind in {p.POSITIONAL_OR_KEYWORD, p.KEYWORD_ONLY} if p.name == "task": init_args["task"] = task elif p.name == "cfg": init_args["cfg"] = cfg elif hasattr(cfg, p.name): init_args[p.name] = getattr(cfg, p.name) elif p.default != p.empty: pass # we'll use the default value else: raise NotImplementedError( "Unable to infer Criterion arguments, please implement " "{}.build_criterion".format(cls.__name__) ) return cls(**init_args) def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ raise NotImplementedError @staticmethod def aggregate_logging_outputs( logging_outputs: List[Dict[str, Any]] ) -> Dict[str, Any]: """Aggregate logging outputs from data parallel training.""" utils.deprecation_warning( "The aggregate_logging_outputs API is deprecated. " "Please use the reduce_metrics API instead." ) raise NotImplementedError @classmethod def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None: """Aggregate logging outputs from data parallel training.""" utils.deprecation_warning( "Criterions should implement the reduce_metrics API. " "Falling back to deprecated aggregate_logging_outputs API." ) agg_logging_outputs = cls.aggregate_logging_outputs(logging_outputs) for k, v in agg_logging_outputs.items(): if k in {"nsentences", "ntokens", "sample_size"}: continue metrics.log_scalar(k, v) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return False class LegacyFairseqCriterion(FairseqCriterion): def __init__(self, args, task): super().__init__(task=task) self.args = args utils.deprecation_warning( "Criterions should take explicit arguments instead of an " "argparse.Namespace object, please update your criterion by " "extending FairseqCriterion instead of LegacyFairseqCriterion." ) @classmethod def build_criterion(cls, args, task): """Construct a criterion from command-line args.""" return cls(args, task) ================================================ FILE: fairseq/criterions/fastspeech2_loss.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. from typing import List, Dict, Any from dataclasses import dataclass, field import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from fairseq.data.data_utils import lengths_to_mask from fairseq.models.fairseq_model import FairseqEncoderModel @dataclass class FastSpeech2CriterionConfig(FairseqDataclass): ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"}) @register_criterion("fastspeech2", dataclass=FastSpeech2CriterionConfig) class FastSpeech2Loss(FairseqCriterion): def __init__(self, task, ctc_weight): super().__init__(task) self.ctc_weight = ctc_weight def forward(self, model: FairseqEncoderModel, sample, reduction="mean"): src_tokens = sample["net_input"]["src_tokens"] src_lens = sample["net_input"]["src_lengths"] tgt_lens = sample["target_lengths"] _feat_out, _feat_out_post, _, log_dur_out, pitch_out, energy_out = model( src_tokens=src_tokens, src_lengths=src_lens, prev_output_tokens=sample["net_input"]["prev_output_tokens"], incremental_state=None, target_lengths=tgt_lens, speaker=sample["speaker"], durations=sample["durations"], pitches=sample["pitches"], energies=sample["energies"], ) src_mask = lengths_to_mask(sample["net_input"]["src_lengths"]) tgt_mask = lengths_to_mask(sample["target_lengths"]) pitches, energies = sample["pitches"], sample["energies"] pitch_out, pitches = pitch_out[src_mask], pitches[src_mask] energy_out, energies = energy_out[src_mask], energies[src_mask] feat_out, feat = _feat_out[tgt_mask], sample["target"][tgt_mask] l1_loss = F.l1_loss(feat_out, feat, reduction=reduction) if _feat_out_post is not None: l1_loss += F.l1_loss(_feat_out_post[tgt_mask], feat, reduction=reduction) pitch_loss = F.mse_loss(pitch_out, pitches, reduction=reduction) energy_loss = F.mse_loss(energy_out, energies, reduction=reduction) log_dur_out = log_dur_out[src_mask] dur = sample["durations"].float() dur = dur.half() if log_dur_out.type().endswith(".HalfTensor") else dur log_dur = torch.log(dur + 1)[src_mask] dur_loss = F.mse_loss(log_dur_out, log_dur, reduction=reduction) ctc_loss = torch.tensor(0.0).type_as(l1_loss) if self.ctc_weight > 0.0: lprobs = model.get_normalized_probs((_feat_out,), log_probs=True) lprobs = lprobs.transpose(0, 1) # T x B x C src_mask = lengths_to_mask(src_lens) src_tokens_flat = src_tokens.masked_select(src_mask) ctc_loss = ( F.ctc_loss( lprobs, src_tokens_flat, tgt_lens, src_lens, reduction=reduction, zero_infinity=True, ) * self.ctc_weight ) loss = l1_loss + dur_loss + pitch_loss + energy_loss + ctc_loss sample_size = sample["nsentences"] logging_output = { "loss": utils.item(loss.data), "ntokens": sample["ntokens"], "nsentences": sample["nsentences"], "sample_size": sample_size, "l1_loss": utils.item(l1_loss.data), "dur_loss": utils.item(dur_loss.data), "pitch_loss": utils.item(pitch_loss.data), "energy_loss": utils.item(energy_loss.data), "ctc_loss": utils.item(ctc_loss.data), } return loss, sample_size, logging_output @classmethod def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None: ns = [log.get("sample_size", 0) for log in logging_outputs] ntot = sum(ns) ws = [n / (ntot + 1e-8) for n in ns] for key in [ "loss", "l1_loss", "dur_loss", "pitch_loss", "energy_loss", "ctc_loss", ]: vals = [log.get(key, 0) for log in logging_outputs] val = sum(val * w for val, w in zip(vals, ws)) metrics.log_scalar(key, val, ntot, round=3) metrics.log_scalar("sample_size", ntot, len(logging_outputs)) # inference metrics if "targ_frames" not in logging_outputs[0]: return n = sum(log.get("targ_frames", 0) for log in logging_outputs) for key, new_key in [ ("mcd_loss", "mcd_loss"), ("pred_frames", "pred_ratio"), ("nins", "ins_rate"), ("ndel", "del_rate"), ]: val = sum(log.get(key, 0) for log in logging_outputs) metrics.log_scalar(new_key, val / n, n, round=3) @staticmethod def logging_outputs_can_be_summed() -> bool: return False ================================================ FILE: fairseq/criterions/hubert_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import re from dataclasses import dataclass, field from typing import List, Optional import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass @dataclass class HubertCriterionConfig(FairseqDataclass): pred_masked_weight: float = field( default=1.0, metadata={"help": "weight for predictive loss for masked frames"}, ) pred_nomask_weight: float = field( default=0.0, metadata={"help": "weight for predictive loss for unmasked frames"}, ) loss_weights: Optional[List[float]] = field( default=None, metadata={"help": "weights for additional loss terms (not first one)"}, ) log_keys: List[str] = field( default_factory=lambda: [], metadata={"help": "output keys to log"}, ) @register_criterion("hubert", dataclass=HubertCriterionConfig) class HubertCriterion(FairseqCriterion): def __init__( self, task, pred_masked_weight, pred_nomask_weight, loss_weights=None, log_keys=None, ): super().__init__(task) self.pred_masked_weight = pred_masked_weight self.pred_nomask_weight = pred_nomask_weight self.loss_weights = loss_weights self.log_keys = [] if log_keys is None else log_keys def forward(self, model, sample, reduce=True, log_pred=False): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(target_list=sample["target_list"], **sample["net_input"]) loss = 0.0 sample_size = 0 logging_output = {} reduction = "sum" if reduce else "none" loss_m_list = [] logp_m_list = model.get_logits(net_output, True) targ_m_list = model.get_targets(net_output, True) assert self.pred_masked_weight == 0 or len(logp_m_list) > 0 for i, (logp_m, targ_m) in enumerate(zip(logp_m_list, targ_m_list)): loss_m = F.cross_entropy(logp_m, targ_m, reduction=reduction) loss_m_list.append(loss_m) logging_output[f"loss_m_{i}"] = loss_m.detach().item() if self.pred_masked_weight > 0: loss += self.pred_masked_weight * sum(loss_m_list) sample_size += targ_m_list[0].numel() loss_u_list = [] logp_u_list = model.get_logits(net_output, False) targ_u_list = model.get_targets(net_output, False) assert self.pred_nomask_weight == 0 or len(logp_u_list) > 0 for i, (logp_u, targ_u) in enumerate(zip(logp_u_list, targ_u_list)): loss_u = F.cross_entropy(logp_u, targ_u, reduction=reduction) loss_u_list.append(loss_u) logging_output[f"loss_u_{i}"] = loss_u.detach().item() if self.pred_nomask_weight > 0: loss += self.pred_nomask_weight * sum(loss_u_list) sample_size += targ_u_list[0].numel() if self.loss_weights is not None: assert hasattr(model, "get_extra_losses") extra_losses, names = model.get_extra_losses(net_output) if torch.is_tensor(extra_losses): extra_losses = [extra_losses] names = [names] if len(self.loss_weights) == 1 and len(extra_losses) != 1: self.loss_weights = [self.loss_weights[0]] * len(extra_losses) assert len(extra_losses) == len( self.loss_weights ), f"{len(extra_losses)}, {len(self.loss_weights)}" for p, n, coef in zip(extra_losses, names, self.loss_weights): if coef != 0 and p is not None: p = coef * p.float() * sample_size loss += p logging_output[f"loss_{n}"] = p.item() logging_output = { "loss": loss.item() if reduce else loss, "ntokens": sample_size, "nsentences": sample["id"].numel(), "sample_size": sample_size, **logging_output, } for lk in self.log_keys: if lk in net_output: logging_output[lk] = float((net_output[lk])) def compute_correct(logits): if logits.numel() == 0: return 0, 0 else: assert logits.dim() > 1, logits.shape max = logits.argmax(-1) == 0 min = logits.argmin(-1) == 0 both = max & min corr = max.long().sum().item() - both.long().sum().item() count = max.numel() return corr, count with torch.no_grad(): for i, logp_m in enumerate(logp_m_list): corr_m, count_m = compute_correct(logp_m) logging_output[f"correct_m_{i}"] = corr_m logging_output[f"count_m_{i}"] = count_m for i, logp_u in enumerate(logp_u_list): corr_u, count_u = compute_correct(logp_u) logging_output[f"correct_u_{i}"] = corr_u logging_output[f"count_u_{i}"] = count_u return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training (copied from normal cross entropy).""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) else: metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) ) counts = {} for lk in logging_outputs[0].keys(): if lk.startswith("count_"): val = sum(log[lk] for log in logging_outputs) metrics.log_scalar(lk, val) counts[lk] = val for lk in logging_outputs[0].keys(): if lk.startswith("loss_"): val = sum(log[lk] for log in logging_outputs) metrics.log_scalar(lk, val / sample_size / math.log(2), round=3) elif lk.startswith("correct_"): val = sum(log[lk] for log in logging_outputs) metrics.log_scalar(lk, val / counts[re.sub("correct", "count", lk)]) @staticmethod def aggregate_logging_outputs(logging_outputs): """Aggregate logging outputs from data parallel training.""" raise NotImplementedError() @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return False ================================================ FILE: fairseq/criterions/label_smoothed_cross_entropy.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field import torch from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from omegaconf import II @dataclass class LabelSmoothedCrossEntropyCriterionConfig(FairseqDataclass): label_smoothing: float = field( default=0.0, metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"}, ) report_accuracy: bool = field( default=False, metadata={"help": "report accuracy metric"}, ) ignore_prefix_size: int = field( default=0, metadata={"help": "Ignore first N tokens"}, ) sentence_avg: bool = II("optimization.sentence_avg") def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=None, reduce=True): if target.dim() == lprobs.dim() - 1: target = target.unsqueeze(-1) nll_loss = -lprobs.gather(dim=-1, index=target) smooth_loss = -lprobs.sum(dim=-1, keepdim=True) if ignore_index is not None: pad_mask = target.eq(ignore_index) nll_loss.masked_fill_(pad_mask, 0.0) smooth_loss.masked_fill_(pad_mask, 0.0) else: nll_loss = nll_loss.squeeze(-1) smooth_loss = smooth_loss.squeeze(-1) if reduce: nll_loss = nll_loss.sum() smooth_loss = smooth_loss.sum() eps_i = epsilon / (lprobs.size(-1) - 1) loss = (1.0 - epsilon - eps_i) * nll_loss + eps_i * smooth_loss return loss, nll_loss @register_criterion( "label_smoothed_cross_entropy", dataclass=LabelSmoothedCrossEntropyCriterionConfig ) class LabelSmoothedCrossEntropyCriterion(FairseqCriterion): def __init__( self, task, sentence_avg, label_smoothing, ignore_prefix_size=0, report_accuracy=False, ): super().__init__(task) self.sentence_avg = sentence_avg self.eps = label_smoothing self.ignore_prefix_size = ignore_prefix_size self.report_accuracy = report_accuracy def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } if self.report_accuracy: n_correct, total = self.compute_accuracy(model, net_output, sample) logging_output["n_correct"] = utils.item(n_correct.data) logging_output["total"] = utils.item(total.data) return loss, sample_size, logging_output def get_lprobs_and_target(self, model, net_output, sample): lprobs = model.get_normalized_probs(net_output, log_probs=True) target = model.get_targets(sample, net_output) if self.ignore_prefix_size > 0: # lprobs: B x T x C lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous() target = target[:, self.ignore_prefix_size :].contiguous() return lprobs.view(-1, lprobs.size(-1)), target.view(-1) def compute_loss(self, model, net_output, sample, reduce=True): lprobs, target = self.get_lprobs_and_target(model, net_output, sample) loss, nll_loss = label_smoothed_nll_loss( lprobs, target, self.eps, ignore_index=self.padding_idx, reduce=reduce, ) return loss, nll_loss def compute_accuracy(self, model, net_output, sample): lprobs, target = self.get_lprobs_and_target(model, net_output, sample) mask = target.ne(self.padding_idx) n_correct = torch.sum( lprobs.argmax(1).masked_select(mask).eq(target.masked_select(mask)) ) total = torch.sum(mask) return n_correct, total @classmethod def reduce_metrics(cls, logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) nll_loss_sum = sum(log.get("nll_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar( "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) total = utils.item(sum(log.get("total", 0) for log in logging_outputs)) if total > 0: metrics.log_scalar("total", total) n_correct = utils.item( sum(log.get("n_correct", 0) for log in logging_outputs) ) metrics.log_scalar("n_correct", n_correct) metrics.log_derived( "accuracy", lambda meters: round( meters["n_correct"].sum * 100.0 / meters["total"].sum, 3 ) if meters["total"].sum > 0 else float("nan"), ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/label_smoothed_cross_entropy_latency_augmented.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import torch from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import register_criterion from fairseq.criterions.label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterion, LabelSmoothedCrossEntropyCriterionConfig, ) try: from simuleval.metrics.latency import ( AverageLagging, AverageProportion, DifferentiableAverageLagging, ) LATENCY_METRICS = { "average_lagging": AverageLagging, "average_proportion": AverageProportion, "differentiable_average_lagging": DifferentiableAverageLagging, } except ImportError: LATENCY_METRICS = None @dataclass class LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig( LabelSmoothedCrossEntropyCriterionConfig ): latency_avg_weight: float = field( default=0.0, metadata={"help": "weight fot average latency loss."}, ) latency_var_weight: float = field( default=0.0, metadata={"help": "weight fot variance latency loss."}, ) latency_avg_type: str = field( default="differentiable_average_lagging", metadata={"help": "latency type for average loss"}, ) latency_var_type: str = field( default="variance_delay", metadata={"help": "latency typ for variance loss"}, ) latency_gather_method: str = field( default="weighted_average", metadata={"help": "method to gather latency loss for all heads"}, ) latency_update_after: int = field( default=0, metadata={"help": "Add latency loss after certain steps"}, ) @register_criterion( "latency_augmented_label_smoothed_cross_entropy", dataclass=LabelSmoothedCrossEntropyCriterionLatencyAugmentConfig, ) class LatencyAugmentedLabelSmoothedCrossEntropyCriterion( LabelSmoothedCrossEntropyCriterion ): def __init__( self, task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy, latency_avg_weight, latency_var_weight, latency_avg_type, latency_var_type, latency_gather_method, latency_update_after, ): super().__init__( task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy ) assert LATENCY_METRICS is not None, "Please make sure SimulEval is installed." self.latency_avg_weight = latency_avg_weight self.latency_var_weight = latency_var_weight self.latency_avg_type = latency_avg_type self.latency_var_type = latency_var_type self.latency_gather_method = latency_gather_method self.latency_update_after = latency_update_after def forward(self, model, sample, reduce=True): net_output = model(**sample["net_input"]) # 1. Compute cross entropy loss loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) # 2. Compute cross latency loss latency_loss, expected_latency, expected_delays_var = self.compute_latency_loss( model, sample, net_output ) if self.latency_update_after > 0: num_updates = getattr(model.decoder, "num_updates", None) assert ( num_updates is not None ), "model.decoder doesn't have attribute 'num_updates'" if num_updates <= self.latency_update_after: latency_loss = 0 loss += latency_loss sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, "latency": expected_latency, "delays_var": expected_delays_var, "latency_loss": latency_loss, } if self.report_accuracy: n_correct, total = self.compute_accuracy(model, net_output, sample) logging_output["n_correct"] = utils.item(n_correct.data) logging_output["total"] = utils.item(total.data) return loss, sample_size, logging_output def compute_latency_loss(self, model, sample, net_output): assert ( net_output[-1].encoder_padding_mask is None or not net_output[-1].encoder_padding_mask[:, 0].any() ), "Only right padding on source is supported." # 1. Obtain the expected alignment alpha_list = [item["alpha"] for item in net_output[1].attn_list] num_layers = len(alpha_list) bsz, num_heads, tgt_len, src_len = alpha_list[0].size() # bsz * num_layers * num_heads, tgt_len, src_len alpha_all = torch.cat(alpha_list, dim=1).view(-1, tgt_len, src_len) # 2 compute expected delays # bsz * num_heads * num_layers, tgt_len, src_len for MMA steps = ( torch.arange(1, 1 + src_len) .unsqueeze(0) .unsqueeze(1) .expand_as(alpha_all) .type_as(alpha_all) ) expected_delays = torch.sum(steps * alpha_all, dim=-1) target_padding_mask = ( model.get_targets(sample, net_output) .eq(self.padding_idx) .unsqueeze(1) .expand(bsz, num_layers * num_heads, tgt_len) .contiguous() .view(-1, tgt_len) ) src_lengths = ( sample["net_input"]["src_lengths"] .unsqueeze(1) .expand(bsz, num_layers * num_heads) .contiguous() .view(-1) ) expected_latency = LATENCY_METRICS[self.latency_avg_type]( expected_delays, src_lengths, None, target_padding_mask=target_padding_mask ) # 2.1 average expected latency of heads # bsz, num_layers * num_heads expected_latency = expected_latency.view(bsz, -1) if self.latency_gather_method == "average": # bsz * tgt_len expected_latency = expected_delays.mean(dim=1) elif self.latency_gather_method == "weighted_average": weights = torch.nn.functional.softmax(expected_latency, dim=1) expected_latency = torch.sum(expected_latency * weights, dim=1) elif self.latency_gather_method == "max": expected_latency = expected_latency.max(dim=1)[0] else: raise NotImplementedError expected_latency = expected_latency.sum() avg_loss = self.latency_avg_weight * expected_latency # 2.2 variance of expected delays expected_delays_var = ( expected_delays.view(bsz, -1, tgt_len).var(dim=1).mean(dim=1) ) expected_delays_var = expected_delays_var.sum() var_loss = self.latency_avg_weight * expected_delays_var # 3. Final loss latency_loss = avg_loss + var_loss return latency_loss, expected_latency, expected_delays_var @classmethod def reduce_metrics(cls, logging_outputs) -> None: super().reduce_metrics(logging_outputs) latency = sum(log.get("latency", 0) for log in logging_outputs) delays_var = sum(log.get("delays_var", 0) for log in logging_outputs) latency_loss = sum(log.get("latency_loss", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) metrics.log_scalar("latency", latency.float() / nsentences, nsentences, round=3) metrics.log_scalar("delays_var", delays_var / nsentences, nsentences, round=3) metrics.log_scalar( "latency_loss", latency_loss / nsentences, nsentences, round=3 ) ================================================ FILE: fairseq/criterions/label_smoothed_cross_entropy_with_alignment.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import register_criterion from .label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterion, LabelSmoothedCrossEntropyCriterionConfig, ) from dataclasses import dataclass, field @dataclass class LabelSmoothedCrossEntropyCriterionWithAlignmentConfig( LabelSmoothedCrossEntropyCriterionConfig ): alignment_lambda: float = field( default=0.05, metadata={"help": "weight for the alignment loss"} ) @register_criterion( "label_smoothed_cross_entropy_with_alignment", dataclass=LabelSmoothedCrossEntropyCriterionWithAlignmentConfig, ) class LabelSmoothedCrossEntropyCriterionWithAlignment( LabelSmoothedCrossEntropyCriterion ): def __init__(self, task, sentence_avg, label_smoothing, alignment_lambda): super().__init__(task, sentence_avg, label_smoothing) self.alignment_lambda = alignment_lambda def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "nll_loss": utils.item(nll_loss.data) if reduce else nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } alignment_loss = None # Compute alignment loss only for training set and non dummy batches. if "alignments" in sample and sample["alignments"] is not None: alignment_loss = self.compute_alignment_loss(sample, net_output) if alignment_loss is not None: logging_output["alignment_loss"] = utils.item(alignment_loss.data) loss += self.alignment_lambda * alignment_loss return loss, sample_size, logging_output def compute_alignment_loss(self, sample, net_output): attn_prob = net_output[1]["attn"][0] bsz, tgt_sz, src_sz = attn_prob.shape attn = attn_prob.view(bsz * tgt_sz, src_sz) align = sample["alignments"] align_weights = sample["align_weights"].float() if len(align) > 0: # Alignment loss computation. align (shape [:, 2]) contains the src-tgt index pairs corresponding to # the alignments. align_weights (shape [:]) contains the 1 / frequency of a tgt index for normalizing. loss = -( (attn[align[:, 1][:, None], align[:, 0][:, None]]).log() * align_weights[:, None] ).sum() else: return None return loss @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) nll_loss_sum = utils.item( sum(log.get("nll_loss", 0) for log in logging_outputs) ) alignment_loss_sum = utils.item( sum(log.get("alignment_loss", 0) for log in logging_outputs) ) ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar( "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_scalar( "alignment_loss", alignment_loss_sum / sample_size / math.log(2), sample_size, round=3, ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import register_criterion from fairseq.criterions.label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterion, LabelSmoothedCrossEntropyCriterionConfig, ) from fairseq.data.data_utils import lengths_to_mask @dataclass class LabelSmoothedCrossEntropyWithCtcCriterionConfig( LabelSmoothedCrossEntropyCriterionConfig ): ctc_weight: float = field(default=1.0, metadata={"help": "weight for CTC loss"}) @register_criterion( "label_smoothed_cross_entropy_with_ctc", dataclass=LabelSmoothedCrossEntropyWithCtcCriterionConfig, ) class LabelSmoothedCrossEntropyWithCtcCriterion(LabelSmoothedCrossEntropyCriterion): def __init__( self, task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy, ctc_weight, ): super().__init__( task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy ) self.ctc_weight = ctc_weight def forward(self, model, sample, reduce=True): net_output = model(**sample["net_input"]) loss, nll_loss = self.compute_loss(model, net_output, sample, reduce=reduce) ctc_loss = torch.tensor(0.0).type_as(loss) if self.ctc_weight > 0.0: ctc_lprobs, ctc_lens = model.get_ctc_output(net_output, sample) ctc_tgt, ctc_tgt_lens = model.get_ctc_target(sample) ctc_tgt_mask = lengths_to_mask(ctc_tgt_lens) ctc_tgt_flat = ctc_tgt.masked_select(ctc_tgt_mask) reduction = "sum" if reduce else "none" ctc_loss = ( F.ctc_loss( ctc_lprobs, ctc_tgt_flat, ctc_lens, ctc_tgt_lens, reduction=reduction, zero_infinity=True, ) * self.ctc_weight ) loss += ctc_loss sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data), "nll_loss": utils.item(nll_loss.data), "ctc_loss": utils.item(ctc_loss.data), "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } if self.report_accuracy: n_correct, total = self.compute_accuracy(model, net_output, sample) logging_output["n_correct"] = utils.item(n_correct.data) logging_output["total"] = utils.item(total.data) return loss, sample_size, logging_output @classmethod def reduce_metrics(cls, logging_outputs) -> None: super().reduce_metrics(logging_outputs) loss_sum = sum(log.get("ctc_loss", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar( "ctc_loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) ================================================ FILE: fairseq/criterions/label_smoothed_cross_entropy_with_rdrop.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field import torch from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import register_criterion from fairseq.criterions.label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterion, LabelSmoothedCrossEntropyCriterionConfig, label_smoothed_nll_loss, ) @dataclass class RdropLabelSmoothedCrossEntropyCriterionConfig( LabelSmoothedCrossEntropyCriterionConfig ): rdrop_alpha: float = field( default=0.0, metadata={"help": "alpha for r-drop, 0 means no r-drop"}, ) @register_criterion( "label_smoothed_cross_entropy_with_rdrop", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig, ) class RdropLabelSmoothedCrossEntropyCriterion(LabelSmoothedCrossEntropyCriterion): def __init__( self, task, sentence_avg, label_smoothing, ignore_prefix_size=0, report_accuracy=False, rdrop_alpha=0.0, ): super().__init__( task, sentence_avg, label_smoothing, ignore_prefix_size=ignore_prefix_size, report_accuracy=report_accuracy, ) self.sentence_avg = sentence_avg self.eps = label_smoothing self.ignore_prefix_size = ignore_prefix_size self.report_accuracy = report_accuracy self.rdrop_alpha = rdrop_alpha def forward(self, model, sample, reduce=True, net_output=None): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ if net_output is None: if self.rdrop_alpha > 0 and sample["net_input"]["src_tokens"].size( 0 ) == sample["target"].size(0): sample = duplicate_input(sample) net_output = model(**sample["net_input"]) loss, nll_loss, rdrop_kl_loss = self.compute_loss( model, net_output, sample, reduce=reduce ) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } if self.report_accuracy: n_correct, total = self.compute_accuracy(model, net_output, sample) logging_output["n_correct"] = utils.item(n_correct.data) logging_output["total"] = utils.item(total.data) if self.rdrop_alpha > 0: logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data) return loss, sample_size, logging_output def get_lprobs_and_target(self, model, net_output, sample): lprobs = model.get_normalized_probs(net_output, log_probs=True) target = model.get_targets(sample, net_output) if self.rdrop_alpha > 0 or target.size(0) != lprobs.size(0): target = torch.cat([target, target.clone()], dim=0) if self.ignore_prefix_size > 0: # lprobs: B x T x C lprobs = lprobs[:, self.ignore_prefix_size :, :].contiguous() target = target[:, self.ignore_prefix_size :].contiguous() return lprobs.view(-1, lprobs.size(-1)), target.view(-1) def compute_loss(self, model, net_output, sample, reduce=True): lprobs, target = self.get_lprobs_and_target(model, net_output, sample) loss, nll_loss = label_smoothed_nll_loss( lprobs, target, self.eps, ignore_index=self.padding_idx, reduce=reduce, ) if self.rdrop_alpha > 0: pad_mask = target[: target.size(0) // 2].unsqueeze(-1).eq(self.padding_idx) rdrop_kl_loss = compute_kl_loss(model, net_output, pad_mask) loss += self.rdrop_alpha * rdrop_kl_loss else: rdrop_kl_loss = loss.new_zeros(1) return loss, nll_loss, rdrop_kl_loss @classmethod def reduce_metrics(cls, logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" super().reduce_metrics(logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) rdrop_kl_loss = utils.item( sum(log.get("rdrop_kl_loss", 0) for log in logging_outputs) / sample_size / math.log(2) ) if rdrop_kl_loss > 0: metrics.log_scalar("rdrop_kl_loss", rdrop_kl_loss) def duplicate_input(sample): if "net_input" in sample.keys(): sample_input = sample["net_input"] else: sample_input = sample for k, v in sample_input.items(): if isinstance(v, torch.Tensor): sample_input[k] = torch.cat([v, v.clone()], dim=0) if "net_input" in sample.keys(): sample["net_input"] = sample_input else: sample = sample_input return sample def compute_kl_loss(model, net_output, pad_mask=None, reduce=True): net_prob = model.get_normalized_probs(net_output, log_probs=True) net_prob_tec = model.get_normalized_probs(net_output, log_probs=False) net_prob = net_prob.view(-1, net_prob.size(-1)) net_prob_tec = net_prob_tec.view(-1, net_prob_tec.size(-1)) p, q = torch.split(net_prob, net_prob.size(0) // 2, dim=0) p_tec, q_tec = torch.split(net_prob_tec, net_prob_tec.size(0) // 2, dim=0) p_loss = torch.nn.functional.kl_div(p, q_tec, reduction="none") q_loss = torch.nn.functional.kl_div(q, p_tec, reduction="none") if pad_mask is not None: p_loss.masked_fill_(pad_mask, 0.0) q_loss.masked_fill_(pad_mask, 0.0) if reduce: p_loss = p_loss.sum() q_loss = q_loss.sum() loss = (p_loss + q_loss) / 2 return loss ================================================ FILE: fairseq/criterions/legacy_masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion def compute_cross_entropy_loss(logits, targets, ignore_index=-100): """ Function to compute the cross entropy loss. The default value of ignore_index is the same as the default value for F.cross_entropy in pytorch. """ assert logits.size(0) == targets.size( -1 ), "Logits and Targets tensor shapes don't match up" loss = F.nll_loss( F.log_softmax(logits, -1, dtype=torch.float32), targets, reduction="sum", ignore_index=ignore_index, ) return loss @register_criterion("legacy_masked_lm_loss") class LegacyMaskedLmLoss(FairseqCriterion): """ Implementation for the loss used in masked language model (MLM) training. This optionally also computes the next sentence prediction (NSP) loss and adds it to the overall loss based on the specified args. There are three cases to consider: 1) Generic MLM training without NSP loss. In this case sentence_targets and sentence_logits are both None. 2) BERT training without NSP loss. In this case sentence_targets is not None but sentence_logits is None and we should not be computing a sentence level loss. 3) BERT training with NSP loss. In this case both sentence_targets and sentence_logits are not None and we should be computing a sentence level loss. The weight of the sentence level loss is specified as an argument. """ def __init__(self, task, masked_lm_only, nsp_loss_weight): super().__init__(task) self.masked_lm_only = masked_lm_only self.nsp_loss_weight = nsp_loss_weight @staticmethod def add_args(parser): """Args for MaskedLM Loss""" # Default for masked_lm_only is False so as to not break BERT training parser.add_argument( "--masked-lm-only", default=False, action="store_true", help="compute MLM loss only", ) parser.add_argument( "--nsp-loss-weight", default=1.0, type=float, help="weight for next sentence prediction" " loss (default 1)", ) def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ lm_logits, output_metadata = model(**sample["net_input"]) # reshape lm_logits from (N,T,C) to (N*T,C) lm_logits = lm_logits.view(-1, lm_logits.size(-1)) lm_targets = sample["lm_target"].view(-1) lm_loss = compute_cross_entropy_loss(lm_logits, lm_targets, self.padding_idx) # compute the number of tokens for which loss is computed. This is used # to normalize the loss ntokens = utils.strip_pad(lm_targets, self.padding_idx).numel() loss = lm_loss / ntokens nsentences = sample["nsentences"] # nsentences = 0 # Compute sentence loss if masked_lm_only is False sentence_loss = None if not self.masked_lm_only: sentence_logits = output_metadata["sentence_logits"] sentence_targets = sample["sentence_target"].view(-1) # This needs to be recomputed due to some differences between # TokenBlock and BlockPair dataset. This can be resolved with a # refactor of BERTModel which we will do in the future. # TODO: Remove this after refactor of BERTModel nsentences = sentence_targets.size(0) # Check for logits being none which can happen when remove_heads # is set to true in the BERT model. Ideally we should set # masked_lm_only to true in this case, but that requires some # refactor in the BERT model. if sentence_logits is not None: sentence_loss = compute_cross_entropy_loss( sentence_logits, sentence_targets ) loss += self.nsp_loss_weight * (sentence_loss / nsentences) # NOTE: as we are summing up per token mlm loss and per sentence nsp loss # we don't need to use sample_size as denominator for the gradient # here sample_size is just used for logging sample_size = 1 logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "lm_loss": utils.item(lm_loss.data) if reduce else lm_loss.data, # sentence loss is not always computed "sentence_loss": ( (utils.item(sentence_loss.data) if reduce else sentence_loss.data) if sentence_loss is not None else 0.0 ), "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size, } return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" lm_loss_sum = sum(log.get("lm_loss", 0) for log in logging_outputs) sentence_loss_sum = sum(log.get("sentence_loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) agg_loss = sum(log.get("loss", 0) for log in logging_outputs) metrics.log_scalar( "loss", agg_loss / sample_size / math.log(2) if sample_size > 0 else 0.0, sample_size, round=3, ) metrics.log_scalar( "lm_loss", lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0, ntokens, round=3, ) metrics.log_scalar( "sentence_loss", sentence_loss_sum / nsentences / math.log(2) if nsentences > 0 else 0.0, nsentences, round=3, ) metrics.log_scalar( "nll_loss", lm_loss_sum / ntokens / math.log(2) if ntokens > 0 else 0.0, ntokens, round=3, ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass import math from omegaconf import II import torch from fairseq import modules, utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass @dataclass class MaskedLmConfig(FairseqDataclass): tpu: bool = II("common.tpu") @register_criterion("masked_lm", dataclass=MaskedLmConfig) class MaskedLmLoss(FairseqCriterion): """ Implementation for the loss used in masked language model (MLM) training. """ def __init__(self, cfg: MaskedLmConfig, task): super().__init__(task) self.tpu = cfg.tpu def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ masked_tokens = sample["target"].ne(self.padding_idx) sample_size = masked_tokens.int().sum() # Rare: when all tokens are masked, project all tokens. # We use torch.where to avoid device-to-host transfers, # except on CPU where torch.where is not well supported # (see github.com/pytorch/pytorch/issues/26247). if self.tpu: masked_tokens = None # always project all tokens on TPU elif masked_tokens.device == torch.device("cpu"): if not masked_tokens.any(): masked_tokens = None else: masked_tokens = torch.where( masked_tokens.any(), masked_tokens, masked_tokens.new([True]), ) logits = model(**sample["net_input"], masked_tokens=masked_tokens)[0] targets = model.get_targets(sample, [logits]) if masked_tokens is not None: targets = targets[masked_tokens] loss = modules.cross_entropy( logits.view(-1, logits.size(-1)), targets.view(-1), reduction="sum", ignore_index=self.padding_idx, ) logging_output = { "loss": loss if self.tpu else loss.data, "ntokens": sample["ntokens"], "nsentences": sample["nsentences"], "sample_size": sample_size, } return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/model_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass, field from typing import Dict, List import torch from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from fairseq.logging.meters import safe_round logger = logging.getLogger(__name__) @dataclass class ModelCriterionConfig(FairseqDataclass): loss_weights: Dict[str, float] = field( default_factory=dict, metadata={"help": "weights for the loss terms"}, ) log_keys: List[str] = field( default_factory=list, metadata={"help": "additional output keys to log"}, ) can_sum: bool = True @register_criterion("model", dataclass=ModelCriterionConfig) class ModelCriterion(FairseqCriterion): """ This criterion relies on the model to supply losses. The losses should be a dictionary of name -> scalar returned by the model either by including it in the net_output dict or by implementing a get_losses(net_output, sample) method. The final loss is a scaled sum of all losses according to weights in loss_weights. If no weights are provided, then all losses are scaled by 1.0. The losses will be automatically logged. Additional keys from net_output dict can be logged via the log_keys parameter. """ def __init__(self, task, loss_weights=None, log_keys=None, can_sum=True): super().__init__(task) self.loss_weights = loss_weights self.log_keys = log_keys self.can_sum = can_sum def forward(self, model, sample, reduce=True): net_output = model(**sample["net_input"]) scaled_losses = {} if hasattr(model, "get_losses"): losses = model.get_losses(net_output, sample) elif isinstance(net_output, dict) and "losses" in net_output: losses = net_output["losses"] else: raise Exception("Could not retrieve losses") for lk, p in losses.items(): try: coef = 1.0 if len(self.loss_weights) == 0 else self.loss_weights[lk] except KeyError: logger.error( f"weight for loss {lk} is not in loss_weights ({self.loss_weights})" ) raise if coef != 0 and p is not None: scaled_losses[lk] = coef * p.float().sum() loss = sum(scaled_losses.values()) if "sample_size" in net_output: sample_size = net_output["sample_size"] else: sample_size = loss.numel() if reduce and loss.numel() > 1: loss = loss.sum() logging_output = { "loss": loss.data, "ntokens": sample_size, "nsentences": sample["id"].numel(), "sample_size": sample_size, "_world_size": 1, } for lk in self.log_keys: if lk in net_output and net_output[lk] is not None: if not torch.is_tensor(net_output[lk]) or net_output[lk].numel() == 1: logging_output[lk] = float(net_output[lk]) elif lk.startswith("_"): logging_output[lk] = net_output[lk] else: for i, v in enumerate(net_output[lk]): logging_output[f"{lk}_{i}"] = float(v) if len(scaled_losses) > 1: for lk, l in scaled_losses.items(): if l.numel() > 1: l = l.sum() logging_output[f"loss_{lk}"] = l.item() if "logs" in net_output: for lgw in net_output["logs"]: logging_output[lgw] = net_output["logs"][lgw] return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) nsentences = utils.item( sum(log.get("nsentences", 0) for log in logging_outputs) ) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) metrics.log_scalar("loss", loss_sum / sample_size, sample_size, round=3) metrics.log_scalar("ntokens", ntokens) metrics.log_scalar("nsentences", nsentences) metrics.log_scalar("sample_size", sample_size) builtin_keys = { "loss", "ntokens", "nsentences", "sample_size", "_world_size", } world_size = utils.item( sum(log.get("_world_size", 0) for log in logging_outputs) ) for k in logging_outputs[0]: if k not in builtin_keys and not k.startswith("_"): val = sum(log.get(k, 0) for log in logging_outputs) if k.startswith("loss_"): metrics.log_scalar(k, val / sample_size, sample_size, round=3) else: metrics.log_scalar(k, val / world_size, round=3) correct = sum(log.get("correct", 0) for log in logging_outputs) total = sum(log.get("count", 0) for log in logging_outputs) if total > 0: metrics.log_scalar("_correct", correct) metrics.log_scalar("_total", total) metrics.log_derived( "accuracy", lambda meters: safe_round( meters["_correct"].sum / meters["_total"].sum, 5 ) if meters["_total"].sum > 0 else float("nan"), ) def logging_outputs_can_be_summed(self) -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return self.can_sum ================================================ FILE: fairseq/criterions/nat_loss.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from torch import Tensor from dataclasses import dataclass, field @dataclass class LabelSmoothedDualImitationCriterionConfig(FairseqDataclass): label_smoothing: float = field( default=0.0, metadata={"help": "epsilon for label smoothing, 0 means no label smoothing"}, ) @register_criterion("nat_loss", dataclass=LabelSmoothedDualImitationCriterionConfig) class LabelSmoothedDualImitationCriterion(FairseqCriterion): def __init__(self, task, label_smoothing): super().__init__(task) self.label_smoothing = label_smoothing def _compute_loss( self, outputs, targets, masks=None, label_smoothing=0.0, name="loss", factor=1.0 ): """ outputs: batch x len x d_model targets: batch x len masks: batch x len policy_logprob: if there is some policy depends on the likelihood score as rewards. """ def mean_ds(x: Tensor, dim=None) -> Tensor: return ( x.float().mean().type_as(x) if dim is None else x.float().mean(dim).type_as(x) ) if masks is not None: outputs, targets = outputs[masks], targets[masks] if masks is not None and not masks.any(): nll_loss = torch.tensor(0) loss = nll_loss else: logits = F.log_softmax(outputs, dim=-1) if targets.dim() == 1: losses = F.nll_loss(logits, targets.to(logits.device), reduction="none") else: # soft-labels losses = F.kl_div(logits, targets.to(logits.device), reduction="none") losses = losses.sum(-1) nll_loss = mean_ds(losses) if label_smoothing > 0: loss = ( nll_loss * (1 - label_smoothing) - mean_ds(logits) * label_smoothing ) else: loss = nll_loss loss = loss * factor return {"name": name, "loss": loss, "nll_loss": nll_loss, "factor": factor} def _custom_loss(self, loss, name="loss", factor=1.0): return {"name": name, "loss": loss, "factor": factor} def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ nsentences, ntokens = sample["nsentences"], sample["ntokens"] # B x T src_tokens, src_lengths = ( sample["net_input"]["src_tokens"], sample["net_input"]["src_lengths"], ) tgt_tokens, prev_output_tokens = sample["target"], sample["prev_target"] outputs = model(src_tokens, src_lengths, prev_output_tokens, tgt_tokens) losses, nll_loss = [], [] for obj in outputs: if outputs[obj].get("loss", None) is None: _losses = self._compute_loss( outputs[obj].get("out"), outputs[obj].get("tgt"), outputs[obj].get("mask", None), outputs[obj].get("ls", 0.0), name=obj + "-loss", factor=outputs[obj].get("factor", 1.0), ) else: _losses = self._custom_loss( outputs[obj].get("loss"), name=obj + "-loss", factor=outputs[obj].get("factor", 1.0), ) losses += [_losses] if outputs[obj].get("nll_loss", False): nll_loss += [_losses.get("nll_loss", 0.0)] loss = sum(l["loss"] for l in losses) nll_loss = sum(l for l in nll_loss) if len(nll_loss) > 0 else loss.new_tensor(0) # NOTE: # we don't need to use sample_size as denominator for the gradient # here sample_size is just used for logging sample_size = 1 logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": ntokens, "nsentences": nsentences, "sample_size": sample_size, } for l in losses: logging_output[l["name"]] = ( utils.item(l["loss"].data / l["factor"]) if reduce else l[["loss"]].data / l["factor"] ) return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) loss = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) nll_loss = utils.item(sum(log.get("nll_loss", 0) for log in logging_outputs)) metrics.log_scalar( "loss", loss / sample_size / math.log(2), sample_size, round=3 ) metrics.log_scalar( "nll_loss", nll_loss / sample_size / math.log(2), sample_size, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) ) for key in logging_outputs[0]: if key[-5:] == "-loss": val = sum(log.get(key, 0) for log in logging_outputs) metrics.log_scalar( key[:-5], val / sample_size / math.log(2) if sample_size > 0 else 0.0, sample_size, round=3, ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/sentence_prediction.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field from itertools import chain import numpy as np import torch import torch.nn.functional as F from sklearn.metrics import f1_score from sklearn.metrics import matthews_corrcoef as _matthews_corrcoef from scipy.stats import pearsonr, spearmanr from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from fairseq.logging.meters import safe_round def simple_accuracy(preds, labels): return (preds == labels).mean() def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds) return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } def pearson_and_spearman(preds, labels): pearson_corr = pearsonr(preds, labels)[0] spearman_corr = spearmanr(preds, labels)[0] return { "pearson": pearson_corr, "spearmanr": spearman_corr, "corr": (pearson_corr + spearman_corr) / 2, } def matthews_corrcoef(preds, labels): # make it consistent with other metrics taking (preds, labels) as input mcc = _matthews_corrcoef(labels, preds) return mcc @dataclass class SentencePredictionConfig(FairseqDataclass): classification_head_name: str = field( default="sentence_classification_head", metadata={"help": "name of the classification head to use"}, ) regression_target: bool = field( default=False, ) report_mcc: bool = False report_acc_and_f1: bool = False report_pearson_and_spearman: bool = False @register_criterion("sentence_prediction", dataclass=SentencePredictionConfig) class SentencePredictionCriterion(FairseqCriterion): def __init__(self, cfg: SentencePredictionConfig, task): super().__init__(task) self.classification_head_name = cfg.classification_head_name self.regression_target = cfg.regression_target self.keep_pred_and_targ = ( cfg.report_mcc or cfg.report_acc_and_f1 or cfg.report_pearson_and_spearman ) self.report_mcc = cfg.report_mcc self.report_acc_and_f1 = cfg.report_acc_and_f1 self.report_pearson_and_spearman = cfg.report_pearson_and_spearman self.label_dict = task.label_dictionary def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ assert ( hasattr(model, "classification_heads") and self.classification_head_name in model.classification_heads ), "model must provide sentence classification head for --criterion=sentence_prediction" logits, _ = model( **sample["net_input"], features_only=True, classification_head_name=self.classification_head_name, ) targets = model.get_targets(sample, [logits]).view(-1) sample_size = targets.numel() if not self.regression_target: lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) task_loss = F.nll_loss(lprobs, targets, reduction="sum") else: logits = logits.view(-1).float() targets = targets.float() task_loss = F.mse_loss(logits, targets, reduction="sum") logging_output = {} loss = task_loss # mha & ffn regularization update if ( hasattr(model, "args") and hasattr(model.args, "mha_reg_scale_factor") and model.args.mha_reg_scale_factor != 0.0 ): mha_reg_loss = model._get_adaptive_head_loss() loss += mha_reg_loss logging_output.update({"mha_reg_loss": mha_reg_loss}) if ( hasattr(model, "args") and hasattr(model.args, "ffn_reg_scale_factor") and model.args.ffn_reg_scale_factor != 0.0 ): ffn_reg_loss = model._get_adaptive_ffn_loss() loss += ffn_reg_loss logging_output.update({"ffn_reg_loss": ffn_reg_loss}) logging_output.update( { "loss": loss.data, "ntokens": sample["ntokens"], "nsentences": sample_size, "sample_size": sample_size, } ) if not self.regression_target: preds = logits.argmax(dim=1) logging_output["ncorrect"] = (preds == targets).sum() if self.keep_pred_and_targ and not model.training: if self.regression_target: logging_output["pred"] = logits.detach().cpu().tolist() logging_output["targ"] = targets.detach().cpu().tolist() else: # remove offset `self.label_dict.nspecial` from OffsetTokensDataset preds = self.label_dict.string(preds + self.label_dict.nspecial).split() targets = self.label_dict.string( targets + self.label_dict.nspecial ).split() logging_output["pred"] = list(map(int, preds)) logging_output["targ"] = list(map(int, targets)) if self.report_mcc: logging_output["report_mcc"] = True if self.report_acc_and_f1: logging_output["report_acc_and_f1"] = True if self.report_pearson_and_spearman: logging_output["report_pearson_and_spearman"] = True return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) mha_reg_loss_sum = sum(log.get("mha_reg_loss", 0) for log in logging_outputs) ffn_reg_loss_sum = sum(log.get("ffn_reg_loss", 0) for log in logging_outputs) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) if mha_reg_loss_sum: metrics.log_scalar( "mha_reg_loss", mha_reg_loss_sum / sample_size / math.log(2), sample_size, round=3, ) if ffn_reg_loss_sum: metrics.log_scalar( "ffn_reg_loss", ffn_reg_loss_sum / sample_size / math.log(2), sample_size, round=3, ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]: ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) metrics.log_scalar( "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1 ) # Metrics used by GLUE pred = np.array( list(chain.from_iterable(log.get("pred", []) for log in logging_outputs)) ) targ = np.array( list(chain.from_iterable(log.get("targ", []) for log in logging_outputs)) ) if len(pred): metrics.log_concat_tensor("pred", torch.from_numpy(pred), dim=0) metrics.log_concat_tensor("targ", torch.from_numpy(targ), dim=0) if any("report_mcc" in log for log in logging_outputs): metrics.log_derived( "mcc", lambda meters: safe_round( matthews_corrcoef( meters["pred"].tensor.numpy(), meters["targ"].tensor.numpy(), ) * 100, 1, ), ) if any("report_acc_and_f1" in log for log in logging_outputs): metrics.log_derived( "acc_and_f1", lambda meters: safe_round( acc_and_f1( meters["pred"].tensor.numpy(), meters["targ"].tensor.numpy(), )["acc_and_f1"] * 100, 1, ), ) metrics.log_derived( "f1", lambda meters: safe_round( acc_and_f1( meters["pred"].tensor.numpy(), meters["targ"].tensor.numpy(), )["f1"] * 100, 1, ), ) if any("report_pearson_and_spearman" in log for log in logging_outputs): metrics.log_derived( "pearson_and_spearman", lambda meters: safe_round( pearson_and_spearman( meters["pred"].tensor.numpy(), meters["targ"].tensor.numpy(), )["corr"] * 100, 1, ), ) metrics.log_derived( "pearson", lambda meters: safe_round( pearson_and_spearman( meters["pred"].tensor.numpy(), meters["targ"].tensor.numpy(), )["pearson"] * 100, 1, ), ) metrics.log_derived( "spearman", lambda meters: safe_round( pearson_and_spearman( meters["pred"].tensor.numpy(), meters["targ"].tensor.numpy(), )["spearmanr"] * 100, 1, ), ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/sentence_prediction_adapters.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn.functional as F from fairseq.criterions import register_criterion from fairseq.criterions.sentence_prediction import ( SentencePredictionCriterion, SentencePredictionConfig, ) @register_criterion("sentence_prediction_adapters", dataclass=SentencePredictionConfig) class SentencePredictionCriterionAdapters(SentencePredictionCriterion): def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ assert ( hasattr(model, "classification_heads") and self.classification_head_name in model.classification_heads ), "model must provide sentence classification head for --criterion=sentence_prediction" if not hasattr(sample, "lang_id"): # If no language ID is given, we fall back to English lang_id = ["en_XX"] * sample["nsentences"] else: lang_id = sample["lang_id"] logits, _ = model( **sample["net_input"], features_only=True, classification_head_name=self.classification_head_name, lang_id=lang_id, ) targets = model.get_targets(sample, [logits]).view(-1) sample_size = targets.numel() if not self.regression_target: lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) loss = F.nll_loss(lprobs, targets, reduction="sum") else: logits = logits.view(-1).float() targets = targets.float() loss = F.mse_loss(logits, targets, reduction="sum") logging_output = { "loss": loss.data, "ntokens": sample["ntokens"], "nsentences": sample_size, "sample_size": sample_size, } if not self.regression_target: preds = logits.argmax(dim=1) logging_output["ncorrect"] = (preds == targets).sum() return loss, sample_size, logging_output ================================================ FILE: fairseq/criterions/sentence_ranking.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion @register_criterion("sentence_ranking") class SentenceRankingCriterion(FairseqCriterion): def __init__(self, task, ranking_head_name, save_predictions, num_classes): super().__init__(task) self.ranking_head_name = ranking_head_name if save_predictions is not None: self.prediction_h = open(save_predictions, "w") else: self.prediction_h = None self.num_classes = num_classes def __del__(self): if self.prediction_h is not None: self.prediction_h.close() @staticmethod def add_args(parser): # fmt: off parser.add_argument('--save-predictions', metavar='FILE', help='file to save predictions to') parser.add_argument('--ranking-head-name', default='sentence_classification_head', help='name of the ranking head to use') # fmt: on def forward(self, model, sample, reduce=True): """Compute ranking loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ assert ( hasattr(model, "classification_heads") and self.ranking_head_name in model.classification_heads ), "model must provide sentence ranking head for --criterion=sentence_ranking" scores = [] for idx in range(self.num_classes): score, _ = model( **sample["net_input{idx}".format(idx=idx + 1)], classification_head_name=self.ranking_head_name, ) scores.append(score) logits = torch.cat(scores, dim=1) sample_size = logits.size(0) if "target" in sample: targets = model.get_targets(sample, [logits]).view(-1) lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) loss = F.nll_loss(lprobs, targets, reduction="sum") else: targets = None loss = torch.tensor(0.0, requires_grad=True) if self.prediction_h is not None: preds = logits.argmax(dim=1) for i, (id, pred) in enumerate(zip(sample["id"].tolist(), preds.tolist())): if targets is not None: label = targets[i].item() print("{}\t{}\t{}".format(id, pred, label), file=self.prediction_h) else: print("{}\t{}".format(id, pred), file=self.prediction_h) logging_output = { "loss": loss.data, "ntokens": sample["ntokens"], "nsentences": sample_size, "sample_size": sample_size, } if targets is not None: logging_output["ncorrect"] = (logits.argmax(dim=1) == targets).sum() return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) if len(logging_outputs) > 0 and "ncorrect" in logging_outputs[0]: ncorrect = sum(log.get("ncorrect", 0) for log in logging_outputs) metrics.log_scalar( "accuracy", 100.0 * ncorrect / nsentences, nsentences, round=1 ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/speech_dlm_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field from typing import Optional import torch.nn.functional as F from fairseq import metrics, utils from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from omegaconf import II @dataclass class SpeechDLMCriterionConfig(FairseqDataclass): sentence_avg: bool = II("optimization.sentence_avg") main_and_cross_weights: Optional[str] = field( default="1,0", metadata={ "help": "Comma-separated list of weights of Main-channel vs Cross-channel Prediction Losses" "(default: 1,0)" }, ) general_unit_loss_weight: float = field( default=0, metadata={ "help": "The weight of the General Prediction Loss (Next-step Unit Prediction Loss)" "(default: 0)" }, ) edge_unit_loss_weight: float = field( default=1, metadata={"help": "The weight of the Edge Unit Prediction Loss" "(default: 1)"}, ) duration_loss_weight: float = field( default=1, metadata={ "help": "The weight of the Edge Unit Duration Prediction Loss" "(default: 1)" }, ) @register_criterion("speech_dlm_criterion", dataclass=SpeechDLMCriterionConfig) class SpeechDLMCriterion(FairseqCriterion): """Criteron for the SpeechDLM model as described in the paper: https://arxiv.org/pdf/2203.16502.pdf There are 3 possible losses depending on the targets of the model: - general_unit_loss : The next unit prediction loss, corresponding to 'next' target - edge_unit_loss : The edge unit prediction loss, corresponding to 'edge' target - duration_loss : The duration prediction loss, corresponding to 'duration' target """ def __init__( self, task, sentence_avg, main_and_cross_weights, general_unit_loss_weight, edge_unit_loss_weight, duration_loss_weight, ): super().__init__(task) self.sentence_avg = sentence_avg self.channels = task.channels self.targets = task.targets self.delayed_duration_target = task.delayed_duration_target self.main_channel_weight = float(main_and_cross_weights.split(",")[0]) self.cross_channel_weight = float(main_and_cross_weights.split(",")[1]) assert self.main_channel_weight >= 0 and self.cross_channel_weight >= 0 self.channel_weights = { channel: weight for channel, weight in zip(self.channels, task.channel_weights) } self.target_weights = {} for t in self.targets: if t == "next": self.target_weights[t] = general_unit_loss_weight assert ( general_unit_loss_weight > 0 ), "Expect a positive --general-unit-loss-weight for next unit prediction" elif t == "edge": self.target_weights[t] = edge_unit_loss_weight assert ( edge_unit_loss_weight > 0 ), "Expect a positive --edge-unit-loss-weight for edge unit prediction" elif t == "duration": self.target_weights[t] = duration_loss_weight assert ( duration_loss_weight > 0 ), "Expect a positive --duration-loss-weight for duration prediction" def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) loss_dict, stats_dict = self.compute_loss( model, net_output, sample, reduce=reduce ) nsentences = sample["net_input"]["src_tokens"][self.channels[0]].size(0) logging_output = { "nsentences": nsentences, } logging_output["nsentences"] = nsentences loss_all = {t: 0 for t in self.targets} correct_all = {t: 0 for t in self.targets} count_all = {t: 0 for t in self.targets} ntokens_all = 0 sample_size_all = 0 for channel in loss_dict: for pred_channel in loss_dict[channel]: # Get ntokens & sample_size ntokens = sample["net_input"]["src_tokens"][channel].numel() sample_size = nsentences if self.sentence_avg else ntokens prefix = "[{}-{}]".format(channel, pred_channel) log_keys = { "next": "general_token", "edge": "edge_token", "duration": "edge_duration", } # Log & Update the sizes logging_output["{}ntokens".format(prefix)] = ntokens logging_output["{}sample_size".format(prefix)] = sample_size ntokens_all += ntokens sample_size_all += sample_size for t in self.targets: log_key = log_keys[t] loss = loss_dict[channel][pred_channel][t] correct, count = stats_dict[channel][pred_channel][t] # Log the statistics logging_output["{}{}_loss".format(prefix, log_key)] = loss.data logging_output["{}{}_correct".format(prefix, log_key)] = correct logging_output["{}{}_count".format(prefix, log_key)] = count # Scale the training loss by weights target_loss = loss * self.channel_weights[channel] if pred_channel == channel: target_loss = target_loss * self.main_channel_weight else: target_loss = target_loss * self.cross_channel_weight # Normalize the losses in the training by the number of edges if t in ["edge", "duration"]: target_loss = target_loss / count * sample_size # Update the statistics loss_all[t] += target_loss correct_all[t] += correct count_all[t] += count # Logging the average statistics logging_output["ntokens"] = ntokens_all logging_output["sample_size"] = sample_size_all for t in self.targets: log_key = { "next": "general_token", "edge": "edge_token", "duration": "edge_duration", }[t] logging_output["{}_loss".format(log_key)] = loss_all[t].data logging_output["{}_correct".format(log_key)] = correct_all[t] logging_output["{}_count".format(log_key)] = count_all[t] # Define the training loss training_loss = 0 for t in self.targets: training_loss += loss_all[t] * self.target_weights[t] logging_output["loss"] = training_loss.data return training_loss, sample_size_all, logging_output def compute_loss(self, model, net_output, sample, reduce=True): # Get the model outputs and target lprobs_dict = model.get_normalized_probs(net_output, log_probs=True) target_dict = model.get_targets(sample, net_output) # Init the dictionaries loss_dict, stats_dict = {}, {} for channel in lprobs_dict: # Init the dictionaries loss_dict[channel], stats_dict[channel] = {}, {} for pred_channel in lprobs_dict[channel]: # Init the dictionaries loss_dict[channel][pred_channel] = {} stats_dict[channel][pred_channel] = {} # Get token & duration predictions outputs = lprobs_dict[channel][pred_channel] if not isinstance(outputs, dict): token_lprobs = outputs else: token_lprobs = outputs["pred_token"] dur_preds = outputs["pred_duration"] dur_preds = dur_preds.view(-1) token_lprobs = token_lprobs.view(-1, token_lprobs.size(-1)) token_preds = token_lprobs.argmax(dim=-1) # Get edge indices if "edge" in self.targets or "duration" in self.targets: edge_indices = target_dict["edge_indices"][pred_channel] # Compute loss and statistics for t in self.targets: if t in ["next", "edge"]: if t == "next": target = target_dict["next"][pred_channel].view(-1) lprobs = token_lprobs preds = token_preds elif t == "edge": target = target_dict["edge"][pred_channel] lprobs = token_lprobs[edge_indices] preds = token_preds[edge_indices] loss = F.nll_loss( lprobs, target, ignore_index=self.padding_idx, reduction="sum" if reduce else "none", ) elif t == "duration": target = target_dict["duration"][pred_channel] if self.delayed_duration_target: duration_indices = edge_indices + 1 if duration_indices[-1] == len(dur_preds): duration_indices = duration_indices[:-1] target = target[:-1] else: duration_indices = edge_indices preds = dur_preds[duration_indices] loss = F.l1_loss( preds, target, reduction="sum" if reduce else "none", ) preds = preds.round() correct = (preds == target).sum().float().cpu().item() count = float(target.size(0)) loss_dict[channel][pred_channel][t] = loss stats_dict[channel][pred_channel][t] = (correct, count) return loss_dict, stats_dict @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" logging_keys = next(iter(logging_outputs)).keys() channels = [item[:-7] for item in logging_keys if item.endswith("ntokens")] target_prefixes = set( [ item[:-5].split("]")[-1] for item in logging_keys if item.endswith("_loss") ] ) for channel_prefix in channels: for target_prefix in target_prefixes: prefix = "{}{}".format(channel_prefix, target_prefix) count_sum = sum( log.get("{}_count".format(prefix), 0) for log in logging_outputs ) correct_sum = sum( log.get("{}_correct".format(prefix), 0) for log in logging_outputs ) loss_sum = sum( log.get("{}_loss".format(prefix), 0) for log in logging_outputs ) if "duration" not in target_prefix: # we divide by log(2) to convert the loss from base e to base 2 metrics.log_scalar( "{}_loss".format(prefix), loss_sum / count_sum / math.log(2), count_sum, round=3, ) metrics.log_derived( "{}_ppl".format(prefix), lambda meters, prefix=prefix: utils.get_perplexity( meters["{}_loss".format(prefix)].avg ), ) else: # for duration we don't need to divide by log(2) metrics.log_scalar( "{}_loss".format(prefix), loss_sum / count_sum, count_sum, round=3, ) accuracy = 100 * correct_sum / count_sum metrics.log_scalar("{}_pred_acc".format(prefix), accuracy, round=3) # Logging training loss sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) loss_sum = sum(log.get("loss", 0) for log in logging_outputs) # we divide by log(2) to convert the loss from base e to base 2 metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/criterions/speech_to_speech_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from collections import OrderedDict import torch from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import register_criterion from fairseq.criterions.ctc import CtcCriterion from fairseq.criterions.label_smoothed_cross_entropy_with_rdrop import ( RdropLabelSmoothedCrossEntropyCriterion, RdropLabelSmoothedCrossEntropyCriterionConfig, duplicate_input, ) from fairseq.criterions.tacotron2_loss import ( Tacotron2Criterion, Tacotron2CriterionConfig, ) logger = logging.getLogger(__name__) class MultitaskCriterion: def __init__(self, multitask_tasks, rdrop_alpha=0.0): self.rdrop_alpha = rdrop_alpha self.rdrop_alpha_mtl = rdrop_alpha self.multitask_criterion = OrderedDict() self.multitask_loss_weight = OrderedDict() for task_name, task_obj in multitask_tasks.items(): if task_obj.args.get_loss_weight(0) == 0: logger.info(f"Skip {task_name} loss criterion") continue rdrop_alpha_task = task_obj.args.rdrop_alpha if rdrop_alpha_task is None: rdrop_alpha_task = rdrop_alpha self.rdrop_alpha_mtl = rdrop_alpha_task logger.info(f"rdrop_alpha is set to {rdrop_alpha_task} for {task_name}") if task_obj.args.decoder_type == "ctc": self.multitask_criterion[task_name] = CtcCriterion( task_obj.args.criterion_cfg, task_obj, rdrop_alpha=rdrop_alpha_task, ) else: self.multitask_criterion[ task_name ] = RdropLabelSmoothedCrossEntropyCriterion( task_obj, task_obj.args.criterion_cfg.sentence_avg, label_smoothing=task_obj.args.criterion_cfg.label_smoothing, rdrop_alpha=rdrop_alpha_task, ) def set_multitask_loss_weight(self, task_name, weight=0.0): self.multitask_loss_weight[task_name] = weight def get_multitask_loss(self, model, sample, model_out): logging_output = {} loss = 0.0 for task_name, task_criterion in self.multitask_criterion.items(): layer_id = task_criterion.task.args.input_layer if isinstance(task_criterion, CtcCriterion): if task_criterion.task.args.input_from == "encoder": if len(model_out["encoder_padding_mask"]) > 0: non_padding_mask = ~model_out["encoder_padding_mask"][0] input_lengths = non_padding_mask.long().sum(-1) else: out = model_out["encoder_states"][layer_id] input_lengths = out.new_full( (out.shape[1],), out.shape[0] ).long() task_sample = { "net_input": { "src_tokens": model_out["encoder_states"][ layer_id ], # check batch idx "src_lengths": input_lengths, }, "id": sample["id"], } else: task_sample = { "net_input": { "src_tokens": model_out["inner_states"][layer_id], "src_lengths": sample["target_lengths"], }, "id": sample["id"], } else: task_sample = { "net_input": { "src_tokens": sample["multitask"][task_name]["net_input"][ "prev_output_tokens" ], "encoder_out": { "encoder_out": [model_out["encoder_states"][layer_id]], "encoder_padding_mask": model_out["encoder_padding_mask"], }, } } for key in ["target", "target_lengths", "ntokens"]: task_sample[key] = sample["multitask"][task_name][key] if task_name == getattr(model, "mt_task_name", None): decoder_out = model_out["mt_decoder_out"] else: decoder_out = None task_loss, task_sample_size, task_logging_output = task_criterion( model.multitask_decoders[task_name], task_sample, net_output=decoder_out ) loss = loss + self.multitask_loss_weight[task_name] * task_loss task_logging_output["loss_weight"] = self.multitask_loss_weight[task_name] logging_output[task_name] = task_logging_output return loss, logging_output @classmethod def reduce_metrics(cls, logging_outputs) -> None: for task_name in logging_outputs[0]["multitask"].keys(): # different criterion may return different logging # currently only reduce on loss, the most common one # ideally the way that losses are reduced should also depend on the task type loss_sum = sum( log["multitask"][task_name].get("loss", 0) for log in logging_outputs ) sample_size = sum( log["multitask"][task_name].get("sample_size", 0) for log in logging_outputs ) metrics.log_scalar( f"multitask_{task_name}_loss", loss_sum / sample_size / math.log(2), sample_size, round=3, ) loss_weight = logging_outputs[0]["multitask"][task_name].get( "loss_weight", 0 ) metrics.log_scalar( f"multitask_{task_name}_loss_weight", loss_weight, weight=0, priority=250, ) @register_criterion( "speech_to_unit", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig ) class SpeechToUnitMultitaskTaskCriterion( RdropLabelSmoothedCrossEntropyCriterion, MultitaskCriterion ): def __init__( self, task, sentence_avg, label_smoothing, ignore_prefix_size=0, report_accuracy=False, rdrop_alpha=0.0, ): super().__init__( task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy, rdrop_alpha, ) MultitaskCriterion.__init__(self, task.multitask_tasks, rdrop_alpha) def forward(self, model, sample, reduce=True): net_input_concat = { "src_tokens": sample["net_input"]["src_tokens"], "src_lengths": sample["net_input"]["src_lengths"], "prev_output_tokens": sample["net_input"]["prev_output_tokens"], "tgt_speaker": sample["net_input"].get("tgt_speaker", None), "return_all_hiddens": True, } if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0: net_input_concat = duplicate_input(net_input_concat) net_output, extra = model(**net_input_concat) loss, nll_loss, rdrop_kl_loss = self.compute_loss( model, [net_output], sample, reduce=reduce ) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } if self.report_accuracy: n_correct, total = self.compute_accuracy(model, [net_output], sample) logging_output["n_correct"] = utils.item(n_correct.data) logging_output["total"] = utils.item(total.data) if self.rdrop_alpha > 0: logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data) if len(self.multitask_criterion) == 0: return loss, sample_size, logging_output # multitask multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) loss += multitask_loss logging_output["multitask"] = multitask_log return loss, sample_size, logging_output @classmethod def reduce_metrics(cls, logging_outputs) -> None: super().reduce_metrics(logging_outputs) # inference metrics if "targ_frames" in logging_outputs[0]: n = sum(log.get("norm_frames", 0) for log in logging_outputs) for key, new_key in [ ("mcd_loss", "mcd_loss"), ("pred_frames", "pred_ratio"), ("nins", "ins_rate"), ("ndel", "del_rate"), ]: val = sum(log.get(key, 0) for log in logging_outputs) metrics.log_scalar(new_key, val / n, n, round=3) if "multitask" not in logging_outputs[0]: return MultitaskCriterion.reduce_metrics(logging_outputs) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return False @register_criterion( "speech_to_unit_2pass", dataclass=RdropLabelSmoothedCrossEntropyCriterionConfig ) class SpeechToUnit2passMultitaskTaskCriterion(SpeechToUnitMultitaskTaskCriterion): def __init__( self, task, sentence_avg, label_smoothing, ignore_prefix_size=0, report_accuracy=False, rdrop_alpha=0.0, ): super().__init__( task, sentence_avg, label_smoothing, ignore_prefix_size, report_accuracy, rdrop_alpha, ) def forward(self, model, sample, reduce=True): net_input_concat = { "src_tokens": sample["net_input"]["src_tokens"], "src_lengths": sample["net_input"]["src_lengths"], "prev_output_tokens": sample["net_input"]["prev_output_tokens"], "prev_output_tokens_mt": sample["multitask"][model.mt_task_name][ "net_input" ]["prev_output_tokens"], "tgt_speaker": sample["net_input"].get("tgt_speaker", None), "return_all_hiddens": True, } if getattr(model, "asr_task_name", None) is not None: net_input_concat["prev_output_tokens_asr"] = sample["multitask"][ model.asr_task_name ]["net_input"]["prev_output_tokens"] if self.rdrop_alpha > 0 or self.rdrop_alpha_mtl > 0: net_input_concat = duplicate_input(net_input_concat) net_output, extra = model(**net_input_concat) loss, nll_loss, rdrop_kl_loss = self.compute_loss( model, [net_output], sample, reduce=reduce ) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": loss.data, "nll_loss": nll_loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } if self.report_accuracy: n_correct, total = self.compute_accuracy(model, [net_output], sample) logging_output["n_correct"] = utils.item(n_correct.data) logging_output["total"] = utils.item(total.data) if self.rdrop_alpha > 0: logging_output["rdrop_kl_loss"] = utils.item(rdrop_kl_loss.data) if len(self.multitask_criterion) == 0: return loss, sample_size, logging_output # multitask multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) loss += multitask_loss logging_output["multitask"] = multitask_log return loss, sample_size, logging_output @register_criterion("speech_to_spectrogram", dataclass=Tacotron2CriterionConfig) class SpeechToSpectrogramMultitaskTaskCriterion(Tacotron2Criterion, MultitaskCriterion): def __init__( self, task, sentence_avg, use_guided_attention_loss, guided_attention_loss_sigma, bce_pos_weight, ctc_weight, ): super().__init__( task, sentence_avg, use_guided_attention_loss, guided_attention_loss_sigma, bce_pos_weight, ctc_weight, ) MultitaskCriterion.__init__(self, task.multitask_tasks) def forward(self, model, sample, reduction="mean"): bsz, max_len, _ = sample["target"].size() feat_tgt = sample["target"] feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len) eos_tgt = torch.arange(max_len).to(sample["target"].device) eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1) eos_tgt = (eos_tgt == (feat_len - 1)).float() feat_out, eos_out, extra = model( src_tokens=sample["net_input"]["src_tokens"], src_lengths=sample["net_input"]["src_lengths"], prev_output_tokens=sample["net_input"]["prev_output_tokens"], tgt_speaker=sample["net_input"]["tgt_speaker"], target_lengths=sample["target_lengths"], return_all_hiddens=True, ) l1_loss, mse_loss, eos_loss = self.compute_loss( extra["feature_out"], feat_out, eos_out, feat_tgt, eos_tgt, sample["target_lengths"], reduction, ) attn_loss = torch.tensor(0.0).type_as(l1_loss) if self.guided_attn is not None: attn_loss = self.guided_attn( extra["attn"], sample["net_input"]["src_lengths"], sample["target_lengths"], reduction, ) loss = ( l1_loss + mse_loss + eos_loss + attn_loss ) # do not include ctc loss as there's no text target sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"] logging_output = { "loss": utils.item(loss.data), "ntokens": sample["ntokens"], "nsentences": sample["nsentences"], "sample_size": sample_size, "l1_loss": utils.item(l1_loss.data), "mse_loss": utils.item(mse_loss.data), "eos_loss": utils.item(eos_loss.data), "attn_loss": utils.item(attn_loss.data), } if len(self.multitask_criterion) == 0: return loss, sample_size, logging_output # multitask multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) loss += multitask_loss logging_output["multitask"] = multitask_log return loss, sample_size, logging_output @classmethod def reduce_metrics(cls, logging_outputs) -> None: super().reduce_metrics(logging_outputs) # inference metrics if "targ_frames" in logging_outputs[0]: n = sum(log.get("norm_frames", 0) for log in logging_outputs) for key, new_key in [ ("mcd_loss", "mcd_loss"), ("pred_frames", "pred_ratio"), ("nins", "ins_rate"), ("ndel", "del_rate"), ]: val = sum(log.get(key, 0) for log in logging_outputs) metrics.log_scalar(new_key, val / n, n, round=3) if "multitask" not in logging_outputs[0]: return MultitaskCriterion.reduce_metrics(logging_outputs) @register_criterion("speech_to_spectrogram_2pass", dataclass=Tacotron2CriterionConfig) class SpeechToSpectrogram2passMultitaskTaskCriterion( SpeechToSpectrogramMultitaskTaskCriterion ): def __init__( self, task, sentence_avg, use_guided_attention_loss, guided_attention_loss_sigma, bce_pos_weight, ctc_weight, ): super().__init__( task, sentence_avg, use_guided_attention_loss, guided_attention_loss_sigma, bce_pos_weight, ctc_weight, ) def forward(self, model, sample, reduction="mean"): bsz, max_len, _ = sample["target"].size() feat_tgt = sample["target"] feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len) eos_tgt = torch.arange(max_len).to(sample["target"].device) eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1) eos_tgt = (eos_tgt == (feat_len - 1)).float() feat_out, eos_out, extra = model( src_tokens=sample["net_input"]["src_tokens"], src_lengths=sample["net_input"]["src_lengths"], prev_output_tokens=sample["net_input"]["prev_output_tokens"], prev_output_tokens_mt=sample["multitask"][model.mt_task_name]["net_input"][ "prev_output_tokens" ], tgt_speaker=sample["net_input"]["tgt_speaker"], target_lengths=sample["target_lengths"], return_all_hiddens=True, ) l1_loss, mse_loss, eos_loss = self.compute_loss( extra["feature_out"], feat_out, eos_out, feat_tgt, eos_tgt, sample["target_lengths"], reduction, ) attn_loss = torch.tensor(0.0).type_as(l1_loss) if self.guided_attn is not None: attn_loss = self.guided_attn( extra["attn"], sample["net_input"]["src_lengths"], sample["target_lengths"], reduction, ) loss = ( l1_loss + mse_loss + eos_loss + attn_loss ) # do not include ctc loss as there's no text target sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"] logging_output = { "loss": utils.item(loss.data), "ntokens": sample["ntokens"], "nsentences": sample["nsentences"], "sample_size": sample_size, "l1_loss": utils.item(l1_loss.data), "mse_loss": utils.item(mse_loss.data), "eos_loss": utils.item(eos_loss.data), "attn_loss": utils.item(attn_loss.data), } if len(self.multitask_criterion) == 0: return loss, sample_size, logging_output # multitask multitask_loss, multitask_log = self.get_multitask_loss(model, sample, extra) loss += multitask_loss logging_output["multitask"] = multitask_log return loss, sample_size, logging_output ================================================ FILE: fairseq/criterions/speech_ulm_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from dataclasses import dataclass, field import torch.nn.functional as F from fairseq.logging import metrics from fairseq.tasks import FairseqTask from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from omegaconf import II @dataclass class SpeechUnitLmCriterionConfig(FairseqDataclass): sentence_avg: bool = II("optimization.sentence_avg") loss_weights: str = field( default="1.;0.0;0.0", metadata={ "help": "Weights of the losses that correspond to token, duration, and F0 streams" }, ) discrete_duration: bool = II("task.discrete_duration") discrete_f0: bool = II("task.discrete_f0") def mae_loss(pred, targ, mask, reduce=True): if pred.ndim == 3: pred = pred.squeeze(2) else: assert pred.ndim == 2 loss = (pred.float() - targ.float()).abs() * (~mask).float() loss = loss.sum() if reduce else loss.view(-1) return loss def nll_loss(pred, targ, mask, reduce=True): lprob = F.log_softmax(pred, dim=-1) loss = F.nll_loss(lprob.view(-1, lprob.size(-1)), targ.view(-1), reduction="none") loss = loss * (~mask).float().view(-1) loss = loss.sum() if reduce else loss.view(-1) return loss @register_criterion("speech_unit_lm_criterion", dataclass=SpeechUnitLmCriterionConfig) class SpeechUnitLmCriterion(FairseqCriterion): def __init__(self, cfg: SpeechUnitLmCriterionConfig, task: FairseqTask): super().__init__(task) self.sentence_avg = cfg.sentence_avg self.weights = torch.tensor([float(w) for w in cfg.loss_weights.split(";")]) assert self.weights.size(0) == 3 assert (self.weights >= 0.0).all() self.dur_loss_fn = nll_loss if cfg.discrete_duration else mae_loss self.f0_loss_fn = nll_loss if cfg.discrete_f0 else mae_loss def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) token_loss = nll_loss( net_output["token"], sample["target"], sample["mask"], reduce ) dur_loss = self.dur_loss_fn( net_output["duration"], sample["dur_target"], sample["dur_mask"], reduce, ) f0_loss = self.f0_loss_fn( net_output["f0"], sample["f0_target"], sample["f0_mask"], reduce, ) loss = self.weights.to(token_loss.device) * torch.stack( [token_loss, dur_loss, f0_loss], dim=-1 ) loss = loss.sum() if reduce else loss.sum(-1) sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": loss.detach().sum().item(), "token_loss": token_loss.detach().sum().item(), "dur_loss": dur_loss.detach().sum().item(), "f0_loss": f0_loss.detach().sum().item(), "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) token_loss_sum = sum(log.get("token_loss", 0) for log in logging_outputs) dur_loss_sum = sum(log.get("dur_loss", 0) for log in logging_outputs) f0_loss_sum = sum(log.get("f0_loss", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar("loss", loss_sum / sample_size, sample_size, round=3) metrics.log_scalar( "token_loss", token_loss_sum / sample_size, sample_size, round=3 ) metrics.log_scalar("dur_loss", dur_loss_sum / sample_size, sample_size, round=3) metrics.log_scalar("f0_loss", f0_loss_sum / sample_size, sample_size, round=3) @staticmethod def logging_outputs_can_be_summed() -> bool: return True ================================================ FILE: fairseq/criterions/tacotron2_loss.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging from dataclasses import dataclass, field from functools import lru_cache from typing import Any, Dict, List import torch import torch.nn.functional as F from omegaconf import II from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.data.data_utils import lengths_to_mask from fairseq.dataclass import FairseqDataclass logger = logging.getLogger(__name__) @dataclass class Tacotron2CriterionConfig(FairseqDataclass): bce_pos_weight: float = field( default=1.0, metadata={"help": "weight of positive examples for BCE loss"}, ) use_guided_attention_loss: bool = field( default=False, metadata={"help": "use guided attention loss"}, ) guided_attention_loss_sigma: float = field( default=0.4, metadata={"help": "weight of positive examples for BCE loss"}, ) ctc_weight: float = field(default=0.0, metadata={"help": "weight for CTC loss"}) sentence_avg: bool = II("optimization.sentence_avg") class GuidedAttentionLoss(torch.nn.Module): """ Efficiently Trainable Text-to-Speech System Based on Deep Convolutional Networks with Guided Attention (https://arxiv.org/abs/1710.08969) """ def __init__(self, sigma): super().__init__() self.sigma = sigma @staticmethod @lru_cache(maxsize=8) def _get_weight(s_len, t_len, sigma): grid_x, grid_y = torch.meshgrid(torch.arange(t_len), torch.arange(s_len)) grid_x = grid_x.to(s_len.device) grid_y = grid_y.to(s_len.device) w = (grid_y.float() / s_len - grid_x.float() / t_len) ** 2 return 1.0 - torch.exp(-w / (2 * (sigma**2))) def _get_weights(self, src_lens, tgt_lens): bsz, max_s_len, max_t_len = len(src_lens), max(src_lens), max(tgt_lens) weights = torch.zeros((bsz, max_t_len, max_s_len)) for i, (s_len, t_len) in enumerate(zip(src_lens, tgt_lens)): weights[i, :t_len, :s_len] = self._get_weight(s_len, t_len, self.sigma) return weights @staticmethod def _get_masks(src_lens, tgt_lens): in_masks = lengths_to_mask(src_lens) out_masks = lengths_to_mask(tgt_lens) return out_masks.unsqueeze(2) & in_masks.unsqueeze(1) def forward(self, attn, src_lens, tgt_lens, reduction="mean"): weights = self._get_weights(src_lens, tgt_lens).to(attn.device) masks = self._get_masks(src_lens, tgt_lens).to(attn.device) loss = (weights * attn.transpose(1, 2)).masked_select(masks) loss = torch.sum(loss) if reduction == "sum" else torch.mean(loss) return loss @register_criterion("tacotron2", dataclass=Tacotron2CriterionConfig) class Tacotron2Criterion(FairseqCriterion): def __init__( self, task, sentence_avg, use_guided_attention_loss, guided_attention_loss_sigma, bce_pos_weight, ctc_weight, ): super().__init__(task) self.sentence_avg = sentence_avg self.bce_pos_weight = bce_pos_weight self.guided_attn = None if use_guided_attention_loss: self.guided_attn = GuidedAttentionLoss(guided_attention_loss_sigma) self.ctc_weight = ctc_weight def forward(self, model, sample, reduction="mean"): bsz, max_len, _ = sample["target"].size() feat_tgt = sample["target"] feat_len = sample["target_lengths"].view(bsz, 1).expand(-1, max_len) eos_tgt = torch.arange(max_len).to(sample["target"].device) eos_tgt = eos_tgt.view(1, max_len).expand(bsz, -1) eos_tgt = (eos_tgt == (feat_len - 1)).float() src_tokens = sample["net_input"]["src_tokens"] src_lens = sample["net_input"]["src_lengths"] tgt_lens = sample["target_lengths"] feat_out, eos_out, extra = model( src_tokens=src_tokens, src_lengths=src_lens, prev_output_tokens=sample["net_input"]["prev_output_tokens"], incremental_state=None, target_lengths=tgt_lens, speaker=sample["speaker"], ) l1_loss, mse_loss, eos_loss = self.compute_loss( extra["feature_out"], feat_out, eos_out, feat_tgt, eos_tgt, tgt_lens, reduction, ) attn_loss = torch.tensor(0.0).type_as(l1_loss) if self.guided_attn is not None: attn_loss = self.guided_attn(extra["attn"], src_lens, tgt_lens, reduction) ctc_loss = torch.tensor(0.0).type_as(l1_loss) if self.ctc_weight > 0.0: net_output = (feat_out, eos_out, extra) lprobs = model.get_normalized_probs(net_output, log_probs=True) lprobs = lprobs.transpose(0, 1) # T x B x C src_mask = lengths_to_mask(src_lens) src_tokens_flat = src_tokens.masked_select(src_mask) ctc_loss = ( F.ctc_loss( lprobs, src_tokens_flat, tgt_lens, src_lens, reduction=reduction, zero_infinity=True, ) * self.ctc_weight ) loss = l1_loss + mse_loss + eos_loss + attn_loss + ctc_loss sample_size = sample["nsentences"] if self.sentence_avg else sample["ntokens"] logging_output = { "loss": utils.item(loss.data), "ntokens": sample["ntokens"], "nsentences": sample["nsentences"], "sample_size": sample_size, "l1_loss": utils.item(l1_loss.data), "mse_loss": utils.item(mse_loss.data), "eos_loss": utils.item(eos_loss.data), "attn_loss": utils.item(attn_loss.data), "ctc_loss": utils.item(ctc_loss.data), } return loss, sample_size, logging_output def compute_loss( self, feat_out, feat_out_post, eos_out, feat_tgt, eos_tgt, tgt_lens, reduction="mean", ): mask = lengths_to_mask(tgt_lens) _eos_out = eos_out[mask].squeeze() _eos_tgt = eos_tgt[mask] _feat_tgt = feat_tgt[mask] _feat_out = feat_out[mask] _feat_out_post = feat_out_post[mask] l1_loss = F.l1_loss(_feat_out, _feat_tgt, reduction=reduction) + F.l1_loss( _feat_out_post, _feat_tgt, reduction=reduction ) mse_loss = F.mse_loss(_feat_out, _feat_tgt, reduction=reduction) + F.mse_loss( _feat_out_post, _feat_tgt, reduction=reduction ) eos_loss = F.binary_cross_entropy_with_logits( _eos_out, _eos_tgt, pos_weight=torch.tensor(self.bce_pos_weight), reduction=reduction, ) return l1_loss, mse_loss, eos_loss @classmethod def reduce_metrics(cls, logging_outputs: List[Dict[str, Any]]) -> None: ns = [log.get("sample_size", 0) for log in logging_outputs] ntot = sum(ns) ws = [n / (ntot + 1e-8) for n in ns] for key in ["loss", "l1_loss", "mse_loss", "eos_loss", "attn_loss", "ctc_loss"]: vals = [log.get(key, 0) for log in logging_outputs] val = sum(val * w for val, w in zip(vals, ws)) metrics.log_scalar(key, val, ntot, round=3) metrics.log_scalar("sample_size", ntot, len(logging_outputs)) # inference metrics if "targ_frames" not in logging_outputs[0]: return n = sum(log.get("targ_frames", 0) for log in logging_outputs) for key, new_key in [ ("mcd_loss", "mcd_loss"), ("pred_frames", "pred_ratio"), ("nins", "ins_rate"), ("ndel", "del_rate"), ]: val = sum(log.get(key, 0) for log in logging_outputs) metrics.log_scalar(new_key, val / n, n, round=3) @staticmethod def logging_outputs_can_be_summed() -> bool: return False ================================================ FILE: fairseq/criterions/wav2vec_criterion.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field from typing import List, Optional import torch import torch.nn.functional as F from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion from fairseq.dataclass import FairseqDataclass from fairseq.logging.meters import safe_round from fairseq.utils import is_xla_tensor @dataclass class Wav2VecCriterionConfig(FairseqDataclass): infonce: bool = field( default=False, metadata={ "help": "if set, uses cross entropy instead of binary cross entropy (i.e. InfoNCE loss)" }, ) loss_weights: Optional[List[float]] = field( default=None, metadata={"help": "weights for additional loss terms (not first one)"}, ) log_keys: List[str] = field( default_factory=lambda: [], metadata={"help": "output keys to log"}, ) @register_criterion("wav2vec", dataclass=Wav2VecCriterionConfig) class Wav2vecCriterion(FairseqCriterion): def __init__(self, task, infonce=False, loss_weights=None, log_keys=None): super().__init__(task) self.infonce = infonce self.loss_weights = loss_weights self.log_keys = [] if log_keys is None else log_keys def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) logits = model.get_logits(net_output).float() target = model.get_targets(sample, net_output) self.xla = is_xla_tensor(logits) # XXX: handle weights on xla. weights = None if hasattr(model, "get_target_weights") and not self.infonce: weights = model.get_target_weights(target, net_output) if torch.is_tensor(weights): weights = weights.float() losses = [] reduction = "none" if ((not reduce) or self.xla) else "sum" if self.infonce: loss = F.cross_entropy(logits, target, reduction=reduction) else: loss = F.binary_cross_entropy_with_logits( logits, target.float(), weights, reduction=reduction ) if self.xla: # tpu-comment: since dynamic shapes lead to recompilations on xla, # we don't shrink tensors using mask_indices. # Instead, we use mask indices to adjust loss. mi = ( sample["net_input"]["mask_indices"] .transpose(0, 1) # logits are transposed in `model.get_logits` .reshape(logits.size(0)) ) loss = (loss * mi).sum() if reduce else (loss * mi) if "sample_size" in sample: sample_size = sample["sample_size"] elif "mask_indices" in sample["net_input"]: sample_size = sample["net_input"]["mask_indices"].sum() else: sample_size = target.numel() if self.infonce else target.long().sum().item() losses.append(loss.detach().clone()) if self.loss_weights is not None: assert hasattr(model, "get_extra_losses") extra_losses = model.get_extra_losses(net_output) if torch.is_tensor(extra_losses): extra_losses = [extra_losses] if len(self.loss_weights) == 1 and len(extra_losses) != 1: self.loss_weights = [self.loss_weights[0]] * len(extra_losses) assert len(extra_losses) == len( self.loss_weights ), f"{len(extra_losses)}, {len(self.loss_weights)}" for p, coef in zip(extra_losses, self.loss_weights): if coef != 0 and p is not None: p = coef * p.float() * sample_size loss += p losses.append(p) logging_output = { "loss": loss.item() if (reduce and not self.xla) else loss.detach(), "ntokens": sample_size, "nsentences": sample["id"].numel(), "sample_size": sample_size, } for lk in self.log_keys: # Only store "logits" and "target" for computing MAP and MAUC # during validation if lk == "logits": if not self.training: logging_output["logits"] = logits.cpu().numpy() elif lk == "target": if not self.training: # If the targets have been mixed with the predictions of # teacher models, find the original targets if hasattr(model, "get_original_targets"): original_target = model.get_original_targets(sample, net_output) else: original_target = target logging_output["target"] = original_target.cpu().numpy() elif lk in net_output: value = net_output[lk] if not is_xla_tensor(value): value = float(value) logging_output[lk] = value if len(losses) > 1: for i, l in enumerate(losses): logging_output[f"loss_{i}"] = l.item() if not self.xla else l.detach() if self.infonce: with torch.no_grad(): if logits.numel() == 0: corr = 0 count = 0 else: assert logits.dim() > 1, logits.shape max = logits.argmax(-1) == 0 min = logits.argmin(-1) == 0 if is_xla_tensor(logits): max, min = max * mi, min * mi both = max & min corr = max.long().sum() - both.long().sum() count = mi.sum() else: both = max & min corr = max.long().sum().item() - both.long().sum().item() count = float(max.numel()) logging_output["correct"] = corr logging_output["count"] = count return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = utils.item(sum(log.get("loss", 0) for log in logging_outputs)) ntokens = utils.item(sum(log.get("ntokens", 0) for log in logging_outputs)) nsentences = utils.item( sum(log.get("nsentences", 0) for log in logging_outputs) ) sample_size = utils.item( sum(log.get("sample_size", 0) for log in logging_outputs) ) metrics.log_scalar( "loss", loss_sum / (sample_size or 1) / math.log(2), sample_size, round=3 ) metrics.log_scalar("ntokens", ntokens) metrics.log_scalar("nsentences", nsentences) correct = sum(log.get("correct", 0) for log in logging_outputs) metrics.log_scalar("_correct", correct) total = sum(log.get("count", 0) for log in logging_outputs) metrics.log_scalar("_total", total) if total > 0: metrics.log_derived( "accuracy", lambda meters: safe_round( meters["_correct"].sum / meters["_total"].sum, 5 ) if meters["_total"].sum > 0 else float("nan"), ) builtin_keys = { "loss", "ntokens", "nsentences", "sample_size", "correct", "count", } for k in logging_outputs[0]: if k not in builtin_keys: val = sum(log.get(k, 0) for log in logging_outputs) if k.startswith("loss"): metrics.log_scalar( k, val / (sample_size or 1) / math.log(2), sample_size, round=3 ) else: metrics.log_scalar(k, val / len(logging_outputs), round=3) # FIXME: revert when gather based xla reduction is implemented # @staticmethod # def logging_outputs_can_be_summed() -> bool: def logging_outputs_can_be_summed(self) -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ # XXX: Gather based reduction not implemented for xla yet. # So we fall to sum based reduction for xla. return self.xla ================================================ FILE: fairseq/data/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" from .dictionary import Dictionary, TruncatedDictionary from .fairseq_dataset import FairseqDataset, FairseqIterableDataset from .base_wrapper_dataset import BaseWrapperDataset from .add_target_dataset import AddTargetDataset from .append_token_dataset import AppendTokenDataset from .audio.raw_audio_dataset import BinarizedAudioDataset, FileAudioDataset from .audio.hubert_dataset import HubertDataset from .backtranslation_dataset import BacktranslationDataset from .bucket_pad_length_dataset import BucketPadLengthDataset from .colorize_dataset import ColorizeDataset from .concat_dataset import ConcatDataset from .concat_sentences_dataset import ConcatSentencesDataset from .denoising_dataset import DenoisingDataset from .id_dataset import IdDataset from .indexed_dataset import ( IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, MMapIndexedDataset, ) from .language_pair_dataset import LanguagePairDataset from .list_dataset import ListDataset from .lm_context_window_dataset import LMContextWindowDataset from .lru_cache_dataset import LRUCacheDataset from .mask_tokens_dataset import MaskTokensDataset from .monolingual_dataset import MonolingualDataset from .multi_corpus_sampled_dataset import MultiCorpusSampledDataset from .nested_dictionary_dataset import NestedDictionaryDataset from .noising import NoisingDataset from .numel_dataset import NumelDataset from .num_samples_dataset import NumSamplesDataset from .offset_tokens_dataset import OffsetTokensDataset from .padding_mask_dataset import ( LeftPaddingMaskDataset, PaddingMaskDataset, RightPaddingMaskDataset, ) from .pad_dataset import LeftPadDataset, PadDataset, RightPadDataset from .prepend_dataset import PrependDataset from .prepend_token_dataset import PrependTokenDataset from .raw_label_dataset import RawLabelDataset from .replace_dataset import ReplaceDataset from .resampling_dataset import ResamplingDataset from .roll_dataset import RollDataset from .round_robin_zip_datasets import RoundRobinZipDatasets from .sort_dataset import SortDataset from .speech_dlm_dataset import SpeechDLMDataset from .strip_token_dataset import StripTokenDataset from .subsample_dataset import SubsampleDataset from .token_block_dataset import TokenBlockDataset from .transform_eos_dataset import TransformEosDataset from .transform_eos_lang_pair_dataset import TransformEosLangPairDataset from .shorten_dataset import TruncateDataset, RandomCropDataset from .multilingual.sampled_multi_dataset import SampledMultiDataset from .multilingual.sampled_multi_epoch_dataset import SampledMultiEpochDataset from .fasta_dataset import FastaDataset, EncodedFastaDataset from .transform_eos_concat_langpair_dataset import TransformEosConcatLangPairDataset from .iterators import ( CountingIterator, EpochBatchIterator, GroupedIterator, ShardedIterator, ) __all__ = [ "AddTargetDataset", "AppendTokenDataset", "BacktranslationDataset", "BaseWrapperDataset", "BinarizedAudioDataset", "BucketPadLengthDataset", "ColorizeDataset", "ConcatDataset", "ConcatSentencesDataset", "CountingIterator", "DenoisingDataset", "Dictionary", "EncodedFastaDataset", "EpochBatchIterator", "FairseqDataset", "FairseqIterableDataset", "FastaDataset", "FileAudioDataset", "GroupedIterator", "HubertDataset", "IdDataset", "IndexedCachedDataset", "IndexedDataset", "IndexedRawTextDataset", "LanguagePairDataset", "LeftPadDataset", "ListDataset", "LMContextWindowDataset", "LRUCacheDataset", "MaskTokensDataset", "MMapIndexedDataset", "MonolingualDataset", "MultiCorpusSampledDataset", "NestedDictionaryDataset", "NoisingDataset", "NumelDataset", "NumSamplesDataset", "OffsetTokensDataset", "PadDataset", "PrependDataset", "PrependTokenDataset", "RandomCropDataset", "RawLabelDataset", "ResamplingDataset", "ReplaceDataset", "RightPadDataset", "RollDataset", "RoundRobinZipDatasets", "SampledMultiDataset", "SampledMultiEpochDataset", "ShardedIterator", "SortDataset", "SpeechDLMDataset", "StripTokenDataset", "SubsampleDataset", "TokenBlockDataset", "TransformEosDataset", "TransformEosLangPairDataset", "TransformEosConcatLangPairDataset", "TruncateDataset", "TruncatedDictionary", ] ================================================ FILE: fairseq/data/add_class_target_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import BaseWrapperDataset, data_utils from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel class AddTargetDataset(BaseWrapperDataset): def __init__( self, dataset, labels, pad, eos, batch_targets, process_label=None, label_len_fn=None, add_to_input=False, text_compression_level=TextCompressionLevel.none, ): super().__init__(dataset) self.labels = labels self.batch_targets = batch_targets self.pad = pad self.eos = eos self.process_label = process_label self.label_len_fn = label_len_fn self.add_to_input = add_to_input self.text_compressor = TextCompressor(level=text_compression_level) def get_label(self, index, process_fn=None): lbl = self.labels[index] lbl = self.text_compressor.decompress(lbl) return lbl if process_fn is None else process_fn(lbl) def __getitem__(self, index): item = self.dataset[index] item["label"] = self.get_label(index, process_fn=self.process_label) return item def size(self, index): sz = self.dataset.size(index) own_sz = self.label_len_fn(self.get_label(index)) return sz, own_sz def collater(self, samples): collated = self.dataset.collater(samples) if len(collated) == 0: return collated indices = set(collated["id"].tolist()) target = [s["label"] for s in samples if s["id"] in indices] if self.batch_targets: collated["target_lengths"] = torch.LongTensor([len(t) for t in target]) target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False) collated["ntokens"] = collated["target_lengths"].sum().item() else: collated["ntokens"] = sum([len(t) for t in target]) collated["target"] = target if self.add_to_input: eos = target.new_full((target.size(0), 1), self.eos) collated["target"] = torch.cat([target, eos], dim=-1).long() collated["net_input"]["prev_output_tokens"] = torch.cat( [eos, target], dim=-1 ).long() collated["ntokens"] += target.size(0) return collated def filter_indices_by_size(self, indices, max_sizes): indices, ignored = data_utils._filter_by_size_dynamic( indices, self.size, max_sizes ) return indices, ignored ================================================ FILE: fairseq/data/add_target_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import BaseWrapperDataset, data_utils from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel class AddTargetDataset(BaseWrapperDataset): def __init__( self, dataset, labels, pad, eos, batch_targets, process_label=None, label_len_fn=None, add_to_input=False, text_compression_level=TextCompressionLevel.none, ): super().__init__(dataset) self.labels = labels self.batch_targets = batch_targets self.pad = pad self.eos = eos self.process_label = process_label self.label_len_fn = label_len_fn self.add_to_input = add_to_input self.text_compressor = TextCompressor(level=text_compression_level) def get_label(self, index, process_fn=None): lbl = self.labels[index] lbl = self.text_compressor.decompress(lbl) return lbl if process_fn is None else process_fn(lbl) def __getitem__(self, index): item = self.dataset[index] item["label"] = self.get_label(index, process_fn=self.process_label) return item def size(self, index): sz = self.dataset.size(index) own_sz = self.label_len_fn(self.get_label(index)) return sz, own_sz def collater(self, samples): collated = self.dataset.collater(samples) if len(collated) == 0: return collated indices = set(collated["id"].tolist()) target = [s["label"] for s in samples if s["id"] in indices] if self.add_to_input: eos = torch.LongTensor([self.eos]) prev_output_tokens = [torch.cat([eos, t], axis=-1) for t in target] target = [torch.cat([t, eos], axis=-1) for t in target] collated["net_input"]["prev_output_tokens"] = prev_output_tokens if self.batch_targets: collated["target_lengths"] = torch.LongTensor([len(t) for t in target]) target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False) collated["ntokens"] = collated["target_lengths"].sum().item() if getattr(collated["net_input"], "prev_output_tokens", None): collated["net_input"]["prev_output_tokens"] = data_utils.collate_tokens( collated["net_input"]["prev_output_tokens"], pad_idx=self.pad, left_pad=False, ) else: collated["ntokens"] = sum([len(t) for t in target]) collated["target"] = target return collated def filter_indices_by_size(self, indices, max_sizes): indices, ignored = data_utils._filter_by_size_dynamic( indices, self.size, max_sizes ) return indices, ignored ================================================ FILE: fairseq/data/append_token_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from . import BaseWrapperDataset class AppendTokenDataset(BaseWrapperDataset): def __init__(self, dataset, token=None): super().__init__(dataset) self.token = token if token is not None: self._sizes = np.array(dataset.sizes) + 1 else: self._sizes = dataset.sizes def __getitem__(self, idx): item = self.dataset[idx] if self.token is not None: item = torch.cat([item, item.new([self.token])]) return item @property def sizes(self): return self._sizes def num_tokens(self, index): n = self.dataset.num_tokens(index) if self.token is not None: n += 1 return n def size(self, index): n = self.dataset.size(index) if self.token is not None: n += 1 return n ================================================ FILE: fairseq/data/audio/__init__.py ================================================ from abc import ABC, abstractmethod from typing import Dict, Optional import importlib import os import numpy as np class AudioTransform(ABC): @classmethod @abstractmethod def from_config_dict(cls, config: Optional[Dict] = None): pass class CompositeAudioTransform(AudioTransform): def _from_config_dict( cls, transform_type, get_audio_transform, composite_cls, config=None, return_empty=False, ): _config = {} if config is None else config _transforms = _config.get(f"{transform_type}_transforms") if _transforms is None: if return_empty: _transforms = [] else: return None transforms = [ get_audio_transform(_t).from_config_dict(_config.get(_t)) for _t in _transforms ] return composite_cls(transforms) def __init__(self, transforms): self.transforms = [t for t in transforms if t is not None] def __call__(self, x): for t in self.transforms: x = t(x) return x def __repr__(self): format_string = ( [self.__class__.__name__ + "("] + [f" {t.__repr__()}" for t in self.transforms] + [")"] ) return "\n".join(format_string) def register_audio_transform(name, cls_type, registry, class_names): def register_audio_transform_cls(cls): if name in registry: raise ValueError(f"Cannot register duplicate transform ({name})") if not issubclass(cls, cls_type): raise ValueError( f"Transform ({name}: {cls.__name__}) must extend " f"{cls_type.__name__}" ) if cls.__name__ in class_names: raise ValueError( f"Cannot register audio transform with duplicate " f"class name ({cls.__name__})" ) registry[name] = cls class_names.add(cls.__name__) return cls return register_audio_transform_cls def import_transforms(transforms_dir, transform_type): for file in os.listdir(transforms_dir): path = os.path.join(transforms_dir, file) if ( not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)) ): name = file[: file.find(".py")] if file.endswith(".py") else file importlib.import_module( f"fairseq.data.audio.{transform_type}_transforms." + name ) # Utility fn for uniform numbers in transforms def rand_uniform(a, b): return np.random.uniform() * (b - a) + a ================================================ FILE: fairseq/data/audio/audio_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import mmap from pathlib import Path import io from typing import BinaryIO, List, Optional, Tuple, Union import numpy as np import torch import torch.nn.functional as F from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform SF_AUDIO_FILE_EXTENSIONS = {".wav", ".flac", ".ogg"} FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS = {".npy", ".wav", ".flac", ".ogg"} def convert_waveform( waveform: Union[np.ndarray, torch.Tensor], sample_rate: int, normalize_volume: bool = False, to_mono: bool = False, to_sample_rate: Optional[int] = None, ) -> Tuple[Union[np.ndarray, torch.Tensor], int]: """convert a waveform: - to a target sample rate - from multi-channel to mono channel - volume normalization Args: waveform (numpy.ndarray or torch.Tensor): 2D original waveform (channels x length) sample_rate (int): original sample rate normalize_volume (bool): perform volume normalization to_mono (bool): convert to mono channel if having multiple channels to_sample_rate (Optional[int]): target sample rate Returns: waveform (numpy.ndarray): converted 2D waveform (channels x length) sample_rate (float): target sample rate """ try: import torchaudio.sox_effects as ta_sox except ImportError: raise ImportError("Please install torchaudio: pip install torchaudio") effects = [] if normalize_volume: effects.append(["gain", "-n"]) if to_sample_rate is not None and to_sample_rate != sample_rate: effects.append(["rate", f"{to_sample_rate}"]) if to_mono and waveform.shape[0] > 1: effects.append(["channels", "1"]) if len(effects) > 0: is_np_input = isinstance(waveform, np.ndarray) _waveform = torch.from_numpy(waveform) if is_np_input else waveform converted, converted_sample_rate = ta_sox.apply_effects_tensor( _waveform, sample_rate, effects ) if is_np_input: converted = converted.numpy() return converted, converted_sample_rate return waveform, sample_rate def get_waveform( path_or_fp: Union[str, BinaryIO], normalization: bool = True, mono: bool = True, frames: int = -1, start: int = 0, always_2d: bool = True, output_sample_rate: Optional[int] = None, normalize_volume: bool = False, waveform_transforms: Optional[CompositeAudioWaveformTransform] = None, ) -> Tuple[np.ndarray, int]: """Get the waveform and sample rate of a 16-bit WAV/FLAC/OGG Vorbis audio. Args: path_or_fp (str or BinaryIO): the path or file-like object normalization (bool): normalize values to [-1, 1] (Default: True) mono (bool): convert multi-channel audio to mono-channel one frames (int): the number of frames to read. (-1 for reading all) start (int): Where to start reading. A negative value counts from the end. always_2d (bool): always return 2D array even for mono-channel audios output_sample_rate (Optional[int]): output sample rate normalize_volume (bool): normalize volume Returns: waveform (numpy.ndarray): 1D or 2D waveform (channels x length) sample_rate (float): sample rate """ if isinstance(path_or_fp, str): ext = Path(path_or_fp).suffix if ext not in SF_AUDIO_FILE_EXTENSIONS: raise ValueError(f"Unsupported audio format: {ext}") try: import soundfile as sf except ImportError: raise ImportError("Please install soundfile: pip install soundfile") waveform, sample_rate = sf.read( path_or_fp, dtype="float32", always_2d=True, frames=frames, start=start ) waveform = waveform.T # T x C -> C x T waveform, sample_rate = convert_waveform( waveform, sample_rate, normalize_volume=normalize_volume, to_mono=mono, to_sample_rate=output_sample_rate, ) if not normalization: waveform *= 2**15 # denormalized to 16-bit signed integers if waveform_transforms is not None: waveform, sample_rate = waveform_transforms(waveform, sample_rate) if not always_2d: waveform = waveform.squeeze(axis=0) return waveform, sample_rate def get_features_from_npy_or_audio(path, waveform_transforms=None): ext = Path(path).suffix if ext not in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS: raise ValueError(f'Unsupported file format for "{path}"') return ( np.load(path) if ext == ".npy" else get_fbank(path, waveform_transforms=waveform_transforms) ) def get_features_or_waveform_from_stored_zip( path, byte_offset, byte_size, need_waveform=False, use_sample_rate=None, waveform_transforms=None, ): assert path.endswith(".zip") data = read_from_stored_zip(path, byte_offset, byte_size) f = io.BytesIO(data) if is_npy_data(data): features_or_waveform = np.load(f) elif is_sf_audio_data(data): features_or_waveform = ( get_waveform( f, always_2d=False, output_sample_rate=use_sample_rate, waveform_transforms=waveform_transforms, )[0] if need_waveform else get_fbank(f, waveform_transforms=waveform_transforms) ) else: raise ValueError(f'Unknown file format for "{path}"') return features_or_waveform def get_features_or_waveform( path: str, need_waveform=False, use_sample_rate=None, waveform_transforms=None ): """Get speech features from .npy file or waveform from .wav/.flac file. The file may be inside an uncompressed ZIP file and is accessed via byte offset and length. Args: path (str): File path in the format of "<.npy/.wav/.flac path>" or "<zip path>:<byte offset>:<byte length>". need_waveform (bool): return waveform instead of features. use_sample_rate (int): change sample rate for the input wave file Returns: features_or_waveform (numpy.ndarray): speech features or waveform. """ _path, slice_ptr = parse_path(path) if len(slice_ptr) == 0: if need_waveform: return get_waveform( _path, always_2d=False, output_sample_rate=use_sample_rate, waveform_transforms=waveform_transforms, )[0] return get_features_from_npy_or_audio( _path, waveform_transforms=waveform_transforms ) elif len(slice_ptr) == 2: features_or_waveform = get_features_or_waveform_from_stored_zip( _path, slice_ptr[0], slice_ptr[1], need_waveform=need_waveform, use_sample_rate=use_sample_rate, waveform_transforms=waveform_transforms, ) else: raise ValueError(f"Invalid path: {path}") return features_or_waveform def _get_kaldi_fbank( waveform: np.ndarray, sample_rate: int, n_bins=80 ) -> Optional[np.ndarray]: """Get mel-filter bank features via PyKaldi.""" try: from kaldi.feat.fbank import Fbank, FbankOptions from kaldi.feat.mel import MelBanksOptions from kaldi.feat.window import FrameExtractionOptions from kaldi.matrix import Vector mel_opts = MelBanksOptions() mel_opts.num_bins = n_bins frame_opts = FrameExtractionOptions() frame_opts.samp_freq = sample_rate opts = FbankOptions() opts.mel_opts = mel_opts opts.frame_opts = frame_opts fbank = Fbank(opts=opts) features = fbank.compute(Vector(waveform.squeeze()), 1.0).numpy() return features except ImportError: return None def _get_torchaudio_fbank( waveform: np.ndarray, sample_rate, n_bins=80 ) -> Optional[np.ndarray]: """Get mel-filter bank features via TorchAudio.""" try: import torchaudio.compliance.kaldi as ta_kaldi waveform = torch.from_numpy(waveform) features = ta_kaldi.fbank( waveform, num_mel_bins=n_bins, sample_frequency=sample_rate ) return features.numpy() except ImportError: return None def get_fbank( path_or_fp: Union[str, BinaryIO], n_bins=80, waveform_transforms=None ) -> np.ndarray: """Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi (faster CPP implementation) to TorchAudio (Python implementation). Note that Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the waveform should not be normalized.""" waveform, sample_rate = get_waveform( path_or_fp, normalization=False, waveform_transforms=waveform_transforms ) features = _get_kaldi_fbank(waveform, sample_rate, n_bins) if features is None: features = _get_torchaudio_fbank(waveform, sample_rate, n_bins) if features is None: raise ImportError( "Please install pyKaldi or torchaudio to enable " "online filterbank feature extraction" ) return features def is_npy_data(data: bytes) -> bool: return data[0] == 147 and data[1] == 78 def is_sf_audio_data(data: bytes) -> bool: is_wav = data[0] == 82 and data[1] == 73 and data[2] == 70 is_flac = data[0] == 102 and data[1] == 76 and data[2] == 97 is_ogg = data[0] == 79 and data[1] == 103 and data[2] == 103 return is_wav or is_flac or is_ogg def mmap_read(path: str, offset: int, length: int) -> bytes: with open(path, "rb") as f: with mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) as mmap_o: data = mmap_o[offset : offset + length] return data def read_from_stored_zip(zip_path: str, offset: int, length: int) -> bytes: return mmap_read(zip_path, offset, length) def parse_path(path: str) -> Tuple[str, List[int]]: """Parse data path which is either a path to 1. a .npy/.wav/.flac/.ogg file 2. a stored ZIP file with slicing info: "[zip_path]:[offset]:[length]" Args: path (str): the data path to parse Returns: file_path (str): the file path slice_ptr (list of int): empty in case 1; byte offset and length for the slice in case 2 """ if Path(path).suffix in FEATURE_OR_SF_AUDIO_FILE_EXTENSIONS: _path, slice_ptr = path, [] else: _path, *slice_ptr = path.split(":") if not Path(_path).is_file(): raise FileNotFoundError(f"File not found: {_path}") assert len(slice_ptr) in {0, 2}, f"Invalid path: {path}" slice_ptr = [int(i) for i in slice_ptr] return _path, slice_ptr def get_window(window_fn: callable, n_fft: int, win_length: int) -> torch.Tensor: padding = n_fft - win_length assert padding >= 0 return F.pad(window_fn(win_length), (padding // 2, padding - padding // 2)) def get_fourier_basis(n_fft: int) -> torch.Tensor: basis = np.fft.fft(np.eye(n_fft)) basis = np.vstack( [np.real(basis[: n_fft // 2 + 1, :]), np.imag(basis[: n_fft // 2 + 1, :])] ) return torch.from_numpy(basis).float() def get_mel_filters( sample_rate: int, n_fft: int, n_mels: int, f_min: float, f_max: float ) -> torch.Tensor: try: import librosa except ImportError: raise ImportError("Please install librosa: pip install librosa") basis = librosa.filters.mel(sample_rate, n_fft, n_mels, f_min, f_max) return torch.from_numpy(basis).float() class TTSSpectrogram(torch.nn.Module): def __init__( self, n_fft: int, win_length: int, hop_length: int, window_fn: callable = torch.hann_window, return_phase: bool = False, ) -> None: super(TTSSpectrogram, self).__init__() self.n_fft = n_fft self.hop_length = hop_length self.return_phase = return_phase basis = get_fourier_basis(n_fft).unsqueeze(1) basis *= get_window(window_fn, n_fft, win_length) self.register_buffer("basis", basis) def forward( self, waveform: torch.Tensor ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: padding = (self.n_fft // 2, self.n_fft // 2) x = F.pad(waveform.unsqueeze(1), padding, mode="reflect") x = F.conv1d(x, self.basis, stride=self.hop_length) real_part = x[:, : self.n_fft // 2 + 1, :] imag_part = x[:, self.n_fft // 2 + 1 :, :] magnitude = torch.sqrt(real_part**2 + imag_part**2) if self.return_phase: phase = torch.atan2(imag_part, real_part) return magnitude, phase return magnitude class TTSMelScale(torch.nn.Module): def __init__( self, n_mels: int, sample_rate: int, f_min: float, f_max: float, n_stft: int ) -> None: super(TTSMelScale, self).__init__() basis = get_mel_filters(sample_rate, (n_stft - 1) * 2, n_mels, f_min, f_max) self.register_buffer("basis", basis) def forward(self, specgram: torch.Tensor) -> torch.Tensor: return torch.matmul(self.basis, specgram) ================================================ FILE: fairseq/data/audio/data_cfg.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from argparse import Namespace from copy import deepcopy from pathlib import Path from typing import Dict, Optional from fairseq.data import Dictionary logger = logging.getLogger(__name__) def get_config_from_yaml(yaml_path: Path): try: import yaml except ImportError: print("Please install PyYAML: pip install PyYAML") config = {} if yaml_path.is_file(): try: with open(yaml_path) as f: config = yaml.load(f, Loader=yaml.FullLoader) except Exception as e: raise Exception(f"Failed to load config from {yaml_path.as_posix()}: {e}") else: raise FileNotFoundError(f"{yaml_path.as_posix()} not found") return config class S2TDataConfig(object): """Wrapper class for data config YAML""" def __init__(self, yaml_path: Path): self.config = get_config_from_yaml(yaml_path) self.root = yaml_path.parent def _auto_convert_to_abs_path(self, x): if isinstance(x, str): if not Path(x).exists() and (self.root / x).exists(): return (self.root / x).as_posix() elif isinstance(x, dict): return {k: self._auto_convert_to_abs_path(v) for k, v in x.items()} return x @property def vocab_filename(self): """fairseq vocabulary file under data root""" return self.config.get("vocab_filename", "dict.txt") @property def speaker_set_filename(self): """speaker set file under data root""" return self.config.get("speaker_set_filename", None) @property def shuffle(self) -> bool: """Shuffle dataset samples before batching""" return self.config.get("shuffle", False) @property def pre_tokenizer(self) -> Dict: """Pre-tokenizer to apply before subword tokenization. Returning a dictionary with `tokenizer` providing the tokenizer name and the other items providing the tokenizer-specific arguments. Tokenizers are defined in `fairseq.data.encoders.*`""" tokenizer = self.config.get("pre_tokenizer", {"tokenizer": None}) return self._auto_convert_to_abs_path(tokenizer) @property def bpe_tokenizer(self) -> Dict: """Subword tokenizer to apply after pre-tokenization. Returning a dictionary with `bpe` providing the tokenizer name and the other items providing the tokenizer-specific arguments. Tokenizers are defined in `fairseq.data.encoders.*`""" tokenizer = self.config.get("bpe_tokenizer", {"bpe": None}) return self._auto_convert_to_abs_path(tokenizer) @property def prepend_tgt_lang_tag(self) -> bool: """Prepend target lang ID token as the target BOS (e.g. for to-many multilingual setting). During inference, this requires `--prefix-size 1` to force BOS to be lang ID token.""" return self.config.get("prepend_tgt_lang_tag", False) @property def prepend_bos_and_append_tgt_lang_tag(self) -> bool: """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining).""" return self.config.get("prepend_bos_and_append_tgt_lang_tag", False) @property def input_feat_per_channel(self): """The dimension of input features (per audio channel)""" return self.config.get("input_feat_per_channel", 80) @property def input_channels(self): """The number of channels in the input audio""" return self.config.get("input_channels", 1) @property def sample_rate(self): return self.config.get("sample_rate", 16_000) @property def sampling_alpha(self): """Hyper-parameter alpha = 1/T for temperature-based resampling. (alpha = 1 for no resampling)""" return self.config.get("sampling_alpha", 1.0) @property def use_audio_input(self): """Needed by the dataset loader to see if the model requires raw audio as inputs.""" return self.config.get("use_audio_input", False) def standardize_audio(self) -> bool: return self.use_audio_input and self.config.get("standardize_audio", False) @property def use_sample_rate(self): """Needed by the dataset loader to see if the model requires raw audio with specific sample rate as inputs.""" return self.config.get("use_sample_rate", 16000) @property def audio_root(self): """Audio paths in the manifest TSV can be relative and this provides the root path. Set this to empty string when using absolute paths.""" return self.config.get("audio_root", "") def get_transforms(self, transform_type, split, is_train): """Split-specific feature transforms. Allowing train set wildcard `_train`, evaluation set wildcard `_eval` and general wildcard `*` for matching.""" from copy import deepcopy cfg = deepcopy(self.config) _cur = cfg.get(f"{transform_type}transforms", {}) cur = _cur.get(split) cur = _cur.get("_train") if cur is None and is_train else cur cur = _cur.get("_eval") if cur is None and not is_train else cur cur = _cur.get("*") if cur is None else cur return cur def get_feature_transforms(self, split, is_train): cfg = deepcopy(self.config) # TODO: deprecate transforms cur = self.get_transforms("", split, is_train) if cur is not None: logger.warning( "Auto converting transforms into feature_transforms, " "but transforms will be deprecated in the future. Please " "update this in the config." ) ft_transforms = self.get_transforms("feature_", split, is_train) if ft_transforms: cur.extend(ft_transforms) else: cur = self.get_transforms("feature_", split, is_train) cfg["feature_transforms"] = cur return cfg def get_waveform_transforms(self, split, is_train): cfg = deepcopy(self.config) cfg["waveform_transforms"] = self.get_transforms("waveform_", split, is_train) return cfg def get_dataset_transforms(self, split, is_train): cfg = deepcopy(self.config) cfg["dataset_transforms"] = self.get_transforms("dataset_", split, is_train) return cfg @property def global_cmvn_stats_npz(self) -> Optional[str]: path = self.config.get("global_cmvn", {}).get("stats_npz_path", None) return self._auto_convert_to_abs_path(path) @property def vocoder(self) -> Dict[str, str]: vocoder = self.config.get("vocoder", {"type": "griffin_lim"}) return self._auto_convert_to_abs_path(vocoder) @property def hub(self) -> Dict[str, str]: return self.config.get("hub", {}) class S2SDataConfig(S2TDataConfig): """Wrapper class for data config YAML""" @property def vocab_filename(self): """fairseq vocabulary file under data root""" return self.config.get("vocab_filename", None) @property def pre_tokenizer(self) -> Dict: return None @property def bpe_tokenizer(self) -> Dict: return None @property def input_transformed_channels(self): """The number of channels in the audio after feature transforms""" # TODO: move this into individual transforms # TODO: deprecate transforms _cur = self.config.get("transforms", {}) ft_transforms = self.config.get("feature_transforms", {}) if _cur and ft_transforms: _cur.update(ft_transforms) else: _cur = self.config.get("feature_transforms", {}) cur = _cur.get("_train", []) _channels = self.input_channels if "delta_deltas" in cur: _channels *= 3 return _channels @property def output_sample_rate(self): """The audio sample rate of output target speech""" return self.config.get("output_sample_rate", 22050) @property def target_speaker_embed(self): """Target speaker embedding file (one line per target audio sample)""" return self.config.get("target_speaker_embed", None) @property def prepend_tgt_lang_tag_as_bos(self) -> bool: """Prepend target lang ID token as the target BOS.""" return self.config.get("prepend_tgt_lang_tag_as_bos", False) class MultitaskConfig(object): """Wrapper class for data config YAML""" def __init__(self, yaml_path: Path): config = get_config_from_yaml(yaml_path) self.config = {} for k, v in config.items(): self.config[k] = SingleTaskConfig(k, v) def get_all_tasks(self): return self.config def get_single_task(self, name): assert name in self.config, f"multitask '{name}' does not exist!" return self.config[name] @property def first_pass_decoder_task_index(self): """Return the task index of the first-pass text decoder. If there are multiple 'is_first_pass_decoder: True' in the config file, the last task is used for the first-pass decoder. If there is no 'is_first_pass_decoder: True' in the config file, the last task whose task_name includes 'target' and decoder_type is not ctc. """ idx = -1 for i, (k, v) in enumerate(self.config.items()): if v.is_first_pass_decoder: idx = i if idx < 0: for i, (k, v) in enumerate(self.config.items()): if k.startswith("target") and v.decoder_type == "transformer": idx = i return idx class SingleTaskConfig(object): def __init__(self, name, config): self.task_name = name self.config = config dict_path = config.get("dict", "") self.tgt_dict = Dictionary.load(dict_path) if Path(dict_path).exists() else None @property def data(self): return self.config.get("data", "") @property def decoder_type(self): return self.config.get("decoder_type", "transformer") @property def decoder_args(self): """Decoder arch related args""" args = self.config.get("decoder_args", {}) return Namespace(**args) @property def criterion_cfg(self): """cfg for the multitask criterion""" if self.decoder_type == "ctc": from fairseq.criterions.ctc import CtcCriterionConfig cfg = CtcCriterionConfig cfg.zero_infinity = self.config.get("zero_infinity", True) else: from fairseq.criterions.label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterionConfig, ) cfg = LabelSmoothedCrossEntropyCriterionConfig cfg.label_smoothing = self.config.get("label_smoothing", 0.2) return cfg @property def input_from(self): """Condition on encoder/decoder of the main model""" return "decoder" if "decoder_layer" in self.config else "encoder" @property def input_layer(self): if self.input_from == "decoder": return self.config["decoder_layer"] - 1 else: # default using the output from the last encoder layer (-1) return self.config.get("encoder_layer", 0) - 1 @property def loss_weight_schedule(self): return ( "decay" if "loss_weight_max" in self.config and "loss_weight_decay_steps" in self.config else "fixed" ) def get_loss_weight(self, num_updates): if self.loss_weight_schedule == "fixed": weight = self.config.get("loss_weight", 1.0) else: # "decay" assert ( self.config.get("loss_weight_decay_steps", 0) > 0 ), "loss_weight_decay_steps must be greater than 0 for a decay schedule" loss_weight_min = self.config.get("loss_weight_min", 0.0001) loss_weight_decay_stepsize = ( self.config["loss_weight_max"] - loss_weight_min ) / self.config["loss_weight_decay_steps"] weight = max( self.config["loss_weight_max"] - loss_weight_decay_stepsize * num_updates, loss_weight_min, ) return weight @property def prepend_bos_and_append_tgt_lang_tag(self) -> bool: """Prepend BOS and append target lang ID token to the target (e.g. mBART with language token pretraining).""" return self.config.get("prepend_bos_and_append_tgt_lang_tag", False) @property def eos_token(self): """EOS token during generation""" return self.config.get("eos_token", "<eos>") @property def rdrop_alpha(self): return self.config.get("rdrop_alpha", 0.0) @property def is_first_pass_decoder(self): flag = self.config.get("is_first_pass_decoder", False) if flag: if self.decoder_type == "ctc": raise ValueError( "First-pass decoder in the multi-decoder model must not be CTC." ) if "target" not in self.task_name: raise Warning( 'The name of the first-pass decoder does not include "target".' ) return flag @property def get_lang_tag_mapping(self): return self.config.get("lang_tag_mapping", {}) ================================================ FILE: fairseq/data/audio/dataset_transforms/__init__.py ================================================ import os from fairseq.data.audio import ( AudioTransform, CompositeAudioTransform, import_transforms, register_audio_transform, ) class AudioDatasetTransform(AudioTransform): pass AUDIO_DATASET_TRANSFORM_REGISTRY = {} AUDIO_DATASET_TRANSFORM_CLASS_NAMES = set() def get_audio_dataset_transform(name): return AUDIO_DATASET_TRANSFORM_REGISTRY[name] def register_audio_dataset_transform(name): return register_audio_transform( name, AudioDatasetTransform, AUDIO_DATASET_TRANSFORM_REGISTRY, AUDIO_DATASET_TRANSFORM_CLASS_NAMES, ) import_transforms(os.path.dirname(__file__), "dataset") class CompositeAudioDatasetTransform(CompositeAudioTransform): @classmethod def from_config_dict(cls, config=None): return super()._from_config_dict( cls, "dataset", get_audio_dataset_transform, CompositeAudioDatasetTransform, config, return_empty=True, ) def get_transform(self, cls): for t in self.transforms: if isinstance(t, cls): return t return None def has_transform(self, cls): return self.get_transform(cls) is not None ================================================ FILE: fairseq/data/audio/dataset_transforms/concataugment.py ================================================ from typing import List import numpy as np from fairseq.data.audio.dataset_transforms import ( AudioDatasetTransform, register_audio_dataset_transform, ) _DEFAULTS = {"rate": 0.25, "max_tokens": 3000, "attempts": 5} @register_audio_dataset_transform("concataugment") class ConcatAugment(AudioDatasetTransform): @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return ConcatAugment( _config.get("rate", _DEFAULTS["rate"]), _config.get("max_tokens", _DEFAULTS["max_tokens"]), _config.get("attempts", _DEFAULTS["attempts"]), ) def __init__( self, rate=_DEFAULTS["rate"], max_tokens=_DEFAULTS["max_tokens"], attempts=_DEFAULTS["attempts"], ): self.rate, self.max_tokens, self.attempts = rate, max_tokens, attempts def __repr__(self): return ( self.__class__.__name__ + "(" + ", ".join( [ f"rate={self.rate}", f"max_tokens={self.max_tokens}", f"attempts={self.attempts}", ] ) + ")" ) def find_indices(self, index: int, n_frames: List[int], n_samples: int): # skip conditions: application rate, max_tokens limit exceeded if np.random.random() > self.rate: return [index] if self.max_tokens and n_frames[index] > self.max_tokens: return [index] # pick second sample to concatenate for _ in range(self.attempts): index2 = np.random.randint(0, n_samples) if index2 != index and ( not self.max_tokens or n_frames[index] + n_frames[index2] < self.max_tokens ): return [index, index2] return [index] ================================================ FILE: fairseq/data/audio/dataset_transforms/noisyoverlapaugment.py ================================================ import numpy as np import torch from fairseq.data.audio import rand_uniform from fairseq.data.audio.dataset_transforms import ( AudioDatasetTransform, register_audio_dataset_transform, ) from fairseq.data.audio.waveform_transforms.noiseaugment import ( NoiseAugmentTransform, ) _DEFAULTS = { "rate": 0.25, "mixing_noise_rate": 0.1, "noise_path": "", "noise_snr_min": -5, "noise_snr_max": 5, "utterance_snr_min": -5, "utterance_snr_max": 5, } @register_audio_dataset_transform("noisyoverlapaugment") class NoisyOverlapAugment(AudioDatasetTransform): @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return NoisyOverlapAugment( _config.get("rate", _DEFAULTS["rate"]), _config.get("mixing_noise_rate", _DEFAULTS["mixing_noise_rate"]), _config.get("noise_path", _DEFAULTS["noise_path"]), _config.get("noise_snr_min", _DEFAULTS["noise_snr_min"]), _config.get("noise_snr_max", _DEFAULTS["noise_snr_max"]), _config.get("utterance_snr_min", _DEFAULTS["utterance_snr_min"]), _config.get("utterance_snr_max", _DEFAULTS["utterance_snr_max"]), ) def __init__( self, rate=_DEFAULTS["rate"], mixing_noise_rate=_DEFAULTS["mixing_noise_rate"], noise_path=_DEFAULTS["noise_path"], noise_snr_min=_DEFAULTS["noise_snr_min"], noise_snr_max=_DEFAULTS["noise_snr_max"], utterance_snr_min=_DEFAULTS["utterance_snr_min"], utterance_snr_max=_DEFAULTS["utterance_snr_max"], ): self.rate = rate self.mixing_noise_rate = mixing_noise_rate self.noise_shaper = NoiseAugmentTransform(noise_path) self.noise_snr_min = noise_snr_min self.noise_snr_max = noise_snr_max self.utterance_snr_min = utterance_snr_min self.utterance_snr_max = utterance_snr_max def __repr__(self): return ( self.__class__.__name__ + "(" + ", ".join( [ f"rate={self.rate}", f"mixing_noise_rate={self.mixing_noise_rate}", f"noise_snr_min={self.noise_snr_min}", f"noise_snr_max={self.noise_snr_max}", f"utterance_snr_min={self.utterance_snr_min}", f"utterance_snr_max={self.utterance_snr_max}", ] ) + ")" ) def __call__(self, sources): for i, source in enumerate(sources): if np.random.random() > self.rate: continue pri = source.numpy() if np.random.random() > self.mixing_noise_rate: sec = sources[np.random.randint(0, len(sources))].numpy() snr = rand_uniform(self.utterance_snr_min, self.utterance_snr_max) else: sec = self.noise_shaper.pick_sample(source.shape) snr = rand_uniform(self.noise_snr_min, self.noise_snr_max) L1 = pri.shape[-1] L2 = sec.shape[-1] l = np.random.randint(0, min(round(L1 / 2), L2)) # mix len s_source = np.random.randint(0, L1 - l) s_sec = np.random.randint(0, L2 - l) get_power = lambda x: np.mean(x**2) if get_power(sec) == 0: continue scl = np.sqrt(get_power(pri) / (np.power(10, snr / 10) * get_power(sec))) pri[s_source : s_source + l] = np.add( pri[s_source : s_source + l], np.multiply(scl, sec[s_sec : s_sec + l]) ) sources[i] = torch.from_numpy(pri).float() return sources ================================================ FILE: fairseq/data/audio/feature_transforms/__init__.py ================================================ import os from fairseq.data.audio import ( AudioTransform, CompositeAudioTransform, import_transforms, register_audio_transform, ) class AudioFeatureTransform(AudioTransform): pass AUDIO_FEATURE_TRANSFORM_REGISTRY = {} AUDIO_FEATURE_TRANSFORM_CLASS_NAMES = set() def get_audio_feature_transform(name): return AUDIO_FEATURE_TRANSFORM_REGISTRY[name] def register_audio_feature_transform(name): return register_audio_transform( name, AudioFeatureTransform, AUDIO_FEATURE_TRANSFORM_REGISTRY, AUDIO_FEATURE_TRANSFORM_CLASS_NAMES, ) import_transforms(os.path.dirname(__file__), "feature") class CompositeAudioFeatureTransform(CompositeAudioTransform): @classmethod def from_config_dict(cls, config=None): return super()._from_config_dict( cls, "feature", get_audio_feature_transform, CompositeAudioFeatureTransform, config, ) ================================================ FILE: fairseq/data/audio/feature_transforms/delta_deltas.py ================================================ import numpy as np import torch from fairseq.data.audio.feature_transforms import ( AudioFeatureTransform, register_audio_feature_transform, ) @register_audio_feature_transform("delta_deltas") class DeltaDeltas(AudioFeatureTransform): """Expand delta-deltas features from spectrum.""" @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return DeltaDeltas(_config.get("win_length", 5)) def __init__(self, win_length=5): self.win_length = win_length def __repr__(self): return self.__class__.__name__ def __call__(self, spectrogram): from torchaudio.functional import compute_deltas assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor." # spectrogram is T x F, while compute_deltas takes (…, F, T) spectrogram = torch.from_numpy(spectrogram).transpose(0, 1) delta = compute_deltas(spectrogram) delta_delta = compute_deltas(delta) out_feat = np.concatenate( [spectrogram, delta.numpy(), delta_delta.numpy()], axis=0 ) out_feat = np.transpose(out_feat) return out_feat ================================================ FILE: fairseq/data/audio/feature_transforms/global_cmvn.py ================================================ import numpy as np from fairseq.data.audio.feature_transforms import ( AudioFeatureTransform, register_audio_feature_transform, ) @register_audio_feature_transform("global_cmvn") class GlobalCMVN(AudioFeatureTransform): """Global CMVN (cepstral mean and variance normalization). The global mean and variance need to be pre-computed and stored in NumPy format (.npz).""" @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return GlobalCMVN(_config.get("stats_npz_path")) def __init__(self, stats_npz_path): self.stats_npz_path = stats_npz_path stats = np.load(stats_npz_path) self.mean, self.std = stats["mean"], stats["std"] def __repr__(self): return self.__class__.__name__ + f'(stats_npz_path="{self.stats_npz_path}")' def __call__(self, x): x = np.subtract(x, self.mean) x = np.divide(x, self.std) return x ================================================ FILE: fairseq/data/audio/feature_transforms/specaugment.py ================================================ import math import numbers from typing import Optional import numpy as np from fairseq.data.audio.feature_transforms import ( AudioFeatureTransform, register_audio_feature_transform, ) @register_audio_feature_transform("specaugment") class SpecAugmentTransform(AudioFeatureTransform): """SpecAugment (https://arxiv.org/abs/1904.08779)""" @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return SpecAugmentTransform( _config.get("time_warp_W", 0), _config.get("freq_mask_N", 0), _config.get("freq_mask_F", 0), _config.get("time_mask_N", 0), _config.get("time_mask_T", 0), _config.get("time_mask_p", 0.0), _config.get("mask_value", None), ) def __init__( self, time_warp_w: int = 0, freq_mask_n: int = 0, freq_mask_f: int = 0, time_mask_n: int = 0, time_mask_t: int = 0, time_mask_p: float = 0.0, mask_value: Optional[float] = 0.0, ): # Sanity checks assert mask_value is None or isinstance( mask_value, numbers.Number ), f"mask_value (type: {type(mask_value)}) must be None or a number" if freq_mask_n > 0: assert freq_mask_f > 0, ( f"freq_mask_F ({freq_mask_f}) " f"must be larger than 0 when doing freq masking." ) if time_mask_n > 0: assert time_mask_t > 0, ( f"time_mask_T ({time_mask_t}) must be larger than 0 when " f"doing time masking." ) self.time_warp_w = time_warp_w self.freq_mask_n = freq_mask_n self.freq_mask_f = freq_mask_f self.time_mask_n = time_mask_n self.time_mask_t = time_mask_t self.time_mask_p = time_mask_p self.mask_value = mask_value def __repr__(self): return ( self.__class__.__name__ + "(" + ", ".join( [ f"time_warp_w={self.time_warp_w}", f"freq_mask_n={self.freq_mask_n}", f"freq_mask_f={self.freq_mask_f}", f"time_mask_n={self.time_mask_n}", f"time_mask_t={self.time_mask_t}", f"time_mask_p={self.time_mask_p}", ] ) + ")" ) def __call__(self, spectrogram): assert len(spectrogram.shape) == 2, "spectrogram must be a 2-D tensor." distorted = spectrogram.copy() # make a copy of input spectrogram. num_frames = spectrogram.shape[0] # or 'tau' in the paper. num_freqs = spectrogram.shape[1] # or 'miu' in the paper. mask_value = self.mask_value if mask_value is None: # if no value was specified, use local mean. mask_value = spectrogram.mean() if num_frames == 0: return spectrogram if num_freqs < self.freq_mask_f: return spectrogram if self.time_warp_w > 0: if 2 * self.time_warp_w < num_frames: import cv2 w0 = np.random.randint(self.time_warp_w, num_frames - self.time_warp_w) w = np.random.randint(-self.time_warp_w + 1, self.time_warp_w) upper, lower = distorted[:w0, :], distorted[w0:, :] upper = cv2.resize( upper, dsize=(num_freqs, w0 + w), interpolation=cv2.INTER_LINEAR ) lower = cv2.resize( lower, dsize=(num_freqs, num_frames - w0 - w), interpolation=cv2.INTER_LINEAR, ) distorted = np.concatenate((upper, lower), axis=0) for _i in range(self.freq_mask_n): f = np.random.randint(0, self.freq_mask_f) f0 = np.random.randint(0, num_freqs - f) if f != 0: distorted[:, f0 : f0 + f] = mask_value max_time_mask_t = min( self.time_mask_t, math.floor(num_frames * self.time_mask_p) ) if max_time_mask_t < 1: return distorted for _i in range(self.time_mask_n): t = np.random.randint(0, max_time_mask_t) t0 = np.random.randint(0, num_frames - t) if t != 0: distorted[t0 : t0 + t, :] = mask_value return distorted ================================================ FILE: fairseq/data/audio/feature_transforms/utterance_cmvn.py ================================================ import numpy as np from fairseq.data.audio.feature_transforms import ( AudioFeatureTransform, register_audio_feature_transform, ) @register_audio_feature_transform("utterance_cmvn") class UtteranceCMVN(AudioFeatureTransform): """Utterance-level CMVN (cepstral mean and variance normalization)""" @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return UtteranceCMVN( _config.get("norm_means", True), _config.get("norm_vars", True), ) def __init__(self, norm_means=True, norm_vars=True): self.norm_means, self.norm_vars = norm_means, norm_vars def __repr__(self): return ( self.__class__.__name__ + f"(norm_means={self.norm_means}, norm_vars={self.norm_vars})" ) def __call__(self, x): mean = x.mean(axis=0) square_sums = (x**2).sum(axis=0) if self.norm_means: x = np.subtract(x, mean) if self.norm_vars: var = square_sums / x.shape[0] - mean**2 std = np.sqrt(np.maximum(var, 1e-10)) x = np.divide(x, std) return x ================================================ FILE: fairseq/data/audio/frm_text_to_speech_dataset.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory.abs import csv import logging import os.path as op from typing import List, Optional import numpy as np import torch from fairseq.data import Dictionary from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig from fairseq.data.audio.text_to_speech_dataset import ( TextToSpeechDataset, TextToSpeechDatasetCreator, ) logger = logging.getLogger(__name__) class FrmTextToSpeechDataset(TextToSpeechDataset): def __init__( self, split: str, is_train_split: bool, data_cfg: S2TDataConfig, audio_paths: List[str], n_frames: List[int], src_texts: Optional[List[str]] = None, tgt_texts: Optional[List[str]] = None, speakers: Optional[List[str]] = None, src_langs: Optional[List[str]] = None, tgt_langs: Optional[List[str]] = None, ids: Optional[List[str]] = None, tgt_dict: Optional[Dictionary] = None, pre_tokenizer=None, bpe_tokenizer=None, n_frames_per_step=1, speaker_to_id=None, do_chunk=False, chunk_bound=-1, chunk_init=50, chunk_incr=5, add_eos=True, dedup=True, ref_fpu=-1, ): # It assumes texts are encoded at a fixed frame-rate super().__init__( split=split, is_train_split=is_train_split, data_cfg=data_cfg, audio_paths=audio_paths, n_frames=n_frames, src_texts=src_texts, tgt_texts=tgt_texts, speakers=speakers, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, tgt_dict=tgt_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, n_frames_per_step=n_frames_per_step, speaker_to_id=speaker_to_id, ) self.do_chunk = do_chunk self.chunk_bound = chunk_bound self.chunk_init = chunk_init self.chunk_incr = chunk_incr self.add_eos = add_eos self.dedup = dedup self.ref_fpu = ref_fpu self.chunk_size = -1 if do_chunk: assert self.chunk_incr >= 0 assert self.pre_tokenizer is None def __getitem__(self, index): index, source, target, speaker_id, _, _, _ = super().__getitem__(index) if target[-1].item() == self.tgt_dict.eos_index: target = target[:-1] fpu = source.size(0) / target.size(0) # frame-per-unit fps = self.n_frames_per_step assert ( self.ref_fpu == -1 or abs((fpu * fps - self.ref_fpu) / self.ref_fpu) < 0.1 ), f"{fpu*fps} != {self.ref_fpu}" # only chunk training split if self.is_train_split and self.do_chunk and self.chunk_size > 0: lang = target[: int(self.data_cfg.prepend_tgt_lang_tag)] text = target[int(self.data_cfg.prepend_tgt_lang_tag) :] size = len(text) chunk_size = min(self.chunk_size, size) chunk_start = np.random.randint(size - chunk_size + 1) text = text[chunk_start : chunk_start + chunk_size] target = torch.cat((lang, text), 0) f_size = int(np.floor(chunk_size * fpu)) f_start = int(np.floor(chunk_start * fpu)) assert f_size > 0 source = source[f_start : f_start + f_size, :] if self.dedup: target = torch.unique_consecutive(target) if self.add_eos: eos_idx = self.tgt_dict.eos_index target = torch.cat((target, torch.LongTensor([eos_idx])), 0) return index, source, target, speaker_id def set_epoch(self, epoch): if self.is_train_split and self.do_chunk: old = self.chunk_size self.chunk_size = self.chunk_init + epoch * self.chunk_incr if self.chunk_bound > 0: self.chunk_size = min(self.chunk_size, self.chunk_bound) logger.info( ( f"{self.split}: setting chunk size " f"from {old} to {self.chunk_size}" ) ) class FrmTextToSpeechDatasetCreator(TextToSpeechDatasetCreator): # inherit for key names @classmethod def from_tsv( cls, root: str, data_cfg: S2TDataConfig, split: str, tgt_dict, pre_tokenizer, bpe_tokenizer, is_train_split: bool, n_frames_per_step: int, speaker_to_id, do_chunk: bool = False, chunk_bound: int = -1, chunk_init: int = 50, chunk_incr: int = 5, add_eos: bool = True, dedup: bool = True, ref_fpu: float = -1, ) -> FrmTextToSpeechDataset: tsv_path = op.join(root, f"{split}.tsv") if not op.isfile(tsv_path): raise FileNotFoundError(f"Dataset not found: {tsv_path}") with open(tsv_path) as f: reader = csv.DictReader( f, delimiter="\t", quotechar=None, doublequote=False, lineterminator="\n", quoting=csv.QUOTE_NONE, ) s = [dict(e) for e in reader] assert len(s) > 0 ids = [ss[cls.KEY_ID] for ss in s] audio_paths = [op.join(data_cfg.audio_root, ss[cls.KEY_AUDIO]) for ss in s] n_frames = [int(ss[cls.KEY_N_FRAMES]) for ss in s] tgt_texts = [ss[cls.KEY_TGT_TEXT] for ss in s] src_texts = [ss.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for ss in s] speakers = [ss.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for ss in s] src_langs = [ss.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for ss in s] tgt_langs = [ss.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for ss in s] return FrmTextToSpeechDataset( split=split, is_train_split=is_train_split, data_cfg=data_cfg, audio_paths=audio_paths, n_frames=n_frames, src_texts=src_texts, tgt_texts=tgt_texts, speakers=speakers, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, tgt_dict=tgt_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, n_frames_per_step=n_frames_per_step, speaker_to_id=speaker_to_id, do_chunk=do_chunk, chunk_bound=chunk_bound, chunk_init=chunk_init, chunk_incr=chunk_incr, add_eos=add_eos, dedup=dedup, ref_fpu=ref_fpu, ) ================================================ FILE: fairseq/data/audio/hubert_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import logging import os import sys from typing import Any, List, Optional, Union import numpy as np import torch import torch.nn.functional as F from fairseq.data import data_utils from fairseq.data.fairseq_dataset import FairseqDataset from fairseq.data.audio.audio_utils import ( parse_path, read_from_stored_zip, ) import io logger = logging.getLogger(__name__) def load_audio(manifest_path, max_keep, min_keep): n_long, n_short = 0, 0 names, inds, sizes = [], [], [] with open(manifest_path) as f: root = f.readline().strip() for ind, line in enumerate(f): items = line.strip().split("\t") assert len(items) == 2, line sz = int(items[1]) if min_keep is not None and sz < min_keep: n_short += 1 elif max_keep is not None and sz > max_keep: n_long += 1 else: names.append(items[0]) inds.append(ind) sizes.append(sz) tot = ind + 1 logger.info( ( f"max_keep={max_keep}, min_keep={min_keep}, " f"loaded {len(names)}, skipped {n_short} short and {n_long} long, " f"longest-loaded={max(sizes)}, shortest-loaded={min(sizes)}" ) ) return root, names, inds, tot, sizes def load_label(label_path, inds, tot): with open(label_path) as f: labels = [line.rstrip() for line in f] assert ( len(labels) == tot ), f"number of labels does not match ({len(labels)} != {tot})" labels = [labels[i] for i in inds] return labels def load_label_offset(label_path, inds, tot): with open(label_path) as f: code_lengths = [len(line.encode("utf-8")) for line in f] assert ( len(code_lengths) == tot ), f"number of labels does not match ({len(code_lengths)} != {tot})" offsets = list(itertools.accumulate([0] + code_lengths)) offsets = [(offsets[i], offsets[i + 1]) for i in inds] return offsets def verify_label_lengths( audio_sizes, audio_rate, label_path, label_rate, inds, tot, tol=0.1, # tolerance in seconds ): if label_rate < 0: logger.info(f"{label_path} is sequence label. skipped") return with open(label_path) as f: lengths = [len(line.rstrip().split()) for line in f] assert len(lengths) == tot lengths = [lengths[i] for i in inds] num_invalid = 0 for i, ind in enumerate(inds): dur_from_audio = audio_sizes[i] / audio_rate dur_from_label = lengths[i] / label_rate if abs(dur_from_audio - dur_from_label) > tol: logger.warning( ( f"audio and label duration differ too much " f"(|{dur_from_audio} - {dur_from_label}| > {tol}) " f"in line {ind+1} of {label_path}. Check if `label_rate` " f"is correctly set (currently {label_rate}). " f"num. of samples = {audio_sizes[i]}; " f"label length = {lengths[i]}" ) ) num_invalid += 1 if num_invalid > 0: logger.warning( f"total {num_invalid} (audio, label) pairs with mismatched lengths" ) class HubertDataset(FairseqDataset): def __init__( self, manifest_path: str, sample_rate: float, label_paths: List[str], label_rates: Union[List[float], float], # -1 for sequence labels pad_list: List[str], eos_list: List[str], label_processors: Optional[List[Any]] = None, max_keep_sample_size: Optional[int] = None, min_keep_sample_size: Optional[int] = None, max_sample_size: Optional[int] = None, shuffle: bool = True, pad_audio: bool = False, normalize: bool = False, store_labels: bool = True, random_crop: bool = False, single_target: bool = False, ): self.audio_root, self.audio_names, inds, tot, self.sizes = load_audio( manifest_path, max_keep_sample_size, min_keep_sample_size ) self.sample_rate = sample_rate self.shuffle = shuffle self.random_crop = random_crop self.num_labels = len(label_paths) self.pad_list = pad_list self.eos_list = eos_list self.label_processors = label_processors self.single_target = single_target self.label_rates = ( [label_rates for _ in range(len(label_paths))] if isinstance(label_rates, float) else label_rates ) self.store_labels = store_labels if store_labels: self.label_list = [load_label(p, inds, tot) for p in label_paths] else: self.label_paths = label_paths self.label_offsets_list = [ load_label_offset(p, inds, tot) for p in label_paths ] assert label_processors is None or len(label_processors) == self.num_labels for label_path, label_rate in zip(label_paths, self.label_rates): verify_label_lengths( self.sizes, sample_rate, label_path, label_rate, inds, tot ) self.max_sample_size = ( max_sample_size if max_sample_size is not None else sys.maxsize ) self.pad_audio = pad_audio self.normalize = normalize logger.info( f"pad_audio={pad_audio}, random_crop={random_crop}, " f"normalize={normalize}, max_sample_size={self.max_sample_size}" ) def get_audio(self, index): import soundfile as sf wav_path = os.path.join(self.audio_root, self.audio_names[index]) _path, slice_ptr = parse_path(wav_path) if len(slice_ptr) == 0: wav, cur_sample_rate = sf.read(_path) else: assert _path.endswith(".zip") data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) f = io.BytesIO(data) wav, cur_sample_rate = sf.read(f) wav = torch.from_numpy(wav).float() wav = self.postprocess(wav, cur_sample_rate) return wav def get_label(self, index, label_idx): if self.store_labels: label = self.label_list[label_idx][index] else: with open(self.label_paths[label_idx]) as f: offset_s, offset_e = self.label_offsets_list[label_idx][index] f.seek(offset_s) label = f.read(offset_e - offset_s) if self.label_processors is not None: label = self.label_processors[label_idx](label) return label def get_labels(self, index): return [self.get_label(index, i) for i in range(self.num_labels)] def __getitem__(self, index): wav = self.get_audio(index) labels = self.get_labels(index) return {"id": index, "source": wav, "label_list": labels} def __len__(self): return len(self.sizes) def crop_to_max_size(self, wav, target_size): size = len(wav) diff = size - target_size if diff <= 0: return wav, 0 start, end = 0, target_size if self.random_crop: start = np.random.randint(0, diff + 1) end = size - diff + start return wav[start:end], start def collater(self, samples): # target = max(sizes) -> random_crop not used # target = max_sample_size -> random_crop used for long samples = [s for s in samples if s["source"] is not None] if len(samples) == 0: return {} audios = [s["source"] for s in samples] audio_sizes = [len(s) for s in audios] if self.pad_audio: audio_size = min(max(audio_sizes), self.max_sample_size) else: audio_size = min(min(audio_sizes), self.max_sample_size) collated_audios, padding_mask, audio_starts = self.collater_audio( audios, audio_size ) targets_by_label = [ [s["label_list"][i] for s in samples] for i in range(self.num_labels) ] targets_list, lengths_list, ntokens_list = self.collater_label( targets_by_label, audio_size, audio_starts ) net_input = {"source": collated_audios, "padding_mask": padding_mask} batch = { "id": torch.LongTensor([s["id"] for s in samples]), "net_input": net_input, } if self.single_target: batch["target_lengths"] = lengths_list[0] batch["ntokens"] = ntokens_list[0] batch["target"] = targets_list[0] else: batch["target_lengths_list"] = lengths_list batch["ntokens_list"] = ntokens_list batch["target_list"] = targets_list return batch def collater_audio(self, audios, audio_size): collated_audios = audios[0].new_zeros(len(audios), audio_size) padding_mask = ( torch.BoolTensor(collated_audios.shape).fill_(False) # if self.pad_audio else None ) audio_starts = [0 for _ in audios] for i, audio in enumerate(audios): diff = len(audio) - audio_size if diff == 0: collated_audios[i] = audio elif diff < 0: assert self.pad_audio collated_audios[i] = torch.cat([audio, audio.new_full((-diff,), 0.0)]) padding_mask[i, diff:] = True else: collated_audios[i], audio_starts[i] = self.crop_to_max_size( audio, audio_size ) return collated_audios, padding_mask, audio_starts def collater_frm_label(self, targets, audio_size, audio_starts, label_rate, pad): assert label_rate > 0 s2f = label_rate / self.sample_rate frm_starts = [int(round(s * s2f)) for s in audio_starts] frm_size = int(round(audio_size * s2f)) if not self.pad_audio: rem_size = [len(t) - s for t, s in zip(targets, frm_starts)] frm_size = min(frm_size, *rem_size) targets = [t[s : s + frm_size] for t, s in zip(targets, frm_starts)] logger.debug(f"audio_starts={audio_starts}") logger.debug(f"frame_starts={frm_starts}") logger.debug(f"frame_size={frm_size}") lengths = torch.LongTensor([len(t) for t in targets]) ntokens = lengths.sum().item() targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False) return targets, lengths, ntokens def collater_seq_label(self, targets, pad): lengths = torch.LongTensor([len(t) for t in targets]) ntokens = lengths.sum().item() targets = data_utils.collate_tokens(targets, pad_idx=pad, left_pad=False) return targets, lengths, ntokens def collater_label(self, targets_by_label, audio_size, audio_starts): targets_list, lengths_list, ntokens_list = [], [], [] itr = zip(targets_by_label, self.label_rates, self.pad_list) for targets, label_rate, pad in itr: if label_rate == -1.0: targets, lengths, ntokens = self.collater_seq_label(targets, pad) else: targets, lengths, ntokens = self.collater_frm_label( targets, audio_size, audio_starts, label_rate, pad ) targets_list.append(targets) lengths_list.append(lengths) ntokens_list.append(ntokens) return targets_list, lengths_list, ntokens_list def num_tokens(self, index): return self.size(index) def size(self, index): if self.pad_audio: return self.sizes[index] return min(self.sizes[index], self.max_sample_size) def ordered_indices(self): if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] order.append(self.sizes) return np.lexsort(order)[::-1] def postprocess(self, wav, cur_sample_rate): if wav.dim() == 2: wav = wav.mean(-1) assert wav.dim() == 1, wav.dim() if cur_sample_rate != self.sample_rate: raise Exception(f"sr {cur_sample_rate} != {self.sample_rate}") if self.normalize: with torch.no_grad(): wav = F.layer_norm(wav, wav.shape) return wav ================================================ FILE: fairseq/data/audio/multi_modality_dataset.py ================================================ # Copyright (c) 2021-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import math from typing import List, Optional, NamedTuple import numpy as np import torch from fairseq.data import ( ConcatDataset, LanguagePairDataset, FileAudioDataset, data_utils, ) from fairseq.data import FairseqDataset logger = logging.getLogger(__name__) class ModalityDatasetItem(NamedTuple): datasetname: str dataset: any max_positions: List[int] max_tokens: Optional[int] = None max_sentences: Optional[int] = None # MultiModalityDataset: it concate multiple datasets with different modalities. # Compared with ConcatDataset it can 1) sample data given the ratios for different datasets # 2) it adds mode to indicate what type of the data samples come from. # It will be used with GroupedEpochBatchIterator together to generate mini-batch with samples # from the same type of dataset # If only one dataset is used, it will perform like the original dataset with mode added class MultiModalityDataset(ConcatDataset): def __init__(self, datasets: List[ModalityDatasetItem]): id_to_mode = [] dsets = [] max_tokens = [] max_sentences = [] max_positions = [] for dset in datasets: id_to_mode.append(dset.datasetname) dsets.append(dset.dataset) max_tokens.append(dset.max_tokens) max_positions.append(dset.max_positions) max_sentences.append(dset.max_sentences) weights = [1.0 for s in dsets] super().__init__(dsets, weights) self.max_tokens = max_tokens self.max_positions = max_positions self.max_sentences = max_sentences self.id_to_mode = id_to_mode self.raw_sub_batch_samplers = [] self._cur_epoch = 0 def set_epoch(self, epoch): super().set_epoch(epoch) self._cur_epoch = epoch def __getitem__(self, idx): dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) sample = self.datasets[dataset_idx][sample_idx] return (dataset_idx, sample) def collater(self, samples): if len(samples) == 0: return {} dataset_idx = samples[0][0] # make sure all samples in samples are from same dataset assert sum([0 if dataset_idx == s[0] else 1 for s in samples]) == 0 samples = self.datasets[dataset_idx].collater([x[1] for x in samples]) # add mode samples["net_input"]["mode"] = self.id_to_mode[dataset_idx] return samples def size(self, index: int): if len(self.datasets) == 1: return self.datasets[0].size(index) return super().size(index) @property def sizes(self): if len(self.datasets) == 1: return self.datasets[0].sizes return super().sizes def ordered_indices(self): """ Returns indices sorted by length. So less padding is needed. """ if len(self.datasets) == 1: return [self.datasets[0].ordered_indices()] indices_group = [] for d_idx, ds in enumerate(self.datasets): sample_num = self.cumulative_sizes[d_idx] if d_idx > 0: sample_num = sample_num - self.cumulative_sizes[d_idx - 1] assert sample_num == len(ds) indices_group.append(ds.ordered_indices()) return indices_group def get_raw_batch_samplers(self, required_batch_size_multiple, seed): if len(self.raw_sub_batch_samplers) > 0: logger.info(" raw_sub_batch_samplers exists. No action is taken") return with data_utils.numpy_seed(seed): indices = self.ordered_indices() for i, ds in enumerate(self.datasets): indices[i] = ds.filter_indices_by_size( indices[i], self.max_positions[i], )[0] sub_batch_sampler = ds.batch_by_size( indices[i], max_tokens=self.max_tokens[i], max_sentences=self.max_sentences[i], required_batch_size_multiple=required_batch_size_multiple, ) self.raw_sub_batch_samplers.append(sub_batch_sampler) def get_batch_samplers(self, mult_ratios, required_batch_size_multiple, seed): self.get_raw_batch_samplers(required_batch_size_multiple, seed) batch_samplers = [] for i, _ in enumerate(self.datasets): if i > 0: sub_batch_sampler = [ [y + self.cumulative_sizes[i - 1] for y in x] for x in self.raw_sub_batch_samplers[i] ] else: sub_batch_sampler = list(self.raw_sub_batch_samplers[i]) smp_r = mult_ratios[i] if smp_r != 1: is_increase = "increased" if smp_r > 1 else "decreased" logger.info( "number of batch for the dataset {} is {} from {} to {}".format( self.id_to_mode[i], is_increase, len(sub_batch_sampler), int(len(sub_batch_sampler) * smp_r), ) ) mul_samplers = [] for _ in range(math.floor(smp_r)): mul_samplers = mul_samplers + sub_batch_sampler if math.floor(smp_r) != smp_r: with data_utils.numpy_seed(seed + self._cur_epoch): np.random.shuffle(sub_batch_sampler) smp_num = int( (smp_r - math.floor(smp_r)) * len(sub_batch_sampler) ) mul_samplers = mul_samplers + sub_batch_sampler[:smp_num] sub_batch_sampler = mul_samplers else: logger.info( "dataset {} batch number is {} ".format( self.id_to_mode[i], len(sub_batch_sampler) ) ) batch_samplers.append(sub_batch_sampler) return batch_samplers class LangPairMaskDataset(FairseqDataset): def __init__( self, dataset: LanguagePairDataset, src_eos: int, src_bos: Optional[int] = None, noise_id: Optional[int] = -1, mask_ratio: Optional[float] = 0, mask_type: Optional[str] = "random", ): self.dataset = dataset self.src_eos = src_eos self.src_bos = src_bos self.noise_id = noise_id self.mask_ratio = mask_ratio self.mask_type = mask_type assert mask_type in ("random", "tail") @property def src_sizes(self): return self.dataset.src_sizes @property def tgt_sizes(self): return self.dataset.tgt_sizes @property def sizes(self): # dataset.sizes can be a dynamically computed sizes: return self.dataset.sizes def get_batch_shapes(self): if hasattr(self.dataset, "get_batch_shapes"): return self.dataset.get_batch_shapes() return self.dataset.buckets def num_tokens_vec(self, indices): return self.dataset.num_tokens_vec(indices) def __len__(self): return len(self.dataset) def num_tokens(self, index): return self.dataset.num_tokens(index) def size(self, index): return self.dataset.size(index) def ordered_indices(self): return self.dataset.ordered_indices() @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): return self.dataset.prefetch(indices) def mask_src_tokens(self, sample): src_item = sample["source"] mask = None if self.mask_type == "random": mask = torch.rand(len(src_item)).le(self.mask_ratio) else: mask = torch.ones(len(src_item)) mask[: int(len(src_item) * (1 - self.mask_ratio))] = 0 mask = mask.eq(1) if src_item[0] == self.src_bos: mask[0] = False if src_item[-1] == self.src_eos: mask[-1] = False mask_src_item = src_item.masked_fill(mask, self.noise_id) smp = {"id": sample["id"], "source": mask_src_item, "target": sample["target"]} return smp def __getitem__(self, index): sample = self.dataset[index] if self.mask_ratio > 0: sample = self.mask_src_tokens(sample) return sample def collater(self, samples, pad_to_length=None): return self.dataset.collater(samples, pad_to_length) class FileAudioDatasetWrapper(FileAudioDataset): def collater(self, samples): samples = super().collater(samples) if len(samples) == 0: return {} samples["net_input"]["src_tokens"] = samples["net_input"]["source"] samples["net_input"]["prev_output_tokens"] = None del samples["net_input"]["source"] samples["net_input"]["src_lengths"] = None samples["net_input"]["alignment"] = None return samples ================================================ FILE: fairseq/data/audio/raw_audio_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys import time import io import numpy as np import torch import torch.nn.functional as F from .. import FairseqDataset from ..data_utils import compute_block_mask_1d, get_buckets, get_bucketed_sizes from fairseq.data.audio.audio_utils import ( parse_path, read_from_stored_zip, is_sf_audio_data, ) from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel logger = logging.getLogger(__name__) class RawAudioDataset(FairseqDataset): def __init__( self, sample_rate, max_sample_size=None, min_sample_size=0, shuffle=True, pad=False, normalize=False, compute_mask=False, feature_encoder_spec: str = "None", mask_prob: float = 0.75, mask_prob_adjust: float = 0, mask_length: int = 1, inverse_mask: bool = False, require_same_masks: bool = True, clone_batch: int = 1, expand_adjacent: bool = False, mask_dropout: float = 0, non_overlapping: bool = False, corpus_key=None, ): super().__init__() self.sample_rate = sample_rate self.sizes = [] self.max_sample_size = ( max_sample_size if max_sample_size is not None else sys.maxsize ) self.min_sample_size = min_sample_size self.pad = pad self.shuffle = shuffle self.normalize = normalize self.is_compute_mask = compute_mask self.feature_encoder_spec = eval(feature_encoder_spec) self._features_size_map = {} self.mask_prob = mask_prob self.mask_prob_adjust = mask_prob_adjust self.mask_length = mask_length self.inverse_mask = inverse_mask self.require_same_masks = require_same_masks self.clone_batch = clone_batch self.expand_adjacent = expand_adjacent self.mask_dropout = mask_dropout self.non_overlapping = non_overlapping self.corpus_key = corpus_key def __getitem__(self, index): raise NotImplementedError() def __len__(self): return len(self.sizes) def postprocess(self, feats, curr_sample_rate): if feats.dim() == 2: feats = feats.mean(-1) if curr_sample_rate != self.sample_rate: raise Exception(f"sample rate: {curr_sample_rate}, need {self.sample_rate}") assert feats.dim() == 1, feats.dim() if self.normalize: with torch.no_grad(): feats = F.layer_norm(feats, feats.shape) return feats def crop_to_max_size(self, t, target_size, dim=0): size = t.size(dim) diff = size - target_size if diff <= 0: return t start = np.random.randint(0, diff + 1) end = size - diff + start slices = [] for d in range(dim): slices.append(slice(None)) slices.append(slice(start, end)) return t[slices] @staticmethod def _bucket_tensor(tensor, num_pad, value): return F.pad(tensor, (0, num_pad), value=value) def collater(self, samples): samples = [s for s in samples if s["source"] is not None] if len(samples) == 0: return {} sources = [s["source"] for s in samples] sizes = [len(s) for s in sources] if self.pad: target_size = min(max(sizes), self.max_sample_size) else: target_size = min(min(sizes), self.max_sample_size) collated_sources = sources[0].new_zeros(len(sources), target_size) padding_mask = ( torch.BoolTensor(collated_sources.shape).fill_(False) if self.pad else None ) for i, (source, size) in enumerate(zip(sources, sizes)): diff = size - target_size if diff == 0: collated_sources[i] = source elif diff < 0: assert self.pad collated_sources[i] = torch.cat( [source, source.new_full((-diff,), 0.0)] ) padding_mask[i, diff:] = True else: collated_sources[i] = self.crop_to_max_size(source, target_size) input = {"source": collated_sources} if self.corpus_key is not None: input["corpus_key"] = [self.corpus_key] * len(sources) out = {"id": torch.LongTensor([s["id"] for s in samples])} if self.pad: input["padding_mask"] = padding_mask if hasattr(self, "num_buckets") and self.num_buckets > 0: assert self.pad, "Cannot bucket without padding first." bucket = max(self._bucketed_sizes[s["id"]] for s in samples) num_pad = bucket - collated_sources.size(-1) if num_pad: input["source"] = self._bucket_tensor(collated_sources, num_pad, 0) input["padding_mask"] = self._bucket_tensor(padding_mask, num_pad, True) if "precomputed_mask" in samples[0]: target_size = self._get_mask_indices_dims(target_size) collated_mask = torch.cat( [ self.crop_to_max_size(s["precomputed_mask"], target_size, dim=1) for s in samples ], dim=0, ) input["precomputed_mask"] = collated_mask out["net_input"] = input return out def _get_mask_indices_dims(self, size, padding=0, dilation=1): if size not in self.feature_encoder_spec: L_in = size for (_, kernel_size, stride) in self.feature_encoder_spec: L_out = L_in + 2 * padding - dilation * (kernel_size - 1) - 1 L_out = 1 + L_out // stride L_in = L_out self._features_size_map[size] = L_out return self._features_size_map[size] def num_tokens(self, index): return self.size(index) def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" if self.pad: return self.sizes[index] return min(self.sizes[index], self.max_sample_size) def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] order.append( np.minimum( np.array(self.sizes), self.max_sample_size, ) ) return np.lexsort(order)[::-1] else: return np.arange(len(self)) def set_bucket_info(self, num_buckets): self.num_buckets = num_buckets if self.num_buckets > 0: self._collated_sizes = np.minimum( np.array(self.sizes), self.max_sample_size, ) self.buckets = get_buckets( self._collated_sizes, self.num_buckets, ) self._bucketed_sizes = get_bucketed_sizes( self._collated_sizes, self.buckets ) logger.info( f"{len(self.buckets)} bucket(s) for the audio dataset: " f"{self.buckets}" ) def filter_indices_by_size(self, indices, max_sizes): return indices, [] class FileAudioDataset(RawAudioDataset): def __init__( self, manifest_path, sample_rate, max_sample_size=None, min_sample_size=0, shuffle=True, pad=False, normalize=False, num_buckets=0, compute_mask=False, text_compression_level=TextCompressionLevel.none, **mask_compute_kwargs, ): super().__init__( sample_rate=sample_rate, max_sample_size=max_sample_size, min_sample_size=min_sample_size, shuffle=shuffle, pad=pad, normalize=normalize, compute_mask=compute_mask, **mask_compute_kwargs, ) self.text_compressor = TextCompressor(level=text_compression_level) skipped = 0 self.fnames = [] sizes = [] self.skipped_indices = set() with open(manifest_path, "r") as f: self.root_dir = f.readline().strip() for i, line in enumerate(f): items = line.strip().split("\t") assert len(items) == 2, line sz = int(items[1]) if min_sample_size is not None and sz < min_sample_size: skipped += 1 self.skipped_indices.add(i) continue self.fnames.append(self.text_compressor.compress(items[0])) sizes.append(sz) logger.info(f"loaded {len(self.fnames)}, skipped {skipped} samples") self.sizes = np.array(sizes, dtype=np.int64) try: import pyarrow self.fnames = pyarrow.array(self.fnames) except: logger.debug( "Could not create a pyarrow array. Please install pyarrow for better performance" ) pass self.set_bucket_info(num_buckets) def __getitem__(self, index): import soundfile as sf fn = self.fnames[index] fn = fn if isinstance(self.fnames, list) else fn.as_py() fn = self.text_compressor.decompress(fn) path_or_fp = os.path.join(self.root_dir, fn) _path, slice_ptr = parse_path(path_or_fp) if len(slice_ptr) == 2: byte_data = read_from_stored_zip(_path, slice_ptr[0], slice_ptr[1]) assert is_sf_audio_data(byte_data) path_or_fp = io.BytesIO(byte_data) retry = 3 wav = None for i in range(retry): try: wav, curr_sample_rate = sf.read(path_or_fp, dtype="float32") break except Exception as e: logger.warning( f"Failed to read {path_or_fp}: {e}. Sleeping for {1 * i}" ) time.sleep(1 * i) if wav is None: raise Exception(f"Failed to load {path_or_fp}") feats = torch.from_numpy(wav).float() feats = self.postprocess(feats, curr_sample_rate) v = {"id": index, "source": feats} if self.is_compute_mask: T = self._get_mask_indices_dims(feats.size(-1)) mask = compute_block_mask_1d( shape=(self.clone_batch, T), mask_prob=self.mask_prob, mask_length=self.mask_length, mask_prob_adjust=self.mask_prob_adjust, inverse_mask=self.inverse_mask, require_same_masks=True, expand_adjcent=self.expand_adjacent, mask_dropout=self.mask_dropout, non_overlapping=self.non_overlapping, ) v["precomputed_mask"] = mask return v class BinarizedAudioDataset(RawAudioDataset): def __init__( self, data_dir, split, sample_rate, max_sample_size=None, min_sample_size=0, shuffle=True, pad=False, normalize=False, num_buckets=0, compute_mask=False, **mask_compute_kwargs, ): super().__init__( sample_rate=sample_rate, max_sample_size=max_sample_size, min_sample_size=min_sample_size, shuffle=shuffle, pad=pad, normalize=normalize, compute_mask=compute_mask, **mask_compute_kwargs, ) from fairseq.data import data_utils, Dictionary self.fnames_dict = Dictionary.load(os.path.join(data_dir, "dict.txt")) root_path = os.path.join(data_dir, f"{split}.root") if os.path.exists(root_path): with open(root_path, "r") as f: self.root_dir = next(f).strip() else: self.root_dir = None fnames_path = os.path.join(data_dir, split) self.fnames = data_utils.load_indexed_dataset(fnames_path, self.fnames_dict) lengths_path = os.path.join(data_dir, f"{split}.lengths") with open(lengths_path, "r") as f: for line in f: sz = int(line.rstrip()) assert ( sz >= min_sample_size ), f"Min sample size is not supported for binarized dataset, but found a sample with size {sz}" self.sizes.append(sz) self.sizes = np.array(self.sizes, dtype=np.int64) self.set_bucket_info(num_buckets) logger.info(f"loaded {len(self.fnames)} samples") def __getitem__(self, index): import soundfile as sf fname = self.fnames_dict.string(self.fnames[index], separator="") if self.root_dir: fname = os.path.join(self.root_dir, fname) wav, curr_sample_rate = sf.read(fname) feats = torch.from_numpy(wav).float() feats = self.postprocess(feats, curr_sample_rate) v = {"id": index, "source": feats} if self.is_compute_mask: T = self._get_mask_indices_dims(feats.size(-1)) mask = compute_block_mask_1d( shape=(self.clone_batch, T), mask_prob=self.mask_prob, mask_length=self.mask_length, mask_prob_adjust=self.mask_prob_adjust, inverse_mask=self.inverse_mask, require_same_masks=True, expand_adjcent=self.expand_adjacent, mask_dropout=self.mask_dropout, non_overlapping=self.non_overlapping, ) v["precomputed_mask"] = mask return v ================================================ FILE: fairseq/data/audio/speech_to_speech_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Tuple import torch from fairseq.data import ConcatDataset, Dictionary from fairseq.data import data_utils as fairseq_data_utils from fairseq.data.audio.audio_utils import get_features_or_waveform from fairseq.data.audio.data_cfg import S2SDataConfig from fairseq.data.audio.speech_to_text_dataset import ( SpeechToTextDataset, SpeechToTextDatasetCreator, TextTargetMultitaskData, _collate_frames, ) logger = logging.getLogger(__name__) @dataclass class SpeechToSpeechDatasetItem(object): index: int source: torch.Tensor target: Optional[torch.Tensor] = None target_speaker: Optional[torch.Tensor] = None tgt_lang_tag: Optional[int] = None class SpeechToSpeechDataset(SpeechToTextDataset): def __init__( self, split: str, is_train_split: bool, data_cfg: S2SDataConfig, src_audio_paths: List[str], src_n_frames: List[int], tgt_audio_paths: List[str], tgt_n_frames: List[int], src_langs: Optional[List[str]] = None, tgt_langs: Optional[List[str]] = None, ids: Optional[List[str]] = None, target_is_code: bool = False, tgt_dict: Dictionary = None, n_frames_per_step: int = 1, ): tgt_texts = tgt_audio_paths if target_is_code else None super().__init__( split=split, is_train_split=is_train_split, cfg=data_cfg, audio_paths=src_audio_paths, n_frames=src_n_frames, ids=ids, tgt_dict=tgt_dict, tgt_texts=tgt_texts, src_langs=src_langs, tgt_langs=tgt_langs, n_frames_per_step=n_frames_per_step, ) self.tgt_audio_paths = tgt_audio_paths self.tgt_lens = [t // self.n_frames_per_step for t in tgt_n_frames] assert not target_is_code or tgt_dict is not None self.target_is_code = target_is_code assert len(tgt_audio_paths) == self.n_samples assert len(tgt_n_frames) == self.n_samples self.tgt_speakers = None if self.cfg.target_speaker_embed: samples = SpeechToTextDatasetCreator._load_samples_from_tsv( self.cfg.target_speaker_embed, split ) spk_emb_dict = {s["id"]: s["speaker_embed"] for s in samples} self.tgt_speakers = [spk_emb_dict[id] for id in self.ids] assert len(self.tgt_speakers) == self.n_samples logger.info(self.__repr__()) def pack_units(self, input: torch.Tensor) -> torch.Tensor: if self.n_frames_per_step <= 1: return input offset = 4 vocab_size = ( len(self.tgt_dict) - offset ) # remove offset from <bos>, <pad>, <eos>, <unk>, which is specific to fairseq dictionary assert input.dim() == 1 stacked_input = ( input[:-1].view(-1, self.n_frames_per_step) - offset ) # remove <eos> scale = [ pow(vocab_size, self.n_frames_per_step - 1 - i) for i in range(self.n_frames_per_step) ] scale = torch.LongTensor(scale).squeeze(0) res = input.new((len(input) - 1) // self.n_frames_per_step + 1).fill_(input[-1]) res[:-1] = (stacked_input * scale).sum(dim=1) + offset return res def __getitem__(self, index: int) -> SpeechToSpeechDatasetItem: source = self._get_source_audio(index) tgt_lang_tag = None if self.cfg.prepend_tgt_lang_tag_as_bos: # prepend_tgt_lang_tag_as_bos: put tgt_lang_tag as bos of target tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict) if not self.target_is_code: target = get_features_or_waveform(self.tgt_audio_paths[index]) target = torch.from_numpy(target).float() target = self.pack_frames(target) else: target = self.tgt_dict.encode_line( self.tgt_audio_paths[index], add_if_not_exist=False, append_eos=True, ).long() if self.n_frames_per_step > 1: n_tgt_frame = target.size(0) - 1 # exclude <eos> keep_n_tgt_frame = n_tgt_frame - n_tgt_frame % self.n_frames_per_step target = torch.cat( ( target[:keep_n_tgt_frame], target.new_full((1,), self.tgt_dict.eos()), ), dim=0, ) if self.tgt_speakers: tgt_spk = get_features_or_waveform(self.tgt_speakers[index]) tgt_spk = torch.from_numpy(tgt_spk).float() else: tgt_spk = torch.FloatTensor([]) return SpeechToSpeechDatasetItem( index=index, source=source, target=target, target_speaker=tgt_spk, tgt_lang_tag=tgt_lang_tag, ) def _collate_target(self, samples: List[SpeechToSpeechDatasetItem]) -> torch.Tensor: if self.target_is_code: target = fairseq_data_utils.collate_tokens( [x.target for x in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) # convert stacked units to a single id pack_targets = [self.pack_units(x.target) for x in samples] prev_output_tokens = fairseq_data_utils.collate_tokens( pack_targets, self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=True, ) target_lengths = torch.tensor( [x.size(0) for x in pack_targets], dtype=torch.long ) else: target = _collate_frames([x.target for x in samples], is_audio_input=False) bsz, _, d = target.size() prev_output_tokens = torch.cat( (target.new_full((bsz, 1, d), 0.0), target[:, :-1, :]), dim=1 ) target_lengths = torch.tensor( [x.target.size(0) for x in samples], dtype=torch.long ) return target, prev_output_tokens, target_lengths def collater( self, samples: List[SpeechToSpeechDatasetItem], return_order: bool = False ) -> Dict: if len(samples) == 0: return {} indices = torch.tensor([x.index for x in samples], dtype=torch.long) frames = _collate_frames([x.source for x in samples], self.cfg.use_audio_input) # sort samples by descending number of frames n_frames = torch.tensor([x.source.size(0) for x in samples], dtype=torch.long) n_frames, order = n_frames.sort(descending=True) indices = indices.index_select(0, order) frames = frames.index_select(0, order) target, prev_output_tokens, target_lengths = self._collate_target(samples) target = target.index_select(0, order) target_lengths = target_lengths.index_select(0, order) prev_output_tokens = prev_output_tokens.index_select(0, order) ntokens = sum(x.target.size(0) for x in samples) tgt_speakers = None if self.cfg.target_speaker_embed: tgt_speakers = _collate_frames( [x.target_speaker for x in samples], is_audio_input=True ).index_select(0, order) net_input = { "src_tokens": frames, "src_lengths": n_frames, "prev_output_tokens": prev_output_tokens, "tgt_speaker": tgt_speakers, # TODO: unify "speaker" and "tgt_speaker" } if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None: for i in range(len(samples)): net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag out = { "id": indices, "net_input": net_input, "speaker": tgt_speakers, # to support Tacotron2 loss for speech-to-spectrogram model "target": target, "target_lengths": target_lengths, "ntokens": ntokens, "nsentences": len(samples), } if return_order: out["order"] = order return out class SpeechToSpeechMultitaskDataset(SpeechToSpeechDataset): def __init__(self, **kwargs): super().__init__(**kwargs) self.multitask_data = {} def add_multitask_dataset(self, task_name, task_data): self.multitask_data[task_name] = task_data def __getitem__( self, index: int ) -> Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]: s2s_data = super().__getitem__(index) multitask_target = {} sample_id = self.ids[index] tgt_lang = self.tgt_langs[index] for task_name, task_dataset in self.multitask_data.items(): multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang) return s2s_data, multitask_target def collater( self, samples: List[Tuple[SpeechToSpeechDatasetItem, Dict[str, torch.Tensor]]] ) -> Dict: if len(samples) == 0: return {} out = super().collater([s for s, _ in samples], return_order=True) order = out["order"] del out["order"] for task_name, task_dataset in self.multitask_data.items(): if "multitask" not in out: out["multitask"] = {} d = [s[task_name] for _, s in samples] task_target = task_dataset.collater(d) out["multitask"][task_name] = { "target": task_target["target"].index_select(0, order), "target_lengths": task_target["target_lengths"].index_select(0, order), "ntokens": task_target["ntokens"], } out["multitask"][task_name]["net_input"] = { "prev_output_tokens": task_target["prev_output_tokens"].index_select( 0, order ), } return out class SpeechToSpeechDatasetCreator(object): # mandatory columns KEY_ID, KEY_SRC_AUDIO, KEY_SRC_N_FRAMES = "id", "src_audio", "src_n_frames" KEY_TGT_AUDIO, KEY_TGT_N_FRAMES = "tgt_audio", "tgt_n_frames" # optional columns KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang" # default values DEFAULT_LANG = "" @classmethod def _from_list( cls, split_name: str, is_train_split, samples: List[Dict], data_cfg: S2SDataConfig, target_is_code: bool = False, tgt_dict: Dictionary = None, n_frames_per_step: int = 1, multitask: Optional[Dict] = None, ) -> SpeechToSpeechDataset: audio_root = Path(data_cfg.audio_root) ids = [s[cls.KEY_ID] for s in samples] src_audio_paths = [ (audio_root / s[cls.KEY_SRC_AUDIO]).as_posix() for s in samples ] tgt_audio_paths = [ s[cls.KEY_TGT_AUDIO] if target_is_code else (audio_root / s[cls.KEY_TGT_AUDIO]).as_posix() for s in samples ] src_n_frames = [int(s[cls.KEY_SRC_N_FRAMES]) for s in samples] tgt_n_frames = [int(s[cls.KEY_TGT_N_FRAMES]) for s in samples] src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] has_multitask = multitask is not None and len(multitask.keys()) > 0 dataset_cls = ( SpeechToSpeechMultitaskDataset if has_multitask else SpeechToSpeechDataset ) ds = dataset_cls( split=split_name, is_train_split=is_train_split, data_cfg=data_cfg, src_audio_paths=src_audio_paths, src_n_frames=src_n_frames, tgt_audio_paths=tgt_audio_paths, tgt_n_frames=tgt_n_frames, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, target_is_code=target_is_code, tgt_dict=tgt_dict, n_frames_per_step=n_frames_per_step, ) if has_multitask: for task_name, task_obj in multitask.items(): task_data = TextTargetMultitaskData( task_obj.args, split_name, task_obj.target_dictionary ) ds.add_multitask_dataset(task_name, task_data) return ds @classmethod def from_tsv( cls, root: str, data_cfg: S2SDataConfig, splits: str, is_train_split: bool, epoch: int, seed: int, target_is_code: bool = False, tgt_dict: Dictionary = None, n_frames_per_step: int = 1, multitask: Optional[Dict] = None, ) -> SpeechToSpeechDataset: datasets = [] for split in splits.split(","): samples = SpeechToTextDatasetCreator._load_samples_from_tsv(root, split) ds = cls._from_list( split_name=split, is_train_split=is_train_split, samples=samples, data_cfg=data_cfg, target_is_code=target_is_code, tgt_dict=tgt_dict, n_frames_per_step=n_frames_per_step, multitask=multitask, ) datasets.append(ds) return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] ================================================ FILE: fairseq/data/audio/speech_to_text_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import csv import logging import re from argparse import Namespace from collections import defaultdict from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Tuple, Union import numpy as np import torch import torch.nn.functional as F from fairseq.data import ConcatDataset, Dictionary, FairseqDataset, ResamplingDataset from fairseq.data import data_utils as fairseq_data_utils from fairseq.data import encoders from fairseq.data.audio.audio_utils import get_features_or_waveform from fairseq.data.audio.data_cfg import S2TDataConfig from fairseq.data.audio.dataset_transforms import CompositeAudioDatasetTransform from fairseq.data.audio.dataset_transforms.concataugment import ConcatAugment from fairseq.data.audio.dataset_transforms.noisyoverlapaugment import ( NoisyOverlapAugment, ) from fairseq.data.audio.feature_transforms import CompositeAudioFeatureTransform from fairseq.data.audio.waveform_transforms import CompositeAudioWaveformTransform logger = logging.getLogger(__name__) def _collate_frames( frames: List[torch.Tensor], is_audio_input: bool = False ) -> torch.Tensor: """ Convert a list of 2D frames into a padded 3D tensor Args: frames (list): list of 2D frames of size L[i]*f_dim. Where L[i] is length of i-th frame and f_dim is static dimension of features Returns: 3D tensor of size len(frames)*len_max*f_dim where len_max is max of L[i] """ max_len = max(frame.size(0) for frame in frames) if is_audio_input: out = frames[0].new_zeros((len(frames), max_len)) else: out = frames[0].new_zeros((len(frames), max_len, frames[0].size(1))) for i, v in enumerate(frames): out[i, : v.size(0)] = v return out def _is_int_or_np_int(n): return isinstance(n, int) or ( isinstance(n, np.generic) and isinstance(n.item(), int) ) @dataclass class SpeechToTextDatasetItem(object): index: int source: torch.Tensor target: Optional[torch.Tensor] = None speaker_id: Optional[int] = None class SpeechToTextDataset(FairseqDataset): LANG_TAG_TEMPLATE = "<lang:{}>" def __init__( self, split: str, is_train_split: bool, cfg: S2TDataConfig, audio_paths: List[str], n_frames: List[int], src_texts: Optional[List[str]] = None, tgt_texts: Optional[List[str]] = None, speakers: Optional[List[str]] = None, src_langs: Optional[List[str]] = None, tgt_langs: Optional[List[str]] = None, ids: Optional[List[str]] = None, tgt_dict: Optional[Dictionary] = None, pre_tokenizer=None, bpe_tokenizer=None, n_frames_per_step=1, speaker_to_id=None, append_eos=True, ): self.split, self.is_train_split = split, is_train_split self.cfg = cfg self.audio_paths, self.n_frames = audio_paths, n_frames self.n_samples = len(audio_paths) assert len(n_frames) == self.n_samples > 0 assert src_texts is None or len(src_texts) == self.n_samples assert tgt_texts is None or len(tgt_texts) == self.n_samples assert speakers is None or len(speakers) == self.n_samples assert src_langs is None or len(src_langs) == self.n_samples assert tgt_langs is None or len(tgt_langs) == self.n_samples assert ids is None or len(ids) == self.n_samples assert (tgt_dict is None and tgt_texts is None) or ( tgt_dict is not None and tgt_texts is not None ) self.src_texts, self.tgt_texts = src_texts, tgt_texts self.src_langs, self.tgt_langs = src_langs, tgt_langs self.speakers = speakers self.tgt_dict = tgt_dict self.check_tgt_lang_tag() self.ids = ids self.shuffle = cfg.shuffle if is_train_split else False self.feature_transforms = CompositeAudioFeatureTransform.from_config_dict( self.cfg.get_feature_transforms(split, is_train_split) ) self.waveform_transforms = CompositeAudioWaveformTransform.from_config_dict( self.cfg.get_waveform_transforms(split, is_train_split) ) # TODO: add these to data_cfg.py self.dataset_transforms = CompositeAudioDatasetTransform.from_config_dict( self.cfg.get_dataset_transforms(split, is_train_split) ) # check proper usage of transforms if self.feature_transforms and self.cfg.use_audio_input: logger.warning( "Feature transforms will not be applied. To use feature transforms, " "set use_audio_input as False in config." ) self.pre_tokenizer = pre_tokenizer self.bpe_tokenizer = bpe_tokenizer self.n_frames_per_step = n_frames_per_step self.speaker_to_id = speaker_to_id self.tgt_lens = self.get_tgt_lens_and_check_oov() self.append_eos = append_eos logger.info(self.__repr__()) def get_tgt_lens_and_check_oov(self): if self.tgt_texts is None: return [0 for _ in range(self.n_samples)] tgt_lens = [] n_tokens, n_oov_tokens = 0, 0 for i in range(self.n_samples): tokenized = self.get_tokenized_tgt_text(i).split(" ") oov_tokens = [ t for t in tokenized if self.tgt_dict.index(t) == self.tgt_dict.unk_index ] n_tokens += len(tokenized) n_oov_tokens += len(oov_tokens) tgt_lens.append(len(tokenized)) logger.info(f"'{self.split}' has {n_oov_tokens / n_tokens * 100:.2f}% OOV") return tgt_lens def __repr__(self): return ( self.__class__.__name__ + f'(split="{self.split}", n_samples={self.n_samples:_}, ' f"prepend_tgt_lang_tag={self.cfg.prepend_tgt_lang_tag}, " f"n_frames_per_step={self.n_frames_per_step}, " f"shuffle={self.shuffle}, " f"feature_transforms={self.feature_transforms}, " f"waveform_transforms={self.waveform_transforms}, " f"dataset_transforms={self.dataset_transforms})" ) @classmethod def is_lang_tag(cls, token): pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)") return re.match(pattern, token) def check_tgt_lang_tag(self): if self.cfg.prepend_tgt_lang_tag: assert self.tgt_langs is not None and self.tgt_dict is not None tgt_lang_tags = [ self.LANG_TAG_TEMPLATE.format(t) for t in set(self.tgt_langs) ] assert all(t in self.tgt_dict for t in tgt_lang_tags) @classmethod def tokenize(cls, tokenizer, text: str): return text if tokenizer is None else tokenizer.encode(text) def get_tokenized_tgt_text(self, index: Union[int, List[int]]): if _is_int_or_np_int(index): text = self.tgt_texts[index] else: text = " ".join([self.tgt_texts[i] for i in index]) text = self.tokenize(self.pre_tokenizer, text) text = self.tokenize(self.bpe_tokenizer, text) return text def pack_frames(self, feature: torch.Tensor): if self.n_frames_per_step == 1: return feature n_packed_frames = feature.shape[0] // self.n_frames_per_step feature = feature[: self.n_frames_per_step * n_packed_frames] return feature.reshape(n_packed_frames, -1) @classmethod def get_lang_tag_idx(cls, lang: str, dictionary: Dictionary): lang_tag_idx = dictionary.index(cls.LANG_TAG_TEMPLATE.format(lang)) assert lang_tag_idx != dictionary.unk() return lang_tag_idx def _get_source_audio(self, index: Union[int, List[int]]) -> torch.Tensor: """ Gives source audio for given index with any relevant transforms applied. For ConcatAug, source audios for given indices are concatenated in given order. Args: index (int or List[int]): index—or in the case of ConcatAug, indices—to pull the source audio for Returns: source audios concatenated for given indices with relevant transforms appplied """ if _is_int_or_np_int(index): source = get_features_or_waveform( self.audio_paths[index], need_waveform=self.cfg.use_audio_input, use_sample_rate=self.cfg.use_sample_rate, waveform_transforms=self.waveform_transforms, ) else: source = np.concatenate( [ get_features_or_waveform( self.audio_paths[i], need_waveform=self.cfg.use_audio_input, use_sample_rate=self.cfg.use_sample_rate, waveform_transforms=self.waveform_transforms, ) for i in index ] ) if self.cfg.use_audio_input: source = torch.from_numpy(source).float() if self.cfg.standardize_audio: with torch.no_grad(): source = F.layer_norm(source, source.shape) else: if self.feature_transforms is not None: source = self.feature_transforms(source) source = torch.from_numpy(source).float() return source def __getitem__(self, index: int) -> SpeechToTextDatasetItem: has_concat = self.dataset_transforms.has_transform(ConcatAugment) if has_concat: concat = self.dataset_transforms.get_transform(ConcatAugment) indices = concat.find_indices(index, self.n_frames, self.n_samples) source = self._get_source_audio(indices if has_concat else index) source = self.pack_frames(source) target = None if self.tgt_texts is not None: tokenized = self.get_tokenized_tgt_text(indices if has_concat else index) target = self.tgt_dict.encode_line( tokenized, add_if_not_exist=False, append_eos=self.append_eos ).long() if self.cfg.prepend_tgt_lang_tag: lang_tag_idx = self.get_lang_tag_idx( self.tgt_langs[index], self.tgt_dict ) target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0) if self.cfg.prepend_bos_and_append_tgt_lang_tag: bos = torch.LongTensor([self.tgt_dict.bos()]) lang_tag_idx = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict) assert lang_tag_idx != self.tgt_dict.unk() lang_tag_idx = torch.LongTensor([lang_tag_idx]) target = torch.cat((bos, target, lang_tag_idx), 0) speaker_id = None if self.speaker_to_id is not None: speaker_id = self.speaker_to_id[self.speakers[index]] return SpeechToTextDatasetItem( index=index, source=source, target=target, speaker_id=speaker_id ) def __len__(self): return self.n_samples def collater( self, samples: List[SpeechToTextDatasetItem], return_order: bool = False ) -> Dict: if len(samples) == 0: return {} indices = torch.tensor([x.index for x in samples], dtype=torch.long) sources = [x.source for x in samples] has_NOAug = self.dataset_transforms.has_transform(NoisyOverlapAugment) if has_NOAug and self.cfg.use_audio_input: NOAug = self.dataset_transforms.get_transform(NoisyOverlapAugment) sources = NOAug(sources) frames = _collate_frames(sources, self.cfg.use_audio_input) # sort samples by descending number of frames n_frames = torch.tensor([x.size(0) for x in sources], dtype=torch.long) n_frames, order = n_frames.sort(descending=True) indices = indices.index_select(0, order) frames = frames.index_select(0, order) target, target_lengths = None, None prev_output_tokens = None ntokens = None if self.tgt_texts is not None: target = fairseq_data_utils.collate_tokens( [x.target for x in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) target = target.index_select(0, order) target_lengths = torch.tensor( [x.target.size(0) for x in samples], dtype=torch.long ).index_select(0, order) prev_output_tokens = fairseq_data_utils.collate_tokens( [x.target for x in samples], self.tgt_dict.pad(), eos_idx=None, left_pad=False, move_eos_to_beginning=True, ) prev_output_tokens = prev_output_tokens.index_select(0, order) ntokens = sum(x.target.size(0) for x in samples) speaker = None if self.speaker_to_id is not None: speaker = ( torch.tensor([s.speaker_id for s in samples], dtype=torch.long) .index_select(0, order) .view(-1, 1) ) net_input = { "src_tokens": frames, "src_lengths": n_frames, "prev_output_tokens": prev_output_tokens, } out = { "id": indices, "net_input": net_input, "speaker": speaker, "target": target, "target_lengths": target_lengths, "ntokens": ntokens, "nsentences": len(samples), } if return_order: out["order"] = order return out def num_tokens(self, index): return self.n_frames[index] def size(self, index): return self.n_frames[index], self.tgt_lens[index] @property def sizes(self): return np.array(self.n_frames) @property def can_reuse_epoch_itr_across_epochs(self): return True def ordered_indices(self): if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] # first by descending order of # of frames then by original/random order order.append([-n for n in self.n_frames]) return np.lexsort(order) def prefetch(self, indices): raise False class TextTargetMultitaskData(object): # mandatory columns KEY_ID, KEY_TEXT = "id", "tgt_text" LANG_TAG_TEMPLATE = "<lang:{}>" def __init__(self, args, split, tgt_dict): samples = SpeechToTextDatasetCreator._load_samples_from_tsv(args.data, split) self.data = {s[self.KEY_ID]: s[self.KEY_TEXT] for s in samples} self.dict = tgt_dict self.append_eos = args.decoder_type != "ctc" self.pre_tokenizer = self.build_tokenizer(args) self.bpe_tokenizer = self.build_bpe(args) self.prepend_bos_and_append_tgt_lang_tag = ( args.prepend_bos_and_append_tgt_lang_tag ) self.eos_token = args.eos_token self.lang_tag_mapping = args.get_lang_tag_mapping @classmethod def is_lang_tag(cls, token): pattern = cls.LANG_TAG_TEMPLATE.replace("{}", "(.*)") return re.match(pattern, token) @classmethod def tokenize(cls, tokenizer, text: str): return text if tokenizer is None else tokenizer.encode(text) def get_tokenized_tgt_text(self, index: int): text = self.tokenize(self.pre_tokenizer, self.data[index]) text = self.tokenize(self.bpe_tokenizer, text) return text def get_lang_tag_idx(self, lang: str, dictionary: Dictionary): lang_tag = self.LANG_TAG_TEMPLATE.format(lang) lang_tag = self.lang_tag_mapping.get(lang_tag, lang_tag) lang_tag_idx = dictionary.index(lang_tag) assert lang_tag_idx != dictionary.unk(), (lang, lang_tag) return lang_tag_idx def build_tokenizer(self, args): pre_tokenizer = args.config.get("pre_tokenizer") if pre_tokenizer is not None: logger.info(f"pre-tokenizer: {pre_tokenizer}") return encoders.build_tokenizer(Namespace(**pre_tokenizer)) else: return None def build_bpe(self, args): bpe_tokenizer = args.config.get("bpe_tokenizer") if bpe_tokenizer is not None: logger.info(f"tokenizer: {bpe_tokenizer}") return encoders.build_bpe(Namespace(**bpe_tokenizer)) else: return None def get(self, sample_id, tgt_lang=None): if sample_id in self.data: tokenized = self.get_tokenized_tgt_text(sample_id) target = self.dict.encode_line( tokenized, add_if_not_exist=False, append_eos=self.append_eos, ) if self.prepend_bos_and_append_tgt_lang_tag: bos = torch.LongTensor([self.dict.bos()]) lang_tag_idx = self.get_lang_tag_idx(tgt_lang, self.dict) assert lang_tag_idx != self.dict.unk() lang_tag_idx = torch.LongTensor([lang_tag_idx]) target = torch.cat((bos, target, lang_tag_idx), 0) return target else: logger.warning(f"no target for {sample_id}") return torch.IntTensor([]) def collater(self, samples: List[torch.Tensor]) -> torch.Tensor: out = fairseq_data_utils.collate_tokens( samples, self.dict.pad(), eos_idx=None, left_pad=False, move_eos_to_beginning=False, ).long() prev_out = fairseq_data_utils.collate_tokens( samples, self.dict.pad(), eos_idx=None, left_pad=False, move_eos_to_beginning=True, ).long() target_lengths = torch.tensor([t.size(0) for t in samples], dtype=torch.long) ntokens = sum(t.size(0) for t in samples) output = { "prev_output_tokens": prev_out, "target": out, "target_lengths": target_lengths, "ntokens": ntokens, } return output class SpeechToTextMultitaskDataset(SpeechToTextDataset): def __init__(self, **kwargs): super().__init__(**kwargs) self.multitask_data = {} def add_multitask_dataset(self, task_name, task_data): self.multitask_data[task_name] = task_data def __getitem__( self, index: int ) -> Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]: s2t_data = super().__getitem__(index) multitask_target = {} sample_id = self.ids[index] tgt_lang = self.tgt_langs[index] for task_name, task_dataset in self.multitask_data.items(): multitask_target[task_name] = task_dataset.get(sample_id, tgt_lang) return s2t_data, multitask_target def collater( self, samples: List[Tuple[SpeechToTextDatasetItem, Dict[str, torch.Tensor]]] ) -> Dict: if len(samples) == 0: return {} out = super().collater([s for s, _ in samples], return_order=True) order = out["order"] del out["order"] for task_name, task_dataset in self.multitask_data.items(): if "multitask" not in out: out["multitask"] = {} d = [s[task_name] for _, s in samples] task_target = task_dataset.collater(d) out["multitask"][task_name] = { "target": task_target["target"].index_select(0, order), "target_lengths": task_target["target_lengths"].index_select(0, order), "ntokens": task_target["ntokens"], } out["multitask"][task_name]["net_input"] = { "prev_output_tokens": task_target["prev_output_tokens"].index_select( 0, order ), } return out class SpeechToTextDatasetCreator(object): # mandatory columns KEY_ID, KEY_AUDIO, KEY_N_FRAMES = "id", "audio", "n_frames" KEY_TGT_TEXT = "tgt_text" # optional columns KEY_SPEAKER, KEY_SRC_TEXT = "speaker", "src_text" KEY_SRC_LANG, KEY_TGT_LANG = "src_lang", "tgt_lang" # default values DEFAULT_SPEAKER = DEFAULT_SRC_TEXT = DEFAULT_LANG = "" @classmethod def _from_list( cls, split_name: str, is_train_split, samples: List[Dict], cfg: S2TDataConfig, tgt_dict, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, multitask: Optional[Dict] = None, ) -> SpeechToTextDataset: audio_root = Path(cfg.audio_root) ids = [s[cls.KEY_ID] for s in samples] audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] has_multitask = multitask is not None and len(multitask.keys()) > 0 dataset_cls = ( SpeechToTextMultitaskDataset if has_multitask else SpeechToTextDataset ) ds = dataset_cls( split=split_name, is_train_split=is_train_split, cfg=cfg, audio_paths=audio_paths, n_frames=n_frames, src_texts=src_texts, tgt_texts=tgt_texts, speakers=speakers, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, tgt_dict=tgt_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, n_frames_per_step=n_frames_per_step, speaker_to_id=speaker_to_id, ) if has_multitask: for task_name, task_obj in multitask.items(): task_data = TextTargetMultitaskData( task_obj.args, split_name, task_obj.target_dictionary ) ds.add_multitask_dataset(task_name, task_data) return ds @classmethod def get_size_ratios( cls, datasets: List[SpeechToTextDataset], alpha: float = 1.0 ) -> List[float]: """Size ratios for temperature-based sampling (https://arxiv.org/abs/1907.05019)""" id_to_lp, lp_to_sz = {}, defaultdict(int) for ds in datasets: lang_pairs = {f"{s}->{t}" for s, t in zip(ds.src_langs, ds.tgt_langs)} assert len(lang_pairs) == 1 lang_pair = list(lang_pairs)[0] id_to_lp[ds.split] = lang_pair lp_to_sz[lang_pair] += sum(ds.n_frames) sz_sum = sum(v for v in lp_to_sz.values()) lp_to_prob = {k: v / sz_sum for k, v in lp_to_sz.items()} lp_to_tgt_prob = {k: v**alpha for k, v in lp_to_prob.items()} prob_sum = sum(v for v in lp_to_tgt_prob.values()) lp_to_tgt_prob = {k: v / prob_sum for k, v in lp_to_tgt_prob.items()} lp_to_sz_ratio = { k: (lp_to_tgt_prob[k] * sz_sum) / v for k, v in lp_to_sz.items() } size_ratio = [lp_to_sz_ratio[id_to_lp[ds.split]] for ds in datasets] p_formatted = { k: f"{lp_to_prob[k]:.3f}->{lp_to_tgt_prob[k]:.3f}" for k in lp_to_sz } logger.info(f"sampling probability balancing: {p_formatted}") sr_formatted = {ds.split: f"{r:.3f}" for ds, r in zip(datasets, size_ratio)} logger.info(f"balanced sampling size ratio: {sr_formatted}") return size_ratio @classmethod def _load_samples_from_tsv(cls, root: str, split: str): tsv_path = Path(root) / f"{split}.tsv" if not tsv_path.is_file(): raise FileNotFoundError(f"Dataset not found: {tsv_path}") with open(tsv_path) as f: reader = csv.DictReader( f, delimiter="\t", quotechar=None, doublequote=False, lineterminator="\n", quoting=csv.QUOTE_NONE, ) samples = [dict(e) for e in reader] if len(samples) == 0: raise ValueError(f"Empty manifest: {tsv_path}") return samples @classmethod def _from_tsv( cls, root: str, cfg: S2TDataConfig, split: str, tgt_dict, is_train_split: bool, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, multitask: Optional[Dict] = None, ) -> SpeechToTextDataset: samples = cls._load_samples_from_tsv(root, split) return cls._from_list( split, is_train_split, samples, cfg, tgt_dict, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, multitask, ) @classmethod def from_tsv( cls, root: str, cfg: S2TDataConfig, splits: str, tgt_dict, pre_tokenizer, bpe_tokenizer, is_train_split: bool, epoch: int, seed: int, n_frames_per_step: int = 1, speaker_to_id=None, multitask: Optional[Dict] = None, ) -> SpeechToTextDataset: datasets = [ cls._from_tsv( root=root, cfg=cfg, split=split, tgt_dict=tgt_dict, is_train_split=is_train_split, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, n_frames_per_step=n_frames_per_step, speaker_to_id=speaker_to_id, multitask=multitask, ) for split in splits.split(",") ] if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0: # temperature-based sampling size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha) datasets = [ ResamplingDataset( d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0) ) for r, d in zip(size_ratios, datasets) ] return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] ================================================ FILE: fairseq/data/audio/speech_to_text_joint_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from pathlib import Path from typing import Dict, List, NamedTuple, Optional import torch from fairseq.data import ConcatDataset, Dictionary, ResamplingDataset from fairseq.data import data_utils as fairseq_data_utils from fairseq.data.audio.speech_to_text_dataset import ( S2TDataConfig, SpeechToTextDataset, SpeechToTextDatasetCreator, ) logger = logging.getLogger(__name__) class S2TJointDataConfig(S2TDataConfig): """Wrapper class for data config YAML""" @property def src_vocab_filename(self): """fairseq vocabulary file under data root""" return self.config.get("src_vocab_filename", "src_dict.txt") @property def src_pre_tokenizer(self) -> Dict: """Pre-tokenizer to apply before subword tokenization. Returning a dictionary with `tokenizer` providing the tokenizer name and the other items providing the tokenizer-specific arguments. Tokenizers are defined in `fairseq.data.encoders.*`""" return self.config.get("src_pre_tokenizer", {"tokenizer": None}) @property def src_bpe_tokenizer(self) -> Dict: """Subword tokenizer to apply on source text after pre-tokenization. Returning a dictionary with `bpe` providing the tokenizer name and the other items providing the tokenizer-specific arguments. Tokenizers are defined in `fairseq.data.encoders.*`""" return self.config.get("src_bpe_tokenizer", {"bpe": None}) @property def prepend_tgt_lang_tag_no_change(self) -> bool: """Prepend target lang ID token as the prev_output_tokens BOS (e.g. for to-many multilingual setting). No change needed during inference. This option is deprecated and replaced by prepend_tgt_lang_tag_as_bos. """ value = self.config.get("prepend_tgt_lang_tag_no_change", None) if value is None: return self.config.get("prepend_tgt_lang_tag_as_bos", False) return value @property def sampling_text_alpha(self): """Hyper-parameter alpha = 1/T for temperature-based resampling. (text input only) (alpha = 1 for no resampling)""" return self.config.get("sampling_text_alpha", 1.0) class SpeechToTextJointDatasetItem(NamedTuple): index: int source: torch.Tensor target: Optional[torch.Tensor] = None src_txt_tokens: Optional[torch.Tensor] = None tgt_lang_tag: Optional[int] = None src_lang_tag: Optional[int] = None tgt_alignment: Optional[torch.Tensor] = None # use_src_lang_id: # 0: don't use src_lang_id # 1: attach src_lang_id to the src_txt_tokens as eos class SpeechToTextJointDataset(SpeechToTextDataset): def __init__( self, split: str, is_train_split: bool, cfg: S2TJointDataConfig, audio_paths: List[str], n_frames: List[int], src_texts: Optional[List[str]] = None, tgt_texts: Optional[List[str]] = None, speakers: Optional[List[str]] = None, src_langs: Optional[List[str]] = None, tgt_langs: Optional[List[str]] = None, ids: Optional[List[str]] = None, tgt_dict: Optional[Dictionary] = None, src_dict: Optional[Dictionary] = None, pre_tokenizer=None, bpe_tokenizer=None, src_pre_tokenizer=None, src_bpe_tokenizer=None, append_eos: Optional[bool] = True, alignment: Optional[List[str]] = None, use_src_lang_id: Optional[int] = 0, ): super().__init__( split, is_train_split, cfg, audio_paths, n_frames, src_texts=src_texts, tgt_texts=tgt_texts, speakers=speakers, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, tgt_dict=tgt_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, append_eos=append_eos, ) self.src_dict = src_dict self.src_pre_tokenizer = src_pre_tokenizer self.src_bpe_tokenizer = src_bpe_tokenizer self.alignment = None self.use_src_lang_id = use_src_lang_id if alignment is not None: self.alignment = [ [float(s) for s in sample.split()] for sample in alignment ] def get_tokenized_src_text(self, index: int): text = self.tokenize(self.src_pre_tokenizer, self.src_texts[index]) text = self.tokenize(self.src_bpe_tokenizer, text) return text def __getitem__(self, index: int) -> SpeechToTextJointDatasetItem: s2t_dataset_item = super().__getitem__(index) src_tokens = None src_lang_tag = None if self.src_texts is not None and self.src_dict is not None: src_tokens = self.get_tokenized_src_text(index) src_tokens = self.src_dict.encode_line( src_tokens, add_if_not_exist=False, append_eos=True ).long() if self.use_src_lang_id > 0: src_lang_tag = self.get_lang_tag_idx( self.src_langs[index], self.src_dict ) tgt_lang_tag = None if self.cfg.prepend_tgt_lang_tag_no_change: # prepend_tgt_lang_tag_no_change: modify prev_output_tokens instead tgt_lang_tag = self.get_lang_tag_idx(self.tgt_langs[index], self.tgt_dict) ali = None if self.alignment is not None: ali = torch.Tensor(self.alignment[index]).float() return SpeechToTextJointDatasetItem( index=index, source=s2t_dataset_item.source, target=s2t_dataset_item.target, src_txt_tokens=src_tokens, tgt_lang_tag=tgt_lang_tag, src_lang_tag=src_lang_tag, tgt_alignment=ali, ) def __len__(self): return self.n_samples def collater(self, samples: List[SpeechToTextJointDatasetItem]) -> Dict: s2t_out = super().collater(samples, return_order=True) if s2t_out == {}: return s2t_out net_input, order = s2t_out["net_input"], s2t_out["order"] if self.src_texts is not None and self.src_dict is not None: src_txt_tokens = fairseq_data_utils.collate_tokens( [x.src_txt_tokens for x in samples], self.src_dict.pad(), self.src_dict.eos(), left_pad=False, move_eos_to_beginning=False, ) src_txt_lengths = torch.tensor( [x.src_txt_tokens.size()[0] for x in samples], dtype=torch.long ) if self.use_src_lang_id > 0: src_lang_idxs = torch.tensor( [s.src_lang_tag for s in samples], dtype=src_txt_tokens.dtype ) if self.use_src_lang_id == 1: # replace eos with lang_id eos_idx = src_txt_lengths - 1 src_txt_tokens.scatter_( 1, eos_idx.view(-1, 1), src_lang_idxs.view(-1, 1) ) else: raise NotImplementedError("Implementation is required") src_txt_tokens = src_txt_tokens.index_select(0, order) src_txt_lengths = src_txt_lengths.index_select(0, order) net_input["src_txt_tokens"] = src_txt_tokens net_input["src_txt_lengths"] = src_txt_lengths net_input["alignment"] = None if self.alignment is not None: max_len = max([s.tgt_alignment.size(0) for s in samples]) alignment = torch.ones(len(samples), max_len).float() for i, s in enumerate(samples): cur_len = s.tgt_alignment.size(0) alignment[i][:cur_len].copy_(s.tgt_alignment) net_input["alignment"] = alignment.index_select(0, order) if self.tgt_texts is not None and samples[0].tgt_lang_tag is not None: for i in range(len(samples)): net_input["prev_output_tokens"][i][0] = samples[order[i]].tgt_lang_tag out = { "id": s2t_out["id"], "net_input": net_input, "target": s2t_out["target"], "target_lengths": s2t_out["target_lengths"], "ntokens": s2t_out["ntokens"], "nsentences": len(samples), } return out class SpeechToTextJointDatasetCreator(SpeechToTextDatasetCreator): KEY_ALIGN = "align" @classmethod def _from_list( cls, split_name: str, is_train_split, samples: List[Dict], cfg: S2TJointDataConfig, tgt_dict, src_dict, pre_tokenizer, bpe_tokenizer, src_pre_tokenizer, src_bpe_tokenizer, append_eos, use_src_lang_id, ) -> SpeechToTextJointDataset: audio_root = Path(cfg.audio_root) ids = [s[cls.KEY_ID] for s in samples] audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] tgt_alignment = None if cls.KEY_ALIGN in samples[0].keys(): tgt_alignment = [s[cls.KEY_ALIGN] for s in samples] return SpeechToTextJointDataset( split_name, is_train_split, cfg, audio_paths, n_frames, src_texts=src_texts, tgt_texts=tgt_texts, speakers=speakers, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, tgt_dict=tgt_dict, src_dict=src_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, src_pre_tokenizer=src_pre_tokenizer, src_bpe_tokenizer=src_bpe_tokenizer, append_eos=append_eos, alignment=tgt_alignment, use_src_lang_id=use_src_lang_id, ) @classmethod def _from_tsv( cls, root: str, cfg: S2TJointDataConfig, split: str, tgt_dict, src_dict, is_train_split: bool, pre_tokenizer, bpe_tokenizer, src_pre_tokenizer, src_bpe_tokenizer, append_eos: bool, use_src_lang_id: int, ) -> SpeechToTextJointDataset: samples = cls._load_samples_from_tsv(root, split) return cls._from_list( split, is_train_split, samples, cfg, tgt_dict, src_dict, pre_tokenizer, bpe_tokenizer, src_pre_tokenizer, src_bpe_tokenizer, append_eos, use_src_lang_id, ) @classmethod def from_tsv( cls, root: str, cfg: S2TJointDataConfig, splits: str, tgt_dict, src_dict, pre_tokenizer, bpe_tokenizer, src_pre_tokenizer, src_bpe_tokenizer, is_train_split: bool, epoch: int, seed: int, append_eos: Optional[bool] = True, use_src_lang_id: Optional[int] = 0, ) -> SpeechToTextJointDataset: datasets = [ cls._from_tsv( root, cfg, split, tgt_dict, src_dict, is_train_split, pre_tokenizer, bpe_tokenizer, src_pre_tokenizer, src_bpe_tokenizer, append_eos=append_eos, use_src_lang_id=use_src_lang_id, ) for split in splits.split(",") ] if is_train_split and len(datasets) > 1 and cfg.sampling_alpha != 1.0: # temperature-based sampling size_ratios = cls.get_size_ratios(datasets, alpha=cfg.sampling_alpha) datasets = [ ResamplingDataset( d, size_ratio=r, seed=seed, epoch=epoch, replace=(r >= 1.0) ) for r, d in zip(size_ratios, datasets) ] return ConcatDataset(datasets) if len(datasets) > 1 else datasets[0] ================================================ FILE: fairseq/data/audio/text_to_speech_dataset.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory.abs from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional import numpy as np import torch from fairseq.data import Dictionary from fairseq.data import data_utils as fairseq_data_utils from fairseq.data.audio.audio_utils import get_features_or_waveform from fairseq.data.audio.speech_to_text_dataset import ( S2TDataConfig, SpeechToTextDataset, SpeechToTextDatasetCreator, _collate_frames, ) @dataclass class TextToSpeechDatasetItem(object): index: int source: torch.Tensor target: Optional[torch.Tensor] = None speaker_id: Optional[int] = None duration: Optional[torch.Tensor] = None pitch: Optional[torch.Tensor] = None energy: Optional[torch.Tensor] = None class TextToSpeechDataset(SpeechToTextDataset): def __init__( self, split: str, is_train_split: bool, cfg: S2TDataConfig, audio_paths: List[str], n_frames: List[int], src_texts: Optional[List[str]] = None, tgt_texts: Optional[List[str]] = None, speakers: Optional[List[str]] = None, src_langs: Optional[List[str]] = None, tgt_langs: Optional[List[str]] = None, ids: Optional[List[str]] = None, tgt_dict: Optional[Dictionary] = None, pre_tokenizer=None, bpe_tokenizer=None, n_frames_per_step=1, speaker_to_id=None, durations: Optional[List[List[int]]] = None, pitches: Optional[List[str]] = None, energies: Optional[List[str]] = None, ): super(TextToSpeechDataset, self).__init__( split, is_train_split, cfg, audio_paths, n_frames, src_texts=src_texts, tgt_texts=tgt_texts, speakers=speakers, src_langs=src_langs, tgt_langs=tgt_langs, ids=ids, tgt_dict=tgt_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, n_frames_per_step=n_frames_per_step, speaker_to_id=speaker_to_id, ) self.durations = durations self.pitches = pitches self.energies = energies def __getitem__(self, index: int) -> TextToSpeechDatasetItem: s2t_item = super().__getitem__(index) duration, pitch, energy = None, None, None if self.durations is not None: duration = torch.tensor( self.durations[index] + [0], dtype=torch.long # pad 0 for EOS ) if self.pitches is not None: pitch = get_features_or_waveform(self.pitches[index]) pitch = torch.from_numpy( np.concatenate((pitch, [0])) # pad 0 for EOS ).float() if self.energies is not None: energy = get_features_or_waveform(self.energies[index]) energy = torch.from_numpy( np.concatenate((energy, [0])) # pad 0 for EOS ).float() return TextToSpeechDatasetItem( index=index, source=s2t_item.source, target=s2t_item.target, speaker_id=s2t_item.speaker_id, duration=duration, pitch=pitch, energy=energy, ) def collater(self, samples: List[TextToSpeechDatasetItem]) -> Dict[str, Any]: if len(samples) == 0: return {} src_lengths, order = torch.tensor( [s.target.shape[0] for s in samples], dtype=torch.long ).sort(descending=True) id_ = torch.tensor([s.index for s in samples], dtype=torch.long).index_select( 0, order ) feat = _collate_frames( [s.source for s in samples], self.cfg.use_audio_input ).index_select(0, order) target_lengths = torch.tensor( [s.source.shape[0] for s in samples], dtype=torch.long ).index_select(0, order) src_tokens = fairseq_data_utils.collate_tokens( [s.target for s in samples], self.tgt_dict.pad(), self.tgt_dict.eos(), left_pad=False, move_eos_to_beginning=False, ).index_select(0, order) speaker = None if self.speaker_to_id is not None: speaker = ( torch.tensor([s.speaker_id for s in samples], dtype=torch.long) .index_select(0, order) .view(-1, 1) ) bsz, _, d = feat.size() prev_output_tokens = torch.cat( (feat.new_zeros((bsz, 1, d)), feat[:, :-1, :]), dim=1 ) durations, pitches, energies = None, None, None if self.durations is not None: durations = fairseq_data_utils.collate_tokens( [s.duration for s in samples], 0 ).index_select(0, order) assert src_tokens.shape[1] == durations.shape[1] if self.pitches is not None: pitches = _collate_frames([s.pitch for s in samples], True) pitches = pitches.index_select(0, order) assert src_tokens.shape[1] == pitches.shape[1] if self.energies is not None: energies = _collate_frames([s.energy for s in samples], True) energies = energies.index_select(0, order) assert src_tokens.shape[1] == energies.shape[1] src_texts = [self.tgt_dict.string(samples[i].target) for i in order] return { "id": id_, "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, "prev_output_tokens": prev_output_tokens, }, "speaker": speaker, "target": feat, "durations": durations, "pitches": pitches, "energies": energies, "target_lengths": target_lengths, "ntokens": sum(target_lengths).item(), "nsentences": len(samples), "src_texts": src_texts, } class TextToSpeechDatasetCreator(SpeechToTextDatasetCreator): KEY_DURATION = "duration" KEY_PITCH = "pitch" KEY_ENERGY = "energy" @classmethod def _from_list( cls, split_name: str, is_train_split, samples: List[Dict], cfg: S2TDataConfig, tgt_dict, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, multitask=None, ) -> TextToSpeechDataset: audio_root = Path(cfg.audio_root) ids = [s[cls.KEY_ID] for s in samples] audio_paths = [(audio_root / s[cls.KEY_AUDIO]).as_posix() for s in samples] n_frames = [int(s[cls.KEY_N_FRAMES]) for s in samples] tgt_texts = [s[cls.KEY_TGT_TEXT] for s in samples] src_texts = [s.get(cls.KEY_SRC_TEXT, cls.DEFAULT_SRC_TEXT) for s in samples] speakers = [s.get(cls.KEY_SPEAKER, cls.DEFAULT_SPEAKER) for s in samples] src_langs = [s.get(cls.KEY_SRC_LANG, cls.DEFAULT_LANG) for s in samples] tgt_langs = [s.get(cls.KEY_TGT_LANG, cls.DEFAULT_LANG) for s in samples] durations = [s.get(cls.KEY_DURATION, None) for s in samples] durations = [ None if dd is None else [int(d) for d in dd.split(" ")] for dd in durations ] durations = None if any(dd is None for dd in durations) else durations pitches = [s.get(cls.KEY_PITCH, None) for s in samples] pitches = [ None if pp is None else (audio_root / pp).as_posix() for pp in pitches ] pitches = None if any(pp is None for pp in pitches) else pitches energies = [s.get(cls.KEY_ENERGY, None) for s in samples] energies = [ None if ee is None else (audio_root / ee).as_posix() for ee in energies ] energies = None if any(ee is None for ee in energies) else energies return TextToSpeechDataset( split_name, is_train_split, cfg, audio_paths, n_frames, src_texts, tgt_texts, speakers, src_langs, tgt_langs, ids, tgt_dict, pre_tokenizer, bpe_tokenizer, n_frames_per_step, speaker_to_id, durations, pitches, energies, ) ================================================ FILE: fairseq/data/audio/waveform_transforms/__init__.py ================================================ import os from fairseq.data.audio import ( AudioTransform, CompositeAudioTransform, import_transforms, register_audio_transform, ) class AudioWaveformTransform(AudioTransform): pass AUDIO_WAVEFORM_TRANSFORM_REGISTRY = {} AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES = set() def get_audio_waveform_transform(name): return AUDIO_WAVEFORM_TRANSFORM_REGISTRY[name] def register_audio_waveform_transform(name): return register_audio_transform( name, AudioWaveformTransform, AUDIO_WAVEFORM_TRANSFORM_REGISTRY, AUDIO_WAVEFORM_TRANSFORM_CLASS_NAMES, ) import_transforms(os.path.dirname(__file__), "waveform") class CompositeAudioWaveformTransform(CompositeAudioTransform): @classmethod def from_config_dict(cls, config=None): return super()._from_config_dict( cls, "waveform", get_audio_waveform_transform, CompositeAudioWaveformTransform, config, ) def __call__(self, x, sample_rate): for t in self.transforms: x, sample_rate = t(x, sample_rate) return x, sample_rate ================================================ FILE: fairseq/data/audio/waveform_transforms/noiseaugment.py ================================================ from pathlib import Path import numpy as np from math import ceil from fairseq.data.audio import rand_uniform from fairseq.data.audio.waveform_transforms import ( AudioWaveformTransform, register_audio_waveform_transform, ) SNR_MIN = 5.0 SNR_MAX = 15.0 RATE = 0.25 NOISE_RATE = 1.0 NOISE_LEN_MEAN = 0.2 NOISE_LEN_STD = 0.05 class NoiseAugmentTransform(AudioWaveformTransform): @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return cls( _config.get("samples_path", None), _config.get("snr_min", SNR_MIN), _config.get("snr_max", SNR_MAX), _config.get("rate", RATE), ) def __init__( self, samples_path: str, snr_min: float = SNR_MIN, snr_max: float = SNR_MAX, rate: float = RATE, ): # Sanity checks assert ( samples_path ), "need to provide path to audio samples for noise augmentation" assert snr_max >= snr_min, f"empty signal-to-noise range ({snr_min}, {snr_max})" assert rate >= 0 and rate <= 1, "rate should be a float between 0 to 1" self.paths = list(Path(samples_path).glob("**/*.wav")) # load music self.n_samples = len(self.paths) assert self.n_samples > 0, f"no audio files found in {samples_path}" self.snr_min = snr_min self.snr_max = snr_max self.rate = rate def __repr__(self): return ( self.__class__.__name__ + "(" + ", ".join( [ f"n_samples={self.n_samples}", f"snr={self.snr_min}-{self.snr_max}dB", f"rate={self.rate}", ] ) + ")" ) def pick_sample(self, goal_shape, always_2d=False, use_sample_rate=None): from fairseq.data.audio.audio_utils import get_waveform path = self.paths[np.random.randint(0, self.n_samples)] sample = get_waveform( path, always_2d=always_2d, output_sample_rate=use_sample_rate )[0] # Check dimensions match, else silently skip adding noise to sample # NOTE: SHOULD THIS QUIT WITH AN ERROR? is_2d = len(goal_shape) == 2 if len(goal_shape) != sample.ndim or ( is_2d and goal_shape[0] != sample.shape[0] ): return np.zeros(goal_shape) # Cut/repeat sample to size len_dim = len(goal_shape) - 1 n_repeat = ceil(goal_shape[len_dim] / sample.shape[len_dim]) repeated = np.tile(sample, [1, n_repeat] if is_2d else n_repeat) start = np.random.randint(0, repeated.shape[len_dim] - goal_shape[len_dim] + 1) return ( repeated[:, start : start + goal_shape[len_dim]] if is_2d else repeated[start : start + goal_shape[len_dim]] ) def _mix(self, source, noise, snr): get_power = lambda x: np.mean(x**2) if get_power(noise): scl = np.sqrt( get_power(source) / (np.power(10, snr / 10) * get_power(noise)) ) else: scl = 0 return 1 * source + scl * noise def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None): return self.pick_sample(goal_shape, always_2d, use_sample_rate) def __call__(self, source, sample_rate): if np.random.random() > self.rate: return source, sample_rate noise = self._get_noise( source.shape, always_2d=True, use_sample_rate=sample_rate ) return ( self._mix(source, noise, rand_uniform(self.snr_min, self.snr_max)), sample_rate, ) @register_audio_waveform_transform("musicaugment") class MusicAugmentTransform(NoiseAugmentTransform): pass @register_audio_waveform_transform("backgroundnoiseaugment") class BackgroundNoiseAugmentTransform(NoiseAugmentTransform): pass @register_audio_waveform_transform("babbleaugment") class BabbleAugmentTransform(NoiseAugmentTransform): def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None): for i in range(np.random.randint(3, 8)): speech = self.pick_sample(goal_shape, always_2d, use_sample_rate) if i == 0: agg_noise = speech else: # SNR scaled by i (how many noise signals already in agg_noise) agg_noise = self._mix(agg_noise, speech, i) return agg_noise @register_audio_waveform_transform("sporadicnoiseaugment") class SporadicNoiseAugmentTransform(NoiseAugmentTransform): @classmethod def from_config_dict(cls, config=None): _config = {} if config is None else config return cls( _config.get("samples_path", None), _config.get("snr_min", SNR_MIN), _config.get("snr_max", SNR_MAX), _config.get("rate", RATE), _config.get("noise_rate", NOISE_RATE), _config.get("noise_len_mean", NOISE_LEN_MEAN), _config.get("noise_len_std", NOISE_LEN_STD), ) def __init__( self, samples_path: str, snr_min: float = SNR_MIN, snr_max: float = SNR_MAX, rate: float = RATE, noise_rate: float = NOISE_RATE, # noises per second noise_len_mean: float = NOISE_LEN_MEAN, # length of noises in seconds noise_len_std: float = NOISE_LEN_STD, ): super().__init__(samples_path, snr_min, snr_max, rate) self.noise_rate = noise_rate self.noise_len_mean = noise_len_mean self.noise_len_std = noise_len_std def _get_noise(self, goal_shape, always_2d=False, use_sample_rate=None): agg_noise = np.zeros(goal_shape) len_dim = len(goal_shape) - 1 is_2d = len(goal_shape) == 2 n_noises = round(self.noise_rate * goal_shape[len_dim] / use_sample_rate) start_pointers = [ round(rand_uniform(0, goal_shape[len_dim])) for _ in range(n_noises) ] for start_pointer in start_pointers: noise_shape = list(goal_shape) len_seconds = np.random.normal(self.noise_len_mean, self.noise_len_std) noise_shape[len_dim] = round(max(0, len_seconds) * use_sample_rate) end_pointer = start_pointer + noise_shape[len_dim] if end_pointer >= goal_shape[len_dim]: continue noise = self.pick_sample(noise_shape, always_2d, use_sample_rate) if is_2d: agg_noise[:, start_pointer:end_pointer] = ( agg_noise[:, start_pointer:end_pointer] + noise ) else: agg_noise[start_pointer:end_pointer] = ( agg_noise[start_pointer:end_pointer] + noise ) return agg_noise ================================================ FILE: fairseq/data/backtranslation_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq import utils from . import FairseqDataset def backtranslate_samples(samples, collate_fn, generate_fn, cuda=True): """Backtranslate a list of samples. Given an input (*samples*) of the form: [{'id': 1, 'source': 'hallo welt'}] this will return: [{'id': 1, 'source': 'hello world', 'target': 'hallo welt'}] Args: samples (List[dict]): samples to backtranslate. Individual samples are expected to have a 'source' key, which will become the 'target' after backtranslation. collate_fn (callable): function to collate samples into a mini-batch generate_fn (callable): function to generate backtranslations cuda (bool): use GPU for generation (default: ``True``) Returns: List[dict]: an updated list of samples with a backtranslated source """ collated_samples = collate_fn(samples) s = utils.move_to_cuda(collated_samples) if cuda else collated_samples generated_sources = generate_fn(s) id_to_src = {sample["id"]: sample["source"] for sample in samples} # Go through each tgt sentence in batch and its corresponding best # generated hypothesis and create a backtranslation data pair # {id: id, source: generated backtranslation, target: original tgt} return [ { "id": id.item(), "target": id_to_src[id.item()], "source": hypos[0]["tokens"].cpu(), } for id, hypos in zip(collated_samples["id"], generated_sources) ] class BacktranslationDataset(FairseqDataset): """ Sets up a backtranslation dataset which takes a tgt batch, generates a src using a tgt-src backtranslation function (*backtranslation_fn*), and returns the corresponding `{generated src, input tgt}` batch. Args: tgt_dataset (~fairseq.data.FairseqDataset): the dataset to be backtranslated. Only the source side of this dataset will be used. After backtranslation, the source sentences in this dataset will be returned as the targets. src_dict (~fairseq.data.Dictionary): the dictionary of backtranslated sentences. tgt_dict (~fairseq.data.Dictionary, optional): the dictionary of sentences to be backtranslated. backtranslation_fn (callable, optional): function to call to generate backtranslations. This is typically the `generate` method of a :class:`~fairseq.sequence_generator.SequenceGenerator` object. Pass in None when it is not available at initialization time, and use set_backtranslation_fn function to set it when available. output_collater (callable, optional): function to call on the backtranslated samples to create the final batch (default: ``tgt_dataset.collater``). cuda: use GPU for generation """ def __init__( self, tgt_dataset, src_dict, tgt_dict=None, backtranslation_fn=None, output_collater=None, cuda=True, **kwargs ): self.tgt_dataset = tgt_dataset self.backtranslation_fn = backtranslation_fn self.output_collater = ( output_collater if output_collater is not None else tgt_dataset.collater ) self.cuda = cuda if torch.cuda.is_available() else False self.src_dict = src_dict self.tgt_dict = tgt_dict def __getitem__(self, index): """ Returns a single sample from *tgt_dataset*. Note that backtranslation is not applied in this step; use :func:`collater` instead to backtranslate a batch of samples. """ return self.tgt_dataset[index] def __len__(self): return len(self.tgt_dataset) def set_backtranslation_fn(self, backtranslation_fn): self.backtranslation_fn = backtranslation_fn def collater(self, samples): """Merge and backtranslate a list of samples to form a mini-batch. Using the samples from *tgt_dataset*, load a collated target sample to feed to the backtranslation model. Then take the backtranslation with the best score as the source and the original input as the target. Note: we expect *tgt_dataset* to provide a function `collater()` that will collate samples into the format expected by *backtranslation_fn*. After backtranslation, we will feed the new list of samples (i.e., the `(backtranslated source, original source)` pairs) to *output_collater* and return the result. Args: samples (List[dict]): samples to backtranslate and collate Returns: dict: a mini-batch with keys coming from *output_collater* """ if samples[0].get("is_dummy", False): return samples samples = backtranslate_samples( samples=samples, collate_fn=self.tgt_dataset.collater, generate_fn=(lambda net_input: self.backtranslation_fn(net_input)), cuda=self.cuda, ) return self.output_collater(samples) def num_tokens(self, index): """Just use the tgt dataset num_tokens""" return self.tgt_dataset.num_tokens(index) def ordered_indices(self): """Just use the tgt dataset ordered_indices""" return self.tgt_dataset.ordered_indices() def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``. Note: we use *tgt_dataset* to approximate the length of the source sentence, since we do not know the actual length until after backtranslation. """ tgt_size = self.tgt_dataset.size(index)[0] return (tgt_size, tgt_size) @property def supports_prefetch(self): return getattr(self.tgt_dataset, "supports_prefetch", False) def prefetch(self, indices): return self.tgt_dataset.prefetch(indices) ================================================ FILE: fairseq/data/base_wrapper_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from torch.utils.data.dataloader import default_collate from . import FairseqDataset class BaseWrapperDataset(FairseqDataset): def __init__(self, dataset): super().__init__() self.dataset = dataset def __getitem__(self, index): return self.dataset[index] def __len__(self): return len(self.dataset) def collater(self, samples): if hasattr(self.dataset, "collater"): return self.dataset.collater(samples) else: return default_collate(samples) @property def sizes(self): return self.dataset.sizes def num_tokens(self, index): return self.dataset.num_tokens(index) def size(self, index): return self.dataset.size(index) def ordered_indices(self): return self.dataset.ordered_indices() @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def attr(self, attr: str, index: int): return self.dataset.attr(attr, index) def prefetch(self, indices): self.dataset.prefetch(indices) def get_batch_shapes(self): return self.dataset.get_batch_shapes() def batch_by_size( self, indices, max_tokens=None, max_sentences=None, required_batch_size_multiple=1, ): return self.dataset.batch_by_size( indices, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) def filter_indices_by_size(self, indices, max_sizes): return self.dataset.filter_indices_by_size(indices, max_sizes) @property def can_reuse_epoch_itr_across_epochs(self): return self.dataset.can_reuse_epoch_itr_across_epochs def set_epoch(self, epoch): super().set_epoch(epoch) if hasattr(self.dataset, "set_epoch"): self.dataset.set_epoch(epoch) ================================================ FILE: fairseq/data/bucket_pad_length_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch.nn.functional as F from fairseq.data import BaseWrapperDataset from fairseq.data.data_utils import get_buckets, get_bucketed_sizes class BucketPadLengthDataset(BaseWrapperDataset): """ Bucket and pad item lengths to the nearest bucket size. This can be used to reduce the number of unique batch shapes, which is important on TPUs since each new batch shape requires a recompilation. Args: dataset (FairseqDatset): dataset to bucket sizes (List[int]): all item sizes num_buckets (int): number of buckets to create pad_idx (int): padding symbol left_pad (bool): if True, pad on the left; otherwise right pad """ def __init__( self, dataset, sizes, num_buckets, pad_idx, left_pad, tensor_key=None, ): super().__init__(dataset) self.pad_idx = pad_idx self.left_pad = left_pad assert num_buckets > 0 self.buckets = get_buckets(sizes, num_buckets) self._bucketed_sizes = get_bucketed_sizes(sizes, self.buckets) self._tensor_key = tensor_key def _set_tensor(self, item, val): if self._tensor_key is None: return val item[self._tensor_key] = val return item def _get_tensor(self, item): if self._tensor_key is None: return item return item[self._tensor_key] def _pad(self, tensor, bucket_size, dim=-1): num_pad = bucket_size - tensor.size(dim) return F.pad( tensor, (num_pad if self.left_pad else 0, 0 if self.left_pad else num_pad), value=self.pad_idx, ) def __getitem__(self, index): item = self.dataset[index] bucket_size = self._bucketed_sizes[index] tensor = self._get_tensor(item) padded = self._pad(tensor, bucket_size) return self._set_tensor(item, padded) @property def sizes(self): return self._bucketed_sizes def num_tokens(self, index): return self._bucketed_sizes[index] def size(self, index): return self._bucketed_sizes[index] ================================================ FILE: fairseq/data/codedataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import logging import os import random from pathlib import Path import numpy as np import torch import torch.utils.data from . import data_utils from fairseq.data.fairseq_dataset import FairseqDataset F0_FRAME_SPACE = 0.005 # sec logger = logging.getLogger(__name__) class ExpressiveCodeDataConfig(object): def __init__(self, json_path): with open(json_path, "r") as f: self.config = json.load(f) self._manifests = self.config["manifests"] @property def manifests(self): return self._manifests @property def n_units(self): return self.config["n_units"] @property def sampling_rate(self): return self.config["sampling_rate"] @property def code_hop_size(self): return self.config["code_hop_size"] @property def f0_stats(self): """pre-computed f0 statistics path""" return self.config.get("f0_stats", None) @property def f0_vq_type(self): """naive or precomp""" return self.config["f0_vq_type"] @property def f0_vq_name(self): return self.config["f0_vq_name"] def get_f0_vq_naive_quantizer(self, log, norm_mean, norm_std): key = "log" if log else "linear" if norm_mean and norm_std: key += "_mean_std_norm" elif norm_mean: key += "_mean_norm" else: key += "_none_norm" return self.config["f0_vq_naive_quantizer"][key] @property def f0_vq_n_units(self): return self.config["f0_vq_n_units"] @property def multispkr(self): """how to parse speaker label from audio path""" return self.config.get("multispkr", None) def get_f0(audio, rate=16000): try: import amfm_decompy.basic_tools as basic import amfm_decompy.pYAAPT as pYAAPT from librosa.util import normalize except ImportError: raise "Please install amfm_decompy (`pip install AMFM-decompy`) and librosa (`pip install librosa`)." assert audio.ndim == 1 frame_length = 20.0 # ms to_pad = int(frame_length / 1000 * rate) // 2 audio = normalize(audio) * 0.95 audio = np.pad(audio, (to_pad, to_pad), "constant", constant_values=0) audio = basic.SignalObj(audio, rate) pitch = pYAAPT.yaapt( audio, frame_length=frame_length, frame_space=F0_FRAME_SPACE * 1000, nccf_thresh1=0.25, tda_frame_length=25.0, ) f0 = pitch.samp_values return f0 def interpolate_f0(f0): try: from scipy.interpolate import interp1d except ImportError: raise "Please install scipy (`pip install scipy`)" orig_t = np.arange(f0.shape[0]) f0_interp = f0[:] ii = f0_interp != 0 if ii.sum() > 1: f0_interp = interp1d( orig_t[ii], f0_interp[ii], bounds_error=False, kind="linear", fill_value=0 )(orig_t) f0_interp = torch.Tensor(f0_interp).type_as(f0).to(f0.device) return f0_interp def naive_quantize(x, edges): bin_idx = (x.view(-1, 1) > edges.view(1, -1)).long().sum(dim=1) return bin_idx def load_wav(full_path): try: import soundfile as sf except ImportError: raise "Please install soundfile (`pip install SoundFile`)" data, sampling_rate = sf.read(full_path) return data, sampling_rate def parse_code(code_str, dictionary, append_eos): code, duration = torch.unique_consecutive( torch.ShortTensor(list(map(int, code_str.split()))), return_counts=True ) code = " ".join(map(str, code.tolist())) code = dictionary.encode_line(code, append_eos).short() if append_eos: duration = torch.cat((duration, duration.new_zeros((1,))), dim=0) # eos duration = duration.short() return code, duration def parse_manifest(manifest, dictionary): audio_files = [] codes = [] durations = [] speakers = [] with open(manifest) as info: for line in info.readlines(): sample = eval(line.strip()) if "cpc_km100" in sample: k = "cpc_km100" elif "hubert_km100" in sample: k = "hubert_km100" elif "phone" in sample: k = "phone" else: assert False, "unknown format" code = sample[k] code, duration = parse_code(code, dictionary, append_eos=True) codes.append(code) durations.append(duration) audio_files.append(sample["audio"]) speakers.append(sample.get("speaker", None)) return audio_files, codes, durations, speakers def parse_speaker(path, method): if type(path) == str: path = Path(path) if method == "parent_name": return path.parent.name elif method == "parent_parent_name": return path.parent.parent.name elif method == "_": return path.name.split("_")[0] elif method == "single": return "A" elif callable(method): return method(path) else: raise NotImplementedError() def get_f0_by_filename(filename, tgt_sampling_rate): audio, sampling_rate = load_wav(filename) if sampling_rate != tgt_sampling_rate: raise ValueError( "{} SR doesn't match target {} SR".format(sampling_rate, tgt_sampling_rate) ) # compute un-interpolated f0, and use Ann's interp in __getitem__ if set f0 = get_f0(audio, rate=tgt_sampling_rate) f0 = torch.from_numpy(f0.astype(np.float32)) return f0 def align_f0_to_durations(f0, durations, f0_code_ratio, tol=1): code_len = durations.sum() targ_len = int(f0_code_ratio * code_len) diff = f0.size(0) - targ_len assert abs(diff) <= tol, ( f"Cannot subsample F0: |{f0.size(0)} - {f0_code_ratio}*{code_len}|" f" > {tol} (dur=\n{durations})" ) if diff > 0: f0 = f0[:targ_len] elif diff < 0: f0 = torch.cat((f0, f0.new_full((-diff,), f0[-1])), 0) f0_offset = 0.0 seg_f0s = [] for dur in durations: f0_dur = dur.item() * f0_code_ratio seg_f0 = f0[int(f0_offset) : int(f0_offset + f0_dur)] seg_f0 = seg_f0[seg_f0 != 0] if len(seg_f0) == 0: seg_f0 = torch.tensor(0).type(seg_f0.type()) else: seg_f0 = seg_f0.mean() seg_f0s.append(seg_f0) f0_offset += f0_dur assert int(f0_offset) == f0.size(0), f"{f0_offset} {f0.size()} {durations.sum()}" return torch.tensor(seg_f0s) class Paddings(object): def __init__(self, code_val, dur_val=0, f0_val=-2.0): self.code = code_val self.dur = dur_val self.f0 = f0_val class Shifts(object): def __init__(self, shifts_str, pads): self._shifts = list(map(int, shifts_str.split(","))) assert len(self._shifts) == 2, self._shifts assert all(s >= 0 for s in self._shifts) self.extra_length = max(s for s in self._shifts) self.pads = pads @property def dur(self): return self._shifts[0] @property def f0(self): return self._shifts[1] @staticmethod def shift_one(seq, left_pad_num, right_pad_num, pad): assert seq.ndim == 1 bos = seq.new_full((left_pad_num,), pad) eos = seq.new_full((right_pad_num,), pad) seq = torch.cat([bos, seq, eos]) mask = torch.ones_like(seq).bool() mask[left_pad_num : len(seq) - right_pad_num] = 0 return seq, mask def __call__(self, code, dur, f0): if self.extra_length == 0: code_mask = torch.zeros_like(code).bool() dur_mask = torch.zeros_like(dur).bool() f0_mask = torch.zeros_like(f0).bool() return code, code_mask, dur, dur_mask, f0, f0_mask code, code_mask = self.shift_one(code, 0, self.extra_length, self.pads.code) dur, dur_mask = self.shift_one( dur, self.dur, self.extra_length - self.dur, self.pads.dur ) f0, f0_mask = self.shift_one( f0, self.f0, self.extra_length - self.f0, self.pads.f0 ) return code, code_mask, dur, dur_mask, f0, f0_mask class CodeDataset(FairseqDataset): def __init__( self, manifest, dictionary, dur_dictionary, f0_dictionary, config, discrete_dur, discrete_f0, log_f0, normalize_f0_mean, normalize_f0_std, interpolate_f0, return_filename=False, strip_filename=True, shifts="0,0", return_continuous_f0=False, ): random.seed(1234) self.dictionary = dictionary self.dur_dictionary = dur_dictionary self.f0_dictionary = f0_dictionary self.config = config # duration config self.discrete_dur = discrete_dur # pitch config self.discrete_f0 = discrete_f0 self.log_f0 = log_f0 self.normalize_f0_mean = normalize_f0_mean self.normalize_f0_std = normalize_f0_std self.interpolate_f0 = interpolate_f0 self.return_filename = return_filename self.strip_filename = strip_filename self.f0_code_ratio = config.code_hop_size / ( config.sampling_rate * F0_FRAME_SPACE ) # use lazy loading to avoid sharing file handlers across workers self.manifest = manifest self._codes = None self._durs = None self._f0s = None with open(f"{manifest}.leng.txt", "r") as f: lengs = [int(line.rstrip()) for line in f] edges = np.cumsum([0] + lengs) self.starts, self.ends = edges[:-1], edges[1:] with open(f"{manifest}.path.txt", "r") as f: self.file_names = [line.rstrip() for line in f] logger.info(f"num entries: {len(self.starts)}") if os.path.exists(f"{manifest}.f0_stat.pt"): self.f0_stats = torch.load(f"{manifest}.f0_stat.pt") elif config.f0_stats: self.f0_stats = torch.load(config.f0_stats) self.multispkr = config.multispkr if config.multispkr: with open(f"{manifest}.speaker.txt", "r") as f: self.spkrs = [line.rstrip() for line in f] self.id_to_spkr = sorted(self.spkrs) self.spkr_to_id = {k: v for v, k in enumerate(self.id_to_spkr)} self.pads = Paddings( dictionary.pad(), 0, # use 0 for duration padding f0_dictionary.pad() if discrete_f0 else -5.0, ) self.shifts = Shifts(shifts, pads=self.pads) self.return_continuous_f0 = return_continuous_f0 def get_data_handlers(self): logging.info(f"loading data for {self.manifest}") self._codes = np.load(f"{self.manifest}.code.npy", mmap_mode="r") self._durs = np.load(f"{self.manifest}.dur.npy", mmap_mode="r") if self.discrete_f0: if self.config.f0_vq_type == "precomp": self._f0s = np.load( f"{self.manifest}.{self.config.f0_vq_name}.npy", mmap_mode="r" ) elif self.config.f0_vq_type == "naive": self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r") quantizers_path = self.config.get_f0_vq_naive_quantizer( self.log_f0, self.normalize_f0_mean, self.normalize_f0_std ) quantizers = torch.load(quantizers_path) n_units = self.config.f0_vq_n_units self._f0_quantizer = torch.from_numpy(quantizers[n_units]) else: raise ValueError(f"f0_vq_type {self.config.f0_vq_type} not supported") else: self._f0s = np.load(f"{self.manifest}.f0.npy", mmap_mode="r") def preprocess_f0(self, f0, stats): """ 1. interpolate 2. log transform (keep unvoiced frame 0) """ # TODO: change this to be dependent on config for naive quantizer f0 = f0.clone() if self.interpolate_f0: f0 = interpolate_f0(f0) mask = f0 != 0 # only process voiced frames if self.log_f0: f0[mask] = f0[mask].log() if self.normalize_f0_mean: mean = stats["logf0_mean"] if self.log_f0 else stats["f0_mean"] f0[mask] = f0[mask] - mean if self.normalize_f0_std: std = stats["logf0_std"] if self.log_f0 else stats["f0_std"] f0[mask] = f0[mask] / std return f0 def _get_raw_item(self, index): start, end = self.starts[index], self.ends[index] if self._codes is None: self.get_data_handlers() code = torch.from_numpy(np.array(self._codes[start:end])).long() dur = torch.from_numpy(np.array(self._durs[start:end])) f0 = torch.from_numpy(np.array(self._f0s[start:end])) return code, dur, f0 def __getitem__(self, index): code, dur, f0 = self._get_raw_item(index) code = torch.cat([code.new([self.dictionary.bos()]), code]) # use 0 for eos and bos dur = torch.cat([dur.new([0]), dur]) if self.discrete_dur: dur = self.dur_dictionary.encode_line( " ".join(map(str, dur.tolist())), append_eos=False ).long() else: dur = dur.float() # TODO: find a more elegant approach raw_f0 = None if self.discrete_f0: if self.config.f0_vq_type == "precomp": f0 = self.f0_dictionary.encode_line( " ".join(map(str, f0.tolist())), append_eos=False ).long() else: f0 = f0.float() f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]]) if self.return_continuous_f0: raw_f0 = f0 raw_f0 = torch.cat([raw_f0.new([self.f0_dictionary.bos()]), raw_f0]) f0 = naive_quantize(f0, self._f0_quantizer) f0 = torch.cat([f0.new([self.f0_dictionary.bos()]), f0]) else: f0 = f0.float() if self.multispkr: f0 = self.preprocess_f0(f0, self.f0_stats[self.spkrs[index]]) else: f0 = self.preprocess_f0(f0, self.f0_stats) f0 = torch.cat([f0.new([0]), f0]) if raw_f0 is not None: *_, raw_f0, raw_f0_mask = self.shifts(code, dur, raw_f0) else: raw_f0_mask = None code, code_mask, dur, dur_mask, f0, f0_mask = self.shifts(code, dur, f0) if raw_f0_mask is not None: assert (raw_f0_mask == f0_mask).all() # is a padded frame if either input or output is padded feats = { "source": code[:-1], "target": code[1:], "mask": code_mask[1:].logical_or(code_mask[:-1]), "dur_source": dur[:-1], "dur_target": dur[1:], "dur_mask": dur_mask[1:].logical_or(dur_mask[:-1]), "f0_source": f0[:-1], "f0_target": f0[1:], "f0_mask": f0_mask[1:].logical_or(f0_mask[:-1]), } if raw_f0 is not None: feats["raw_f0"] = raw_f0[1:] if self.return_filename: fname = self.file_names[index] feats["filename"] = ( fname if not self.strip_filename else Path(fname).with_suffix("").name ) return feats def __len__(self): return len(self.starts) def size(self, index): return self.ends[index] - self.starts[index] + self.shifts.extra_length def num_tokens(self, index): return self.size(index) def collater(self, samples): pad_idx, eos_idx = self.dictionary.pad(), self.dictionary.eos() if len(samples) == 0: return {} src_tokens = data_utils.collate_tokens( [s["source"] for s in samples], pad_idx, eos_idx, left_pad=False ) tgt_tokens = data_utils.collate_tokens( [s["target"] for s in samples], pad_idx=pad_idx, eos_idx=pad_idx, # appending padding, eos is there already left_pad=False, ) src_durs, tgt_durs = [ data_utils.collate_tokens( [s[k] for s in samples], pad_idx=self.pads.dur, eos_idx=self.pads.dur, left_pad=False, ) for k in ["dur_source", "dur_target"] ] src_f0s, tgt_f0s = [ data_utils.collate_tokens( [s[k] for s in samples], pad_idx=self.pads.f0, eos_idx=self.pads.f0, left_pad=False, ) for k in ["f0_source", "f0_target"] ] mask, dur_mask, f0_mask = [ data_utils.collate_tokens( [s[k] for s in samples], pad_idx=1, eos_idx=1, left_pad=False, ) for k in ["mask", "dur_mask", "f0_mask"] ] src_lengths = torch.LongTensor([s["source"].numel() for s in samples]) n_tokens = sum(len(s["source"]) for s in samples) result = { "nsentences": len(samples), "ntokens": n_tokens, "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, "dur_src": src_durs, "f0_src": src_f0s, }, "target": tgt_tokens, "dur_target": tgt_durs, "f0_target": tgt_f0s, "mask": mask, "dur_mask": dur_mask, "f0_mask": f0_mask, } if "filename" in samples[0]: result["filename"] = [s["filename"] for s in samples] # TODO: remove this hack into the inference dataset if "prefix" in samples[0]: result["prefix"] = [s["prefix"] for s in samples] if "raw_f0" in samples[0]: raw_f0s = data_utils.collate_tokens( [s["raw_f0"] for s in samples], pad_idx=self.pads.f0, eos_idx=self.pads.f0, left_pad=False, ) result["raw_f0"] = raw_f0s return result ================================================ FILE: fairseq/data/colorize_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import BaseWrapperDataset class ColorizeDataset(BaseWrapperDataset): """Adds 'colors' property to net input that is obtained from the provided color getter for use by models""" def __init__(self, dataset, color_getter): super().__init__(dataset) self.color_getter = color_getter def collater(self, samples): base_collate = super().collater(samples) if len(base_collate) > 0: base_collate["net_input"]["colors"] = torch.tensor( list(self.color_getter(self.dataset, s["id"]) for s in samples), dtype=torch.long, ) return base_collate ================================================ FILE: fairseq/data/concat_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import bisect import numpy as np from torch.utils.data.dataloader import default_collate from . import FairseqDataset class ConcatDataset(FairseqDataset): @staticmethod def cumsum(sequence, sample_ratios): r, s = [], 0 for e, ratio in zip(sequence, sample_ratios): curr_len = int(ratio * len(e)) r.append(curr_len + s) s += curr_len return r def __init__(self, datasets, sample_ratios=1): super(ConcatDataset, self).__init__() assert len(datasets) > 0, "datasets should not be an empty iterable" self.datasets = list(datasets) if isinstance(sample_ratios, int): sample_ratios = [sample_ratios] * len(self.datasets) self.sample_ratios = sample_ratios self.cumulative_sizes = self.cumsum(self.datasets, sample_ratios) self.real_sizes = [len(d) for d in self.datasets] def __len__(self): return self.cumulative_sizes[-1] def __getitem__(self, idx): dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) return self.datasets[dataset_idx][sample_idx] def _get_dataset_and_sample_index(self, idx: int): dataset_idx = bisect.bisect_right(self.cumulative_sizes, idx) if dataset_idx == 0: sample_idx = idx else: sample_idx = idx - self.cumulative_sizes[dataset_idx - 1] sample_idx = sample_idx % self.real_sizes[dataset_idx] return dataset_idx, sample_idx def collater(self, samples, **extra_args): # For now only supports datasets with same underlying collater implementations if hasattr(self.datasets[0], "collater"): return self.datasets[0].collater(samples, **extra_args) else: return default_collate(samples, **extra_args) def size(self, idx: int): """ Return an example's size as a float or tuple. """ dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) return self.datasets[dataset_idx].size(sample_idx) def num_tokens(self, index: int): return np.max(self.size(index)) def attr(self, attr: str, index: int): dataset_idx = bisect.bisect_right(self.cumulative_sizes, index) return getattr(self.datasets[dataset_idx], attr, None) @property def sizes(self): _dataset_sizes = [] for ds, sr in zip(self.datasets, self.sample_ratios): if isinstance(ds.sizes, np.ndarray): _dataset_sizes.append(np.tile(ds.sizes, sr)) else: # Only support underlying dataset with single size array. assert isinstance(ds.sizes, list) _dataset_sizes.append(np.tile(ds.sizes[0], sr)) return np.concatenate(_dataset_sizes) @property def supports_prefetch(self): return all(d.supports_prefetch for d in self.datasets) def ordered_indices(self): """ Returns indices sorted by length. So less padding is needed. """ if isinstance(self.sizes, np.ndarray) and len(self.sizes.shape) > 1: # special handling for concatenating lang_pair_datasets indices = np.arange(len(self)) sizes = self.sizes tgt_sizes = ( sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None ) src_sizes = ( sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes ) # sort by target length, then source length if tgt_sizes is not None: indices = indices[np.argsort(tgt_sizes[indices], kind="mergesort")] return indices[np.argsort(src_sizes[indices], kind="mergesort")] else: return np.argsort(self.sizes) def prefetch(self, indices): frm = 0 for to, ds in zip(self.cumulative_sizes, self.datasets): real_size = len(ds) if getattr(ds, "supports_prefetch", False): ds.prefetch([(i - frm) % real_size for i in indices if frm <= i < to]) frm = to @property def can_reuse_epoch_itr_across_epochs(self): return all(d.can_reuse_epoch_itr_across_epochs for d in self.datasets) def set_epoch(self, epoch): super().set_epoch(epoch) for ds in self.datasets: if hasattr(ds, "set_epoch"): ds.set_epoch(epoch) ================================================ FILE: fairseq/data/concat_sentences_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import FairseqDataset class ConcatSentencesDataset(FairseqDataset): def __init__(self, *datasets): super().__init__() self.datasets = datasets assert all( len(ds) == len(datasets[0]) for ds in datasets ), "datasets must have the same length" def __getitem__(self, index): return torch.cat([ds[index] for ds in self.datasets]) def __len__(self): return len(self.datasets[0]) def collater(self, samples): return self.datasets[0].collater(samples) @property def sizes(self): return sum(ds.sizes for ds in self.datasets) def num_tokens(self, index): return sum(ds.num_tokens(index) for ds in self.datasets) def size(self, index): return sum(ds.size(index) for ds in self.datasets) def ordered_indices(self): return self.datasets[0].ordered_indices() @property def supports_prefetch(self): return any(getattr(ds, "supports_prefetch", False) for ds in self.datasets) def prefetch(self, indices): for ds in self.datasets: if getattr(ds, "supports_prefetch", False): ds.prefetch(indices) def set_epoch(self, epoch): super().set_epoch(epoch) for ds in self.datasets: if hasattr(ds, "set_epoch"): ds.set_epoch(epoch) ================================================ FILE: fairseq/data/data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. try: from collections.abc import Iterable except ImportError: from collections import Iterable import contextlib import itertools import logging import re import warnings from typing import Optional, Tuple import math import numpy as np import torch from fairseq.file_io import PathManager from fairseq import utils import os logger = logging.getLogger(__name__) def infer_language_pair(path): """Infer language pair from filename: <split>.<lang1>-<lang2>.(...).idx""" src, dst = None, None for filename in PathManager.ls(path): parts = filename.split(".") if len(parts) >= 3 and len(parts[1].split("-")) == 2: return parts[1].split("-") return src, dst def collate_tokens( values, pad_idx, eos_idx=None, left_pad=False, move_eos_to_beginning=False, pad_to_length=None, pad_to_multiple=1, pad_to_bsz=None, ): """Convert a list of 1d tensors into a padded 2d tensor.""" size = max(v.size(0) for v in values) size = size if pad_to_length is None else max(size, pad_to_length) if pad_to_multiple != 1 and size % pad_to_multiple != 0: size = int(((size - 0.1) // pad_to_multiple + 1) * pad_to_multiple) batch_size = len(values) if pad_to_bsz is None else max(len(values), pad_to_bsz) res = values[0].new(batch_size, size).fill_(pad_idx) def copy_tensor(src, dst): assert dst.numel() == src.numel() if move_eos_to_beginning: if eos_idx is None: # if no eos_idx is specified, then use the last token in src dst[0] = src[-1] else: dst[0] = eos_idx dst[1:] = src[:-1] else: dst.copy_(src) for i, v in enumerate(values): copy_tensor(v, res[i][size - len(v) :] if left_pad else res[i][: len(v)]) return res def load_indexed_dataset( path, dictionary=None, dataset_impl=None, combine=False, default="cached" ): """A helper function for loading indexed datasets. Args: path (str): path to indexed dataset (e.g., 'data-bin/train') dictionary (~fairseq.data.Dictionary): data dictionary dataset_impl (str, optional): which dataset implementation to use. If not provided, it will be inferred automatically. For legacy indexed data we use the 'cached' implementation by default. combine (bool, optional): automatically load and combine multiple datasets. For example, if *path* is 'data-bin/train', then we will combine 'data-bin/train', 'data-bin/train1', ... and return a single ConcatDataset instance. """ import fairseq.data.indexed_dataset as indexed_dataset from fairseq.data.concat_dataset import ConcatDataset datasets = [] for k in itertools.count(): path_k = path + (str(k) if k > 0 else "") try: path_k = indexed_dataset.get_indexed_dataset_to_local(path_k) except Exception as e: if "StorageException: [404] Path not found" in str(e): logger.warning(f"path_k: {e} not found") else: raise e dataset_impl_k = dataset_impl if dataset_impl_k is None: dataset_impl_k = indexed_dataset.infer_dataset_impl(path_k) dataset = indexed_dataset.make_dataset( path_k, impl=dataset_impl_k or default, fix_lua_indexing=True, dictionary=dictionary, ) if dataset is None: break logger.info("loaded {:,} examples from: {}".format(len(dataset), path_k)) datasets.append(dataset) if not combine: break if len(datasets) == 0: return None elif len(datasets) == 1: return datasets[0] else: return ConcatDataset(datasets) @contextlib.contextmanager def numpy_seed(seed, *addl_seeds): """Context manager which seeds the NumPy PRNG with the specified seed and restores the state afterward""" if seed is None: yield return if len(addl_seeds) > 0: seed = int(hash((seed, *addl_seeds)) % 1e6) state = np.random.get_state() np.random.seed(seed) try: yield finally: np.random.set_state(state) def collect_filtered(function, iterable, filtered): """ Similar to :func:`filter` but collects filtered elements in ``filtered``. Args: function (callable): function that returns ``False`` for elements that should be filtered iterable (iterable): iterable to filter filtered (list): list to store filtered elements """ for el in iterable: if function(el): yield el else: filtered.append(el) def _filter_by_size_dynamic(indices, size_fn, max_positions, raise_exception=False): def compare_leq(a, b): return a <= b if not isinstance(a, tuple) else max(a) <= b def check_size(idx): if isinstance(max_positions, float) or isinstance(max_positions, int): return size_fn(idx) <= max_positions elif isinstance(max_positions, dict): idx_size = size_fn(idx) assert isinstance(idx_size, dict) intersect_keys = set(max_positions.keys()) & set(idx_size.keys()) return all( all( a is None or b is None or a <= b for a, b in zip(idx_size[key], max_positions[key]) ) for key in intersect_keys ) else: # For MultiCorpusSampledDataset, will generalize it later if not isinstance(size_fn(idx), Iterable): return all(size_fn(idx) <= b for b in max_positions) return all( a is None or b is None or a <= b for a, b in zip(size_fn(idx), max_positions) ) ignored = [] itr = collect_filtered(check_size, indices, ignored) indices = np.fromiter(itr, dtype=np.int64, count=-1) return indices, ignored def filter_by_size(indices, dataset, max_positions, raise_exception=False): """ [deprecated] Filter indices based on their size. Use `FairseqDataset::filter_indices_by_size` instead. Args: indices (List[int]): ordered list of dataset indices dataset (FairseqDataset): fairseq dataset instance max_positions (tuple): filter elements larger than this size. Comparisons are done component-wise. raise_exception (bool, optional): if ``True``, raise an exception if any elements are filtered (default: False). """ warnings.warn( "data_utils.filter_by_size is deprecated. " "Use `FairseqDataset::filter_indices_by_size` instead.", stacklevel=2, ) if isinstance(max_positions, float) or isinstance(max_positions, int): if hasattr(dataset, "sizes") and isinstance(dataset.sizes, np.ndarray): ignored = indices[dataset.sizes[indices] > max_positions].tolist() indices = indices[dataset.sizes[indices] <= max_positions] elif ( hasattr(dataset, "sizes") and isinstance(dataset.sizes, list) and len(dataset.sizes) == 1 ): ignored = indices[dataset.sizes[0][indices] > max_positions].tolist() indices = indices[dataset.sizes[0][indices] <= max_positions] else: indices, ignored = _filter_by_size_dynamic( indices, dataset.size, max_positions ) else: indices, ignored = _filter_by_size_dynamic(indices, dataset.size, max_positions) if len(ignored) > 0 and raise_exception: raise Exception( ( "Size of sample #{} is invalid (={}) since max_positions={}, " "skip this example with --skip-invalid-size-inputs-valid-test" ).format(ignored[0], dataset.size(ignored[0]), max_positions) ) if len(ignored) > 0: logger.warning( ( "{} samples have invalid sizes and will be skipped, " "max_positions={}, first few sample ids={}" ).format(len(ignored), max_positions, ignored[:10]) ) return indices def filter_paired_dataset_indices_by_size(src_sizes, tgt_sizes, indices, max_sizes): """Filter a list of sample indices. Remove those that are longer than specified in max_sizes. Args: indices (np.array): original array of sample indices max_sizes (int or list[int] or tuple[int]): max sample size, can be defined separately for src and tgt (then list or tuple) Returns: np.array: filtered sample array list: list of removed indices """ if max_sizes is None: return indices, [] if type(max_sizes) in (int, float): max_src_size, max_tgt_size = max_sizes, max_sizes else: max_src_size, max_tgt_size = max_sizes if tgt_sizes is None: ignored = indices[src_sizes[indices] > max_src_size] else: ignored = indices[ (src_sizes[indices] > max_src_size) | (tgt_sizes[indices] > max_tgt_size) ] if len(ignored) > 0: if tgt_sizes is None: indices = indices[src_sizes[indices] <= max_src_size] else: indices = indices[ (src_sizes[indices] <= max_src_size) & (tgt_sizes[indices] <= max_tgt_size) ] return indices, ignored.tolist() def batch_by_size( indices, num_tokens_fn, num_tokens_vec=None, max_tokens=None, max_sentences=None, required_batch_size_multiple=1, fixed_shapes=None, ): """ Yield mini-batches of indices bucketed by size. Batches may contain sequences of different lengths. Args: indices (List[int]): ordered list of dataset indices num_tokens_fn (callable): function that returns the number of tokens at a given index num_tokens_vec (List[int], optional): precomputed vector of the number of tokens for each index in indices (to enable faster batch generation) max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). required_batch_size_multiple (int, optional): require batch size to be less than N or a multiple of N (default: 1). fixed_shapes (List[Tuple[int, int]], optional): if given, batches will only be created with the given shapes. *max_sentences* and *required_batch_size_multiple* will be ignored (default: None). """ try: from fairseq.data.data_utils_fast import ( batch_by_size_fn, batch_by_size_vec, batch_fixed_shapes_fast, ) except ImportError: raise ImportError( "Please build Cython components with: " "`python setup.py build_ext --inplace`" ) except ValueError: raise ValueError( "Please build (or rebuild) Cython components with `python setup.py build_ext --inplace`." ) # added int() to avoid TypeError: an integer is required max_tokens = int(max_tokens) if max_tokens is not None else -1 max_sentences = max_sentences if max_sentences is not None else -1 bsz_mult = required_batch_size_multiple if not isinstance(indices, np.ndarray): indices = np.fromiter(indices, dtype=np.int64, count=-1) if num_tokens_vec is not None and not isinstance(num_tokens_vec, np.ndarray): num_tokens_vec = np.fromiter(num_tokens_vec, dtype=np.int64, count=-1) if fixed_shapes is None: if num_tokens_vec is None: b = batch_by_size_fn( indices, num_tokens_fn, max_tokens, max_sentences, bsz_mult, ) else: b = batch_by_size_vec( indices, num_tokens_vec, max_tokens, max_sentences, bsz_mult, ) if bsz_mult > 1 and len(b[-1]) % bsz_mult != 0: b = b[:-1] return b else: fixed_shapes = np.array(fixed_shapes, dtype=np.int64) sort_order = np.lexsort( [ fixed_shapes[:, 1].argsort(), # length fixed_shapes[:, 0].argsort(), # bsz ] ) fixed_shapes_sorted = fixed_shapes[sort_order] return batch_fixed_shapes_fast(indices, num_tokens_fn, fixed_shapes_sorted) def post_process(sentence: str, symbol: str): if symbol == "sentencepiece": sentence = sentence.replace(" ", "").replace("\u2581", " ").strip() elif symbol == "wordpiece": sentence = sentence.replace(" ", "").replace("_", " ").strip() elif symbol == "letter": sentence = sentence.replace(" ", "").replace("|", " ").strip() elif symbol == "silence": import re sentence = sentence.replace("<SIL>", "") sentence = re.sub(" +", " ", sentence).strip() elif symbol == "_EOW": sentence = sentence.replace(" ", "").replace("_EOW", " ").strip() elif symbol in {"subword_nmt", "@@ ", "@@"}: if symbol == "subword_nmt": symbol = "@@ " sentence = (sentence + " ").replace(symbol, "").rstrip() elif symbol == "none": pass elif symbol is not None: raise NotImplementedError(f"Unknown post_process option: {symbol}") return sentence def compute_mask_indices( shape: Tuple[int, int], padding_mask: Optional[torch.Tensor], mask_prob: float, mask_length: int, mask_type: str = "static", mask_other: float = 0.0, min_masks: int = 0, no_overlap: bool = False, min_space: int = 0, require_same_masks: bool = True, mask_dropout: float = 0.0, add_masks: bool = False, seed: Optional[int] = None, epoch: Optional[int] = None, indices: Optional[torch.Tensor] = None, idc_select_ver: int = 1, # 2 to reproduce mask_tokens_dataset num_mask_ver: int = 2, # 2 to reproduce mask_tokens_dataset ) -> np.ndarray: """ Computes random mask spans for a given shape Args: shape: the the shape for which to compute masks. should be of size 2 where first element is batch size and 2nd is timesteps padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by number of timesteps divided by length of mask span to mask approximately this percentage of all elements. however due to overlaps, the actual number will be smaller (unless no_overlap is True) mask_type: how to compute mask lengths static = fixed size uniform = sample from uniform distribution [mask_other, mask_length*2] normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element poisson = sample from possion distribution with lambda = mask length min_masks: minimum number of masked spans no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample mask_dropout: randomly dropout this percentage of masks in each example """ bsz, all_sz = shape mask = np.full((bsz, all_sz), False) if num_mask_ver == 1: all_num_mask = int( # add a random number for probabilistic rounding mask_prob * all_sz / float(mask_length) + np.random.rand() ) all_num_mask = max(min_masks, all_num_mask) mask_idcs = [] for i in range(bsz): if seed is not None and epoch is not None and indices is not None: seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6) else: seed_i = None rng = np.random.default_rng(seed_i) if padding_mask is not None: sz = all_sz - padding_mask[i].long().sum().item() assert sz >= 0, sz else: sz = all_sz if num_mask_ver == 1: if padding_mask is not None: num_mask = int( # add a random number for probabilistic rounding mask_prob * sz / float(mask_length) + np.random.rand() ) num_mask = max(min_masks, num_mask) else: num_mask = all_num_mask elif num_mask_ver == 2: num_mask = int( # add a random number for probabilistic rounding mask_prob * sz / float(mask_length) + rng.random() ) num_mask = max(min_masks, num_mask) else: raise ValueError() if mask_type == "static": lengths = np.full(num_mask, mask_length) elif mask_type == "uniform": lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask) elif mask_type == "normal": lengths = rng.normal(mask_length, mask_other, size=num_mask) lengths = [max(1, int(round(x))) for x in lengths] elif mask_type == "poisson": lengths = rng.poisson(mask_length, size=num_mask) lengths = [int(round(x)) for x in lengths] else: raise Exception("unknown mask selection " + mask_type) if sum(lengths) == 0: if mask_type == "static": raise ValueError(f"this should never happens") else: lengths = [min(mask_length, sz - 1)] if no_overlap: mask_idc = [] def arrange(s, e, length, keep_length): span_start = rng.randint(s, e - length) mask_idc.extend(span_start + i for i in range(length)) new_parts = [] if span_start - s - min_space >= keep_length: new_parts.append((s, span_start - min_space + 1)) if e - span_start - length - min_space > keep_length: new_parts.append((span_start + length + min_space, e)) return new_parts parts = [(0, sz)] min_length = min(lengths) for length in sorted(lengths, reverse=True): lens = np.fromiter( (e - s if e - s >= length + min_space else 0 for s, e in parts), np.int, ) l_sum = np.sum(lens) if l_sum == 0: break probs = lens / np.sum(lens) c = rng.choice(len(parts), p=probs) s, e = parts.pop(c) parts.extend(arrange(s, e, length, min_length)) mask_idc = np.asarray(mask_idc) else: if idc_select_ver == 1: min_len = min(lengths) if sz - min_len <= num_mask: min_len = sz - num_mask - 1 mask_idc = rng.choice(sz - min_len, num_mask, replace=False) elif idc_select_ver == 2: mask_idc = rng.choice(sz, num_mask, replace=False) else: raise ValueError() mask_idc = np.asarray( [ mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j]) ] ) mask_idc = np.unique(mask_idc[mask_idc < sz]) if len(mask_idc) >= sz: raise ValueError( ( f"the entire sequence is masked. " f"sz={sz}; mask_idc[mask_idc]; " f"index={indices[i] if indices is not None else None}" ) ) mask_idcs.append(mask_idc) target_len = None if require_same_masks: if add_masks: target_len = max([len(m) for m in mask_idcs]) else: target_len = min([len(m) for m in mask_idcs]) for i, mask_idc in enumerate(mask_idcs): if target_len is not None and len(mask_idc) > target_len: mask_idc = rng.choice(mask_idc, target_len, replace=False) mask[i, mask_idc] = True if target_len is not None and len(mask_idc) < target_len: unmasked = np.flatnonzero(~mask[i]) to_mask = rng.choice(unmasked, target_len - len(mask_idc), replace=False) mask[i, to_mask] = True if mask_dropout > 0: masked = np.flatnonzero(mask[i]) num_holes = np.rint(len(masked) * mask_dropout).astype(int) to_drop = rng.choice(masked, num_holes, replace=False) mask[i, to_drop] = False return mask def compute_block_mask_2d( shape: Tuple[int, int], mask_prob: float, mask_length: int, mask_prob_adjust: float = 0, inverse_mask: bool = False, require_same_masks: bool = True, expand_adjcent: bool = False, mask_dropout: float = 0, non_overlapping: bool = False, ) -> torch.Tensor: assert mask_length > 1 B, L = shape d = int(L**0.5) if inverse_mask: mask_prob = 1 - mask_prob if non_overlapping: sz = math.ceil(d / mask_length) inp_len = sz * sz inp = torch.zeros((B, 1, sz, sz)) w = torch.ones((1, 1, mask_length, mask_length)) mask_inds = torch.multinomial( 1 - inp.view(B, -1), int(inp_len * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)), replacement=False, ) inp.view(B, -1).scatter_(1, mask_inds, 1) mask = torch.nn.functional.conv_transpose2d(inp, w, stride=mask_length).squeeze( 1 ) if mask.size(-1) > d: mask = mask[..., :d, :d] else: mask = torch.zeros((B, d, d)) mask_inds = torch.randint( 0, L, size=( B, int( L * ((mask_prob + mask_prob_adjust) / mask_length**2) * (1 + mask_dropout) ), ), ) mask.view(B, -1).scatter_(1, mask_inds, 1) centers = mask.nonzero(as_tuple=True) inds = ([], [], []) offset = mask_length // 2 for i in range(mask_length): for j in range(mask_length): k1 = i - offset k2 = j - offset inds[0].append(centers[0]) inds[1].append(centers[1] + k1) inds[2].append(centers[2] + k2) i0 = torch.cat(inds[0]) i1 = torch.cat(inds[1]).clamp_(min=0, max=d - 1) i2 = torch.cat(inds[2]).clamp_(min=0, max=d - 1) mask[(i0, i1, i2)] = 1 def get_nbs(b, m, w): all_nbs = torch.nn.functional.conv2d(m.unsqueeze(1), w, padding="same") all_nbs = all_nbs.clamp_max_(1).view(b, -1) return all_nbs if require_same_masks and expand_adjcent: w = torch.zeros((1, 1, 3, 3)) w[..., 0, 1] = 1 w[..., 2, 1] = 1 w[..., 1, 0] = 1 w[..., 1, 2] = 1 all_nbs = get_nbs(B, mask, w) mask = mask.reshape(B, -1) if require_same_masks: n_masks = mask.sum(dim=-1) final_target_len = int(L * (mask_prob)) target_len = int(final_target_len * (1 + mask_dropout)) for i in range(len(mask)): n = n_masks[i] m = mask[i] r = 0 while expand_adjcent and n < target_len: if r == 0: nbs = all_nbs[i] else: nbs = get_nbs(1, m.view(1, d, d), w).flatten() cands = (1 - m + nbs) > 1 cand_sz = int(cands.sum().item()) assert cand_sz > 0, f"{nbs} {cand_sz}" to_mask = torch.multinomial( cands.float(), min(cand_sz, int(target_len - n)), replacement=False ) m[to_mask] = 1 assert to_mask.numel() > 0 n += to_mask.numel() r += 1 if n > final_target_len: to_unmask = torch.multinomial( m, int(n - final_target_len), replacement=False ) m[to_unmask] = 0 elif n < final_target_len: to_mask = torch.multinomial( (1 - m), int(final_target_len - n), replacement=False ) m[to_mask] = 1 if inverse_mask: mask = 1 - mask return mask def compute_block_mask_1d( shape: Tuple[int, int], mask_prob: float, mask_length: int, mask_prob_adjust: float = 0, inverse_mask: bool = False, require_same_masks: bool = True, expand_adjcent: bool = False, mask_dropout: float = 0, non_overlapping: bool = False, ) -> torch.Tensor: B, L = shape if inverse_mask: mask_prob = 1 - mask_prob if non_overlapping: sz = math.ceil(L / mask_length) inp = torch.zeros((B, 1, sz)) w = torch.ones((1, 1, mask_length)) mask_inds = torch.multinomial( 1 - inp.view(B, -1), int(sz * (mask_prob + mask_prob_adjust) * (1 + mask_dropout)), replacement=False, ) inp.view(B, -1).scatter_(1, mask_inds, 1) mask = torch.nn.functional.conv_transpose1d(inp, w, stride=mask_length).squeeze( 1 ) if mask.size(-1) > L: mask = mask[..., :L] else: mask = torch.zeros((B, L)) mask_inds = torch.randint( 0, L, size=( B, int( L * ((mask_prob + mask_prob_adjust) / mask_length) * (1 + mask_dropout) ), ), ) mask.view(B, -1).scatter_(1, mask_inds, 1) centers = mask.nonzero(as_tuple=True) inds = ([], []) offset = mask_length // 2 for i in range(mask_length): k1 = i - offset inds[0].append(centers[0]) inds[1].append(centers[1] + k1) i0 = torch.cat(inds[0]) i1 = torch.cat(inds[1]).clamp_(min=0, max=L - 1) mask[(i0, i1)] = 1 def get_nbs(b, m, w): all_nbs = torch.nn.functional.conv1d(m.unsqueeze(1), w, padding="same") all_nbs = all_nbs.clamp_max_(1).view(b, -1) return all_nbs if require_same_masks and expand_adjcent: w = torch.ones((1, 1, 3)) w[..., 1] = 0 all_nbs = get_nbs(B, mask, w) mask = mask.view(B, -1) if require_same_masks: n_masks = mask.sum(dim=-1) final_target_len = int(L * (mask_prob)) target_len = int(final_target_len * (1 + mask_dropout)) for i in range(len(mask)): n = n_masks[i] m = mask[i] r = 0 while expand_adjcent and n < target_len: if r == 0: nbs = all_nbs[i] else: nbs = get_nbs(1, m.unsqueeze(0), w).squeeze(0) cands = (1 - m + nbs) > 1 cand_sz = int(cands.sum().item()) assert cand_sz > 0, f"{nbs} {cand_sz}" to_mask = torch.multinomial( cands.float(), min(cand_sz, int(target_len - n)), replacement=False ) m[to_mask] = 1 assert to_mask.numel() > 0 n += to_mask.numel() r += 1 if n > final_target_len: to_unmask = torch.multinomial( m, int(n - final_target_len), replacement=False ) m[to_unmask] = 0 elif n < final_target_len: to_mask = torch.multinomial( (1 - m), int(final_target_len - n), replacement=False ) m[to_mask] = 1 if inverse_mask: mask = 1 - mask return mask def get_mem_usage(): try: import psutil mb = 1024 * 1024 return f"used={psutil.virtual_memory().used / mb}Mb; avail={psutil.virtual_memory().available / mb}Mb" except ImportError: return "N/A" # lens: torch.LongTensor # returns: torch.BoolTensor def lengths_to_padding_mask(lens): bsz, max_lens = lens.size(0), torch.max(lens).item() mask = torch.arange(max_lens).to(lens.device).view(1, max_lens) mask = mask.expand(bsz, -1) >= lens.view(bsz, 1).expand(-1, max_lens) return mask # lens: torch.LongTensor # returns: torch.BoolTensor def lengths_to_mask(lens): return ~lengths_to_padding_mask(lens) def get_buckets(sizes, num_buckets): buckets = np.unique( np.percentile( sizes, np.linspace(0, 100, num_buckets + 1), interpolation="lower", )[1:] ) return buckets def get_bucketed_sizes(orig_sizes, buckets): sizes = np.copy(orig_sizes) assert np.min(sizes) >= 0 start_val = -1 for end_val in buckets: mask = (sizes > start_val) & (sizes <= end_val) sizes[mask] = end_val start_val = end_val return sizes def _find_extra_valid_paths(dataset_path: str) -> set: paths = utils.split_paths(dataset_path) all_valid_paths = set() for sub_dir in paths: contents = PathManager.ls(sub_dir) valid_paths = [c for c in contents if re.match("valid*[0-9].*", c) is not None] all_valid_paths |= {os.path.basename(p) for p in valid_paths} # Remove .bin, .idx etc roots = {os.path.splitext(p)[0] for p in all_valid_paths} return roots def raise_if_valid_subsets_unintentionally_ignored(train_cfg) -> None: """Raises if there are paths matching 'valid*[0-9].*' which are not combined or ignored.""" if ( train_cfg.dataset.ignore_unused_valid_subsets or train_cfg.dataset.combine_valid_subsets or train_cfg.dataset.disable_validation or not hasattr(train_cfg.task, "data") ): return other_paths = _find_extra_valid_paths(train_cfg.task.data) specified_subsets = train_cfg.dataset.valid_subset.split(",") ignored_paths = [p for p in other_paths if p not in specified_subsets] if ignored_paths: advice = "Set --combine-val to combine them or --ignore-unused-valid-subsets to ignore them." msg = f"Valid paths {ignored_paths} will be ignored. {advice}" raise ValueError(msg) def compute_mask_indices_for_one( sz, mask_prob: float, mask_length: int, seed=None, epoch=None, index=None, min_masks=0, ): """ set seed, epoch, index for deterministic masking """ seed = int(hash((seed, epoch, index)) % 1e6) if seed else None rng = np.random.default_rng(seed) # decide elements to mask mask = np.full(sz, False) num_mask = int( # add a random number for probabilistic rounding mask_prob * sz / float(mask_length) + rng.random() ) num_mask = max(min_masks, num_mask) # multiple masking as described in the vq-wav2vec paper (https://arxiv.org/abs/1910.05453) mask_idc = rng.choice(sz, num_mask, replace=False) mask_idc = np.concatenate([mask_idc + i for i in range(mask_length)]) mask_idc = mask_idc[mask_idc < len(mask)] try: mask[mask_idc] = True except: # something wrong print(f"Assigning mask indexes {mask_idc} to mask {mask} failed!") raise return mask def compute_mask_indices_v2( shape: Tuple[int, int], padding_mask: Optional[torch.Tensor], mask_prob: float, mask_length: int, min_masks: int = 0, require_same_masks: bool = True, seed: Optional[int] = None, epoch: Optional[int] = None, indices: Optional[torch.Tensor] = None, ) -> np.ndarray: bsz, all_sz = shape mask = np.full((bsz, all_sz), False) for i in range(bsz): if padding_mask is not None: sz = all_sz - padding_mask[i].long().sum().item() else: sz = all_sz index = indices[i].item() if indices is not None else None mask_for_one = compute_mask_indices_for_one( sz, mask_prob, mask_length, seed, epoch, index, min_masks ) mask[i, :sz] = mask_for_one if require_same_masks: index_sum = indices.sum().item() if indices is not None else None seed = int(hash((seed, epoch, index_sum)) % 1e6) if seed else None rng = np.random.default_rng(seed) num_mask = mask.sum(-1).min() for i in range(bsz): extra = mask[i].sum() - num_mask if extra > 0: to_unmask = rng.choice(np.nonzero(mask[i])[0], extra, replace=False) mask[i, to_unmask] = False return mask # TODO: a copy of the original compute_mask_indices def compute_mask_indices_v3( shape: Tuple[int, int], padding_mask: Optional[torch.Tensor], mask_prob: float, mask_length: int, mask_type: str = "static", mask_other: float = 0.0, min_masks: int = 0, no_overlap: bool = False, min_space: int = 0, require_same_masks: bool = True, mask_dropout: float = 0.0, seed: Optional[int] = None, epoch: Optional[int] = None, indices: Optional[torch.Tensor] = None, ) -> np.ndarray: """ Computes random mask spans for a given shape Args: shape: the the shape for which to compute masks. should be of size 2 where first element is batch size and 2nd is timesteps padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by number of timesteps divided by length of mask span to mask approximately this percentage of all elements. however due to overlaps, the actual number will be smaller (unless no_overlap is True) mask_type: how to compute mask lengths static = fixed size uniform = sample from uniform distribution [mask_other, mask_length*2] normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element poisson = sample from possion distribution with lambda = mask length min_masks: minimum number of masked spans no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample mask_dropout: randomly dropout this percentage of masks in each example """ bsz, all_sz = shape mask = np.full((bsz, all_sz), False) all_num_mask = int( # add a random number for probabilistic rounding mask_prob * all_sz / float(mask_length) + np.random.rand() ) all_num_mask = max(min_masks, all_num_mask) mask_idcs = [] for i in range(bsz): if seed is not None and epoch is not None and indices is not None: seed_i = int(hash((seed, epoch, indices[i].item())) % 1e6) else: seed_i = None rng = np.random.default_rng(seed_i) if padding_mask is not None: sz = all_sz - padding_mask[i].long().sum().item() num_mask = int( # add a random number for probabilistic rounding mask_prob * sz / float(mask_length) + rng.random() ) num_mask = max(min_masks, num_mask) else: sz = all_sz num_mask = all_num_mask if mask_type == "static": lengths = np.full(num_mask, mask_length) elif mask_type == "uniform": lengths = rng.randint(mask_other, mask_length * 2 + 1, size=num_mask) elif mask_type == "normal": lengths = rng.normal(mask_length, mask_other, size=num_mask) lengths = [max(1, int(round(x))) for x in lengths] elif mask_type == "poisson": lengths = rng.poisson(mask_length, size=num_mask) lengths = [int(round(x)) for x in lengths] else: raise Exception("unknown mask selection " + mask_type) if sum(lengths) == 0: lengths[0] = min(mask_length, sz - 1) if no_overlap: mask_idc = [] def arrange(s, e, length, keep_length): span_start = rng.randint(s, e - length) mask_idc.extend(span_start + i for i in range(length)) new_parts = [] if span_start - s - min_space >= keep_length: new_parts.append((s, span_start - min_space + 1)) if e - span_start - length - min_space > keep_length: new_parts.append((span_start + length + min_space, e)) return new_parts parts = [(0, sz)] min_length = min(lengths) for length in sorted(lengths, reverse=True): lens = np.fromiter( (e - s if e - s >= length + min_space else 0 for s, e in parts), np.int, ) l_sum = np.sum(lens) if l_sum == 0: break probs = lens / np.sum(lens) c = rng.choice(len(parts), p=probs) s, e = parts.pop(c) parts.extend(arrange(s, e, length, min_length)) mask_idc = np.asarray(mask_idc) else: min_len = min(lengths) if sz - min_len <= num_mask: min_len = sz - num_mask - 1 mask_idc = rng.choice(sz - min_len, num_mask, replace=False) mask_idc = np.asarray( [ mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j]) ] ) mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) min_len = min([len(m) for m in mask_idcs]) for i, mask_idc in enumerate(mask_idcs): if len(mask_idc) > min_len and require_same_masks: mask_idc = rng.choice(mask_idc, min_len, replace=False) if mask_dropout > 0: num_holes = np.rint(len(mask_idc) * mask_dropout).astype(int) mask_idc = rng.choice(mask_idc, len(mask_idc) - num_holes, replace=False) mask[i, mask_idc] = True return mask ================================================ FILE: fairseq/data/data_utils_fast.pyx ================================================ # cython: language_level=3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np cimport cython cimport numpy as np from libc.stdint cimport int32_t, int64_t from libcpp cimport bool as bool_t ctypedef int64_t DTYPE_t @cython.cdivision(True) @cython.boundscheck(False) @cython.wraparound(False) cpdef list batch_by_size_vec( np.ndarray[int64_t, ndim=1] indices, np.ndarray[int64_t, ndim=1] num_tokens_vec, int64_t max_tokens, int64_t max_sentences, int32_t bsz_mult, ): if indices.shape[0] == 0: return [] assert max_tokens <= 0 or np.max(num_tokens_vec) <= max_tokens, ( f"Sentences lengths should not exceed max_tokens={max_tokens}" ) cdef int32_t indices_len = indices.shape[0] cdef np.ndarray[int32_t, ndim=1] batches_ends = \ np.zeros(indices_len, dtype=np.int32) cdef int32_t[:] batches_ends_view = batches_ends cdef int64_t[:] num_tokens_view = num_tokens_vec cdef int32_t pos = 0 cdef int32_t new_batch_end = 0 cdef int64_t new_batch_max_tokens = 0 cdef int32_t new_batch_sentences = 0 cdef int64_t new_batch_num_tokens = 0 cdef bool_t overflow = False cdef bool_t size_matches_with_bsz_mult = False cdef int32_t batches_count = 0 cdef int32_t batch_start = 0 cdef int64_t tail_max_tokens = 0 cdef int64_t batch_max_tokens = 0 for pos in range(indices_len): # At every pos we keep stats about the last complete batch [batch_start:batch_end), # and tail [batch_end:pos]. # 1) Every time when (batch + tail) forms a valid batch # (according to max_tokens, max_sentences and bsz_mult) we append tail to batch. # 2) When (batch+tail) violates max_tokens or max_sentences constraints # we finalize running batch, and tail becomes a new batch. # 3) There is a corner case when tail also violates constraints. # In that situation [batch_end:pos-1] (tail without the current pos) # gets added to the finalized batches, while [pos:pos] becomes a new tail. # # Important: For the sake of performance try to avoid using function calls within this loop. tail_max_tokens = tail_max_tokens \ if tail_max_tokens > num_tokens_view[pos] \ else num_tokens_view[pos] new_batch_end = pos + 1 new_batch_max_tokens = batch_max_tokens \ if batch_max_tokens > tail_max_tokens \ else tail_max_tokens new_batch_sentences = new_batch_end - batch_start new_batch_num_tokens = new_batch_sentences * new_batch_max_tokens overflow = (new_batch_sentences > max_sentences > 0 or new_batch_num_tokens > max_tokens > 0) size_matches_with_bsz_mult = (new_batch_sentences < bsz_mult or new_batch_sentences % bsz_mult == 0) if overflow: tail_num_tokens = tail_max_tokens * \ (new_batch_end - batches_ends_view[batches_count]) tail_overflow = tail_num_tokens > max_tokens > 0 # In case of a tail overflow finalize two batches if tail_overflow: batches_count += 1 batches_ends_view[batches_count] = pos tail_max_tokens = num_tokens_view[pos] batch_start = batches_ends_view[batches_count] batches_count += 1 new_batch_max_tokens = tail_max_tokens if overflow or size_matches_with_bsz_mult: batches_ends_view[batches_count] = new_batch_end batch_max_tokens = new_batch_max_tokens tail_max_tokens = 0 if batches_ends_view[batches_count] != indices_len: batches_count += 1 # Memory and time-efficient split return np.split(indices, batches_ends[:batches_count]) @cython.boundscheck(False) @cython.wraparound(False) cpdef list batch_by_size_fn( np.ndarray[DTYPE_t, ndim=1] indices, num_tokens_fn, int64_t max_tokens, int64_t max_sentences, int32_t bsz_mult, ): cdef int32_t indices_len = indices.shape[0] cdef np.ndarray[int64_t, ndim=1] num_tokens_vec = np.zeros(indices_len, dtype=np.int64) cdef DTYPE_t[:] indices_view = indices cdef DTYPE_t[:] num_tokens_vec_view = num_tokens_vec cdef int64_t pos for pos in range(indices_len): num_tokens_vec[pos] = num_tokens_fn(indices_view[pos]) return batch_by_size_vec(indices, num_tokens_vec, max_tokens, max_sentences, bsz_mult,) cdef _find_valid_shape( DTYPE_t[:, :] shapes_view, int64_t num_sentences, int64_t num_tokens, ): """Return index of first valid shape of -1 if none is found.""" for i in range(shapes_view.shape[0]): if num_sentences <= shapes_view[i][0] and num_tokens <= shapes_view[i][1]: return i return -1 @cython.cdivision(True) cpdef list batch_fixed_shapes_fast( np.ndarray[DTYPE_t, ndim=1] indices, num_tokens_fn, np.ndarray[DTYPE_t, ndim=2] fixed_shapes_sorted, ): cdef int64_t sample_len = 0 cdef list sample_lens = [] cdef list batch = [] cdef list batches = [] cdef int64_t mod_len cdef int64_t i cdef int64_t idx cdef int64_t num_tokens cdef DTYPE_t[:] indices_view = indices cdef DTYPE_t[:, :] shapes_view = fixed_shapes_sorted for i in range(len(indices_view)): idx = indices_view[i] num_tokens = num_tokens_fn(idx) sample_lens.append(num_tokens) sample_len = max(sample_len, num_tokens) shape_idx = _find_valid_shape(shapes_view, len(batch) + 1, sample_len) if shape_idx == -1: batches.append(batch) batch = [] sample_lens = [] sample_len = 0 shapes_view = fixed_shapes_sorted elif shape_idx > 0: # small optimization for the next call to _find_valid_shape shapes_view = shapes_view[shape_idx:] batch.append(idx) if len(batch) > 0: batches.append(batch) return batches ================================================ FILE: fairseq/data/denoising_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import numpy as np import torch from . import FairseqDataset, data_utils def collate( samples, pad_idx, eos_idx, vocab, left_pad_source=False, left_pad_target=False, input_feeding=True, pad_to_length=None, ): assert input_feeding if len(samples) == 0: return {} def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None): return data_utils.collate_tokens( [s[key] for s in samples], pad_idx, eos_idx=None, # use eos_idx of each sample instead of vocab.eos() left_pad=left_pad, move_eos_to_beginning=move_eos_to_beginning, pad_to_length=pad_to_length, ) id = torch.LongTensor([s["id"] for s in samples]) src_tokens = merge( "source", left_pad=left_pad_source, pad_to_length=pad_to_length["source"] if pad_to_length is not None else None, ) # sort by descending source length src_lengths = torch.LongTensor([s["source"].numel() for s in samples]) src_lengths, sort_order = src_lengths.sort(descending=True) id = id.index_select(0, sort_order) src_tokens = src_tokens.index_select(0, sort_order) prev_output_tokens = None target = None if samples[0].get("target", None) is not None: target = merge( "target", left_pad=left_pad_target, pad_to_length=pad_to_length["target"] if pad_to_length is not None else None, ) target = target.index_select(0, sort_order) ntokens = sum(len(s["target"]) for s in samples) if input_feeding: # we create a shifted version of targets for feeding the # previous output token(s) into the next decoder step prev_output_tokens = merge( "target", left_pad=left_pad_target, move_eos_to_beginning=True, pad_to_length=pad_to_length["target"] if pad_to_length is not None else None, ) prev_output_tokens = prev_output_tokens.index_select(0, sort_order) else: ntokens = sum(len(s["source"]) for s in samples) batch = { "id": id, "ntokens": ntokens, "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, }, "target": target, "nsentences": samples[0]["source"].size(0), "sort_order": sort_order, } if prev_output_tokens is not None: batch["net_input"]["prev_output_tokens"] = prev_output_tokens return batch class DenoisingDataset(FairseqDataset): """ A wrapper around TokenBlockDataset for BART dataset. Args: dataset (TokenBlockDataset): dataset to wrap sizes (List[int]): sentence lengths vocab (~fairseq.data.Dictionary): vocabulary mask_idx (int): dictionary index used for masked token mask_whole_words: only mask whole words. This should be a byte mask over vocab indices, indicating whether it is the beginning of a word. We will extend any mask to encompass the whole word. shuffle (bool, optional): shuffle the elements before batching. Default: ``True`` seed: Seed for random number generator for reproducibility. """ def __init__( self, dataset, sizes, vocab, mask_idx, mask_whole_words, shuffle, seed, mask, mask_random, insert, rotate, permute_sentences, bpe, replace_length, mask_length, poisson_lambda, eos=None, item_transform_func=None, ): self.dataset = dataset self.sizes = sizes self.vocab = vocab self.shuffle = shuffle self.seed = seed self.mask_idx = mask_idx self.mask_whole_word = mask_whole_words self.mask_ratio = mask self.random_ratio = mask_random self.insert_ratio = insert self.rotate_ratio = rotate self.permute_sentence_ratio = permute_sentences self.eos = eos if eos is not None else vocab.eos() self.item_transform_func = item_transform_func if bpe != "gpt2": self.full_stop_index = self.vocab.eos() else: assert bpe == "gpt2" self.full_stop_index = self.vocab.index("13") self.replace_length = replace_length if self.replace_length not in [-1, 0, 1]: raise ValueError(f"invalid arg: replace_length={self.replace_length}") if mask_length not in ["subword", "word", "span-poisson"]: raise ValueError(f"invalid arg: mask-length={mask_length}") if mask_length == "subword" and replace_length not in [0, 1]: raise ValueError(f"if using subwords, use replace-length=1 or 0") self.mask_span_distribution = None if mask_length == "span-poisson": _lambda = poisson_lambda lambda_to_the_k = 1 e_to_the_minus_lambda = math.exp(-_lambda) k_factorial = 1 ps = [] for k in range(0, 128): ps.append(e_to_the_minus_lambda * lambda_to_the_k / k_factorial) lambda_to_the_k *= _lambda k_factorial *= k + 1 if ps[-1] < 0.0000001: break ps = torch.FloatTensor(ps) self.mask_span_distribution = torch.distributions.Categorical(ps) self.epoch = 0 @property def can_reuse_epoch_itr_across_epochs(self): return True # only the noise changes, not item sizes def set_epoch(self, epoch, **unused): self.epoch = epoch def __getitem__(self, index): with data_utils.numpy_seed(self.seed, self.epoch, index): tokens = self.dataset[index] assert tokens[-1] == self.eos source, target = tokens, tokens.clone() if self.permute_sentence_ratio > 0.0: source = self.permute_sentences(source, self.permute_sentence_ratio) if self.mask_ratio > 0: source = self.add_whole_word_mask(source, self.mask_ratio) if self.insert_ratio > 0: source = self.add_insertion_noise(source, self.insert_ratio) if self.rotate_ratio > 0.0 and np.random.random() < self.rotate_ratio: source = self.add_rolling_noise(source) # there can additional changes to make: if self.item_transform_func is not None: source, target = self.item_transform_func(source, target) assert (source >= 0).all() assert (source[1:-1] >= 1).all() assert (source <= len(self.vocab)).all() assert source[0] == self.vocab.bos() assert source[-1] == self.eos return { "id": index, "source": source, "target": target, } def __len__(self): return len(self.dataset) def permute_sentences(self, source, p=1.0): full_stops = source == self.full_stop_index # Pretend it ends with a full stop so last span is a sentence full_stops[-2] = 1 # Tokens that are full stops, where the previous token is not sentence_ends = (full_stops[1:] * ~full_stops[:-1]).nonzero(as_tuple=False) + 2 result = source.clone() num_sentences = sentence_ends.size(0) num_to_permute = math.ceil((num_sentences * 2 * p) / 2.0) substitutions = torch.randperm(num_sentences)[:num_to_permute] ordering = torch.arange(0, num_sentences) ordering[substitutions] = substitutions[torch.randperm(num_to_permute)] # Ignore <bos> at start index = 1 for i in ordering: sentence = source[(sentence_ends[i - 1] if i > 0 else 1) : sentence_ends[i]] result[index : index + sentence.size(0)] = sentence index += sentence.size(0) return result def word_starts(self, source): if self.mask_whole_word is not None: is_word_start = self.mask_whole_word.gather(0, source) else: is_word_start = torch.ones(source.size()) is_word_start[0] = 0 is_word_start[-1] = 0 return is_word_start def add_whole_word_mask(self, source, p): is_word_start = self.word_starts(source) num_to_mask = int(math.ceil(is_word_start.float().sum() * p)) num_inserts = 0 if num_to_mask == 0: return source if self.mask_span_distribution is not None: lengths = self.mask_span_distribution.sample(sample_shape=(num_to_mask,)) # Make sure we have enough to mask cum_length = torch.cumsum(lengths, 0) while cum_length[-1] < num_to_mask: lengths = torch.cat( [ lengths, self.mask_span_distribution.sample(sample_shape=(num_to_mask,)), ], dim=0, ) cum_length = torch.cumsum(lengths, 0) # Trim to masking budget i = 0 while cum_length[i] < num_to_mask: i += 1 lengths[i] = num_to_mask - (0 if i == 0 else cum_length[i - 1]) num_to_mask = i + 1 lengths = lengths[:num_to_mask] # Handle 0-length mask (inserts) separately lengths = lengths[lengths > 0] num_inserts = num_to_mask - lengths.size(0) num_to_mask -= num_inserts if num_to_mask == 0: return self.add_insertion_noise(source, num_inserts / source.size(0)) assert (lengths > 0).all() else: lengths = torch.ones((num_to_mask,)).long() assert is_word_start[-1] == 0 word_starts = is_word_start.nonzero(as_tuple=False) indices = word_starts[ torch.randperm(word_starts.size(0))[:num_to_mask] ].squeeze(1) mask_random = torch.FloatTensor(num_to_mask).uniform_() < self.random_ratio source_length = source.size(0) assert source_length - 1 not in indices to_keep = torch.ones(source_length, dtype=torch.bool) is_word_start[ -1 ] = 255 # acts as a long length, so spans don't go over the end of doc if self.replace_length == 0: to_keep[indices] = 0 else: # keep index, but replace it with [MASK] source[indices] = self.mask_idx source[indices[mask_random]] = torch.randint( 1, len(self.vocab), size=(mask_random.sum(),) ) if self.mask_span_distribution is not None: assert len(lengths.size()) == 1 assert lengths.size() == indices.size() lengths -= 1 while indices.size(0) > 0: assert lengths.size() == indices.size() lengths -= is_word_start[indices + 1].long() uncompleted = lengths >= 0 indices = indices[uncompleted] + 1 mask_random = mask_random[uncompleted] lengths = lengths[uncompleted] if self.replace_length != -1: # delete token to_keep[indices] = 0 else: # keep index, but replace it with [MASK] source[indices] = self.mask_idx source[indices[mask_random]] = torch.randint( 1, len(self.vocab), size=(mask_random.sum(),) ) else: # A bit faster when all lengths are 1 while indices.size(0) > 0: uncompleted = is_word_start[indices + 1] == 0 indices = indices[uncompleted] + 1 mask_random = mask_random[uncompleted] if self.replace_length != -1: # delete token to_keep[indices] = 0 else: # keep index, but replace it with [MASK] source[indices] = self.mask_idx source[indices[mask_random]] = torch.randint( 1, len(self.vocab), size=(mask_random.sum(),) ) assert source_length - 1 not in indices source = source[to_keep] if num_inserts > 0: source = self.add_insertion_noise(source, num_inserts / source.size(0)) return source def add_permuted_noise(self, tokens, p): num_words = len(tokens) num_to_permute = math.ceil(((num_words * 2) * p) / 2.0) substitutions = torch.randperm(num_words - 2)[:num_to_permute] + 1 tokens[substitutions] = tokens[substitutions[torch.randperm(num_to_permute)]] return tokens def add_rolling_noise(self, tokens): offset = np.random.randint(1, max(1, tokens.size(-1) - 1) + 1) tokens = torch.cat( (tokens[0:1], tokens[offset:-1], tokens[1:offset], tokens[-1:]), dim=0, ) return tokens def add_insertion_noise(self, tokens, p): if p == 0.0: return tokens num_tokens = len(tokens) n = int(math.ceil(num_tokens * p)) noise_indices = torch.randperm(num_tokens + n - 2)[:n] + 1 noise_mask = torch.zeros(size=(num_tokens + n,), dtype=torch.bool) noise_mask[noise_indices] = 1 result = torch.LongTensor(n + len(tokens)).fill_(-1) num_random = int(math.ceil(n * self.random_ratio)) result[noise_indices[num_random:]] = self.mask_idx result[noise_indices[:num_random]] = torch.randint( low=1, high=len(self.vocab), size=(num_random,) ) result[~noise_mask] = tokens assert (result >= 0).all() return result def collater(self, samples, pad_to_length=None): """Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate Returns: dict: a mini-batch of data """ return collate( samples, self.vocab.pad(), self.eos, self.vocab, pad_to_length=pad_to_length ) def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to enforce ``--max-tokens`` during batching.""" return self.sizes[index] def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" return self.sizes[index] def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: indices = np.random.permutation(len(self)) else: indices = np.arange(len(self)) return indices[np.argsort(self.sizes[indices], kind="mergesort")] def prefetch(self, indices): self.src.prefetch(indices) self.tgt.prefetch(indices) @property def supports_prefetch(self): return ( hasattr(self.src, "supports_prefetch") and self.src.supports_prefetch and hasattr(self.tgt, "supports_prefetch") and self.tgt.supports_prefetch ) ================================================ FILE: fairseq/data/dictionary.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os from collections import Counter from multiprocessing import Pool import torch from fairseq import utils from fairseq.data import data_utils from fairseq.file_chunker_utils import Chunker, find_offsets from fairseq.file_io import PathManager from fairseq.tokenizer import tokenize_line class Dictionary: """A mapping from symbols to consecutive integers""" def __init__( self, *, # begin keyword-only arguments bos="<s>", pad="<pad>", eos="</s>", unk="<unk>", extra_special_symbols=None, add_special_symbols=True, ): self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos self.symbols = [] self.count = [] self.indices = {} if add_special_symbols: self.bos_index = self.add_symbol(bos) self.pad_index = self.add_symbol(pad) self.eos_index = self.add_symbol(eos) self.unk_index = self.add_symbol(unk) if extra_special_symbols: for s in extra_special_symbols: self.add_symbol(s) self.nspecial = len(self.symbols) def __eq__(self, other): return self.indices == other.indices def __getitem__(self, idx): if idx < len(self.symbols): return self.symbols[idx] return self.unk_word def get_count(self, idx): return self.count[idx] def __len__(self): """Returns the number of symbols in the dictionary""" return len(self.symbols) def __contains__(self, sym): return sym in self.indices def index(self, sym): """Returns the index of the specified symbol""" assert isinstance(sym, str) if sym in self.indices: return self.indices[sym] return self.unk_index def string( self, tensor, bpe_symbol=None, escape_unk=False, extra_symbols_to_ignore=None, unk_string=None, include_eos=False, separator=" ", ): """Helper for converting a tensor of token indices to a string. Can optionally remove BPE symbols or escape <unk> words. """ if torch.is_tensor(tensor) and tensor.dim() == 2: return "\n".join( self.string( t, bpe_symbol, escape_unk, extra_symbols_to_ignore, include_eos=include_eos, ) for t in tensor ) extra_symbols_to_ignore = set(extra_symbols_to_ignore or []) if not include_eos: extra_symbols_to_ignore.add(self.eos()) def token_string(i): if i == self.unk(): if unk_string is not None: return unk_string else: return self.unk_string(escape_unk) else: return self[i] if hasattr(self, "bos_index"): extra_symbols_to_ignore.add(self.bos()) sent = separator.join( token_string(i) for i in tensor if utils.item(i) not in extra_symbols_to_ignore ) return data_utils.post_process(sent, bpe_symbol) def unk_string(self, escape=False): """Return unknown string, optionally escaped as: <<unk>>""" if escape: return "<{}>".format(self.unk_word) else: return self.unk_word def add_symbol(self, word, n=1, overwrite=False): """Adds a word to the dictionary""" if word in self.indices and not overwrite: idx = self.indices[word] self.count[idx] = self.count[idx] + n return idx else: idx = len(self.symbols) self.indices[word] = idx self.symbols.append(word) self.count.append(n) return idx def update(self, new_dict): """Updates counts from new dictionary.""" for word in new_dict.symbols: idx2 = new_dict.indices[word] if word in self.indices: idx = self.indices[word] self.count[idx] = self.count[idx] + new_dict.count[idx2] else: idx = len(self.symbols) self.indices[word] = idx self.symbols.append(word) self.count.append(new_dict.count[idx2]) def finalize(self, threshold=-1, nwords=-1, padding_factor=8): """Sort symbols by frequency in descending order, ignoring special ones. Args: - threshold defines the minimum word count - nwords defines the total number of words in the final dictionary, including special symbols - padding_factor can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ if nwords <= 0: nwords = len(self) new_indices = dict(zip(self.symbols[: self.nspecial], range(self.nspecial))) new_symbols = self.symbols[: self.nspecial] new_count = self.count[: self.nspecial] c = Counter( dict( sorted(zip(self.symbols[self.nspecial :], self.count[self.nspecial :])) ) ) for symbol, count in c.most_common(nwords - self.nspecial): if count >= threshold: new_indices[symbol] = len(new_symbols) new_symbols.append(symbol) new_count.append(count) else: break assert len(new_symbols) == len(new_indices) self.count = list(new_count) self.symbols = list(new_symbols) self.indices = new_indices self.pad_to_multiple_(padding_factor) def pad_to_multiple_(self, padding_factor): """Pad Dictionary size to be a multiple of *padding_factor*.""" if padding_factor > 1: i = 0 while len(self) % padding_factor != 0: symbol = "madeupword{:04d}".format(i) self.add_symbol(symbol, n=0) i += 1 def bos(self): """Helper to get index of beginning-of-sentence symbol""" return self.bos_index def pad(self): """Helper to get index of pad symbol""" return self.pad_index def eos(self): """Helper to get index of end-of-sentence symbol""" return self.eos_index def unk(self): """Helper to get index of unk symbol""" return self.unk_index @classmethod def load(cls, f, add_special_symbols=True): """Loads the dictionary from a text file with the format: ``` <symbol0> <count0> <symbol1> <count1> ... ``` """ d = cls(add_special_symbols=add_special_symbols) d.add_from_file(f) return d def add_from_file(self, f): """ Loads a pre-existing dictionary from a text file and adds its symbols to this instance. """ if isinstance(f, str): try: with open(PathManager.get_local_path(f), "r", encoding="utf-8") as fd: self.add_from_file(fd) except FileNotFoundError as fnfe: raise fnfe except UnicodeError: raise Exception( "Incorrect encoding detected in {}, please " "rebuild the dataset".format(f) ) return lines = f.readlines() indices_start_line = self._load_meta(lines) for line in lines[indices_start_line:]: try: line, field = line.rstrip().rsplit(" ", 1) if field == "#fairseq:overwrite": overwrite = True line, field = line.rsplit(" ", 1) else: overwrite = False count = int(field) word = line if word in self and not overwrite: raise RuntimeError( "Duplicate word found when loading Dictionary: '{}'. " "Duplicate words can overwrite earlier ones by adding the " "#fairseq:overwrite flag at the end of the corresponding row " "in the dictionary file. If using the Camembert model, please " "download an updated copy of the model file.".format(word) ) self.add_symbol(word, n=count, overwrite=overwrite) except ValueError: raise ValueError( f"Incorrect dictionary format, expected '<token> <cnt> [flags]': \"{line}\"" ) def _save(self, f, kv_iterator): if isinstance(f, str): PathManager.mkdirs(os.path.dirname(f)) with PathManager.open(f, "w", encoding="utf-8") as fd: return self.save(fd) for k, v in kv_iterator: print("{} {}".format(k, v), file=f) def _get_meta(self): return [], [] def _load_meta(self, lines): return 0 def save(self, f): """Stores dictionary into a text file""" ex_keys, ex_vals = self._get_meta() self._save( f, zip( ex_keys + self.symbols[self.nspecial :], ex_vals + self.count[self.nspecial :], ), ) def dummy_sentence(self, length): t = torch.Tensor(length).uniform_(self.nspecial + 1, len(self)).long() t[-1] = self.eos() return t def encode_line( self, line, line_tokenizer=tokenize_line, add_if_not_exist=True, consumer=None, append_eos=True, reverse_order=False, ) -> torch.IntTensor: words = line_tokenizer(line) if reverse_order: words = list(reversed(words)) nwords = len(words) ids = torch.IntTensor(nwords + 1 if append_eos else nwords) for i, word in enumerate(words): if add_if_not_exist: idx = self.add_symbol(word) else: idx = self.index(word) if consumer is not None: consumer(word, idx) ids[i] = idx if append_eos: ids[nwords] = self.eos_index return ids @staticmethod def _add_file_to_dictionary_single_worker( filename, tokenize, eos_word, start_offset, end_offset, ): counter = Counter() with Chunker(filename, start_offset, end_offset) as line_iterator: for line in line_iterator: for word in tokenize(line): counter.update([word]) counter.update([eos_word]) return counter @staticmethod def add_file_to_dictionary(filename, dict, tokenize, num_workers): def merge_result(counter): for w, c in sorted(counter.items()): dict.add_symbol(w, c) local_file = PathManager.get_local_path(filename) offsets = find_offsets(local_file, num_workers) if num_workers > 1: chunks = zip(offsets, offsets[1:]) pool = Pool(processes=num_workers) results = [] for (start_offset, end_offset) in chunks: results.append( pool.apply_async( Dictionary._add_file_to_dictionary_single_worker, ( local_file, tokenize, dict.eos_word, start_offset, end_offset, ), ) ) pool.close() pool.join() for r in results: merge_result(r.get()) else: merge_result( Dictionary._add_file_to_dictionary_single_worker( local_file, tokenize, dict.eos_word, offsets[0], offsets[1] ) ) class TruncatedDictionary(object): def __init__(self, wrapped_dict, length): self.__class__ = type( wrapped_dict.__class__.__name__, (self.__class__, wrapped_dict.__class__), {}, ) self.__dict__ = wrapped_dict.__dict__ self.wrapped_dict = wrapped_dict self.length = min(len(self.wrapped_dict), length) def __len__(self): return self.length def __getitem__(self, i): if i < self.length: return self.wrapped_dict[i] return self.wrapped_dict.unk() ================================================ FILE: fairseq/data/encoders/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os from fairseq import registry build_tokenizer, register_tokenizer, TOKENIZER_REGISTRY, _ = registry.setup_registry( "--tokenizer", default=None, ) build_bpe, register_bpe, BPE_REGISTRY, _ = registry.setup_registry( "--bpe", default=None, ) # automatically import any Python files in the encoders/ directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("fairseq.data.encoders." + module) ================================================ FILE: fairseq/data/encoders/byte_bpe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq import file_utils from fairseq.data.encoders import register_bpe from fairseq.data.encoders.byte_utils import ( SPACE, SPACE_ESCAPE, byte_encode, smart_byte_decode, ) from fairseq.dataclass import FairseqDataclass @dataclass class ByteBpeConfig(FairseqDataclass): sentencepiece_model_path: str = field( default="???", metadata={"help": "path to sentencepiece model"} ) @register_bpe("byte_bpe", dataclass=ByteBpeConfig) class ByteBPE(object): def __init__(self, cfg): vocab = file_utils.cached_path(cfg.sentencepiece_model_path) try: import sentencepiece as spm self.sp = spm.SentencePieceProcessor() self.sp.Load(vocab) except ImportError: raise ImportError( "Please install sentencepiece with: pip install sentencepiece" ) def encode(self, x: str) -> str: byte_encoded = byte_encode(x) return SPACE.join(self.sp.EncodeAsPieces(byte_encoded)) @staticmethod def decode(x: str) -> str: unescaped = x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE) return smart_byte_decode(unescaped) ================================================ FILE: fairseq/data/encoders/byte_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import re WHITESPACE_NORMALIZER = re.compile(r"\s+") SPACE = chr(32) SPACE_ESCAPE = chr(9601) # excluding non-breaking space (160) here PRINTABLE_LATIN = set( list(range(32, 126 + 1)) + list(range(161, 172 + 1)) + list(range(174, 255 + 1)) ) BYTE_TO_BCHAR = { b: chr(b) if b in PRINTABLE_LATIN else chr(256 + b) for b in range(256) } BCHAR_TO_BYTE = {bc: b for b, bc in BYTE_TO_BCHAR.items()} def byte_encode(x: str) -> str: normalized = WHITESPACE_NORMALIZER.sub(SPACE, x) return "".join([BYTE_TO_BCHAR[b] for b in normalized.encode("utf-8")]) def byte_decode(x: str) -> str: try: return bytes([BCHAR_TO_BYTE[bc] for bc in x]).decode("utf-8") except ValueError: return "" def smart_byte_decode(x: str) -> str: output = byte_decode(x) if output == "": # DP the best recovery (max valid chars) if it's broken n_bytes = len(x) f = [0 for _ in range(n_bytes + 1)] pt = [0 for _ in range(n_bytes + 1)] for i in range(1, n_bytes + 1): f[i], pt[i] = f[i - 1], i - 1 for j in range(1, min(4, i) + 1): if f[i - j] + 1 > f[i] and len(byte_decode(x[i - j : i])) > 0: f[i], pt[i] = f[i - j] + 1, i - j cur_pt = n_bytes while cur_pt > 0: if f[cur_pt] == f[pt[cur_pt]] + 1: output = byte_decode(x[pt[cur_pt] : cur_pt]) + output cur_pt = pt[cur_pt] return output ================================================ FILE: fairseq/data/encoders/bytes.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.data.encoders import register_bpe from fairseq.data.encoders.byte_utils import ( SPACE, SPACE_ESCAPE, byte_encode, smart_byte_decode, ) @register_bpe("bytes") class Bytes(object): def __init__(self, *unused): pass @staticmethod def add_args(parser): pass @staticmethod def encode(x: str) -> str: encoded = byte_encode(x) escaped = encoded.replace(SPACE, SPACE_ESCAPE) return SPACE.join(list(escaped)) @staticmethod def decode(x: str) -> str: unescaped = x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE) return smart_byte_decode(unescaped) ================================================ FILE: fairseq/data/encoders/characters.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.data.encoders import register_bpe SPACE = chr(32) SPACE_ESCAPE = chr(9601) @register_bpe("characters") class Characters(object): def __init__(self, *unused): pass @staticmethod def add_args(parser): pass @staticmethod def encode(x: str) -> str: escaped = x.replace(SPACE, SPACE_ESCAPE) return SPACE.join(list(escaped)) @staticmethod def decode(x: str) -> str: return x.replace(SPACE, "").replace(SPACE_ESCAPE, SPACE) ================================================ FILE: fairseq/data/encoders/fastbpe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq import file_utils from fairseq.data.encoders import register_bpe from fairseq.dataclass import FairseqDataclass @dataclass class fastBPEConfig(FairseqDataclass): bpe_codes: str = field(default="???", metadata={"help": "path to fastBPE BPE"}) @register_bpe("fastbpe", dataclass=fastBPEConfig) class fastBPE(object): def __init__(self, cfg): if cfg.bpe_codes is None: raise ValueError("--bpe-codes is required for --bpe=fastbpe") codes = file_utils.cached_path(cfg.bpe_codes) try: import fastBPE self.bpe = fastBPE.fastBPE(codes) self.bpe_symbol = "@@ " except ImportError: raise ImportError("Please install fastBPE with: pip install fastBPE") def encode(self, x: str) -> str: return self.bpe.apply([x])[0] def decode(self, x: str) -> str: return (x + " ").replace(self.bpe_symbol, "").rstrip() ================================================ FILE: fairseq/data/encoders/gpt2_bpe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq import file_utils from fairseq.data.encoders import register_bpe from fairseq.dataclass import FairseqDataclass from .gpt2_bpe_utils import get_encoder DEFAULT_ENCODER_JSON = "https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json" DEFAULT_VOCAB_BPE = "https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe" @dataclass class GPT2BPEConfig(FairseqDataclass): gpt2_encoder_json: str = field( default=DEFAULT_ENCODER_JSON, metadata={"help": "path to encoder.json"} ) gpt2_vocab_bpe: str = field( default=DEFAULT_VOCAB_BPE, metadata={"help": "path to vocab.bpe"} ) @register_bpe("gpt2", dataclass=GPT2BPEConfig) class GPT2BPE(object): def __init__(self, cfg): encoder_json = file_utils.cached_path(cfg.gpt2_encoder_json) vocab_bpe = file_utils.cached_path(cfg.gpt2_vocab_bpe) self.bpe = get_encoder(encoder_json, vocab_bpe) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x))) def decode(self, x: str) -> str: return self.bpe.decode( [int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split()] ) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ") ================================================ FILE: fairseq/data/encoders/gpt2_bpe_utils.py ================================================ """ Byte pair encoding utilities from GPT-2. Original source: https://github.com/openai/gpt-2/blob/master/src/encoder.py Original license: MIT """ import json from functools import lru_cache @lru_cache() def bytes_to_unicode(): """ Returns list of utf-8 byte and a corresponding list of unicode strings. The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. This is a signficant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. """ bs = ( list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) ) cs = bs[:] n = 0 for b in range(2**8): if b not in bs: bs.append(b) cs.append(2**8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) def get_pairs(word): """Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length strings). """ pairs = set() prev_char = word[0] for char in word[1:]: pairs.add((prev_char, char)) prev_char = char return pairs class Encoder: def __init__(self, encoder, bpe_merges, errors="replace"): self.encoder = encoder self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) self.cache = {} try: import regex as re self.re = re except ImportError: raise ImportError("Please install regex with: pip install regex") # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions self.pat = self.re.compile( r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""" ) def bpe(self, token): if token in self.cache: return self.cache[token] word = tuple(token) pairs = get_pairs(word) if not pairs: return token while True: bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) if bigram not in self.bpe_ranks: break first, second = bigram new_word = [] i = 0 while i < len(word): try: j = word.index(first, i) new_word.extend(word[i:j]) i = j except: new_word.extend(word[i:]) break if word[i] == first and i < len(word) - 1 and word[i + 1] == second: new_word.append(first + second) i += 2 else: new_word.append(word[i]) i += 1 new_word = tuple(new_word) word = new_word if len(word) == 1: break else: pairs = get_pairs(word) word = " ".join(word) self.cache[token] = word return word def encode(self, text): bpe_tokens = [] for token in self.re.findall(self.pat, text): token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) bpe_tokens.extend( self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ") ) return bpe_tokens def decode(self, tokens): text = "".join([self.decoder.get(token, token) for token in tokens]) text = bytearray([self.byte_decoder[c] for c in text]).decode( "utf-8", errors=self.errors ) return text def get_encoder(encoder_json_path, vocab_bpe_path): with open(encoder_json_path, "r") as f: encoder = json.load(f) with open(vocab_bpe_path, "r", encoding="utf-8") as f: bpe_data = f.read() bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]] return Encoder( encoder=encoder, bpe_merges=bpe_merges, ) ================================================ FILE: fairseq/data/encoders/hf_bert_bpe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from typing import Optional from fairseq.data.encoders import register_bpe from fairseq.dataclass import FairseqDataclass @dataclass class BertBPEConfig(FairseqDataclass): bpe_cased: bool = field(default=False, metadata={"help": "set for cased BPE"}) bpe_vocab_file: Optional[str] = field( default=None, metadata={"help": "bpe vocab file"} ) @register_bpe("bert", dataclass=BertBPEConfig) class BertBPE(object): def __init__(self, cfg): try: from transformers import BertTokenizer except ImportError: raise ImportError( "Please install transformers with: pip install transformers" ) if cfg.bpe_vocab_file: self.bert_tokenizer = BertTokenizer( cfg.bpe_vocab_file, do_lower_case=not cfg.bpe_cased ) else: vocab_file_name = ( "bert-base-cased" if cfg.bpe_cased else "bert-base-uncased" ) self.bert_tokenizer = BertTokenizer.from_pretrained(vocab_file_name) def encode(self, x: str) -> str: return " ".join(self.bert_tokenizer.tokenize(x)) def decode(self, x: str) -> str: return self.bert_tokenizer.clean_up_tokenization( self.bert_tokenizer.convert_tokens_to_string(x.split(" ")) ) def is_beginning_of_word(self, x: str) -> bool: return not x.startswith("##") ================================================ FILE: fairseq/data/encoders/hf_byte_bpe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq.data.encoders import register_bpe from fairseq.dataclass import FairseqDataclass from fairseq import file_utils @dataclass class HuggingFaceByteLevelBPEConfig(FairseqDataclass): bpe_merges: str = field(default="???", metadata={"help": "path to merges.txt"}) bpe_vocab: str = field(default="???", metadata={"help": "path to vocab.json"}) bpe_add_prefix_space: bool = field( default=False, metadata={"help": "add prefix space before encoding"} ) @register_bpe("hf_byte_bpe", dataclass=HuggingFaceByteLevelBPEConfig) class HuggingFaceByteLevelBPE(object): def __init__(self, cfg): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( "Please install huggingface/tokenizers with: " "pip install tokenizers" ) bpe_vocab = file_utils.cached_path(cfg.bpe_vocab) bpe_merges = file_utils.cached_path(cfg.bpe_merges) self.bpe = ByteLevelBPETokenizer( bpe_vocab, bpe_merges, add_prefix_space=cfg.bpe_add_prefix_space, ) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x).ids)) def decode(self, x: str) -> str: return self.bpe.decode( [int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split()] ) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ") ================================================ FILE: fairseq/data/encoders/moses_tokenizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq.data.encoders import register_tokenizer from fairseq.dataclass import FairseqDataclass @dataclass class MosesTokenizerConfig(FairseqDataclass): source_lang: str = field(default="en", metadata={"help": "source language"}) target_lang: str = field(default="en", metadata={"help": "target language"}) moses_no_dash_splits: bool = field( default=False, metadata={"help": "don't apply dash split rules"} ) moses_no_escape: bool = field( default=False, metadata={"help": "don't perform HTML escaping on apostrophe, quotes, etc."}, ) @register_tokenizer("moses", dataclass=MosesTokenizerConfig) class MosesTokenizer(object): def __init__(self, cfg: MosesTokenizerConfig): self.cfg = cfg try: from sacremoses import MosesTokenizer, MosesDetokenizer self.tok = MosesTokenizer(cfg.source_lang) self.detok = MosesDetokenizer(cfg.target_lang) except ImportError: raise ImportError( "Please install Moses tokenizer with: pip install sacremoses" ) def encode(self, x: str) -> str: return self.tok.tokenize( x, aggressive_dash_splits=(not self.cfg.moses_no_dash_splits), return_str=True, escape=(not self.cfg.moses_no_escape), ) def decode(self, x: str) -> str: return self.detok.detokenize(x.split()) ================================================ FILE: fairseq/data/encoders/nltk_tokenizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.data.encoders import register_tokenizer from fairseq.dataclass import FairseqDataclass @register_tokenizer("nltk", dataclass=FairseqDataclass) class NLTKTokenizer(object): def __init__(self, *unused): try: from nltk.tokenize import word_tokenize self.word_tokenize = word_tokenize except ImportError: raise ImportError("Please install nltk with: pip install nltk") def encode(self, x: str) -> str: return " ".join(self.word_tokenize(x)) def decode(self, x: str) -> str: return x ================================================ FILE: fairseq/data/encoders/sentencepiece_bpe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from typing import Optional from fairseq import file_utils from fairseq.data.encoders import register_bpe from fairseq.dataclass import FairseqDataclass @dataclass class SentencepieceConfig(FairseqDataclass): sentencepiece_model: str = field( default="???", metadata={"help": "path to sentencepiece model"} ) sentencepiece_enable_sampling: bool = field( default=False, metadata={"help": "enable sampling"} ) sentencepiece_alpha: Optional[float] = field( default=None, metadata={ "help": "soothing parameter for unigram sampling, " "and merge probability for BPE-dropout" }, ) @register_bpe("sentencepiece", dataclass=SentencepieceConfig) class SentencepieceBPE(object): def __init__(self, cfg): self.enable_sampling = cfg.sentencepiece_enable_sampling self.alpha = cfg.sentencepiece_alpha sentencepiece_model = file_utils.cached_path(cfg.sentencepiece_model) try: import sentencepiece as spm self.sp = spm.SentencePieceProcessor() self.sp.Load(sentencepiece_model) except ImportError: raise ImportError( "Please install sentencepiece with: pip install sentencepiece" ) def encode(self, x: str) -> str: return " ".join( self.sp.Encode( x, out_type=str, enable_sampling=self.enable_sampling, alpha=self.alpha ) ) def decode(self, x: str) -> str: return x.replace(" ", "").replace("\u2581", " ").strip() def is_beginning_of_word(self, x: str) -> bool: if x in ["<unk>", "<s>", "</s>", "<pad>"]: # special elements are always considered beginnings # HACK: this logic is already present in fairseq/tasks/masked_lm.py # but these special tokens are also contained in the sentencepiece # vocabulary which causes duplicate special tokens. This hack makes # sure that they are all taken into account. return True return x.startswith("\u2581") ================================================ FILE: fairseq/data/encoders/space_tokenizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import re from fairseq.data.encoders import register_tokenizer from fairseq.dataclass import FairseqDataclass @register_tokenizer("space", dataclass=FairseqDataclass) class SpaceTokenizer(object): def __init__(self, *unused): self.space_tok = re.compile(r"\s+") def encode(self, x: str) -> str: return self.space_tok.sub(" ", x) def decode(self, x: str) -> str: return x ================================================ FILE: fairseq/data/encoders/subword_nmt_bpe.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq import file_utils from fairseq.data.encoders import register_bpe from fairseq.dataclass import FairseqDataclass @dataclass class SubwordNMTBPEConfig(FairseqDataclass): bpe_codes: str = field(default="???", metadata={"help": "path to subword NMT BPE"}) bpe_separator: str = field(default="@@", metadata={"help": "BPE separator"}) @register_bpe("subword_nmt", dataclass=SubwordNMTBPEConfig) class SubwordNMTBPE(object): def __init__(self, cfg): if cfg.bpe_codes is None: raise ValueError("--bpe-codes is required for --bpe=subword_nmt") codes = file_utils.cached_path(cfg.bpe_codes) try: from subword_nmt import apply_bpe bpe_parser = apply_bpe.create_parser() bpe_args = bpe_parser.parse_args( [ "--codes", codes, "--separator", cfg.bpe_separator, ] ) self.bpe = apply_bpe.BPE( bpe_args.codes, bpe_args.merges, bpe_args.separator, None, bpe_args.glossaries, ) self.bpe_symbol = bpe_args.separator + " " except ImportError: raise ImportError( "Please install subword_nmt with: pip install subword-nmt" ) def encode(self, x: str) -> str: return self.bpe.process_line(x) def decode(self, x: str) -> str: return (x + " ").replace(self.bpe_symbol, "").rstrip() ================================================ FILE: fairseq/data/encoders/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.data import encoders def get_whole_word_mask(args, dictionary): bpe = encoders.build_bpe(args) if bpe is not None: def is_beginning_of_word(i): if i < dictionary.nspecial: # special elements are always considered beginnings return True tok = dictionary[i] if tok.startswith("madeupword"): return True try: return bpe.is_beginning_of_word(tok) except ValueError: return True mask_whole_words = torch.ByteTensor( list(map(is_beginning_of_word, range(len(dictionary)))) ) return mask_whole_words return None ================================================ FILE: fairseq/data/fairseq_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import numpy as np import torch.utils.data from fairseq.data import data_utils logger = logging.getLogger(__name__) class EpochListening: """Mixin for receiving updates whenever the epoch increments.""" @property def can_reuse_epoch_itr_across_epochs(self): """ Whether we can reuse the :class:`fairseq.data.EpochBatchIterator` for this dataset across epochs. This needs to return ``False`` if the sample sizes can change across epochs, in which case we may need to regenerate batches at each epoch. If your dataset relies in ``set_epoch`` then you should consider setting this to ``False``. """ return True def set_epoch(self, epoch): """Will receive the updated epoch number at the beginning of the epoch.""" pass class FairseqDataset(torch.utils.data.Dataset, EpochListening): """A dataset that provides helpers for batching.""" def __getitem__(self, index): raise NotImplementedError def __len__(self): raise NotImplementedError def collater(self, samples): """Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate Returns: dict: a mini-batch suitable for forwarding with a Model """ raise NotImplementedError def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to enforce ``--max-tokens`` during batching.""" raise NotImplementedError def num_tokens_vec(self, indices): """Return the number of tokens for a set of positions defined by indices. This value is used to enforce ``--max-tokens`` during batching.""" raise NotImplementedError def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" raise NotImplementedError def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" return np.arange(len(self), dtype=np.int64) @property def supports_prefetch(self): """Whether this dataset supports prefetching.""" return False def attr(self, attr: str, index: int): return getattr(self, attr, None) def prefetch(self, indices): """Prefetch the data required for this epoch.""" raise NotImplementedError def get_batch_shapes(self): """ Return a list of valid batch shapes, for example:: [(8, 512), (16, 256), (32, 128)] The first dimension of each tuple is the batch size and can be ``None`` to automatically infer the max batch size based on ``--max-tokens``. The second dimension of each tuple is the max supported length as given by :func:`fairseq.data.FairseqDataset.num_tokens`. This will be used by :func:`fairseq.data.FairseqDataset.batch_by_size` to restrict batch shapes. This is useful on TPUs to avoid too many dynamic shapes (and recompilations). """ return None def batch_by_size( self, indices, max_tokens=None, max_sentences=None, required_batch_size_multiple=1, ): """ Given an ordered set of indices, return batches according to *max_tokens*, *max_sentences* and *required_batch_size_multiple*. """ from fairseq.data import data_utils fixed_shapes = self.get_batch_shapes() if fixed_shapes is not None: def adjust_bsz(bsz, num_tokens): if bsz is None: assert max_tokens is not None, "Must specify --max-tokens" bsz = max_tokens // num_tokens if max_sentences is not None: bsz = min(bsz, max_sentences) elif ( bsz >= required_batch_size_multiple and bsz % required_batch_size_multiple != 0 ): bsz -= bsz % required_batch_size_multiple return bsz fixed_shapes = np.array( [ [adjust_bsz(bsz, num_tokens), num_tokens] for (bsz, num_tokens) in fixed_shapes ] ) try: num_tokens_vec = self.num_tokens_vec(indices).astype("int64") except NotImplementedError: num_tokens_vec = None return data_utils.batch_by_size( indices, num_tokens_fn=self.num_tokens, num_tokens_vec=num_tokens_vec, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, fixed_shapes=fixed_shapes, ) def filter_indices_by_size(self, indices, max_sizes): """ Filter a list of sample indices. Remove those that are longer than specified in *max_sizes*. WARNING: don't update, override method in child classes Args: indices (np.array): original array of sample indices max_sizes (int or list[int] or tuple[int]): max sample size, can be defined separately for src and tgt (then list or tuple) Returns: np.array: filtered sample array list: list of removed indices """ if isinstance(max_sizes, float) or isinstance(max_sizes, int): if hasattr(self, "sizes") and isinstance(self.sizes, np.ndarray): ignored = indices[self.sizes[indices] > max_sizes].tolist() indices = indices[self.sizes[indices] <= max_sizes] elif ( hasattr(self, "sizes") and isinstance(self.sizes, list) and len(self.sizes) == 1 ): ignored = indices[self.sizes[0][indices] > max_sizes].tolist() indices = indices[self.sizes[0][indices] <= max_sizes] else: indices, ignored = data_utils._filter_by_size_dynamic( indices, self.size, max_sizes ) else: indices, ignored = data_utils._filter_by_size_dynamic( indices, self.size, max_sizes ) return indices, ignored @property def supports_fetch_outside_dataloader(self): """Whether this dataset supports fetching outside the workers of the dataloader.""" return True class FairseqIterableDataset(torch.utils.data.IterableDataset, EpochListening): """ For datasets that need to be read sequentially, usually because the data is being streamed or otherwise can't be manipulated on a single machine. """ def __iter__(self): raise NotImplementedError ================================================ FILE: fairseq/data/fasta_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import subprocess import threading from pathlib import Path import numpy as np import torch def fasta_file_path(prefix_path): return prefix_path + ".fasta" class FastaDataset(torch.utils.data.Dataset): """ For loading protein sequence datasets in the common FASTA data format """ def __init__(self, path: str, cache_indices=False): self.fn = fasta_file_path(path) self.threadlocal = threading.local() self.cache = Path(f"{path}.fasta.idx.npy") if cache_indices: if self.cache.exists(): self.offsets, self.sizes = np.load(self.cache) else: self.offsets, self.sizes = self._build_index(path) np.save(self.cache, np.stack([self.offsets, self.sizes])) else: raise ValueError( "`cache_indices` is not supported anymore due to security concerns." ) def _get_file(self): if not hasattr(self.threadlocal, "f"): self.threadlocal.f = open(self.fn, "r") return self.threadlocal.f def __getitem__(self, idx): f = self._get_file() f.seek(self.offsets[idx]) desc = f.readline().strip() line = f.readline() seq = "" while line != "" and line[0] != ">": seq += line.strip() line = f.readline() return desc, seq def __len__(self): return self.offsets.size def __setstate__(self, state): self.__dict__ = state self.threadlocal = threading.local() def __getstate__(self): d = {} for i, v in self.__dict__.items(): if i != "threadlocal": d[i] = v return d def __del__(self): if hasattr(self.threadlocal, "f"): self.threadlocal.f.close() del self.threadlocal.f @staticmethod def exists(path): return os.path.exists(fasta_file_path(path)) class EncodedFastaDataset(FastaDataset): """ The FastaDataset returns raw sequences - this allows us to return indices with a dictionary instead. """ def __init__(self, path, dictionary): super().__init__(path, cache_indices=True) self.dictionary = dictionary def __getitem__(self, idx): desc, seq = super().__getitem__(idx) return self.dictionary.encode_line(seq, line_tokenizer=list).long() ================================================ FILE: fairseq/data/huffman/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .huffman_coder import HuffmanCodeBuilder, HuffmanCoder from .huffman_mmap_indexed_dataset import ( HuffmanMMapIndex, HuffmanMMapIndexedDataset, HuffmanMMapIndexedDatasetBuilder, vocab_file_path, ) __all__ = [ "HuffmanCoder", "HuffmanCodeBuilder", "HuffmanMMapIndexedDatasetBuilder", "HuffmanMMapIndexedDataset", "HuffmanMMapIndex", "vocab_file_path", ] ================================================ FILE: fairseq/data/huffman/huffman_coder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import re import typing as tp from collections import Counter, deque from dataclasses import dataclass from bitarray import bitarray, util from fairseq.data import Dictionary # basically we have to write to addressable bytes for the memory mapped # dataset loader. Sentences that get encoded to a length that is not a # multiple of BLOCKSIZE (a byte) will be padded to fit. (see _pad in the coder) BLOCKSIZE = 8 class HuffmanCoder: def __init__( self, root: "HuffmanNode", bos="<s>", pad="<pad>", eos="</s>", unk="<unk>" ): self.root = root self.table = root.code_table() self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos def _pad(self, a: bitarray) -> bitarray: """ bitpadding, 1 then 0. If the array is already a multiple of blocksize, we add a full block. """ pad_len = BLOCKSIZE - (len(a) % BLOCKSIZE) - 1 padding = bitarray("1" + "0" * pad_len) return a + padding def _unpad(self, a: bitarray) -> bitarray: """ remove the bitpadding. There will be a set of 0s preceded by a 1 at the end of the bitarray, we remove that """ # count the 0 padding at the end until we find the first 1 # we want to remove the one too remove_cnt = util.rindex(a, 1) return a[:remove_cnt] def encode(self, iter: tp.List[str]) -> bytes: """ encode a list of tokens a return bytes. We use bitpadding to make sure the encoded bits fit in bytes. """ a = bitarray() for token in iter: code = self.get_code(token) if code is None: if self.unk_word is None: raise Exception(f"unknown token {token} cannot be encoded.") else: token = self.unk_word a = a + self.get_code(token) return self._pad(a).tobytes() def decode(self, bits: bytes) -> tp.Iterator["HuffmanNode"]: """ take bitpadded bytes and decode it to a set of leaves. You can then use each node to find the symbol/id """ a = bitarray() a.frombytes(bits) return self.root.decode(self._unpad(a)) def get_code(self, symbol: str) -> tp.Optional[bitarray]: node = self.get_node(symbol) return None if node is None else node.code def get_node(self, symbol: str) -> "HuffmanNode": return self.table.get(symbol) @classmethod def from_file( cls, filename: str, bos="<s>", pad="<pad>", eos="</s>", unk="<unk>", ) -> "HuffmanCoder": builder = HuffmanCodeBuilder.from_file(filename) return builder.build_code(bos=bos, pad=pad, eos=eos, unk=unk) def to_file(self, filename, sep="\t"): nodes = list(self.table.values()) nodes.sort(key=lambda n: n.id) with open(filename, "w", encoding="utf-8") as output: for n in nodes: output.write(f"{n.symbol}{sep}{n.count}\n") def __iter__(self): for n in self.table.values(): yield n def merge(self, other_coder: "HuffmanCoder") -> "HuffmanCoder": builder = HuffmanCodeBuilder() for n in self: builder.increment(n.symbol, n.count) for n in other_coder: builder.increment(n.symbol, n.count) return builder.build_code() def __eq__(self, other: "HuffmanCoder") -> bool: return self.table == other.table def __len__(self) -> int: return len(self.table) def __contains__(self, sym: str) -> bool: return sym in self.table def to_dictionary(self) -> Dictionary: dictionary = Dictionary(bos=self.bos, unk=self.unk, pad=self.pad, eos=self.eos) for n in self: dictionary.add_symbol(n.symbol, n=n.count) dictionary.finalize() return dictionary @dataclass class HuffmanNode: """ a node in a Huffman tree """ id: int count: int symbol: tp.Optional[str] = None left: tp.Optional["HuffmanNode"] = None right: tp.Optional["HuffmanNode"] = None code: tp.Optional[bitarray] = None def is_leaf(self) -> bool: return self.left is None and self.right is None def code_table( self, prefix: tp.Optional[bitarray] = None ) -> tp.Dict[str, "HuffmanNode"]: defaulted_prefix = prefix if prefix is not None else bitarray() if self.is_leaf(): self.code = ( defaulted_prefix if len(defaulted_prefix) > 0 else bitarray("0") ) # leaf could be the root if there is only one symbol return {self.symbol: self} codes_right = self.right.code_table(defaulted_prefix + bitarray([0])) codes_left = self.left.code_table(defaulted_prefix + bitarray([1])) return {**codes_left, **codes_right} def decode(self, bits: bitarray) -> tp.Iterator["HuffmanNode"]: current_node = self for bit in bits: if bit == 0: # go right current_node = current_node.right else: # go left current_node = current_node.left if current_node is None: # we shouldn't be on a leaf here raise Exception("fell off a leaf") if current_node.is_leaf(): yield current_node current_node = self if current_node != self: raise Exception("couldn't decode all the bits") class HuffmanCodeBuilder: """ build a dictionary with occurence count and then build the Huffman code for it. """ def __init__(self): self.symbols = Counter() def add_symbols(self, *syms) -> None: self.symbols.update(syms) def increment(self, symbol: str, cnt: int) -> None: self.symbols[symbol] += cnt @classmethod def from_file(cls, filename): c = cls() with open(filename, "r", encoding="utf-8") as input: for line in input: split = re.split(r"[\s]+", line) c.increment(split[0], int(split[1])) return c def to_file(self, filename, sep="\t"): with open(filename, "w", encoding="utf-8") as output: for (tok, cnt) in self.symbols.most_common(): output.write(f"{tok}{sep}{cnt}\n") def _smallest(self, q1: deque, q2: deque) -> HuffmanNode: if len(q1) == 0: return q2.pop() if len(q2) == 0: return q1.pop() if q1[-1].count < q2[-1].count: return q1.pop() return q2.pop() def __add__(self, c: "HuffmanCodeBuilder") -> "HuffmanCodeBuilder": new_c = self.symbols + c.symbols new_b = HuffmanCodeBuilder() new_b.symbols = new_c return new_b def build_code( self, bos="<s>", pad="<pad>", eos="</s>", unk="<unk>", ) -> HuffmanCoder: assert len(self.symbols) > 0, "cannot build code from empty list of symbols" if self.symbols[bos] == 0: self.add_symbols(bos) if self.symbols[pad] == 0: self.add_symbols(pad) if self.symbols[eos] == 0: self.add_symbols(eos) if self.symbols[unk] == 0: self.add_symbols(unk) node_id = 0 leaves_queue = deque( [ HuffmanNode(symbol=symbol, count=count, id=idx) for idx, (symbol, count) in enumerate(self.symbols.most_common()) ] ) # left are the most common, right are the least common if len(leaves_queue) == 1: root = leaves_queue.pop() root.id = 0 return HuffmanCoder(root) nodes_queue = deque() while len(leaves_queue) > 0 or len(nodes_queue) != 1: # get the lowest two nodes at the head of each queue node1 = self._smallest(leaves_queue, nodes_queue) node2 = self._smallest(leaves_queue, nodes_queue) # add new node nodes_queue.appendleft( HuffmanNode( count=node1.count + node2.count, left=node1, right=node2, id=node_id ) ) node_id += 1 # we are left with the root return HuffmanCoder(nodes_queue.pop(), bos=bos, pad=pad, eos=eos, unk=unk) ================================================ FILE: fairseq/data/huffman/huffman_mmap_indexed_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import mmap import os import shutil import struct import typing as tp from functools import lru_cache import numpy as np import torch from fairseq.data import indexed_dataset from fairseq.data.huffman import HuffmanCoder from fairseq.file_io import PathManager class HuffmanMMapIndex: """ keep an index of the offsets in the huffman binary file. First a header, then the list of sizes (num tokens) for each instance and finally the addresses of each instance. """ _HDR_MAGIC = b"HUFFIDX\x00\x00" _VERSION = 1 @classmethod def writer(cls, path: str, data_len: int): class _Writer: def __enter__(self): self._file = open(path, "wb") # write header (magic + version) self._file.write(cls._HDR_MAGIC) self._file.write(struct.pack("<Q", cls._VERSION)) self._file.write(struct.pack("<Q", data_len)) return self def write(self, sizes, pointers): # add number of items in the index to the header self._file.write(struct.pack("<Q", len(sizes))) # write sizes sizes = np.array(sizes, dtype=np.int32) self._file.write(sizes.tobytes(order="C")) del sizes # write address pointers pointers = np.array(pointers, dtype=np.int64) self._file.write(pointers.tobytes(order="C")) del pointers def __exit__(self, exc_type, exc_val, exc_tb): self._file.close() return _Writer() def __init__(self, path): with open(path, "rb") as stream: # read headers magic_test = stream.read(9) assert self._HDR_MAGIC == magic_test, ( "Index file doesn't match expected format. " "Make sure that --dataset-impl is configured properly." ) (version,) = struct.unpack("<Q", stream.read(8)) assert ( self._VERSION == version ), f"Unexpected file version{version} != code version {self._VERSION}" # read length of data file (self._data_len,) = struct.unpack("<Q", stream.read(8)) # read number of items in data file/index (self._len,) = struct.unpack("<Q", stream.read(8)) offset = stream.tell() indexed_dataset._warmup_mmap_file(path) self._bin_buffer_mmap = np.memmap(path, mode="r", order="C") self._bin_buffer = memoryview(self._bin_buffer_mmap) self._sizes = np.frombuffer( self._bin_buffer, dtype=np.int32, count=self._len, offset=offset ) self._pointers = np.frombuffer( self._bin_buffer, dtype=np.int64, count=self._len, offset=offset + self._sizes.nbytes, ) def __del__(self): self._bin_buffer_mmap._mmap.close() del self._bin_buffer_mmap def __iter__(self): for i in range(self._len): yield self[i] @property def data_len(self): return self._data_len @property def sizes(self): return self._sizes @lru_cache(maxsize=8) def __getitem__(self, i): return self._pointers[i], self._sizes[i] def __len__(self): return self._len def vocab_file_path(prefix_path): return prefix_path + ".vocab" class HuffmanMMapIndexedDataset(torch.utils.data.Dataset): """ an indexed dataset that use mmap and memoryview to access data from disk that was compressed with a HuffmanCoder. """ def __init__(self, prefix_path): super().__init__() self._prefix_path = None self._index = None self._bin_buffer = None self._coder = None self._file = None self._bin_buffer_mmap = None self._do_init(prefix_path) def __getstate__(self): return self._prefix_path def __setstate__(self, state): self._do_init(state) def _do_init(self, prefix_path): self._prefix_path = prefix_path self._index = HuffmanMMapIndex( indexed_dataset.index_file_path(self._prefix_path) ) self._coder = HuffmanCoder.from_file(vocab_file_path(self._prefix_path)) indexed_dataset._warmup_mmap_file( indexed_dataset.data_file_path(self._prefix_path) ) self._file = os.open( indexed_dataset.data_file_path(self._prefix_path), os.O_RDONLY ) self._bin_buffer_mmap = mmap.mmap( self._file, self._index.data_len, access=mmap.ACCESS_READ, ) self._bin_buffer = memoryview(self._bin_buffer_mmap) def __del__(self): del self._bin_buffer if self._file: os.close(self._file) del self._index def __len__(self): return len(self._index) def _decode(self, i): ptr, _ = self._index[i] if i == 0: raw_bytes = self._bin_buffer[:ptr] else: (prev_ptr, _) = self._index[i - 1] raw_bytes = self._bin_buffer[prev_ptr:ptr] return self._coder.decode(raw_bytes.tobytes()) @lru_cache(maxsize=8) def __getitem__(self, i): nodes = self._decode(i) return torch.tensor([n.id for n in nodes], dtype=torch.int64) def __iter__(self): for idx in range(len(self)): yield self[idx] def get_symbols(self, i): nodes = self._decode(i) for n in nodes: yield n.symbol @property def sizes(self): return self._index.sizes @property def supports_prefetch(self): return False @property def coder(self): return self._coder @staticmethod def exists(prefix_path): return ( PathManager.exists(indexed_dataset.index_file_path(prefix_path)) and PathManager.exists(indexed_dataset.data_file_path(prefix_path)) and PathManager.exists(vocab_file_path(prefix_path)) ) class HuffmanMMapIndexedDatasetBuilder: """ Helper to build a memory mapped datasets with a huffman encoder. You can either open/close this manually or use it as a ContextManager. Provide your own coder, it will then be stored alongside the dataset. The builder will first write the vocab file, then open the binary file so you can stream into it, finally the index will be written when the builder is closed (your index should fit in memory). """ def __init__(self, path_prefix: str, coder: HuffmanCoder) -> None: self._path_prefix = path_prefix self._coder = coder self._sizes = [] self._ptrs = [] self._data_len = 0 def open(self): self._coder.to_file(vocab_file_path(self._path_prefix)) self._data_file = open(indexed_dataset.data_file_path(self._path_prefix), "wb") def __enter__(self) -> "HuffmanMMapIndexedDatasetBuilder": self.open() return self def add_item(self, tokens: tp.List[str]) -> None: """ add a list of tokens to the dataset, they will compressed with the provided coder before being written to file. """ encoded = self._coder.encode(tokens) code_len = len(encoded) last_ptr = 0 if len(self._ptrs) > 0: last_ptr = self._ptrs[-1] self._sizes.append(len(tokens)) self._ptrs.append(last_ptr + code_len) self._data_len += code_len self._data_file.write(encoded) def append(self, other_dataset_path_prefix: str) -> None: """ append an existing dataset. Beware, if it wasn't built with the same coder, you are in trouble. """ other_index = HuffmanMMapIndex( indexed_dataset.index_file_path(other_dataset_path_prefix) ) for (ptr, size) in other_index: self._ptrs.append(ptr + self._data_len) self._sizes.append(size) # Concatenate data with open(indexed_dataset.data_file_path(other_dataset_path_prefix), "rb") as f: shutil.copyfileobj(f, self._data_file) self._data_len += other_index.data_len def close(self): self._data_file.close() with HuffmanMMapIndex.writer( indexed_dataset.index_file_path(self._path_prefix), self._data_len ) as index: index.write(self._sizes, self._ptrs) def __exit__(self, exc_type, exc_val, exc_tb) -> None: self.close() ================================================ FILE: fairseq/data/id_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import FairseqDataset class IdDataset(FairseqDataset): def __getitem__(self, index): return index def __len__(self): return 0 def collater(self, samples): return torch.tensor(samples) ================================================ FILE: fairseq/data/indexed_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import shutil import struct from functools import lru_cache import numpy as np import torch from fairseq.dataclass.constants import DATASET_IMPL_CHOICES from fairseq.data.fasta_dataset import FastaDataset from fairseq.file_io import PathManager from fairseq.data.huffman import HuffmanMMapIndexedDataset, HuffmanMMapIndex from . import FairseqDataset from typing import Union def best_fitting_int_dtype( max_int_to_represent, ) -> Union[np.uint16, np.uint32, np.int64]: if max_int_to_represent is None: return np.uint32 # Safe guess elif max_int_to_represent < 65500: return np.uint16 elif max_int_to_represent < 4294967295: return np.uint32 else: return np.int64 # we avoid np.uint64 because it doesn't save space and its type promotion behaves unexpectedly # https://github.com/numpy/numpy/issues/5745 def get_available_dataset_impl(): return list(map(str, DATASET_IMPL_CHOICES)) def infer_dataset_impl(path): if IndexedRawTextDataset.exists(path): return "raw" elif IndexedDataset.exists(path): with open(index_file_path(path), "rb") as f: magic = f.read(8) if magic == IndexedDataset._HDR_MAGIC: return "cached" elif magic == MMapIndexedDataset.Index._HDR_MAGIC[:8]: return "mmap" elif magic == HuffmanMMapIndex._HDR_MAGIC[:8]: return "huffman" else: return None elif FastaDataset.exists(path): return "fasta" else: return None def make_builder(out_file, impl, vocab_size=None): if impl == "mmap": return MMapIndexedDatasetBuilder( out_file, dtype=best_fitting_int_dtype(vocab_size) ) elif impl == "fasta": raise NotImplementedError elif impl == "huffman": raise ValueError( "Use HuffmanCodeBuilder directly as it has a different interface." ) else: return IndexedDatasetBuilder(out_file) def make_dataset(path, impl, fix_lua_indexing=False, dictionary=None): if impl == "raw" and IndexedRawTextDataset.exists(path): assert dictionary is not None return IndexedRawTextDataset(path, dictionary) elif impl == "lazy" and IndexedDataset.exists(path): return IndexedDataset(path, fix_lua_indexing=fix_lua_indexing) elif impl == "cached" and IndexedDataset.exists(path): return IndexedCachedDataset(path, fix_lua_indexing=fix_lua_indexing) elif impl == "mmap" and MMapIndexedDataset.exists(path): return MMapIndexedDataset(path) elif impl == "fasta" and FastaDataset.exists(path): from fairseq.data.fasta_dataset import EncodedFastaDataset return EncodedFastaDataset(path, dictionary) elif impl == "huffman" and HuffmanMMapIndexedDataset.exists(path): return HuffmanMMapIndexedDataset(path) return None def dataset_exists(path, impl): if impl == "raw": return IndexedRawTextDataset.exists(path) elif impl == "mmap": return MMapIndexedDataset.exists(path) elif impl == "huffman": return HuffmanMMapIndexedDataset.exists(path) else: return IndexedDataset.exists(path) def read_longs(f, n): a = np.empty(n, dtype=np.int64) f.readinto(a) return a def write_longs(f, a): f.write(np.array(a, dtype=np.int64)) _code_to_dtype = { 1: np.uint8, 2: np.int8, 3: np.int16, 4: np.int32, 5: np.int64, 6: np.float64, 7: np.double, 8: np.uint16, 9: np.uint32, 10: np.uint64, } def _dtype_header_code(dtype) -> int: for k in _code_to_dtype.keys(): if _code_to_dtype[k] == dtype: return k raise ValueError(dtype) def index_file_path(prefix_path): return prefix_path + ".idx" def data_file_path(prefix_path): return prefix_path + ".bin" class IndexedDataset(FairseqDataset): """Loader for TorchNet IndexedDataset""" _HDR_MAGIC = b"TNTIDX\x00\x00" def __init__(self, path, fix_lua_indexing=False): super().__init__() self.path = path self.fix_lua_indexing = fix_lua_indexing self.data_file = None self.read_index(path) def read_index(self, path): with open(index_file_path(path), "rb") as f: magic = f.read(8) assert magic == self._HDR_MAGIC, ( "Index file doesn't match expected format. " "Make sure that --dataset-impl is configured properly." ) version = f.read(8) assert struct.unpack("<Q", version) == (1,) code, self.element_size = struct.unpack("<QQ", f.read(16)) self.dtype = _code_to_dtype[code] self._len, self.s = struct.unpack("<QQ", f.read(16)) self.dim_offsets = read_longs(f, self._len + 1) self.data_offsets = read_longs(f, self._len + 1) self.sizes = read_longs(f, self.s) def read_data(self, path): self.data_file = open(data_file_path(path), "rb", buffering=0) def check_index(self, i): if i < 0 or i >= self._len: raise IndexError("index out of range") def __del__(self): if self.data_file: self.data_file.close() @lru_cache(maxsize=8) def __getitem__(self, i) -> torch.Tensor: if not self.data_file: self.read_data(self.path) self.check_index(i) tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]] a = np.empty(tensor_size, dtype=self.dtype) self.data_file.seek(self.data_offsets[i] * self.element_size) self.data_file.readinto(a) item = torch.from_numpy(a).long() if self.fix_lua_indexing: item -= 1 # subtract 1 for 0-based indexing return item def __len__(self): return self._len def num_tokens(self, index): return self.sizes[index] def size(self, index): return self.sizes[index] @staticmethod def exists(path): return PathManager.exists(index_file_path(path)) and PathManager.exists( data_file_path(path) ) @property def supports_prefetch(self): return False # avoid prefetching to save memory class IndexedCachedDataset(IndexedDataset): def __init__(self, path, fix_lua_indexing=False): super().__init__(path, fix_lua_indexing=fix_lua_indexing) self.cache = None self.cache_index = {} @property def supports_prefetch(self): return True def prefetch(self, indices): if all(i in self.cache_index for i in indices): return if not self.data_file: self.read_data(self.path) indices = sorted(set(indices)) total_size = 0 for i in indices: total_size += self.data_offsets[i + 1] - self.data_offsets[i] self.cache = np.empty(total_size, dtype=self.dtype) ptx = 0 self.cache_index.clear() for i in indices: self.cache_index[i] = ptx size = self.data_offsets[i + 1] - self.data_offsets[i] a = self.cache[ptx : ptx + size] self.data_file.seek(self.data_offsets[i] * self.element_size) self.data_file.readinto(a) ptx += size if self.data_file: # close and delete data file after prefetch so we can pickle self.data_file.close() self.data_file = None @lru_cache(maxsize=8) def __getitem__(self, i): self.check_index(i) tensor_size = self.sizes[self.dim_offsets[i] : self.dim_offsets[i + 1]] a = np.empty(tensor_size, dtype=self.dtype) ptx = self.cache_index[i] np.copyto(a, self.cache[ptx : ptx + a.size]) item = torch.from_numpy(a).long() if self.fix_lua_indexing: item -= 1 # subtract 1 for 0-based indexing return item class IndexedRawTextDataset(FairseqDataset): """Takes a text file as input and binarizes it in memory at instantiation. Original lines are also kept in memory""" def __init__(self, path, dictionary, append_eos=True, reverse_order=False): self.tokens_list = [] self.lines = [] self.sizes = [] self.append_eos = append_eos self.reverse_order = reverse_order self.read_data(path, dictionary) self.size = len(self.tokens_list) def read_data(self, path, dictionary): with open(path, "r", encoding="utf-8") as f: for line in f: self.lines.append(line.strip("\n")) tokens = dictionary.encode_line( line, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, ).long() self.tokens_list.append(tokens) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes) def check_index(self, i): if i < 0 or i >= self.size: raise IndexError("index out of range") @lru_cache(maxsize=8) def __getitem__(self, i): self.check_index(i) return self.tokens_list[i] def get_original_text(self, i): self.check_index(i) return self.lines[i] def __del__(self): pass def __len__(self): return self.size def num_tokens(self, index): return self.sizes[index] def size(self, index): return self.sizes[index] @staticmethod def exists(path): return PathManager.exists(path) class IndexedDatasetBuilder: element_sizes = { np.uint8: 1, np.int8: 1, np.int16: 2, np.int32: 4, np.int64: 8, np.float64: 4, np.double: 8, } def __init__(self, out_file, dtype=np.int32): self.out_file = open(out_file, "wb") self.dtype = dtype self.data_offsets = [0] self.dim_offsets = [0] self.sizes = [] self.element_size = self.element_sizes[self.dtype] def add_item(self, tensor): # +1 for Lua compatibility bytes = self.out_file.write(np.array(tensor.numpy() + 1, dtype=self.dtype)) self.data_offsets.append(self.data_offsets[-1] + bytes / self.element_size) for s in tensor.size(): self.sizes.append(s) self.dim_offsets.append(self.dim_offsets[-1] + len(tensor.size())) def merge_file_(self, another_file): index = IndexedDataset(another_file) assert index.dtype == self.dtype begin = self.data_offsets[-1] for offset in index.data_offsets[1:]: self.data_offsets.append(begin + offset) self.sizes.extend(index.sizes) begin = self.dim_offsets[-1] for dim_offset in index.dim_offsets[1:]: self.dim_offsets.append(begin + dim_offset) with open(data_file_path(another_file), "rb") as f: while True: data = f.read(1024) if data: self.out_file.write(data) else: break def finalize(self, index_file): self.out_file.close() index = open(index_file, "wb") index.write(b"TNTIDX\x00\x00") index.write(struct.pack("<Q", 1)) index.write( struct.pack("<QQ", _dtype_header_code(self.dtype), self.element_size) ) index.write(struct.pack("<QQ", len(self.data_offsets) - 1, len(self.sizes))) write_longs(index, self.dim_offsets) write_longs(index, self.data_offsets) write_longs(index, self.sizes) index.close() def _warmup_mmap_file(path): with open(path, "rb") as stream: while stream.read(100 * 1024 * 1024): pass class MMapIndexedDataset(torch.utils.data.Dataset): class Index: _HDR_MAGIC = b"MMIDIDX\x00\x00" @classmethod def writer(cls, path, dtype): class _Writer: def __enter__(self): self._file = open(path, "wb") self._file.write(cls._HDR_MAGIC) self._file.write(struct.pack("<Q", 1)) self._file.write(struct.pack("<B", _dtype_header_code(dtype))) return self @staticmethod def _get_pointers(sizes): dtype_size = dtype().itemsize address = 0 pointers = [] for size in sizes: pointers.append(address) address += size * dtype_size return pointers def write(self, sizes): pointers = self._get_pointers(sizes) self._file.write(struct.pack("<Q", len(sizes))) sizes = np.array(sizes, dtype=np.int32) self._file.write(sizes.tobytes(order="C")) del sizes pointers = np.array(pointers, dtype=np.int64) self._file.write(pointers.tobytes(order="C")) del pointers def __exit__(self, exc_type, exc_val, exc_tb): self._file.close() return _Writer() def __init__(self, path): with open(path, "rb") as stream: magic_test = stream.read(9) assert self._HDR_MAGIC == magic_test, ( "Index file doesn't match expected format. " "Make sure that --dataset-impl is configured properly." ) version = struct.unpack("<Q", stream.read(8)) assert (1,) == version (dtype_code,) = struct.unpack("<B", stream.read(1)) self._dtype = _code_to_dtype[dtype_code] self._dtype_size = self._dtype().itemsize self._len = struct.unpack("<Q", stream.read(8))[0] offset = stream.tell() _warmup_mmap_file(path) self._bin_buffer_mmap = np.memmap(path, mode="r", order="C") self._bin_buffer = memoryview(self._bin_buffer_mmap) self._sizes = np.frombuffer( self._bin_buffer, dtype=np.int32, count=self._len, offset=offset ) self._pointers = np.frombuffer( self._bin_buffer, dtype=np.int64, count=self._len, offset=offset + self._sizes.nbytes, ) def __del__(self): self._bin_buffer_mmap._mmap.close() del self._bin_buffer_mmap @property def dtype(self): return self._dtype @property def sizes(self): return self._sizes @lru_cache(maxsize=8) def __getitem__(self, i): return self._pointers[i], self._sizes[i] def __len__(self): return self._len def __init__(self, path): super().__init__() self._path = None self._index = None self._bin_buffer = None self._do_init(path) def __getstate__(self): return self._path def __setstate__(self, state): self._do_init(state) def _do_init(self, path): self._path = path self._index = self.Index(index_file_path(self._path)) _warmup_mmap_file(data_file_path(self._path)) self._bin_buffer_mmap = np.memmap( data_file_path(self._path), mode="r", order="C" ) self._bin_buffer = memoryview(self._bin_buffer_mmap) def __del__(self): self._bin_buffer_mmap._mmap.close() del self._bin_buffer_mmap del self._index def __len__(self): return len(self._index) @lru_cache(maxsize=8) def __getitem__(self, i): ptr, size = self._index[i] np_array = np.frombuffer( self._bin_buffer, dtype=self._index.dtype, count=size, offset=ptr ) if self._index.dtype != np.int64: np_array = np_array.astype(np.int64) return torch.from_numpy(np_array) @property def sizes(self): return self._index.sizes @property def supports_prefetch(self): return False @staticmethod def exists(path): return PathManager.exists(index_file_path(path)) and PathManager.exists( data_file_path(path) ) @property def can_reuse_epoch_itr_across_epochs(self): # TODO: a quick fix. make it a child class of FairseqDataset instead? return True def get_indexed_dataset_to_local(path) -> str: local_index_path = PathManager.get_local_path(index_file_path(path)) local_data_path = PathManager.get_local_path(data_file_path(path)) assert local_index_path.endswith(".idx") and local_data_path.endswith(".bin"), ( "PathManager.get_local_path does not return files with expected patterns: " f"{local_index_path} and {local_data_path}" ) local_path = local_data_path[:-4] # stripping surfix ".bin" assert local_path == local_index_path[:-4] # stripping surfix ".idx" return local_path class MMapIndexedDatasetBuilder: def __init__(self, out_file, dtype=np.int64): self._data_file = open(out_file, "wb") self._dtype = dtype self._sizes = [] def add_item(self, tensor): np_array = np.array(tensor.numpy(), dtype=self._dtype) self._data_file.write(np_array.tobytes(order="C")) self._sizes.append(np_array.size) def merge_file_(self, another_file): # Concatenate index index = MMapIndexedDataset.Index(index_file_path(another_file)) assert index.dtype == self._dtype for size in index.sizes: self._sizes.append(size) # Concatenate data with open(data_file_path(another_file), "rb") as f: shutil.copyfileobj(f, self._data_file) def finalize(self, index_file): self._data_file.close() with MMapIndexedDataset.Index.writer(index_file, self._dtype) as index: index.write(self._sizes) ================================================ FILE: fairseq/data/iterators.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import logging import math import operator import os import queue import time from threading import Thread from typing import Iterator, List import numpy as np import torch from fairseq.data import data_utils logger = logging.getLogger(__name__) # Object used by _background_consumer to signal the source is exhausted # to the main thread. _sentinel = object() class CountingIterator(object): """Wrapper around an iterable that maintains the iteration count. Args: iterable (iterable): iterable to wrap start (int): starting iteration count. Note that this doesn't actually advance the iterator. total (int): override the iterator length returned by ``__len``. This can be used to truncate *iterator*. Attributes: n (int): number of elements consumed from this iterator """ def __init__(self, iterable, start=None, total=None): self._itr = iter(iterable) self.n = start or getattr(iterable, "n", 0) self.total = total if total is not None else self.n + len(iterable) def __len__(self): return self.total def __iter__(self): return self def __next__(self): if not self.has_next(): raise StopIteration try: x = next(self._itr) except StopIteration: raise IndexError( f"Iterator expected to have length {self.total}, " f"but exhausted at position {self.n}." ) self.n += 1 return x def has_next(self): """Whether the iterator has been exhausted.""" return self.n < self.total def skip(self, n): """Fast-forward the iterator by skipping n elements.""" for _ in range(n): next(self) return self def take(self, n): """Truncate the iterator to n elements at most.""" self.total = min(self.total, n) # Propagate this change to the underlying iterator if hasattr(self._itr, "take"): self._itr.take(max(n - self.n, 0)) return self class EpochBatchIterating(object): def __len__(self) -> int: raise NotImplementedError @property def next_epoch_idx(self): raise NotImplementedError def next_epoch_itr( self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True ): """Return a new iterator over the dataset. Args: shuffle (bool, optional): shuffle batches before returning the iterator (default: True). fix_batches_to_gpus (bool, optional): ensure that batches are always allocated to the same shards across epochs. Requires that :attr:`dataset` supports prefetching (default: False). set_dataset_epoch (bool, optional): update the wrapped Dataset with the new epoch number (default: True). """ raise NotImplementedError def end_of_epoch(self) -> bool: """Returns whether the most recent epoch iterator has been exhausted""" raise NotImplementedError @property def iterations_in_epoch(self) -> int: """The number of consumed batches in the current epoch.""" raise NotImplementedError def state_dict(self): """Returns a dictionary containing a whole state of the iterator.""" raise NotImplementedError def load_state_dict(self, state_dict): """Copies the state of the iterator from the given *state_dict*.""" raise NotImplementedError @property def first_batch(self): return "DUMMY" class StreamingEpochBatchIterator(EpochBatchIterating): """A steaming-style iterator over a :class:`torch.utils.data.IterableDataset`. Args: dataset (~torch.utils.data.Dataset): dataset from which to load the data max_sentences: batch size collate_fn (callable): merges a list of samples to form a mini-batch num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 1). buffer_size (int, optional): the number of batches to keep ready in the queue. Helps speeding up dataloading. When buffer_size is zero, the default torch.utils.data.DataLoader preloading is used. timeout (int, optional): if positive, the timeout value for collecting a batch from workers. Should always be non-negative (default: ``0``). """ def __init__( self, dataset, max_sentences=1, collate_fn=None, epoch=1, num_workers=0, buffer_size=0, timeout=0, persistent_workers=True, ): assert isinstance(dataset, torch.utils.data.IterableDataset) self.dataset = dataset self.max_sentences = max_sentences self.collate_fn = collate_fn self.epoch = max(epoch, 1) # we use 1-based indexing for epochs self.num_workers = num_workers self.persistent_workers = persistent_workers and num_workers > 0 # This upper limit here is to prevent people from abusing this feature # in a shared computing environment. self.buffer_size = min(buffer_size, 20) self.timeout = timeout self._current_epoch_iterator = None @property def next_epoch_idx(self): """Return the epoch index after *next_epoch_itr* is called.""" if self._current_epoch_iterator is not None and self.end_of_epoch(): return self.epoch + 1 else: return self.epoch def next_epoch_itr( self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True ): self.epoch = self.next_epoch_idx if set_dataset_epoch and hasattr(self.dataset, "set_epoch"): self.dataset.set_epoch(self.epoch) self._current_epoch_iterator = self._get_iterator_for_epoch(self.epoch, shuffle) return self._current_epoch_iterator def end_of_epoch(self) -> bool: return not self._current_epoch_iterator.has_next() @property def iterations_in_epoch(self) -> int: if self._current_epoch_iterator is not None: return self._current_epoch_iterator.n return 0 def state_dict(self): return { "epoch": self.epoch, } def load_state_dict(self, state_dict): self.epoch = state_dict["epoch"] def _get_iterator_for_epoch(self, epoch, shuffle, offset=0): if self.num_workers > 0: os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" # Create data loader worker_init_fn = getattr(self.dataset, "worker_init_fn", None) itr = torch.utils.data.DataLoader( self.dataset, batch_size=self.max_sentences, collate_fn=self.collate_fn, num_workers=self.num_workers, timeout=self.timeout, worker_init_fn=worker_init_fn, pin_memory=True, persistent_workers=self.persistent_workers, ) # Wrap with a BufferedIterator if needed if self.buffer_size > 0: itr = BufferedIterator(self.buffer_size, itr) # Wrap with CountingIterator itr = CountingIterator(itr, start=offset) return itr class FrozenBatchSampler: def __init__( self, ordered_batches, epoch, fix_batches_to_gpus, shuffle, initial_offset, ): self.ordered_batches = ordered_batches self.fix_batches_to_gpus = fix_batches_to_gpus self.shuffle = shuffle self.make_batches_for_epoch(epoch, initial_offset) def make_batches_for_epoch(self, epoch, offset=0): self.batches = self.ordered_batches( epoch, self.fix_batches_to_gpus, self.shuffle ) if offset > 0: self.batches = self.batches[offset:] def __iter__(self) -> Iterator[List[int]]: return iter(self.batches) def __len__(self) -> int: return len(self.batches) class EpochBatchIterator(EpochBatchIterating): """A multi-epoch iterator over a :class:`torch.utils.data.Dataset`. Compared to :class:`torch.utils.data.DataLoader`, this iterator: - can be reused across multiple epochs with the :func:`next_epoch_itr` method (optionally shuffled between epochs) - can be serialized/deserialized with the :func:`state_dict` and :func:`load_state_dict` methods - supports sharding with the *num_shards* and *shard_id* arguments Args: dataset (~torch.utils.data.Dataset): dataset from which to load the data collate_fn (callable): merges a list of samples to form a mini-batch batch_sampler (~torch.utils.data.Sampler or a callable): an iterator over batches of indices, or a callable to create such an iterator (~torch.utils.data.Sampler). A callable batch_sampler will be called for each epoch to enable per epoch dynamic batch iterators defined by this callable batch_sampler. seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 1). buffer_size (int, optional): the number of batches to keep ready in the queue. Helps speeding up dataloading. When buffer_size is zero, the default torch.utils.data.DataLoader preloading is used. timeout (int, optional): if positive, the timeout value for collecting a batch from workers. Should always be non-negative (default: ``0``). disable_shuffling (bool, optional): force disable shuffling (default: ``False``). skip_remainder_batch (bool, optional): if set, discard the last batch in an epoch for the sake of training stability, as the last batch is usually smaller than local_batch_size * distributed_word_size (default: ``False``). grouped_shuffling (bool, optional): enable shuffling batches in groups of num_shards. Ensures that each GPU receives similar length sequences when batches are sorted by length. """ def __init__( self, dataset, collate_fn, batch_sampler, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, buffer_size=0, timeout=0, disable_shuffling=False, skip_remainder_batch=False, grouped_shuffling=False, reuse_dataloader=False, persistent_workers=True, ): assert isinstance(dataset, torch.utils.data.Dataset) self.dataset = dataset self.collate_fn = collate_fn self.batch_sampler = batch_sampler self._frozen_batches = ( tuple(batch_sampler) if not callable(batch_sampler) else None ) self.seed = seed self.num_shards = num_shards self.shard_id = shard_id self.num_workers = num_workers self.persistent_workers = persistent_workers and num_workers > 0 # This upper limit here is to prevent people from abusing this feature # in a shared computing environment. self.buffer_size = min(buffer_size, 20) self.timeout = timeout self.disable_shuffling = disable_shuffling self.skip_remainder_batch = skip_remainder_batch self.grouped_shuffling = grouped_shuffling self.epoch = max(epoch, 1) # we use 1-based indexing for epochs self.shuffle = not disable_shuffling self._cur_epoch_itr = None self._next_epoch_itr = None self._supports_prefetch = getattr(dataset, "supports_prefetch", False) self.dataloader = None self.reuse_dataloader = reuse_dataloader @property def frozen_batches(self): if self._frozen_batches is None: self._frozen_batches = tuple(self.batch_sampler(self.dataset, self.epoch)) return self._frozen_batches @property def first_batch(self): if len(self.frozen_batches) == 0: raise Exception( "The dataset is empty. This could indicate " "that all elements in the dataset have been skipped. " "Try increasing the max number of allowed tokens or using " "a larger dataset." ) if getattr(self.dataset, "supports_fetch_outside_dataloader", True): return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0]]) else: return "DUMMY" def __len__(self): return int(math.ceil(len(self.frozen_batches) / float(self.num_shards))) @property def n(self): return self.iterations_in_epoch @property def next_epoch_idx(self): """Return the epoch index after *next_epoch_itr* is called.""" if self._next_epoch_itr is not None: return self.epoch elif self._cur_epoch_itr is not None and self.end_of_epoch(): return self.epoch + 1 else: return self.epoch def next_epoch_itr( self, shuffle=True, fix_batches_to_gpus=False, set_dataset_epoch=True ): """Return a new iterator over the dataset. Args: shuffle (bool, optional): shuffle batches before returning the iterator (default: True). fix_batches_to_gpus (bool, optional): ensure that batches are always allocated to the same shards across epochs. Requires that :attr:`dataset` supports prefetching (default: False). set_dataset_epoch (bool, optional): update the wrapped Dataset with the new epoch number (default: True). """ if self.disable_shuffling: shuffle = False prev_epoch = self.epoch self.epoch = self.next_epoch_idx if set_dataset_epoch and hasattr(self.dataset, "set_epoch"): self.dataset.set_epoch(self.epoch) if self._next_epoch_itr is not None: self._cur_epoch_itr = self._next_epoch_itr self._next_epoch_itr = None else: if callable(self.batch_sampler) and prev_epoch != self.epoch: # reset _frozen_batches to refresh the next epoch self._frozen_batches = None self._cur_epoch_itr = self._get_iterator_for_epoch( self.epoch, shuffle, fix_batches_to_gpus=fix_batches_to_gpus, ) self.shuffle = shuffle return self._cur_epoch_itr def end_of_epoch(self) -> bool: """Returns whether the most recent epoch iterator has been exhausted""" return not self._cur_epoch_itr.has_next() @property def iterations_in_epoch(self): """The number of consumed batches in the current epoch.""" if self._cur_epoch_itr is not None: return self._cur_epoch_itr.n elif self._next_epoch_itr is not None: return self._next_epoch_itr.n return 0 def state_dict(self): """Returns a dictionary containing a whole state of the iterator.""" if self.end_of_epoch(): epoch = self.epoch + 1 iter_in_epoch = 0 else: epoch = self.epoch iter_in_epoch = self.iterations_in_epoch return { "version": 2, "epoch": epoch, "iterations_in_epoch": iter_in_epoch, "shuffle": self.shuffle, } def load_state_dict(self, state_dict): """Copies the state of the iterator from the given *state_dict*.""" self.epoch = state_dict["epoch"] itr_pos = state_dict.get("iterations_in_epoch", 0) version = state_dict.get("version", 1) if itr_pos > 0: # fast-forward epoch iterator self._next_epoch_itr = self._get_iterator_for_epoch( self.epoch, shuffle=state_dict.get("shuffle", True), offset=itr_pos, ) if self._next_epoch_itr is None: if version == 1: # legacy behavior: we finished the epoch, increment epoch counter self.epoch += 1 else: raise RuntimeError( "Cannot resume training due to dataloader mismatch, please " "report this to the fairseq developers. You can relaunch " "training with `--reset-dataloader` and it should work." ) else: self._next_epoch_itr = None def _get_iterator_for_epoch( self, epoch, shuffle, fix_batches_to_gpus=False, offset=0 ): if self.reuse_dataloader and self.dataloader is not None: self.epoch_batch_sampler.make_batches_for_epoch(epoch, offset) itr = self.dataloader else: self.epoch_batch_sampler = FrozenBatchSampler( self.ordered_batches, epoch, fix_batches_to_gpus, shuffle, initial_offset=offset, ) if offset > 0 and len(self.epoch_batch_sampler) == 0: return None if self.num_workers > 0: os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" # Create data loader itr = torch.utils.data.DataLoader( self.dataset, collate_fn=self.collate_fn, batch_sampler=self.epoch_batch_sampler, num_workers=self.num_workers, timeout=self.timeout, pin_memory=True, persistent_workers=self.persistent_workers, ) if self.reuse_dataloader: self.dataloader = itr # Wrap with a BufferedIterator if needed if self.buffer_size > 0: itr = BufferedIterator(self.buffer_size, itr) # Wrap with CountingIterator itr = CountingIterator(itr, start=offset) if self.skip_remainder_batch: # TODO: Below is a lazy implementation which discard the final batch regardless # of whether it is a full batch or not. total_num_itrs = len(itr) - 1 itr.take(total_num_itrs) logger.info(f"skip final residual batch, total_num_itrs = {total_num_itrs}") return itr def ordered_batches(self, epoch, fix_batches_to_gpus, shuffle): def shuffle_batches(batches, seed): with data_utils.numpy_seed(seed): if self.grouped_shuffling: grouped_batches = [ batches[(i * self.num_shards) : ((i + 1) * self.num_shards)] for i in range((len(batches) // self.num_shards)) ] np.random.shuffle(grouped_batches) batches = list(itertools.chain(*grouped_batches)) else: np.random.shuffle(batches) return batches if self._supports_prefetch: batches = self.frozen_batches if shuffle and not fix_batches_to_gpus: batches = shuffle_batches(list(batches), self.seed + epoch) batches = list( ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[]) ) self.dataset.prefetch([i for s in batches for i in s]) if shuffle and fix_batches_to_gpus: batches = shuffle_batches(batches, self.seed + epoch + self.shard_id) else: if shuffle: batches = shuffle_batches(list(self.frozen_batches), self.seed + epoch) else: batches = self.frozen_batches batches = list( ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[]) ) return batches class GroupedIterator(CountingIterator): """Wrapper around an iterable that returns groups (chunks) of items. Args: iterable (iterable): iterable to wrap chunk_size (int): size of each chunk skip_remainder_batch (bool, optional): if set, discard the last grouped batch in each training epoch, as the last grouped batch is usually smaller than local_batch_size * distributed_word_size * chunk_size (default: ``False``). Attributes: n (int): number of elements consumed from this iterator """ def __init__(self, iterable, chunk_size, skip_remainder_batch=False): if skip_remainder_batch: total_num_itrs = int(math.floor(len(iterable) / float(chunk_size))) logger.info( f"skip final residual batch, grouped total_num_itrs = {total_num_itrs}" ) else: total_num_itrs = int(math.ceil(len(iterable) / float(chunk_size))) logger.info(f"grouped total_num_itrs = {total_num_itrs}") itr = _chunk_iterator(iterable, chunk_size, skip_remainder_batch) super().__init__( itr, start=int(math.ceil(getattr(iterable, "n", 0) / float(chunk_size))), total=total_num_itrs, ) self.chunk_size = chunk_size if skip_remainder_batch: self.take(total_num_itrs) # TODO: [Hack] Here the grouped iterator modifies the base iterator size so that # training can move into the next epoch once the grouped iterator is exhausted. # Double-check this implementation in case unexpected behavior occurs. iterable.take(total_num_itrs * chunk_size) def _chunk_iterator(itr, chunk_size, skip_remainder_batch=False): chunk = [] for x in itr: chunk.append(x) if len(chunk) == chunk_size: yield chunk chunk = [] if not skip_remainder_batch and len(chunk) > 0: yield chunk class ShardedIterator(CountingIterator): """A sharded wrapper around an iterable, padded to length. Args: iterable (iterable): iterable to wrap num_shards (int): number of shards to split the iterable into shard_id (int): which shard to iterator over fill_value (Any, optional): padding value when the iterable doesn't evenly divide *num_shards* (default: None). Attributes: n (int): number of elements consumed from this iterator """ def __init__( self, iterable, num_shards, shard_id, fill_value=None, skip_remainder_batch=None ): """ Args: skip_remainder_batch: ignored""" if shard_id < 0 or shard_id >= num_shards: raise ValueError("shard_id must be between 0 and num_shards") sharded_len = int(math.ceil(len(iterable) / float(num_shards))) itr = map( operator.itemgetter(1), itertools.zip_longest( range(sharded_len), itertools.islice(iterable, shard_id, len(iterable), num_shards), fillvalue=fill_value, ), ) super().__init__( itr, start=int(math.ceil(getattr(iterable, "n", 0) / float(num_shards))), total=sharded_len, ) class BackgroundConsumer(Thread): def __init__(self, queue, source, max_len, cuda_device): Thread.__init__(self) self._queue = queue self._source = source self._max_len = max_len self.count = 0 self.cuda_device = cuda_device def run(self): # set_device to avoid creation of GPU0 context when using pin_memory if self.cuda_device is not None: torch.cuda.set_device(self.cuda_device) try: for item in self._source: self._queue.put(item) # Stop if we reached the maximum length self.count += 1 if self._max_len is not None and self.count >= self._max_len: break # Signal the consumer we are done. self._queue.put(_sentinel) except Exception as e: self._queue.put(e) class BufferedIterator(object): def __init__(self, size, iterable): self._queue = queue.Queue(size) self._iterable = iterable self._consumer = None self.start_time = time.time() self.warning_time = None self.total = len(iterable) def _create_consumer(self): self._consumer = BackgroundConsumer( self._queue, self._iterable, self.total, torch.cuda.current_device() if torch.cuda.is_available() else None, ) self._consumer.daemon = True self._consumer.start() def __iter__(self): return self def __len__(self): return self.total def take(self, n): self.total = min(self.total, n) # Propagate this change to the underlying iterator if hasattr(self._iterable, "take"): self._iterable.take(n) return self def __next__(self): # Create consumer if not created yet if self._consumer is None: self._create_consumer() # Notify the user if there is a data loading bottleneck if self._queue.qsize() < min(2, max(1, self._queue.maxsize // 2)): if time.time() - self.start_time > 5 * 60: if ( self.warning_time is None or time.time() - self.warning_time > 15 * 60 ): logger.debug( "Data loading buffer is empty or nearly empty. This may " "indicate a data loading bottleneck, and increasing the " "number of workers (--num-workers) may help." ) self.warning_time = time.time() # Get next example item = self._queue.get(True) if isinstance(item, Exception): raise item if item is _sentinel: raise StopIteration() return item class GroupedEpochBatchIterator(EpochBatchIterator): """Grouped version of EpochBatchIterator It takes several samplers from different datasets. Each epoch shuffle the dataset wise sampler individually with different random seed. The those sub samplers are combined with into one big samplers with deterministic permutation to mix batches from different datasets. It will act like EpochBatchIterator but make sure 1) data from one data set each time 2) for different workers, they use the same order to fetch the data so they will use data from the same dataset everytime mult_rate is used for update_freq > 1 case where we want to make sure update_freq mini-batches come from same source """ def __init__( self, dataset, collate_fn, batch_samplers, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=0, mult_rate=1, buffer_size=0, skip_remainder_batch=False, ): super().__init__( dataset, collate_fn, batch_samplers, seed, num_shards, shard_id, num_workers, epoch, buffer_size, skip_remainder_batch=skip_remainder_batch, ) # level 0: sub-samplers 1: batch_idx 2: batches self._frozen_batches = tuple([tuple(sub_batch) for sub_batch in batch_samplers]) self.step_size = mult_rate * num_shards self.lengths = [ (len(x) // self.step_size) * self.step_size for x in self.frozen_batches ] def __len__(self): return sum(self.lengths) @property def first_batch(self): if len(self.frozen_batches) == 0: raise Exception( "The dataset is empty. This could indicate " "that all elements in the dataset have been skipped. " "Try increasing the max number of allowed tokens or using " "a larger dataset." ) if self.dataset.supports_fetch_outside_dataloader: return self.collate_fn([self.dataset[i] for i in self.frozen_batches[0][0]]) else: return "DUMMY" def _get_iterator_for_epoch( self, epoch, shuffle, fix_batches_to_gpus=False, offset=0 ): def shuffle_batches(batches, seed): with data_utils.numpy_seed(seed): np.random.shuffle(batches) return batches def return_full_batches(batch_sets, seed, shuffle): if shuffle: batch_sets = [shuffle_batches(list(x), seed) for x in batch_sets] batch_sets = [ batch_sets[i][: self.lengths[i]] for i in range(len(batch_sets)) ] batches = list(itertools.chain.from_iterable(batch_sets)) if shuffle: with data_utils.numpy_seed(seed): idx = np.random.permutation(len(batches) // self.step_size) if len(idx) * self.step_size != len(batches): raise ValueError( "ERROR: %d %d %d %d" % (len(idx), self.step_size, len(batches), self.shard_id), ":".join(["%d" % x for x in self.lengths]), ) mini_shards = [ batches[i * self.step_size : (i + 1) * self.step_size] for i in idx ] batches = list(itertools.chain.from_iterable(mini_shards)) return batches if self._supports_prefetch: raise NotImplementedError("To be implemented") else: batches = return_full_batches( self.frozen_batches, self.seed + epoch, shuffle ) batches = list( ShardedIterator(batches, self.num_shards, self.shard_id, fill_value=[]) ) if offset > 0 and offset >= len(batches): return None if self.num_workers > 0: os.environ["PYTHONWARNINGS"] = "ignore:semaphore_tracker:UserWarning" itr = torch.utils.data.DataLoader( self.dataset, collate_fn=self.collate_fn, batch_sampler=batches[offset:], num_workers=self.num_workers, persistent_workers=self.persistent_workers, ) if self.buffer_size > 0: itr = BufferedIterator(self.buffer_size, itr) return CountingIterator(itr, start=offset) ================================================ FILE: fairseq/data/language_pair_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import numpy as np import torch from fairseq.data import FairseqDataset, data_utils logger = logging.getLogger(__name__) def collate( samples, pad_idx, eos_idx, left_pad_source=True, left_pad_target=False, input_feeding=True, pad_to_length=None, pad_to_multiple=1, ): if len(samples) == 0: return {} def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None): return data_utils.collate_tokens( [s[key] for s in samples], pad_idx, eos_idx, left_pad, move_eos_to_beginning, pad_to_length=pad_to_length, pad_to_multiple=pad_to_multiple, ) def check_alignment(alignment, src_len, tgt_len): if alignment is None or len(alignment) == 0: return False if ( alignment[:, 0].max().item() >= src_len - 1 or alignment[:, 1].max().item() >= tgt_len - 1 ): logger.warning("alignment size mismatch found, skipping alignment!") return False return True def compute_alignment_weights(alignments): """ Given a tensor of shape [:, 2] containing the source-target indices corresponding to the alignments, a weight vector containing the inverse frequency of each target index is computed. For e.g. if alignments = [[5, 7], [2, 3], [1, 3], [4, 2]], then a tensor containing [1., 0.5, 0.5, 1] should be returned (since target index 3 is repeated twice) """ align_tgt = alignments[:, 1] _, align_tgt_i, align_tgt_c = torch.unique( align_tgt, return_inverse=True, return_counts=True ) align_weights = align_tgt_c[align_tgt_i[np.arange(len(align_tgt))]] return 1.0 / align_weights.float() id = torch.LongTensor([s["id"] for s in samples]) src_tokens = merge( "source", left_pad=left_pad_source, pad_to_length=pad_to_length["source"] if pad_to_length is not None else None, ) # sort by descending source length src_lengths = torch.LongTensor( [s["source"].ne(pad_idx).long().sum() for s in samples] ) src_lengths, sort_order = src_lengths.sort(descending=True) id = id.index_select(0, sort_order) src_tokens = src_tokens.index_select(0, sort_order) prev_output_tokens = None target = None if samples[0].get("target", None) is not None: target = merge( "target", left_pad=left_pad_target, pad_to_length=pad_to_length["target"] if pad_to_length is not None else None, ) target = target.index_select(0, sort_order) tgt_lengths = torch.LongTensor( [s["target"].ne(pad_idx).long().sum() for s in samples] ).index_select(0, sort_order) ntokens = tgt_lengths.sum().item() if samples[0].get("prev_output_tokens", None) is not None: prev_output_tokens = merge("prev_output_tokens", left_pad=left_pad_target) elif input_feeding: # we create a shifted version of targets for feeding the # previous output token(s) into the next decoder step prev_output_tokens = merge( "target", left_pad=left_pad_target, move_eos_to_beginning=True, pad_to_length=pad_to_length["target"] if pad_to_length is not None else None, ) else: ntokens = src_lengths.sum().item() batch = { "id": id, "nsentences": len(samples), "ntokens": ntokens, "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, }, "target": target, } if prev_output_tokens is not None: batch["net_input"]["prev_output_tokens"] = prev_output_tokens.index_select( 0, sort_order ) if samples[0].get("alignment", None) is not None: bsz, tgt_sz = batch["target"].shape src_sz = batch["net_input"]["src_tokens"].shape[1] offsets = torch.zeros((len(sort_order), 2), dtype=torch.long) offsets[:, 1] += torch.arange(len(sort_order), dtype=torch.long) * tgt_sz if left_pad_source: offsets[:, 0] += src_sz - src_lengths if left_pad_target: offsets[:, 1] += tgt_sz - tgt_lengths alignments = [ alignment + offset for align_idx, offset, src_len, tgt_len in zip( sort_order, offsets, src_lengths, tgt_lengths ) for alignment in [samples[align_idx]["alignment"].view(-1, 2)] if check_alignment(alignment, src_len, tgt_len) ] if len(alignments) > 0: alignments = torch.cat(alignments, dim=0) align_weights = compute_alignment_weights(alignments) batch["alignments"] = alignments batch["align_weights"] = align_weights if samples[0].get("constraints", None) is not None: # Collate the packed constraints across the samples, padding to # the length of the longest sample. lens = [sample.get("constraints").size(0) for sample in samples] max_len = max(lens) constraints = torch.zeros((len(samples), max(lens))).long() for i, sample in enumerate(samples): constraints[i, 0 : lens[i]] = samples[i].get("constraints") batch["constraints"] = constraints.index_select(0, sort_order) return batch class LanguagePairDataset(FairseqDataset): """ A pair of torch.utils.data.Datasets. Args: src (torch.utils.data.Dataset): source dataset to wrap src_sizes (List[int]): source sentence lengths src_dict (~fairseq.data.Dictionary): source vocabulary tgt (torch.utils.data.Dataset, optional): target dataset to wrap tgt_sizes (List[int], optional): target sentence lengths tgt_dict (~fairseq.data.Dictionary, optional): target vocabulary left_pad_source (bool, optional): pad source tensors on the left side (default: True). left_pad_target (bool, optional): pad target tensors on the left side (default: False). shuffle (bool, optional): shuffle dataset elements before batching (default: True). input_feeding (bool, optional): create a shifted version of the targets to be passed into the model for teacher forcing (default: True). remove_eos_from_source (bool, optional): if set, removes eos from end of source if it's present (default: False). append_eos_to_target (bool, optional): if set, appends eos to end of target if it's absent (default: False). align_dataset (torch.utils.data.Dataset, optional): dataset containing alignments. constraints (Tensor, optional): 2d tensor with a concatenated, zero- delimited list of constraints for each sentence. append_bos (bool, optional): if set, appends bos to the beginning of source/target sentence. num_buckets (int, optional): if set to a value greater than 0, then batches will be bucketed into the given number of batch shapes. src_lang_id (int, optional): source language ID, if set, the collated batch will contain a field 'src_lang_id' in 'net_input' which indicates the source language of the samples. tgt_lang_id (int, optional): target language ID, if set, the collated batch will contain a field 'tgt_lang_id' which indicates the target language of the samples. """ def __init__( self, src, src_sizes, src_dict, tgt=None, tgt_sizes=None, tgt_dict=None, left_pad_source=True, left_pad_target=False, shuffle=True, input_feeding=True, remove_eos_from_source=False, append_eos_to_target=False, align_dataset=None, constraints=None, append_bos=False, eos=None, num_buckets=0, src_lang_id=None, tgt_lang_id=None, pad_to_multiple=1, ): if tgt_dict is not None: assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() if tgt is not None: assert len(src) == len( tgt ), "Source and target must contain the same number of examples" self.src = src self.tgt = tgt self.src_sizes = np.array(src_sizes) self.tgt_sizes = np.array(tgt_sizes) if tgt_sizes is not None else None self.sizes = ( np.vstack((self.src_sizes, self.tgt_sizes)).T if self.tgt_sizes is not None else self.src_sizes ) self.src_dict = src_dict self.tgt_dict = tgt_dict self.left_pad_source = left_pad_source self.left_pad_target = left_pad_target self.shuffle = shuffle self.input_feeding = input_feeding self.remove_eos_from_source = remove_eos_from_source self.append_eos_to_target = append_eos_to_target self.align_dataset = align_dataset if self.align_dataset is not None: assert ( self.tgt_sizes is not None ), "Both source and target needed when alignments are provided" self.constraints = constraints self.append_bos = append_bos self.eos = eos if eos is not None else src_dict.eos() self.src_lang_id = src_lang_id self.tgt_lang_id = tgt_lang_id if num_buckets > 0: from fairseq.data import BucketPadLengthDataset self.src = BucketPadLengthDataset( self.src, sizes=self.src_sizes, num_buckets=num_buckets, pad_idx=self.src_dict.pad(), left_pad=self.left_pad_source, ) self.src_sizes = self.src.sizes logger.info("bucketing source lengths: {}".format(list(self.src.buckets))) if self.tgt is not None: self.tgt = BucketPadLengthDataset( self.tgt, sizes=self.tgt_sizes, num_buckets=num_buckets, pad_idx=self.tgt_dict.pad(), left_pad=self.left_pad_target, ) self.tgt_sizes = self.tgt.sizes logger.info( "bucketing target lengths: {}".format(list(self.tgt.buckets)) ) # determine bucket sizes using self.num_tokens, which will return # the padded lengths (thanks to BucketPadLengthDataset) num_tokens = np.vectorize(self.num_tokens, otypes=[np.compat.long]) self.bucketed_num_tokens = num_tokens(np.arange(len(self.src))) self.buckets = [ (None, num_tokens) for num_tokens in np.unique(self.bucketed_num_tokens) ] else: self.buckets = None self.pad_to_multiple = pad_to_multiple def get_batch_shapes(self): return self.buckets def __getitem__(self, index): tgt_item = self.tgt[index] if self.tgt is not None else None src_item = self.src[index] # Append EOS to end of tgt sentence if it does not have an EOS and remove # EOS from end of src sentence if it exists. This is useful when we use # use existing datasets for opposite directions i.e., when we want to # use tgt_dataset as src_dataset and vice versa if self.append_eos_to_target: eos = self.tgt_dict.eos() if self.tgt_dict else self.src_dict.eos() if self.tgt and self.tgt[index][-1] != eos: tgt_item = torch.cat([self.tgt[index], torch.LongTensor([eos])]) if self.append_bos: bos = self.tgt_dict.bos() if self.tgt_dict else self.src_dict.bos() if self.tgt and self.tgt[index][0] != bos: tgt_item = torch.cat([torch.LongTensor([bos]), self.tgt[index]]) bos = self.src_dict.bos() if self.src[index][0] != bos: src_item = torch.cat([torch.LongTensor([bos]), self.src[index]]) if self.remove_eos_from_source: eos = self.src_dict.eos() if self.src[index][-1] == eos: src_item = self.src[index][:-1] example = { "id": index, "source": src_item, "target": tgt_item, } if self.align_dataset is not None: example["alignment"] = self.align_dataset[index] if self.constraints is not None: example["constraints"] = self.constraints[index] return example def __len__(self): return len(self.src) def collater(self, samples, pad_to_length=None): """Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate pad_to_length (dict, optional): a dictionary of {'source': source_pad_to_length, 'target': target_pad_to_length} to indicate the max length to pad to in source and target respectively. Returns: dict: a mini-batch with the following keys: - `id` (LongTensor): example IDs in the original input order - `ntokens` (int): total number of tokens in the batch - `net_input` (dict): the input to the Model, containing keys: - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in the source sentence of shape `(bsz, src_len)`. Padding will appear on the left if *left_pad_source* is ``True``. - `src_lengths` (LongTensor): 1D Tensor of the unpadded lengths of each source sentence of shape `(bsz)` - `prev_output_tokens` (LongTensor): a padded 2D Tensor of tokens in the target sentence, shifted right by one position for teacher forcing, of shape `(bsz, tgt_len)`. This key will not be present if *input_feeding* is ``False``. Padding will appear on the left if *left_pad_target* is ``True``. - `src_lang_id` (LongTensor): a long Tensor which contains source language IDs of each sample in the batch - `target` (LongTensor): a padded 2D Tensor of tokens in the target sentence of shape `(bsz, tgt_len)`. Padding will appear on the left if *left_pad_target* is ``True``. - `tgt_lang_id` (LongTensor): a long Tensor which contains target language IDs of each sample in the batch """ res = collate( samples, pad_idx=self.src_dict.pad(), eos_idx=self.eos, left_pad_source=self.left_pad_source, left_pad_target=self.left_pad_target, input_feeding=self.input_feeding, pad_to_length=pad_to_length, pad_to_multiple=self.pad_to_multiple, ) if self.src_lang_id is not None or self.tgt_lang_id is not None: src_tokens = res["net_input"]["src_tokens"] bsz = src_tokens.size(0) if self.src_lang_id is not None: res["net_input"]["src_lang_id"] = ( torch.LongTensor([[self.src_lang_id]]).expand(bsz, 1).to(src_tokens) ) if self.tgt_lang_id is not None: res["tgt_lang_id"] = ( torch.LongTensor([[self.tgt_lang_id]]).expand(bsz, 1).to(src_tokens) ) return res def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to enforce ``--max-tokens`` during batching.""" return max( self.src_sizes[index], self.tgt_sizes[index] if self.tgt_sizes is not None else 0, ) def num_tokens_vec(self, indices): """Return the number of tokens for a set of positions defined by indices. This value is used to enforce ``--max-tokens`` during batching.""" sizes = self.src_sizes[indices] if self.tgt_sizes is not None: sizes = np.maximum(sizes, self.tgt_sizes[indices]) return sizes def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" return ( self.src_sizes[index], self.tgt_sizes[index] if self.tgt_sizes is not None else 0, ) def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: indices = np.random.permutation(len(self)).astype(np.int64) else: indices = np.arange(len(self), dtype=np.int64) if self.buckets is None: # sort by target length, then source length if self.tgt_sizes is not None: indices = indices[np.argsort(self.tgt_sizes[indices], kind="mergesort")] return indices[np.argsort(self.src_sizes[indices], kind="mergesort")] else: # sort by bucketed_num_tokens, which is: # max(padded_src_len, padded_tgt_len) return indices[ np.argsort(self.bucketed_num_tokens[indices], kind="mergesort") ] @property def supports_prefetch(self): return getattr(self.src, "supports_prefetch", False) and ( getattr(self.tgt, "supports_prefetch", False) or self.tgt is None ) def prefetch(self, indices): self.src.prefetch(indices) if self.tgt is not None: self.tgt.prefetch(indices) if self.align_dataset is not None: self.align_dataset.prefetch(indices) def filter_indices_by_size(self, indices, max_sizes): """Filter a list of sample indices. Remove those that are longer than specified in max_sizes. Args: indices (np.array): original array of sample indices max_sizes (int or list[int] or tuple[int]): max sample size, can be defined separately for src and tgt (then list or tuple) Returns: np.array: filtered sample array list: list of removed indices """ return data_utils.filter_paired_dataset_indices_by_size( self.src_sizes, self.tgt_sizes, indices, max_sizes, ) ================================================ FILE: fairseq/data/legacy/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .block_pair_dataset import BlockPairDataset from .masked_lm_dataset import MaskedLMDataset from .masked_lm_dictionary import BertDictionary, MaskedLMDictionary __all__ = [ "BertDictionary", "BlockPairDataset", "MaskedLMDataset", "MaskedLMDictionary", ] ================================================ FILE: fairseq/data/legacy/block_pair_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import numpy as np import torch from fairseq.data import FairseqDataset class BlockPairDataset(FairseqDataset): """Break a Dataset of tokens into sentence pair blocks for next sentence prediction as well as masked language model. High-level logics are: 1. break input tensor to tensor blocks 2. pair the blocks with 50% next sentence and 50% random sentence 3. return paired blocks as well as related segment labels Args: dataset (~torch.utils.data.Dataset): dataset to break into blocks sizes: array of sentence lengths dictionary: dictionary for the task block_size: maximum block size break_mode: mode for breaking copurs into block pairs. currently we support 2 modes doc: respect document boundaries and each part of the pair should belong to on document none: don't respect any boundary and cut tokens evenly short_seq_prob: probability for generating shorter block pairs doc_break_size: Size for empty line separating documents. Typically 1 if the sentences have eos, 0 otherwise. """ def __init__( self, dataset, dictionary, sizes, block_size, break_mode="doc", short_seq_prob=0.1, doc_break_size=1, ): super().__init__() self.dataset = dataset self.pad = dictionary.pad() self.eos = dictionary.eos() self.cls = dictionary.cls() self.mask = dictionary.mask() self.sep = dictionary.sep() self.break_mode = break_mode self.dictionary = dictionary self.short_seq_prob = short_seq_prob self.block_indices = [] assert len(dataset) == len(sizes) if break_mode == "doc": cur_doc = [] for sent_id, sz in enumerate(sizes): assert doc_break_size == 0 or sz != 0, ( "when doc_break_size is non-zero, we expect documents to be" "separated by a blank line with a single eos." ) # empty line as document separator if sz == doc_break_size: if len(cur_doc) == 0: continue self.block_indices.append(cur_doc) cur_doc = [] else: cur_doc.append(sent_id) max_num_tokens = block_size - 3 # Account for [CLS], [SEP], [SEP] self.sent_pairs = [] self.sizes = [] for doc_id, doc in enumerate(self.block_indices): self._generate_sentence_pair(doc, doc_id, max_num_tokens, sizes) elif break_mode is None or break_mode == "none": # each block should have half of the block size since we are constructing block pair sent_length = (block_size - 3) // 2 total_len = sum(dataset.sizes) length = math.ceil(total_len / sent_length) def block_at(i): start = i * sent_length end = min(start + sent_length, total_len) return (start, end) sent_indices = np.array([block_at(i) for i in range(length)]) sent_sizes = np.array([e - s for s, e in sent_indices]) dataset_index = self._sent_to_dataset_index(sent_sizes) # pair sentences self._pair_sentences(dataset_index) else: raise ValueError("Invalid break_mode: " + break_mode) def _pair_sentences(self, dataset_index): """ Give a list of evenly cut blocks/sentences, pair these sentences with 50% consecutive sentences and 50% random sentences. This is used for none break mode """ # pair sentences for sent_id, sent in enumerate(dataset_index): next_sent_label = ( 1 if np.random.rand() > 0.5 and sent_id != len(dataset_index) - 1 else 0 ) if next_sent_label: next_sent = dataset_index[sent_id + 1] else: next_sent = dataset_index[ self._skip_sampling(len(dataset_index), [sent_id, sent_id + 1]) ] self.sent_pairs.append((sent, next_sent, next_sent_label)) # The current blocks don't include the special tokens but the # sizes already account for this self.sizes.append(3 + sent[3] + next_sent[3]) def _sent_to_dataset_index(self, sent_sizes): """ Build index mapping block indices to the underlying dataset indices """ dataset_index = [] ds_idx, ds_remaining = -1, 0 for to_consume in sent_sizes: sent_size = to_consume if ds_remaining == 0: ds_idx += 1 ds_remaining = sent_sizes[ds_idx] start_ds_idx = ds_idx start_offset = sent_sizes[ds_idx] - ds_remaining while to_consume > ds_remaining: to_consume -= ds_remaining ds_idx += 1 ds_remaining = sent_sizes[ds_idx] ds_remaining -= to_consume dataset_index.append( ( start_ds_idx, # starting index in dataset start_offset, # starting offset within starting index ds_idx, # ending index in dataset sent_size, # sentence length ) ) assert ds_remaining == 0 assert ds_idx == len(self.dataset) - 1 return dataset_index def _generate_sentence_pair(self, doc, doc_id, max_num_tokens, sizes): """ Go through a single document and genrate sentence paris from it """ current_chunk = [] current_length = 0 curr = 0 # To provide more randomness, we decrease target seq length for parts of # samples (10% by default). Note that max_num_tokens is the hard threshold # for batching and will never be changed. target_seq_length = max_num_tokens if np.random.random() < self.short_seq_prob: target_seq_length = np.random.randint(2, max_num_tokens) # loop through all sentences in document while curr < len(doc): sent_id = doc[curr] current_chunk.append(sent_id) current_length = sum(sizes[current_chunk]) # split chunk and generate pair when exceed target_seq_length or # finish the loop if curr == len(doc) - 1 or current_length >= target_seq_length: # split the chunk into 2 parts a_end = 1 if len(current_chunk) > 2: a_end = np.random.randint(1, len(current_chunk) - 1) sent_a = current_chunk[:a_end] len_a = sum(sizes[sent_a]) # generate next sentence label, note that if there is only 1 sentence # in current chunk, label is always 0 next_sent_label = ( 1 if np.random.rand() > 0.5 and len(current_chunk) != 1 else 0 ) if not next_sent_label: # if next sentence label is 0, sample sent_b from a random doc target_b_length = target_seq_length - len_a rand_doc_id = self._skip_sampling(len(self.block_indices), [doc_id]) random_doc = self.block_indices[rand_doc_id] random_start = np.random.randint(0, len(random_doc)) sent_b = [] len_b = 0 for j in range(random_start, len(random_doc)): sent_b.append(random_doc[j]) len_b = sum(sizes[sent_b]) if len_b >= target_b_length: break # return the second part of the chunk since it's not used num_unused_segments = len(current_chunk) - a_end curr -= num_unused_segments else: # if next sentence label is 1, use the second part of chunk as sent_B sent_b = current_chunk[a_end:] len_b = sum(sizes[sent_b]) # currently sent_a and sent_B may be longer than max_num_tokens, # truncate them and return block idx and offsets for them sent_a, sent_b = self._truncate_sentences( sent_a, sent_b, max_num_tokens ) self.sent_pairs.append((sent_a, sent_b, next_sent_label)) self.sizes.append(3 + sent_a[3] + sent_b[3]) current_chunk = [] curr += 1 def _skip_sampling(self, total, skip_ids): """ Generate a random integer which is not in skip_ids. Sample range is [0, total) TODO: ids in skip_ids should be consecutive, we can extend it to more generic version later """ rand_id = np.random.randint(total - len(skip_ids)) return rand_id if rand_id < min(skip_ids) else rand_id + len(skip_ids) def _truncate_sentences(self, sent_a, sent_b, max_num_tokens): """ Trancate a pair of sentence to limit total length under max_num_tokens Logics: 1. Truncate longer sentence 2. Tokens to be truncated could be at the beginning or the end of the sentnce Returns: Truncated sentences represented by dataset idx """ len_a, len_b = sum(self.dataset.sizes[sent_a]), sum(self.dataset.sizes[sent_b]) front_cut_a = front_cut_b = end_cut_a = end_cut_b = 0 while True: total_length = ( len_a + len_b - front_cut_a - front_cut_b - end_cut_a - end_cut_b ) if total_length <= max_num_tokens: break if len_a - front_cut_a - end_cut_a > len_b - front_cut_b - end_cut_b: if np.random.rand() < 0.5: front_cut_a += 1 else: end_cut_a += 1 else: if np.random.rand() < 0.5: front_cut_b += 1 else: end_cut_b += 1 # calculate ds indices as well as offsets and return truncated_sent_a = self._cut_sentence(sent_a, front_cut_a, end_cut_a) truncated_sent_b = self._cut_sentence(sent_b, front_cut_b, end_cut_b) return truncated_sent_a, truncated_sent_b def _cut_sentence(self, sent, front_cut, end_cut): """ Cut a sentence based on the numbers of tokens to be cut from beginning and end Represent the sentence as dataset idx and return """ start_ds_idx, end_ds_idx, offset = sent[0], sent[-1], 0 target_len = sum(self.dataset.sizes[sent]) - front_cut - end_cut while front_cut > 0: if self.dataset.sizes[start_ds_idx] > front_cut: offset += front_cut break else: front_cut -= self.dataset.sizes[start_ds_idx] start_ds_idx += 1 while end_cut > 0: if self.dataset.sizes[end_ds_idx] > end_cut: break else: end_cut -= self.dataset.sizes[end_ds_idx] end_ds_idx -= 1 return start_ds_idx, offset, end_ds_idx, target_len def _fetch_block(self, start_ds_idx, offset, end_ds_idx, length): """ Fetch a block of tokens based on its dataset idx """ buffer = torch.cat( [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)] ) s, e = offset, offset + length return buffer[s:e] def __getitem__(self, index): block1, block2, next_sent_label = self.sent_pairs[index] block1 = self._fetch_block(*block1) block2 = self._fetch_block(*block2) return block1, block2, next_sent_label def __len__(self): return len(self.sizes) @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): prefetch_idx = set() for index in indices: for block1, block2, _ in [self.sent_pairs[index]]: for ds_idx in range(block1[0], block1[2] + 1): prefetch_idx.add(ds_idx) for ds_idx in range(block2[0], block2[2] + 1): prefetch_idx.add(ds_idx) self.dataset.prefetch(prefetch_idx) ================================================ FILE: fairseq/data/legacy/masked_lm_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Dict, List, Tuple import numpy as np import torch from fairseq.data import Dictionary, FairseqDataset, data_utils from fairseq.data.concat_dataset import ConcatDataset from fairseq.data.legacy.block_pair_dataset import BlockPairDataset from fairseq.data.token_block_dataset import TokenBlockDataset class MaskedLMDataset(FairseqDataset): """ A wrapper Dataset for masked language modelling. The dataset wraps around TokenBlockDataset or BlockedPairDataset and creates a batch where the input blocks are masked according to the specified masking probability. Additionally the batch can also contain sentence level targets if this is specified. Args: dataset: Dataset which generates blocks of data. Only BlockPairDataset and TokenBlockDataset are supported. sizes: Sentence lengths vocab: Dictionary with the vocabulary and special tokens. pad_idx: Id of padding token in dictionary mask_idx: Id of mask token in dictionary classif_token_idx: Id of classification token in dictionary. This is the token associated with the sentence embedding (Eg: CLS for BERT) sep_token_idx: Id of separator token in dictionary (Eg: SEP in BERT) seed: Seed for random number generator for reproducibility. shuffle: Shuffle the elements before batching. has_pairs: Specifies whether the underlying dataset generates a pair of blocks along with a sentence_target or not. Setting it to True assumes that the underlying dataset generates a label for the pair of sentences which is surfaced as sentence_target. The default value assumes a single block with no sentence target. segment_id: An optional segment id for filling in the segment labels when we are in the single block setting (Eg: XLM). Default is 0. masking_ratio: specifies what percentage of the blocks should be masked. masking_prob: specifies the probability of a given token being replaced with the "MASK" token. random_token_prob: specifies the probability of a given token being replaced by a random token from the vocabulary. """ def __init__( self, dataset: FairseqDataset, sizes: np.ndarray, vocab: Dictionary, pad_idx: int, mask_idx: int, classif_token_idx: int, sep_token_idx: int, seed: int = 1, shuffle: bool = True, has_pairs: bool = True, segment_id: int = 0, masking_ratio: float = 0.15, masking_prob: float = 0.8, random_token_prob: float = 0.1, ): # Make sure the input datasets are the ones supported assert ( isinstance(dataset, TokenBlockDataset) or isinstance(dataset, BlockPairDataset) or isinstance(dataset, ConcatDataset) ), ( "MaskedLMDataset only wraps TokenBlockDataset or BlockPairDataset or " "ConcatDataset" ) self.dataset = dataset self.sizes = np.array(sizes) self.vocab = vocab self.pad_idx = pad_idx self.mask_idx = mask_idx self.classif_token_idx = classif_token_idx self.sep_token_idx = sep_token_idx self.shuffle = shuffle self.seed = seed self.has_pairs = has_pairs self.segment_id = segment_id self.masking_ratio = masking_ratio self.masking_prob = masking_prob self.random_token_prob = random_token_prob # If we have only one block then sizes needs to be updated to include # the classification token if not has_pairs: self.sizes = self.sizes + 1 def __getitem__(self, index: int): # if has_pairs, then expect 2 blocks and a sentence target if self.has_pairs: (block_one, block_two, sentence_target) = self.dataset[index] else: block_one = self.dataset[index] return { "id": index, "block_one": block_one, "block_two": block_two if self.has_pairs else None, "sentence_target": sentence_target if self.has_pairs else None, } def __len__(self): return len(self.dataset) def _mask_block( self, sentence: np.ndarray, mask_idx: int, pad_idx: int, dictionary_token_range: Tuple, ): """ Mask tokens for Masked Language Model training Samples mask_ratio tokens that will be predicted by LM. Note:This function may not be efficient enough since we had multiple conversions between np and torch, we can replace them with torch operators later. Args: sentence: 1d tensor to be masked mask_idx: index to use for masking the sentence pad_idx: index to use for masking the target for tokens we aren't predicting dictionary_token_range: range of indices in dictionary which can be used for random word replacement (e.g. without special characters) Return: masked_sent: masked sentence target: target with words which we are not predicting replaced by pad_idx """ masked_sent = np.copy(sentence) sent_length = len(sentence) mask_num = math.ceil(sent_length * self.masking_ratio) mask = np.random.choice(sent_length, mask_num, replace=False) target = np.copy(sentence) for i in range(sent_length): if i in mask: rand = np.random.random() # replace with mask if probability is less than masking_prob # (Eg: 0.8) if rand < self.masking_prob: masked_sent[i] = mask_idx # replace with random token if probability is less than # masking_prob + random_token_prob (Eg: 0.9) elif rand < (self.masking_prob + self.random_token_prob): # sample random token from dictionary masked_sent[i] = np.random.randint( dictionary_token_range[0], dictionary_token_range[1] ) else: target[i] = pad_idx return masked_sent, target def _collate(self, samples: List[Dict], pad_idx: int, eos_idx: int): """ Does the heavy lifting for creating a batch from the input list of examples. The logic is as follows: 1. Mask the input blocks. In case has_pair is True then we have 2 blocks to mask. 2. Prepend the first masked block tensor with the special token used as sentence embedding. Eg: CLS in BERT. This happens irrespective of the value of has_pair. 3. If has_pair is True, then append the first masked block with the special separator token (eg: SEP for BERT) and compute segment label accordingly. In this case, also append the second masked block with this special separator token and compute its segment label. 4. For the targets tensor, prepend and append with padding index accordingly. 5. Concatenate all tensors. """ if len(samples) == 0: return {} # To ensure determinism, we reset the state of the PRNG after every # batch based on the seed and the first id of the batch. This ensures # that across epochs we get the same mask for the same example. This # is needed for reproducibility and is how BERT does masking # TODO: Can we add deteminism without this constraint? with data_utils.numpy_seed(self.seed + samples[0]["id"]): for s in samples: # token range is needed for replacing with random token during # masking token_range = (self.vocab.nspecial, len(self.vocab)) # mask according to specified probabilities. masked_blk_one, masked_tgt_one = self._mask_block( s["block_one"], self.mask_idx, self.pad_idx, token_range, ) tokens = np.concatenate([[self.classif_token_idx], masked_blk_one]) targets = np.concatenate([[self.pad_idx], masked_tgt_one]) segments = np.ones(len(tokens)) * self.segment_id # if has_pairs is True then we need to add the SEP token to both # the blocks after masking and re-compute segments based on the new # lengths. if self.has_pairs: tokens_one = np.concatenate([tokens, [self.sep_token_idx]]) targets_one = np.concatenate([targets, [self.pad_idx]]) masked_blk_two, masked_tgt_two = self._mask_block( s["block_two"], self.mask_idx, self.pad_idx, token_range ) tokens_two = np.concatenate([masked_blk_two, [self.sep_token_idx]]) targets_two = np.concatenate([masked_tgt_two, [self.pad_idx]]) # block + 1 sep + 1 special (CLS) segments_one = np.zeros(len(tokens_one)) # block + 1 sep segments_two = np.ones(len(tokens_two)) tokens = np.concatenate([tokens_one, tokens_two]) targets = np.concatenate([targets_one, targets_two]) segments = np.concatenate([segments_one, segments_two]) s["source"] = torch.LongTensor(tokens) s["segment_labels"] = torch.LongTensor(segments) s["lm_target"] = torch.LongTensor(targets) def merge(key): return data_utils.collate_tokens( [s[key] for s in samples], pad_idx, eos_idx, left_pad=False ) return { "id": torch.LongTensor([s["id"] for s in samples]), "ntokens": sum(len(s["source"]) for s in samples), "net_input": { "src_tokens": merge("source"), "segment_labels": merge("segment_labels"), }, "lm_target": merge("lm_target"), "sentence_target": torch.LongTensor([s["sentence_target"] for s in samples]) if self.has_pairs else None, "nsentences": len(samples), } def collater(self, samples: List[Dict]): """Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate Returns: dict: a mini-batch of data """ return self._collate(samples, self.vocab.pad(), self.vocab.eos()) def num_tokens(self, index: int): """ Return the number of tokens in a sample. This value is used to enforce max-tokens during batching. """ return self.sizes[index] def size(self, index: int): """ Return an example's size as a float or tuple. This value is used when filtering a dataset with max-positions. """ return self.sizes[index] def ordered_indices(self): """ Return an ordered list of indices. Batches will be constructed based on this order. """ if self.shuffle: return np.random.permutation(len(self)) else: order = [np.arange(len(self))] order.append(self.sizes) return np.lexsort(order) @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): self.dataset.prefetch(indices) ================================================ FILE: fairseq/data/legacy/masked_lm_dictionary.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.data import Dictionary class MaskedLMDictionary(Dictionary): """ Dictionary for Masked Language Modelling tasks. This extends Dictionary by adding the mask symbol. """ def __init__( self, pad="<pad>", eos="</s>", unk="<unk>", mask="<mask>", ): super().__init__(pad=pad, eos=eos, unk=unk) self.mask_word = mask self.mask_index = self.add_symbol(mask) self.nspecial = len(self.symbols) def mask(self): """Helper to get index of mask symbol""" return self.mask_index class BertDictionary(MaskedLMDictionary): """ Dictionary for BERT task. This extends MaskedLMDictionary by adding support for cls and sep symbols. """ def __init__( self, pad="<pad>", eos="</s>", unk="<unk>", mask="<mask>", cls="<cls>", sep="<sep>", ): super().__init__(pad=pad, eos=eos, unk=unk, mask=mask) self.cls_word = cls self.sep_word = sep self.cls_index = self.add_symbol(cls) self.sep_index = self.add_symbol(sep) self.nspecial = len(self.symbols) def cls(self): """Helper to get index of cls symbol""" return self.cls_index def sep(self): """Helper to get index of sep symbol""" return self.sep_index ================================================ FILE: fairseq/data/list_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import BaseWrapperDataset class ListDataset(BaseWrapperDataset): def __init__(self, dataset, sizes=None): super().__init__(dataset) self._sizes = sizes def __iter__(self): for x in self.dataset: yield x def collater(self, samples): return samples @property def sizes(self): return self._sizes def num_tokens(self, index): return self.sizes[index] def size(self, index): return self.sizes[index] def set_epoch(self, epoch): pass ================================================ FILE: fairseq/data/lm_context_window_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from typing import Dict from fairseq.data.monolingual_dataset import MonolingualDataset from . import FairseqDataset class LMContextWindowDataset(FairseqDataset): """ Wraps a MonolingualDataset and provides more context for evaluation. Each item in the new dataset will have a maximum size of ``tokens_per_sample + context_window``. Args: dataset: dataset to wrap tokens_per_sample (int): the max number of tokens in each dataset item context_window (int): the number of accumulated tokens to add to each dataset item pad_idx (int): padding symbol """ def __init__( self, dataset: MonolingualDataset, tokens_per_sample: int, context_window: int, pad_idx: int, ): assert context_window > 0 self.dataset = dataset self.tokens_per_sample = tokens_per_sample self.context_window = context_window self.pad_idx = pad_idx self.prev_tokens = np.empty([0]) def __getitem__(self, index): return self.dataset[index] def __len__(self): return len(self.dataset) def collater(self, samples) -> Dict: sample = self.dataset.collater(samples) pad = self.pad_idx max_sample_len = self.tokens_per_sample + self.context_window bsz, tsz = sample["net_input"]["src_tokens"].shape start_idxs = [0] * bsz toks = sample["net_input"]["src_tokens"] lengths = sample["net_input"]["src_lengths"] tgt = sample["target"] new_toks = np.empty([bsz, tsz + self.context_window], dtype=np.int64) new_tgt = np.full([bsz, tsz + self.context_window], pad, dtype=np.int64) sample_lens = toks.ne(pad).long().sum(dim=1).cpu() for i in range(bsz): sample_len = sample_lens[i] extra = len(self.prev_tokens) + sample_len - max_sample_len if extra > 0: self.prev_tokens = self.prev_tokens[extra:] pads = np.full(self.context_window - len(self.prev_tokens), pad) new_toks[i] = np.concatenate([self.prev_tokens, toks[i].numpy(), pads]) new_tgt[ i, len(self.prev_tokens) : len(self.prev_tokens) + len(tgt[i]) ] = tgt[i] start_idxs[i] = len(self.prev_tokens) lengths[i] += len(self.prev_tokens) self.prev_tokens = new_toks[i][new_toks[i] != pad][-self.context_window :] sample["net_input"]["src_tokens"] = torch.from_numpy(new_toks) sample["target"] = torch.from_numpy(new_tgt) sample["start_indices"] = start_idxs return sample def num_tokens(self, index): return self.dataset.num_tokens(index) def size(self, index): return self.dataset.size(index) def ordered_indices(self): # NOTE we don't shuffle the data to retain access to the previous dataset elements return np.arange(len(self.dataset)) @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): return self.dataset.prefetch(indices) ================================================ FILE: fairseq/data/lru_cache_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from functools import lru_cache from . import BaseWrapperDataset class LRUCacheDataset(BaseWrapperDataset): def __init__(self, dataset, token=None): super().__init__(dataset) @lru_cache(maxsize=8) def __getitem__(self, index): return self.dataset[index] @lru_cache(maxsize=8) def collater(self, samples): return self.dataset.collater(samples) ================================================ FILE: fairseq/data/mask_tokens_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from functools import lru_cache import numpy as np import torch from fairseq.data import Dictionary, data_utils from . import BaseWrapperDataset, LRUCacheDataset class MaskTokensDataset(BaseWrapperDataset): """ A wrapper Dataset for masked language modeling. Input items are masked according to the specified masking probability. Args: dataset: Dataset to wrap. sizes: Sentence lengths vocab: Dictionary with the vocabulary and special tokens. pad_idx: Id of pad token in vocab mask_idx: Id of mask token in vocab return_masked_tokens: controls whether to return the non-masked tokens (the default) or to return a tensor with the original masked token IDs (and *pad_idx* elsewhere). The latter is useful as targets for masked LM training. seed: Seed for random number generator for reproducibility. mask_prob: probability of replacing a token with *mask_idx*. leave_unmasked_prob: probability that a masked token is unmasked. random_token_prob: probability of replacing a masked token with a random token from the vocabulary. freq_weighted_replacement: sample random replacement words based on word frequencies in the vocab. mask_whole_words: only mask whole words. This should be a byte mask over vocab indices, indicating whether it is the beginning of a word. We will extend any mask to encompass the whole word. bpe: BPE to use for whole-word masking. mask_multiple_length : repeat each mask index multiple times. Default value is 1. mask_stdev : standard deviation of masks distribution in case of multiple masking. Default value is 0. """ @classmethod def apply_mask(cls, dataset: torch.utils.data.Dataset, *args, **kwargs): """Return the source and target datasets for masked LM training.""" dataset = LRUCacheDataset(dataset) return ( LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=False)), LRUCacheDataset(cls(dataset, *args, **kwargs, return_masked_tokens=True)), ) def __init__( self, dataset: torch.utils.data.Dataset, vocab: Dictionary, pad_idx: int, mask_idx: int, return_masked_tokens: bool = False, seed: int = 1, mask_prob: float = 0.15, leave_unmasked_prob: float = 0.1, random_token_prob: float = 0.1, freq_weighted_replacement: bool = False, mask_whole_words: torch.Tensor = None, mask_multiple_length: int = 1, mask_stdev: float = 0.0, skip_masking: bool = False, ): assert 0.0 < mask_prob < 1.0 assert 0.0 <= random_token_prob <= 1.0 assert 0.0 <= leave_unmasked_prob <= 1.0 assert random_token_prob + leave_unmasked_prob <= 1.0 assert mask_multiple_length >= 1 assert mask_stdev >= 0.0 self.dataset = dataset self.vocab = vocab self.pad_idx = pad_idx self.mask_idx = mask_idx self.return_masked_tokens = return_masked_tokens self.seed = seed self.mask_prob = mask_prob self.leave_unmasked_prob = leave_unmasked_prob self.random_token_prob = random_token_prob self.mask_whole_words = mask_whole_words self.mask_multiple_length = mask_multiple_length self.mask_stdev = mask_stdev self.skip_masking = skip_masking if random_token_prob > 0.0: if freq_weighted_replacement: weights = np.array(self.vocab.count) else: weights = np.ones(len(self.vocab)) weights[: self.vocab.nspecial] = 0 self.weights = weights / weights.sum() self.epoch = 0 @property def can_reuse_epoch_itr_across_epochs(self): return True # only the noise changes, not item sizes def set_epoch(self, epoch, **unused): super().set_epoch(epoch) self.epoch = epoch def __getitem__(self, index: int): return self.__getitem_cached__(self.seed, self.epoch, index) @lru_cache(maxsize=8) def __getitem_cached__(self, seed: int, epoch: int, index: int): seed = int(hash((seed, epoch, index)) % 1e6) rng = np.random.default_rng(seed) item = self.dataset[index] sz = len(item) assert ( self.mask_idx not in item ), "Dataset contains mask_idx (={}), this is not expected!".format( self.mask_idx, ) if self.skip_masking: return torch.from_numpy(np.copy(item)) if self.mask_whole_words is not None: word_begins_mask = self.mask_whole_words.gather(0, item) word_begins_idx = word_begins_mask.nonzero().view(-1) sz = len(word_begins_idx) words = np.split(word_begins_mask, word_begins_idx)[1:] assert len(words) == sz word_lens = list(map(len, words)) # decide elements to mask mask = np.full(sz, False) num_mask = int( # add a random number for probabilistic rounding self.mask_prob * sz / float(self.mask_multiple_length) + rng.random() ) # multiple masking as described in the vq-wav2vec paper (https://arxiv.org/abs/1910.05453) mask_idc = rng.choice(sz, num_mask, replace=False) if self.mask_stdev > 0.0: lengths = rng.normal( self.mask_multiple_length, self.mask_stdev, size=num_mask ) lengths = [max(0, int(round(x))) for x in lengths] mask_idc = np.asarray( [ mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j]) ], dtype=np.int64, ) else: mask_idc = np.concatenate( [mask_idc + i for i in range(self.mask_multiple_length)] ) mask_idc = mask_idc[mask_idc < len(mask)] try: mask[mask_idc] = True except: # something wrong print("Assigning mask indexes {} to mask {} failed!".format(mask_idc, mask)) raise # if self.return_masked_tokens: # print(( # f"IDX={index}; seed={seed}; epoch={epoch}; is_tgt={self.return_masked_tokens}: " # f"{np.nonzero(mask)[0].sum()}" # )) if self.return_masked_tokens: # exit early if we're just returning the masked tokens # (i.e., the targets for masked LM training) if self.mask_whole_words is not None: mask = np.repeat(mask, word_lens) new_item = np.full(len(mask), self.pad_idx) new_item[mask] = item[torch.from_numpy(mask.astype(np.uint8)) == 1] return torch.from_numpy(new_item) # decide unmasking and random replacement rand_or_unmask_prob = self.random_token_prob + self.leave_unmasked_prob if rand_or_unmask_prob > 0.0: rand_or_unmask = mask & (rng.random(sz) < rand_or_unmask_prob) if self.random_token_prob == 0.0: unmask = rand_or_unmask rand_mask = None elif self.leave_unmasked_prob == 0.0: unmask = None rand_mask = rand_or_unmask else: unmask_prob = self.leave_unmasked_prob / rand_or_unmask_prob decision = rng.random(sz) < unmask_prob unmask = rand_or_unmask & decision rand_mask = rand_or_unmask & (~decision) else: unmask = rand_mask = None if unmask is not None: mask = mask ^ unmask if self.mask_whole_words is not None: mask = np.repeat(mask, word_lens) new_item = np.copy(item) new_item[mask] = self.mask_idx if rand_mask is not None: num_rand = rand_mask.sum() if num_rand > 0: if self.mask_whole_words is not None: rand_mask = np.repeat(rand_mask, word_lens) num_rand = rand_mask.sum() new_item[rand_mask] = rng.choice( len(self.vocab), num_rand, p=self.weights, ) return torch.from_numpy(new_item) ================================================ FILE: fairseq/data/monolingual_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from . import FairseqDataset, data_utils def collate(samples, pad_idx, eos_idx, fixed_pad_length=None, pad_to_bsz=None): if len(samples) == 0: return {} def merge(key, is_list=False): if is_list: res = [] for i in range(len(samples[0][key])): res.append( data_utils.collate_tokens( [s[key][i] for s in samples], pad_idx, eos_idx, left_pad=False, pad_to_length=fixed_pad_length, pad_to_bsz=pad_to_bsz, ) ) return res else: return data_utils.collate_tokens( [s[key] for s in samples], pad_idx, eos_idx, left_pad=False, pad_to_length=fixed_pad_length, pad_to_bsz=pad_to_bsz, ) src_tokens = merge("source") if samples[0]["target"] is not None: is_target_list = isinstance(samples[0]["target"], list) target = merge("target", is_target_list) else: target = src_tokens return { "id": torch.LongTensor([s["id"] for s in samples]), "nsentences": len(samples), "ntokens": sum(len(s["source"]) for s in samples), "net_input": { "src_tokens": src_tokens, "src_lengths": torch.LongTensor([s["source"].numel() for s in samples]), }, "target": target, } class MonolingualDataset(FairseqDataset): """ A wrapper around torch.utils.data.Dataset for monolingual data. Args: dataset (torch.utils.data.Dataset): dataset to wrap sizes (List[int]): sentence lengths vocab (~fairseq.data.Dictionary): vocabulary shuffle (bool, optional): shuffle the elements before batching (default: True). """ def __init__( self, dataset, sizes, src_vocab, tgt_vocab=None, add_eos_for_other_targets=False, shuffle=False, targets=None, add_bos_token=False, fixed_pad_length=None, pad_to_bsz=None, src_lang_idx=None, tgt_lang_idx=None, ): self.dataset = dataset self.sizes = np.array(sizes) self.vocab = src_vocab self.tgt_vocab = tgt_vocab or src_vocab self.add_eos_for_other_targets = add_eos_for_other_targets self.shuffle = shuffle self.add_bos_token = add_bos_token self.fixed_pad_length = fixed_pad_length self.pad_to_bsz = pad_to_bsz self.src_lang_idx = src_lang_idx self.tgt_lang_idx = tgt_lang_idx assert targets is None or all( t in {"self", "future", "past"} for t in targets ), "targets must be none or one of 'self', 'future', 'past'" if targets is not None and len(targets) == 0: targets = None self.targets = targets def __getitem__(self, index): if self.targets is not None: # *future_target* is the original sentence # *source* is shifted right by 1 (maybe left-padded with eos) # *past_target* is shifted right by 2 (left-padded as needed) # # Left-to-right language models should condition on *source* and # predict *future_target*. # Right-to-left language models should condition on *source* and # predict *past_target*. source, future_target, past_target = self.dataset[index] source, target = self._make_source_target( source, future_target, past_target ) else: source = self.dataset[index] target = None source, target = self._maybe_add_bos(source, target) return {"id": index, "source": source, "target": target} def __len__(self): return len(self.dataset) def _make_source_target(self, source, future_target, past_target): if self.targets is not None: target = [] if ( self.add_eos_for_other_targets and (("self" in self.targets) or ("past" in self.targets)) and source[-1] != self.vocab.eos() ): # append eos at the end of source source = torch.cat([source, source.new([self.vocab.eos()])]) if "future" in self.targets: future_target = torch.cat( [future_target, future_target.new([self.vocab.pad()])] ) if "past" in self.targets: # first token is before the start of sentence which is only used in "none" break mode when # add_eos_for_other_targets is False past_target = torch.cat( [ past_target.new([self.vocab.pad()]), past_target[1:], source[-2, None], ] ) for t in self.targets: if t == "self": target.append(source) elif t == "future": target.append(future_target) elif t == "past": target.append(past_target) else: raise Exception("invalid target " + t) if len(target) == 1: target = target[0] else: target = future_target return source, self._filter_vocab(target) def _maybe_add_bos(self, source, target): if self.add_bos_token: source = torch.cat([source.new([self.vocab.bos()]), source]) if target is not None: target = torch.cat([target.new([self.tgt_vocab.bos()]), target]) return source, target def num_tokens_vec(self, indices): """Return the number of tokens for a set of positions defined by indices. This value is used to enforce ``--max-tokens`` during batching.""" return self.sizes[indices] def _filter_vocab(self, target): if len(self.tgt_vocab) != len(self.vocab): def _filter(target): mask = target.ge(len(self.tgt_vocab)) if mask.any(): target[mask] = self.tgt_vocab.unk() return target if isinstance(target, list): return [_filter(t) for t in target] return _filter(target) return target def collater(self, samples): """Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate Returns: dict: a mini-batch with the following keys: - `id` (LongTensor): example IDs in the original input order - `ntokens` (int): total number of tokens in the batch - `net_input` (dict): the input to the Model, containing keys: - `src_tokens` (LongTensor): a padded 2D Tensor of tokens in the source sentence of shape `(bsz, src_len)`. Padding will appear on the right. - `target` (LongTensor): a padded 2D Tensor of tokens in the target sentence of shape `(bsz, tgt_len)`. Padding will appear on the right. """ return collate( samples, self.vocab.pad(), self.vocab.eos(), self.fixed_pad_length, self.pad_to_bsz, ) def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to enforce ``--max-tokens`` during batching.""" return self.sizes[index] def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" return self.sizes[index] def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] order.append(self.sizes) return np.lexsort(order) @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): self.dataset.prefetch(indices) ================================================ FILE: fairseq/data/multi_corpus_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import asyncio import logging import time from collections import OrderedDict from typing import Dict, List, Optional import numpy as np from fairseq.data import data_utils from . import FairseqDataset logger = logging.getLogger(__name__) class MultiCorpusDataset(FairseqDataset): """ Stores multiple instances of FairseqDataset together. Unless batch_sample=True, requires each instance to be the same dataset, as the collate method needs to work on batches with samples from each dataset. Allows specifying a distribution over the datasets to use. Note that unlike MultiCorpusSampledDataset, this distribution allows sampling for each item, rather than on a batch level. Note that datasets with sampling probabilty of 0 will be skipped. Each time ordered_indices() is called, a new sample is generated with the specified distribution. Args: datasets: a OrderedDict of FairseqDataset instances. distribution: a List containing the probability of getting an utterance from corresponding dataset seed: random seed for sampling the datsets sort_indices: if true, will sort the ordered indices by size batch_sample: if true, will ensure each batch is from a single dataset """ def __init__( self, datasets: Dict[str, FairseqDataset], distribution: List[float], seed: int, sort_indices: bool = False, batch_sample: bool = False, distributed_rank: Optional[int] = None, ): super().__init__() assert isinstance(datasets, OrderedDict) assert len(datasets) == len(distribution) assert sum(distribution) == 1 self.datasets = datasets self.distribution = distribution self.seed = seed self.sort_indices = sort_indices self.batch_sample = batch_sample self.distributed_rank = distributed_rank # Avoid repeated conversions to list later self.dataset_list = list(datasets.values()) self.total_num_instances = 0 first_dataset = self.dataset_list[0] self.num_instances_per_dataset = [] self.dataset_offsets = [] for i, dataset in enumerate(self.dataset_list): assert isinstance(dataset, FairseqDataset) assert type(dataset) is type(first_dataset) self.num_instances_per_dataset.append( 0 if self.distribution[i] == 0 else len(dataset) ) self.dataset_offsets.append(self.total_num_instances) self.total_num_instances += self.num_instances_per_dataset[i] def ordered_indices(self): start = time.time() with data_utils.numpy_seed(self.seed, self.epoch): logger.info( f"sampling new dataset with seed {self.seed} epoch {self.epoch}" ) sampled_indices = [] num_selected_instances = 0 # For each dataset i, sample self.distribution[i] * self.total_num_instances for i, key in enumerate(self.datasets): if self.distribution[i] == 0: # skip dataset if sampling probability is 0 continue if i < len(self.datasets) - 1: num_instances = int(self.distribution[i] * self.total_num_instances) high = self.dataset_offsets[i + 1] else: num_instances = self.total_num_instances - num_selected_instances high = self.total_num_instances logger.info(f"sampling {num_instances} from {key} dataset") num_selected_instances += num_instances # First, add k copies of the dataset where k = num_instances // len(dataset). # This ensures an equal distribution of the data points as much as possible. # For the remaining entries randomly sample them dataset_size = len(self.datasets[key]) num_copies = num_instances // dataset_size dataset_indices = ( np.random.permutation(high - self.dataset_offsets[i]) + self.dataset_offsets[i] )[: num_instances - num_copies * dataset_size] if num_copies > 0: sampled_indices += list( np.concatenate( ( np.repeat( np.arange(self.dataset_offsets[i], high), num_copies ), dataset_indices, ) ) ) else: sampled_indices += list(dataset_indices) assert ( len(sampled_indices) == self.total_num_instances ), f"{len(sampled_indices)} vs {self.total_num_instances}" np.random.shuffle(sampled_indices) if self.sort_indices: sampled_indices.sort(key=lambda i: self.num_tokens(i)) logger.info( "multi_corpus_dataset ordered_indices took {}s".format( time.time() - start ) ) return np.array(sampled_indices, dtype=np.int64) def _map_index(self, index: int): """ If dataset A has length N and dataset B has length M then index 1 maps to index 1 of dataset A, and index N + 1 maps to index 1 of B. """ counter = 0 for num_instances, key in zip(self.num_instances_per_dataset, self.datasets): if index < counter + num_instances: return index - counter, key counter += num_instances raise ValueError( "Invalid index: {}, max: {}".format(index, self.total_num_instances) ) def __len__(self): """ Length of this dataset is the sum of individual datasets """ return self.total_num_instances async def getitem(self, index): new_index, key = self._map_index(index) try: if hasattr(self.datasets[key], "getitem"): item = await self.datasets[key].getitem(new_index) else: item = self.datasets[key][new_index] item["full_id"] = index return item except Exception as e: e.args = (f"Error from {key} dataset", *e.args) raise def __getitem__(self, index): return asyncio.run(self.getitem(index)) async def getitems(self, indices): # initialize a bunch of everstore read operations # wait in the end to reduce overhead # very helpful if io is latency bounded max_concurrency = 32 sem = asyncio.Semaphore(max_concurrency) async def controlled_getitem(index): async with sem: return await self.getitem(index) coroutines = [] for index in indices: coroutines.append(controlled_getitem(index)) results = await asyncio.gather(*coroutines) return results def __getitems__(self, indices): return asyncio.run(self.getitems(indices)) def collater(self, samples): """ If we are doing batch sampling, then pick the right collater to use. Otherwise we assume all collaters are the same. """ if len(samples) == 0: return None if "full_id" in samples[0]: _, key = self._map_index(samples[0]["full_id"]) try: batch = self.datasets[key].collater(samples) except Exception: print(f"Collating failed for key {key}", flush=True) raise return batch else: # Subclasses may override __getitem__ to not specify full_id return list(self.datasets.values())[0].collater(samples) def num_tokens(self, index: int): index, key = self._map_index(index) return self.datasets[key].num_tokens(index) def size(self, index: int): index, key = self._map_index(index) return self.datasets[key].size(index) @property def can_reuse_epoch_itr_across_epochs(self): return False def set_epoch(self, epoch, **unused): super().set_epoch(epoch) logger.info(f"setting epoch of multi_corpus_dataset to {epoch}") self.epoch = epoch @property def supports_prefetch(self): return False @property def supports_fetch_outside_dataloader(self): return all( self.datasets[key].supports_fetch_outside_dataloader for key in self.datasets ) def batch_by_size( self, indices, max_tokens=None, max_sentences=None, required_batch_size_multiple=1, ): if not self.batch_sample: return super().batch_by_size( indices, max_tokens, max_sentences, required_batch_size_multiple ) dataset_indices = {key: [] for key in self.datasets} for i in indices: _, key = self._map_index(i) dataset_indices[key].append(i) batches = [] for key in dataset_indices: cur_batches = super().batch_by_size( np.array(dataset_indices[key], dtype=np.int64), max_tokens, max_sentences, required_batch_size_multiple, ) logger.info(f"Created {len(cur_batches)} batches for dataset {key}") batches += cur_batches # If this dataset is used in a distributed training setup, # then shuffle such that the order is seeded by the distributed rank # as well if self.distributed_rank is not None: with data_utils.numpy_seed(self.seed, self.epoch, self.distributed_rank): np.random.shuffle(batches) return batches ================================================ FILE: fairseq/data/multi_corpus_sampled_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import OrderedDict from typing import Callable, Dict, List import numpy as np from . import FairseqDataset def uniform_sampler(x): # Sample from uniform distribution return np.random.choice(x, 1).item() class MultiCorpusSampledDataset(FairseqDataset): """ Stores multiple instances of FairseqDataset together and in every iteration creates a batch by first sampling a dataset according to a specified probability distribution and then getting instances from that dataset. Args: datasets: an OrderedDict of FairseqDataset instances. sampling_func: A function for sampling over list of dataset keys. The default strategy is to sample uniformly. """ def __init__( self, datasets: Dict[str, FairseqDataset], sampling_func: Callable[[List], int] = None, ): super().__init__() assert isinstance(datasets, OrderedDict) self.datasets = datasets if sampling_func is None: sampling_func = uniform_sampler self.sampling_func = sampling_func self.total_num_instances = 0 for _, dataset in datasets.items(): assert isinstance(dataset, FairseqDataset) self.total_num_instances += len(dataset) self._ordered_indices = None def __len__(self): """ Length of this dataset is the sum of individual datasets """ return self.total_num_instances def ordered_indices(self): """ Ordered indices for batching. Here we call the underlying dataset's ordered_indices() so that we get the same random ordering as we would have from using the underlying dataset directly. """ if self._ordered_indices is None: self._ordered_indices = OrderedDict( [ (key, dataset.ordered_indices()) for key, dataset in self.datasets.items() ] ) return np.arange(len(self)) def _map_index_to_dataset(self, key: int, index: int): """ Different underlying datasets have different lengths. In order to ensure we are not accessing an index outside the range of the current dataset size, we wrap around. This function should be called after we have created an ordering for this and all underlying datasets. """ assert ( self._ordered_indices is not None ), "Must call MultiCorpusSampledDataset.ordered_indices() first" mapped_index = index % len(self.datasets[key]) return self._ordered_indices[key][mapped_index] def __getitem__(self, index: int): """ Get the item associated with index from each underlying dataset. Since index is in the range of [0, TotalNumInstances], we need to map the index to the dataset before retrieving the item. """ return OrderedDict( [ (key, dataset[self._map_index_to_dataset(key, index)]) for key, dataset in self.datasets.items() ] ) def collater(self, samples: List[Dict]): """ Generate a mini-batch for this dataset. To convert this into a regular mini-batch we use the following logic: 1. Select a dataset using the specified probability distribution. 2. Call the collater function of the selected dataset. """ if len(samples) == 0: return None selected_key = self.sampling_func(list(self.datasets.keys())) selected_samples = [sample[selected_key] for sample in samples] return self.datasets[selected_key].collater(selected_samples) def num_tokens(self, index: int): """ Return an example's length (number of tokens), used for batching. Here we return the max across all examples at index across all underlying datasets. """ return max( dataset.num_tokens(self._map_index_to_dataset(key, index)) for key, dataset in self.datasets.items() ) def size(self, index: int): """ Return an example's size as a float or tuple. Here we return the max across all underlying datasets. This value is used when filtering a dataset with max-positions. """ return max( dataset.size(self._map_index_to_dataset(key, index)) for key, dataset in self.datasets.items() ) @property def supports_prefetch(self): return all( getattr(dataset, "supports_prefetch", False) for dataset in self.datasets.values() ) def prefetch(self, indices): for key, dataset in self.datasets.items(): dataset.prefetch( [self._map_index_to_dataset(key, index) for index in indices] ) @property def supports_fetch_outside_dataloader(self): return all( self.datasets[key].supports_fetch_outside_dataloader for key in self.datasets ) ================================================ FILE: fairseq/data/multilingual/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. ================================================ FILE: fairseq/data/multilingual/multilingual_data_manager.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import json import logging import math import os from collections import OrderedDict, defaultdict from argparse import ArgumentError from fairseq import utils from fairseq.data import ( AppendTokenDataset, ConcatDataset, Dictionary, LanguagePairDataset, PrependTokenDataset, SampledMultiDataset, SampledMultiEpochDataset, StripTokenDataset, TransformEosLangPairDataset, TruncateDataset, data_utils, indexed_dataset, ) from fairseq.data.multilingual.multilingual_utils import ( EncoderLangtok, LangTokSpec, LangTokStyle, augment_dictionary, get_lang_tok, ) from fairseq.data.multilingual.sampled_multi_dataset import CollateFormat from fairseq.file_io import PathManager from fairseq.utils import FileContentsAction, csv_str_list, eval_str_dict logger = logging.getLogger(__name__) SRC_DICT_NAME = "src" TGT_DICT_NAME = "tgt" def _lang_id(dic: Dictionary, lang: str): """Return language ID index.""" idx = dic.index(lang) assert idx != dic.unk_index, "cannot find language ID for lang {}".format(lang) return idx def load_sampling_weights(from_file): with open(from_file) as f: weights = json.load(f) return weights class MultilingualDatasetManager(object): def __init__(self, args, lang_pairs, langs, dicts, sampling_method): super().__init__() self.args = args self.seed = args.seed self.lang_pairs = lang_pairs self.extra_lang_pairs = ( list({p for _, v in args.extra_lang_pairs.items() for p in v.split(",")}) if args.extra_lang_pairs else [] ) self.src_langs = { p.split("-")[0] for p in args.lang_pairs + self.extra_lang_pairs } self.tgt_langs = { p.split("-")[1] for p in args.lang_pairs + self.extra_lang_pairs } self.langs = langs self.dicts = dicts self.lang_dict = self.create_lang_dictionary(self.langs) self.sampling_method = sampling_method self.sampling_scheduler = None self._has_sharded_data = False self._num_shards_dict = {} self._training_data_sizes = defaultdict(lambda: {}) @classmethod def setup_data_manager(cls, args, lang_pairs, langs, dicts, sampling_method): return MultilingualDatasetManager( args, lang_pairs, langs, dicts, sampling_method ) @staticmethod def add_args(parser): parser.add_argument( "data", help="colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner", action=FileContentsAction, ) parser.add_argument( "--langs", default=None, type=csv_str_list, help="a list of languages comma sperated languages which can appear in lang-pairs; " "note that the ordering determines language token IDs", ) parser.add_argument( "--lang-dict", default=None, type=str, help="an external file which contains a list of " "languages which can appear in lang-pairs; " "note that the ordering determines language token IDs; " "--langs and --lang-dict are two exclusive options", ) parser.add_argument( "--source-dict", default=None, type=str, help="path to source dictionary; if specified it will override per language dictionary loading", ) parser.add_argument( "--target-dict", default=None, type=str, help="path to target dictionary; if specified it will override per language dictionary loading", ) parser.add_argument( "--lang-tok-style", default=LangTokStyle.multilingual.value, type=str, choices=[LangTokStyle.multilingual.value, LangTokStyle.mbart.value], help="language token styles", ) parser.add_argument( "--load-alignments", action="store_true", help="load the binarized alignments", ) parser.add_argument( "--left-pad-source", default="True", type=str, metavar="BOOL", help="pad the source on the left", ) parser.add_argument( "--left-pad-target", default="False", type=str, metavar="BOOL", help="pad the target on the left", ) try: parser.add_argument( "--max-source-positions", default=1024, type=int, metavar="N", help="max number of tokens in the source sequence", ) parser.add_argument( "--max-target-positions", default=1024, type=int, metavar="N", help="max number of tokens in the target sequence", ) except ArgumentError: # this might have already been defined. Once we transition this to hydra it should be fine to add it here. pass parser.add_argument( "--upsample-primary", default=1, type=int, help="amount to upsample primary dataset", ) parser.add_argument( "--truncate-source", action="store_true", default=False, help="truncate source to max-source-positions", ) parser.add_argument( "--encoder-langtok", default=None, type=str, choices=[EncoderLangtok.src.value, EncoderLangtok.tgt.value], metavar="SRCTGT", help="prepend to the beginning of source sentence the source or target " "language token. (src/tgt)", ) parser.add_argument( "--decoder-langtok", action="store_true", help="prepend to the beginning of target sentence the target language token", ) parser.add_argument( "--lang-tok-replacing-bos-eos", action="store_true", default=False ) parser.add_argument( "--enable-lang-ids", default=False, action="store_true", help="whether to include language IDs in samples", ) parser.add_argument( "--enable-reservsed-directions-shared-datasets", default=False, action="store_true", help="whether to allow datasets be used in reversed directions", ) parser.add_argument( "--extra-data", help='a dictionary of data name to this path, \ e.g. {"mined", path_to_mined_data, "denoised": path_to_denoised_data}', type=lambda uf: eval_str_dict(uf, type=str), default=None, ) parser.add_argument( "--extra-lang-pairs", help='a dictionary of data name to the language pairs they serve, \ e.g. {"mined": comma-separated-lang-pairs, "denoised": comma-separated-lang-pairs}', type=lambda uf: eval_str_dict(uf, type=str), default=None, ) parser.add_argument( "--fixed-dictionary", help="Fixed dictionary to use with model path", default=None, type=str, ) parser.add_argument( "--langtoks-specs", help='a list of comma separated data types that a set of language tokens to be specialized for, \ e.g. "main,dae,mined". There will be a set of language tokens added to the vocab to \ distinguish languages in different training data types. If not specified, default language \ tokens per languages will be added', default=LangTokSpec.main.value, type=csv_str_list, ) parser.add_argument( "--langtoks", help='a dictionary of how to add language tokens, \ e.g. {"mined": (None, "tgt"), "mono_dae": ("src.dae", "tgt"), "main": \ ("src", "tgt")}, or {"mined": ("src.mined", "tgt")}', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument( "--sampling-weights-from-file", help='a file contain a python dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=str, ) parser.add_argument( "--sampling-weights", help='a dictionary of how to sample data sets, \ e.g. { "main:en_XX-es_XX": 0.2, "mined:en_XX-pt_XX": 0.5, \ "mono_dae:es_XX-es_XX: 0.3, "main:en_xx-fr_XX": 0.8 }', default=None, type=lambda uf: eval_str_dict(uf, type=str), ) parser.add_argument( "--virtual-epoch-size", default=None, type=int, help="virtual epoch size to speed up data loading", ) parser.add_argument( "--virtual-data-size", default=None, type=int, help="virtual data size of the whole joint dataset to speed" "up data loading and have specific dynamic sampling strategy interval", ) @classmethod def load_langs(cls, args, **kwargs): if args.lang_dict and args.langs: raise ValueError("--langs and --lang-dict can not both be specified") if args.lang_dict is None and args.langs is None: logger.warning( "External language dictionary is not provided; " "use lang-pairs to infer the set of supported languages. " "The language ordering is not stable which might cause " "misalignment in pretraining and finetuning." ) # infer from lang_pairs as it is langs = list( {x for lang_pair in args.lang_pairs for x in lang_pair.split("-")} ) langs = sorted(langs) logger.info(f"inferred language list: {langs}") elif args.lang_dict: with open( PathManager.get_local_path(args.lang_dict), "r", encoding="utf-8" ) as f: langs = [lang.strip() for lang in f.readlines() if lang.strip()] logger.info( f"loaded language list from {args.lang_dict} as they are ordered in file" ) elif args.langs: langs = args.langs logger.info( f"parsed the language list as they are ordered in the option: {langs}" ) return langs def has_sharded_data(self, split): return self._has_sharded_data and split == getattr( self.args, "train_subset", None ) def _shared_collater(self): return not (self.args.extra_data and "mono_dae" in self.args.extra_data) and ( not self.args.lang_tok_replacing_bos_eos ) def estimate_global_pass_epoch(self, epoch): if self.args.virtual_epoch_size is None or self.args.virtual_data_size is None: return None # one epoch more for remaining data in each shard virtual_epochs_per_shard = math.ceil( self.args.virtual_data_size / self.args.virtual_epoch_size ) # note that fairseq epoch / shard_epoch starts from 1 shard_epoch = (epoch - 1) // virtual_epochs_per_shard + 1 return shard_epoch @classmethod def prepare(cls, load_dictionary, args, **kargs): args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) if not hasattr(args, "shuffle_instance"): args.shuffle_instance = False if args.langtoks is None: args.langtoks = {} if "main" not in args.langtoks: src_langtok_spec = args.encoder_langtok if args.encoder_langtok else None tgt_langtok_spec = "tgt" if args.decoder_langtok else None args.langtoks["main"] = (src_langtok_spec, tgt_langtok_spec) def check_langs(langs, pairs): messages = [] for src, tgt in pairs: if src not in langs or tgt not in langs: messages.append( f"language pair {src}-{tgt} contains languages " "that are not in the language dictionary" ) if len(messages) > 0: raise ValueError(" ".join(messages) + f"; langs: {langs}") if args.lang_pairs is None: raise ValueError( "--lang-pairs is required. List all the language pairs in the training objective." ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(",") if args.source_lang is not None or args.target_lang is not None: training = False else: training = True language_list = cls.load_langs(args, **kargs) check_langs( language_list, ( [p.split("-") for p in args.lang_pairs] if training else [(args.source_lang, args.target_lang)] ), ) def load_dictionary_and_postproc(path): d = load_dictionary(path) augment_dictionary( dictionary=d, language_list=language_list, lang_tok_style=args.lang_tok_style, langtoks_specs=args.langtoks_specs, extra_data=args.extra_data, ) return d dicts = cls.load_all_dictionaries( args, language_list, load_dictionary_and_postproc, training ) return language_list, dicts, training @classmethod def load_all_dictionaries(cls, args, language_list, load_dictionary, training): dicts = OrderedDict() if args.source_dict is not None: dicts[SRC_DICT_NAME] = load_dictionary(args.source_dict) if args.target_dict is not None: dicts[TGT_DICT_NAME] = load_dictionary(args.target_dict) if training: extra_lang_pairs = ( list( {p for _, v in args.extra_lang_pairs.items() for p in v.split(",")} ) if args.extra_lang_pairs else [] ) src_langs_to_load_dicts = sorted( {p.split("-")[0] for p in (args.lang_pairs + extra_lang_pairs)} ) tgt_langs_to_load_dicts = sorted( {p.split("-")[1] for p in (args.lang_pairs + extra_lang_pairs)} ) else: src_langs_to_load_dicts = [args.source_lang] tgt_langs_to_load_dicts = [args.target_lang] paths = utils.split_paths(args.data) assert len(paths) > 0 def load_dicts(langs_to_load_dicts): for lang in langs_to_load_dicts: dicts[lang] = load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(lang)) ) if len(dicts) > 0: dict0 = next(iter(dicts.values())) assert dicts[lang].pad() == dict0.pad() assert dicts[lang].eos() == dict0.eos() assert dicts[lang].unk() == dict0.unk() logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang]))) if args.fixed_dictionary is not None: fixed_dict = load_dictionary(args.fixed_dictionary) dicts = { lang: fixed_dict for lang in src_langs_to_load_dicts + tgt_langs_to_load_dicts } else: if args.source_dict is None: load_dicts(src_langs_to_load_dicts) if args.target_dict is None: load_dicts(tgt_langs_to_load_dicts) return dicts def get_source_dictionary(self, lang): if self.args.source_dict is not None: return self.dicts[SRC_DICT_NAME] else: return self.dicts[lang] def get_target_dictionary(self, lang): if self.args.target_dict is not None: return self.dicts[TGT_DICT_NAME] else: return self.dicts[lang] @classmethod def create_lang_dictionary(cls, langs): unk = "<unk>" # hack to remove symbols other than unk as they are not needed by lang dict lang_dict = Dictionary(pad=unk, eos=unk, unk=unk, bos=unk) for lang in langs: lang_dict.add_symbol(lang) return lang_dict @classmethod def get_langtok_index(cls, lang_tok, dic): idx = dic.index(lang_tok) assert ( idx != dic.unk_index ), "cannot find language token {} in the dictionary".format(lang_tok) return idx def get_encoder_langtok(self, src_lang, tgt_lang, spec=None): if spec is None: return None if spec and spec.startswith("src"): if src_lang is None: return None langtok = get_lang_tok( lang=src_lang, lang_tok_style=self.args.lang_tok_style, spec=spec ) else: if tgt_lang is None: return None langtok = get_lang_tok( lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec ) return self.get_langtok_index( langtok, self.get_source_dictionary(src_lang) if src_lang else self.get_target_dictionary(tgt_lang), ) def get_decoder_langtok(self, tgt_lang, spec=None): if spec is None: return None langtok = get_lang_tok( lang=tgt_lang, lang_tok_style=self.args.lang_tok_style, spec=spec ) return self.get_langtok_index(langtok, self.get_target_dictionary(tgt_lang)) @classmethod def load_data(cls, path, vdict, impl): dataset = data_utils.load_indexed_dataset(path, vdict, impl) return dataset @classmethod def split_exists(cls, split, src, tgt, lang, data_path, dataset_impl): filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) def load_lang_dataset( self, data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, max_source_positions, prepend_bos=False, load_alignments=False, truncate_source=False, ): src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if self.split_exists(split_k, src, tgt, src, data_path, dataset_impl): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif self.split_exists(split_k, tgt, src, src, data_path, dataset_impl): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: logger.error( f"Dataset not found: {data_path}, {split_k}, {src}, {tgt}" ) raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) src_dataset = self.load_data(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_datasets.append(self.load_data(prefix + tgt, tgt_dict, dataset_impl)) logger.info( "{} {} {}-{} {} examples".format( data_path, split_k, src, tgt, len(src_datasets[-1]) ) ) if not combine: break assert len(src_datasets) == len(tgt_datasets) if len(src_datasets) == 1: src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) align_dataset = None if load_alignments: align_path = os.path.join( data_path, "{}.align.{}-{}".format(split, src, tgt) ) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl ) return src_dataset, tgt_dataset, align_dataset def load_langpair_dataset( self, data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, src_dataset_transform_func=lambda dataset: dataset, tgt_dataset_transform_func=lambda dataset: dataset, src_lang_id=None, tgt_lang_id=None, langpairs_sharing_datasets=None, ): norm_direction = "-".join(sorted([src, tgt])) if langpairs_sharing_datasets is not None: src_dataset = langpairs_sharing_datasets.get( (data_path, split, norm_direction, src), "NotInCache" ) tgt_dataset = langpairs_sharing_datasets.get( (data_path, split, norm_direction, tgt), "NotInCache" ) align_dataset = langpairs_sharing_datasets.get( (data_path, split, norm_direction, src, tgt), "NotInCache" ) # a hack: any one is not in cache, we need to reload them if ( langpairs_sharing_datasets is None or src_dataset == "NotInCache" or tgt_dataset == "NotInCache" or align_dataset == "NotInCache" or split != getattr(self.args, "train_subset", None) ): # source and target datasets can be reused in reversed directions to save memory # reversed directions of valid and test data will not share source and target datasets src_dataset, tgt_dataset, align_dataset = self.load_lang_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, max_source_positions=max_source_positions, prepend_bos=prepend_bos, load_alignments=load_alignments, truncate_source=truncate_source, ) src_dataset = src_dataset_transform_func(src_dataset) tgt_dataset = tgt_dataset_transform_func(tgt_dataset) if langpairs_sharing_datasets is not None: langpairs_sharing_datasets[ (data_path, split, norm_direction, src) ] = src_dataset langpairs_sharing_datasets[ (data_path, split, norm_direction, tgt) ] = tgt_dataset langpairs_sharing_datasets[ (data_path, split, norm_direction, src, tgt) ] = align_dataset if align_dataset is None: # no align data so flag the reverse direction as well in sharing langpairs_sharing_datasets[ (data_path, split, norm_direction, tgt, src) ] = align_dataset else: logger.info( f"Reusing source and target datasets of [{split}] {tgt}-{src} for reversed direction: " f"[{split}] {src}-{tgt}: src length={len(src_dataset)}; tgt length={len(tgt_dataset)}" ) return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes if tgt_dataset is not None else None, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, src_lang_id=src_lang_id, tgt_lang_id=tgt_lang_id, ) def src_dataset_tranform_func(self, src_lang, tgt_lang, dataset, spec=None): if self.args.lang_tok_replacing_bos_eos: # it is handled by self.alter_dataset_langtok # TODO: Unifiy with alter_dataset_langtok return dataset if spec is None: return dataset tok = self.get_encoder_langtok(src_lang, tgt_lang, spec) if tok: return PrependTokenDataset(dataset, tok) return dataset def tgt_dataset_tranform_func(self, source_lang, target_lang, dataset, spec=None): if dataset is None: # note that target dataset can be None during inference time return None if self.args.lang_tok_replacing_bos_eos: # TODO: Unifiy with alter_dataset_langtok # It is handled by self.alter_dataset_langtok. # The complication in self.alter_dataset_langtok # makes a unified framework difficult. return dataset # if not self.args.decoder_langtok: if not spec: return dataset tok = self.get_decoder_langtok(target_lang, spec) if tok: return PrependTokenDataset(dataset, tok) return dataset def alter_dataset_langtok( self, lang_pair_dataset, src_eos=None, src_lang=None, tgt_eos=None, tgt_lang=None, src_langtok_spec=None, tgt_langtok_spec=None, ): if src_langtok_spec is None and tgt_langtok_spec is None: return lang_pair_dataset new_src_eos = None if ( src_langtok_spec is not None and src_eos is not None and (src_lang is not None or tgt_lang is not None) ): new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang, src_langtok_spec) else: src_eos = None new_tgt_bos = None if tgt_langtok_spec and tgt_eos is not None and tgt_lang is not None: new_tgt_bos = self.get_decoder_langtok(tgt_lang, tgt_langtok_spec) else: tgt_eos = None return TransformEosLangPairDataset( lang_pair_dataset, src_eos=src_eos, new_src_eos=new_src_eos, tgt_bos=tgt_eos, new_tgt_bos=new_tgt_bos, ) def load_a_dataset( self, split, data_path, src, src_dict, tgt, tgt_dict, combine, prepend_bos=False, langpairs_sharing_datasets=None, data_category=None, **extra_kwargs, ): dataset_impl = self.args.dataset_impl upsample_primary = self.args.upsample_primary left_pad_source = self.args.left_pad_source left_pad_target = self.args.left_pad_target max_source_positions = self.args.max_source_positions max_target_positions = self.args.max_target_positions load_alignments = self.args.load_alignments truncate_source = self.args.truncate_source src_dataset_transform_func = self.src_dataset_tranform_func tgt_dataset_transform_func = self.tgt_dataset_tranform_func enable_lang_ids = self.args.enable_lang_ids lang_dictionary = self.lang_dict src_langtok_spec, tgt_langtok_spec = extra_kwargs["langtok_spec"] src_langtok = self.get_encoder_langtok(src, tgt, src_langtok_spec) tgt_langtok = self.get_decoder_langtok(tgt, tgt_langtok_spec) logger.info( f"{data_category}:{src}-{tgt} src_langtok: {src_langtok}; tgt_langtok: {tgt_langtok}" ) langpair_ds = self.load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos, load_alignments, truncate_source, src_dataset_transform_func=lambda dataset: src_dataset_transform_func( src, tgt, dataset, src_langtok_spec ), tgt_dataset_transform_func=lambda dataset: tgt_dataset_transform_func( src, tgt, dataset, tgt_langtok_spec ), src_lang_id=_lang_id(lang_dictionary, src) if enable_lang_ids and lang_dictionary is not None else None, tgt_lang_id=_lang_id(lang_dictionary, tgt) if enable_lang_ids and lang_dictionary is not None else None, langpairs_sharing_datasets=langpairs_sharing_datasets, ) # TODO: handle modified lang toks for mined data and dae data if self.args.lang_tok_replacing_bos_eos: ds = self.alter_dataset_langtok( langpair_ds, src_eos=self.get_source_dictionary(src).eos() if src else self.get_target_dictionary(tgt).eos(), src_lang=src, tgt_eos=self.get_target_dictionary(tgt).eos(), tgt_lang=tgt, src_langtok_spec=src_langtok_spec, tgt_langtok_spec=tgt_langtok_spec, ) else: ds = langpair_ds return ds def load_split_langpair_datasets(self, split, data_param_list): datasets = [] langpairs_sharing_datasets = ( {} if self.args.enable_reservsed_directions_shared_datasets else None ) for param in data_param_list: ds = self.load_a_dataset( split=split, langpairs_sharing_datasets=langpairs_sharing_datasets, **param, ) datasets.append(ds) return datasets def get_data_paths_and_lang_pairs(self, split): datapaths = {"main": self.args.data} lang_pairs = {"main": self.lang_pairs} if split == getattr(self.args, "train_subset", None): # only training data can have extra data and extra language pairs if self.args.extra_data: extra_datapaths = self.args.extra_data datapaths.update(extra_datapaths) if self.args.extra_lang_pairs: extra_lang_pairs = { k: v.split(",") for k, v in self.args.extra_lang_pairs.items() } lang_pairs.update(extra_lang_pairs) return datapaths, lang_pairs @classmethod def get_dataset_key(cls, data_category, src, tgt): return f"{data_category}:{src}-{tgt}" @classmethod def _get_shard_num_dict(cls, split, paths): shards = defaultdict(int) for path in paths: files = PathManager.ls(path) directions = set() for f in files: if f.startswith(split) and f.endswith(".idx"): # idx files of the form "{split}.{src}-{tgt}.{lang}.idx" direction = f.split(".")[-3] directions.add(direction) for direction in directions: shards[direction] += 1 return shards def get_split_num_data_shards(self, split): if split in self._num_shards_dict: return self._num_shards_dict[split] num_shards_dict = {} data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split) for data_category, paths in data_paths.items(): if data_category not in lang_pairs: continue paths = utils.split_paths(paths) shards_dict = self._get_shard_num_dict(split, paths) lang_dirs = [ lang_pair.split("-") for lang_pair in lang_pairs[data_category] ] lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs] for src, tgt in lang_dirs: key = self.get_dataset_key(data_category, src, tgt) if "mono_" in data_category: # monolingual data requires tgt only assert src is None or src == tgt, ( f"error: src={src}, " f"tgt={tgt} for data_category={data_category}" ) num_shards_dict[key] = shards_dict[tgt] else: if f"{src}-{tgt}" in shards_dict: num_shards_dict[key] = shards_dict[f"{src}-{tgt}"] elif f"{tgt}-{src}" in shards_dict: # follow the fairseq tradition to use reversed direction data if it is not available num_shards_dict[key] = shards_dict[f"{tgt}-{src}"] self._num_shards_dict[split] = num_shards_dict logger.info(f"[{split}] num of shards: {num_shards_dict}") return num_shards_dict @classmethod def get_shard_id(cls, num_shards, epoch, shard_epoch=None): shard = epoch if shard_epoch is None else shard_epoch shard = (shard - 1) % num_shards return shard def get_split_data_path(self, paths, epoch, shard_epoch, num_shards): path = paths[self.get_shard_id(num_shards, epoch, shard_epoch)] return path def get_split_data_param_list(self, split, epoch, shard_epoch=None): # TODO: to extend with extra datasets and keys and loop over different shard data paths param_list = [] data_paths, lang_pairs = self.get_data_paths_and_lang_pairs(split) logger.info(f"langtoks settings: {self.args.langtoks}") split_num_shards_dict = self.get_split_num_data_shards(split) for data_category, paths in data_paths.items(): if data_category not in lang_pairs: continue paths = utils.split_paths(paths) assert len(paths) > 0 if len(paths) > 1: self._has_sharded_data = True if split != getattr(self.args, "train_subset", None): # if not training data set, use the first shard for valid and test paths = paths[:1] if data_category in self.args.langtoks: lang_tok_spec = self.args.langtoks[data_category] else: # default to None lang_tok_spec = (None, None) # infer langcode lang_dirs = [ lang_pair.split("-") for lang_pair in lang_pairs[data_category] ] lang_dirs = [x if len(x) > 1 else (x[0], x[0]) for x in lang_dirs] for src, tgt in lang_dirs: assert src is not None or data_category == "mono_dae", ( f"error: src={src}, " f"tgt={tgt} for data_category={data_category}" ) # logger.info(f"preparing param for {data_category}: {src} - {tgt}") key = self.get_dataset_key(data_category, src, tgt) data_path = self.get_split_data_path( paths, epoch, shard_epoch, split_num_shards_dict[key] ) param_list.append( { "key": key, "data_path": data_path, "split": split, "src": src, "src_dict": self.get_source_dictionary(src) if src and data_category != "mono_dae" else None, "tgt": tgt, "tgt_dict": self.get_target_dictionary(tgt), "data_category": data_category, "langtok_spec": lang_tok_spec, } ) return param_list def get_train_dataset_sizes( self, data_param_list, datasets, epoch, shard_epoch=None ): num_shards = [ self.get_split_num_data_shards(param["split"])[param["key"]] for param in data_param_list ] data_sizes = [] for (key, d), num_shard in zip(datasets, num_shards): my_data_sizes = self._training_data_sizes[key] shard_ind = self.get_shard_id(num_shard, epoch, shard_epoch) if shard_ind not in my_data_sizes: my_data_sizes[shard_ind] = len(d) known_size = max(my_data_sizes.values()) data_sizes.append( # If we don't know the data size of the shard yet, # use the the max known data size to approximate. # Note that we preprocess shards by a designated shard size # and put any remaining data at the end into the last shard so # the max shard size approximation is almost correct before loading # the last shard; after loading the last shard, it will have the # exact data sizes of the whole data size. (key, sum(my_data_sizes.get(i, known_size) for i in range(num_shard))) ) logger.info( f"estimated total data sizes of all shards used in sampling ratios: {data_sizes}. " "Note that if the data a shard has not been loaded yet, use the max known data size to approximate" ) return [s for _, s in data_sizes] def get_train_sampling_ratios( self, data_param_list, datasets, epoch=1, shard_epoch=None ): data_sizes = self.get_train_dataset_sizes( data_param_list, datasets, epoch, shard_epoch ) sampling_func = self.sampling_method.sampling_method_selector() sample_ratios = sampling_func(data_sizes) if sampling_func is not None else None return sample_ratios def get_sampling_ratios(self, data_param_list, datasets, epoch, shard_epoch=None): if self.args.sampling_weights_from_file: weights = load_sampling_weights(self.args.sampling_weights_from_file) sample_ratios = [weights[k] for k, _ in datasets] logger.info( "| ignoring --sampling-weights when loadding sampling weights " f"from file {self.args.sampling_weights_from_file}" ) elif self.args.sampling_weights: sample_ratios = [self.args.sampling_weights[k] for k, _ in datasets] else: sample_ratios = self.get_train_sampling_ratios( data_param_list, datasets, epoch, shard_epoch ) if sample_ratios is not None: logger.info( "| Upsample ratios: {}".format( list(zip(map(lambda x: x["key"], data_param_list), sample_ratios)) ) ) assert len(sample_ratios) == len(datasets) return sample_ratios def load_split_datasets( self, split, training, epoch=1, combine=False, shard_epoch=None, **kwargs ): data_param_list = self.get_split_data_param_list( split, epoch, shard_epoch=shard_epoch ) langpairs_sharing_datasets = ( {} if self.args.enable_reservsed_directions_shared_datasets else None ) datasets = [ ( param["key"], self.load_a_dataset( combine=combine, langpairs_sharing_datasets=langpairs_sharing_datasets, **param, ), ) for param in data_param_list ] return datasets, data_param_list def load_into_concat_dataset(self, split, datasets, data_param_list): if self.args.lang_tok_replacing_bos_eos: # TODO: to investigate why TransformEosLangPairDataset doesn't work with ConcatDataset return SampledMultiDataset( OrderedDict(datasets), sampling_ratios=None, eval_key=None, collate_format=CollateFormat.single, virtual_size=None, split=split, ) return ConcatDataset([d for _, d in datasets]) def load_sampled_multi_epoch_dataset( self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs ): datasets, data_param_list = self.load_split_datasets( split, training, epoch, combine, shard_epoch=shard_epoch, **kwargs ) if training and split == getattr(self.args, "train_subset", None): sample_ratios = self.get_sampling_ratios(data_param_list, datasets, epoch) return SampledMultiEpochDataset( OrderedDict(datasets), epoch=epoch, shard_epoch=shard_epoch, # valid and test datasets will be degenerate to concating datasets: sampling_ratios=sample_ratios, eval_key=None, collate_format=CollateFormat.single, virtual_size=self.args.virtual_data_size, split=split, virtual_epoch_size=self.args.virtual_epoch_size, # if not using lang_tok altering, simplified to use the same collater shared_collater=self._shared_collater(), ) else: return self.load_into_concat_dataset(split, datasets, data_param_list) def load_sampled_multi_dataset( self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs ): datasets, data_param_list = self.load_split_datasets( split, training, epoch, combine, shard_epoch=shard_epoch, **kwargs ) if training and split == getattr(self.args, "train_subset", None): sample_ratios = self.get_sampling_ratios(data_param_list, datasets, epoch) return SampledMultiDataset( OrderedDict(datasets), epoch=epoch, # valid and test datasets will be degerate to concating datasets: sampling_ratios=sample_ratios, eval_key=None, collate_format=CollateFormat.single, virtual_size=self.args.virtual_data_size, split=split, # if not using lang_tok altering, simplified to use the same collater shared_collater=self._shared_collater(), ) else: return self.load_into_concat_dataset(split, datasets, data_param_list) def load_dataset( self, split, training, epoch=0, combine=False, shard_epoch=None, **kwargs ): if self.args.virtual_epoch_size is None: return self.load_sampled_multi_dataset( split, training, epoch, combine, shard_epoch, **kwargs ) else: return self.load_sampled_multi_epoch_dataset( split, training, epoch, combine, shard_epoch, **kwargs ) ================================================ FILE: fairseq/data/multilingual/multilingual_utils.py ================================================ from enum import Enum from typing import Dict, List, Optional, Sequence import torch from fairseq.data import Dictionary class EncoderLangtok(Enum): """ Prepend to the beginning of source sentence either the source or target language token. (src/tgt). """ src = "src" tgt = "tgt" class LangTokSpec(Enum): main = "main" mono_dae = "mono_dae" class LangTokStyle(Enum): multilingual = "multilingual" mbart = "mbart" @torch.jit.export def get_lang_tok( lang: str, lang_tok_style: str, spec: str = LangTokSpec.main.value ) -> str: # TOKEN_STYLES can't be defined outside this fn since it needs to be # TorchScriptable. TOKEN_STYLES: Dict[str, str] = { LangTokStyle.mbart.value: "[{}]", LangTokStyle.multilingual.value: "__{}__", } if spec.endswith("dae"): lang = f"{lang}_dae" elif spec.endswith("mined"): lang = f"{lang}_mined" style = TOKEN_STYLES[lang_tok_style] return style.format(lang) def augment_dictionary( dictionary: Dictionary, language_list: List[str], lang_tok_style: str, langtoks_specs: Sequence[str] = (LangTokSpec.main.value,), extra_data: Optional[Dict[str, str]] = None, ) -> None: for spec in langtoks_specs: for language in language_list: dictionary.add_symbol( get_lang_tok(lang=language, lang_tok_style=lang_tok_style, spec=spec) ) if lang_tok_style == LangTokStyle.mbart.value or ( extra_data is not None and LangTokSpec.mono_dae.value in extra_data ): dictionary.add_symbol("<mask>") ================================================ FILE: fairseq/data/multilingual/sampled_multi_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import datetime import hashlib import logging import time from bisect import bisect_right from collections import OrderedDict, defaultdict from enum import Enum from typing import List import numpy as np import torch from fairseq.data import FairseqDataset, data_utils from fairseq.distributed import utils as distributed_utils def get_time_gap(s, e): return ( datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s) ).__str__() logger = logging.getLogger(__name__) def default_virtual_size_func(datasets, ratios, max_scale_up=1.5): sizes = [len(d) for d in datasets] if ratios is None: return sum(sizes) largest_idx = np.argmax(sizes) largest_r = ratios[largest_idx] largest_s = sizes[largest_idx] # set virtual sizes relative to the largest dataset virtual_sizes = [(r / largest_r) * largest_s for r in ratios] vsize = sum(virtual_sizes) max_size = sum(sizes) * max_scale_up return int(vsize if vsize < max_size else max_size) class CollateFormat(Enum): single = 1 ordered_dict = 2 class SampledMultiDataset(FairseqDataset): """Samples from multiple sub-datasets according to given sampling ratios. Args: datasets ( List[~torch.utils.data.Dataset] or OrderedDict[str, ~torch.utils.data.Dataset] ): datasets sampling_ratios (List[float]): list of probability of each dataset to be sampled (default: None, which corresponds to concatenating all dataset together). seed (int): RNG seed to use (default: 2). epoch (int): starting epoch number (default: 1). eval_key (str, optional): a key used at evaluation time that causes this instance to pass-through batches from *datasets[eval_key]*. collate_format (CollateFormat): collater output format, either CollateFormat.ordered_dict or CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures the collater to output batches of data mixed from all sub-datasets, and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys of sub-datasets. Note that not all sub-datasets will present in a single batch in both formats. virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func). split (str): the split of the data, e.g. 'train', 'valid' or 'test'. shared_collater (bool): whether or not to all sub-datasets have the same collater. shuffle (bool): whether or not to shuffle data (default: True). """ def __init__( self, datasets, sampling_ratios=None, seed=2, epoch=1, eval_key=None, collate_format=CollateFormat.single, virtual_size=default_virtual_size_func, split="", shared_collater=False, shuffle=True, ): super().__init__() self.shared_collater = shared_collater self.shuffle = shuffle if isinstance(datasets, OrderedDict): self.keys = list(datasets.keys()) datasets = list(datasets.values()) elif isinstance(datasets, List): self.keys = list(range(len(datasets))) else: raise AssertionError() self.datasets = datasets self.split = split self.eval_key = eval_key if self.eval_key is not None: self.collate_format = CollateFormat.single else: self.collate_format = collate_format self.seed = seed self._cur_epoch = None self.cumulated_sizes = None # self.datasets[k][self._cur_indices[i]] is the data item i in this sampled dataset # namely, data item i is sampled from the kth sub-dataset self.datasets[k] # where self.cumulated_sizes[k-1] <= i < self.cumulated_sizes[k] self._cur_indices = None self._sizes = None self.virtual_size_per_dataset = None # caching properties self._reset_cached_properties() self.setup_sampling(sampling_ratios, virtual_size) self.set_epoch(epoch) def _clean_if_not_none(self, var_list): for v in var_list: if v is not None: del v def _reset_cached_properties(self): self._clean_if_not_none([self._sizes, self._cur_indices]) self._sizes = None self._cur_indices = None def setup_sampling(self, sample_ratios, virtual_size): sizes = [len(d) for d in self.datasets] if sample_ratios is None: # default back to concating datasets self.sample_ratios = None self.virtual_size = sum(sizes) else: if not isinstance(sample_ratios, np.ndarray): sample_ratios = np.array(sample_ratios) self.sample_ratios = sample_ratios virtual_size = ( default_virtual_size_func if virtual_size is None else virtual_size ) self.virtual_size = ( virtual_size(self.datasets, self.sample_ratios) if callable(virtual_size) else virtual_size ) def adjust_sampling(self, epoch, sampling_ratios, virtual_size): if sampling_ratios is not None: sampling_ratios = self._sync_sample_ratios(sampling_ratios) self.setup_sampling(sampling_ratios, virtual_size) def _sync_sample_ratios(self, ratios): # in case the ratios are not precisely the same across processes # also to ensure every procresses update the ratios in the same pace ratios = torch.DoubleTensor(ratios) if torch.distributed.is_initialized(): if torch.cuda.is_available(): distributed_utils.all_reduce( ratios.cuda(), group=distributed_utils.get_data_parallel_group() ) else: distributed_utils.all_reduce( ratios, group=distributed_utils.get_data_parallel_group() ) ret = ratios.cpu() ret = ret.numpy() return ret def random_choice_in_dataset(self, rng, dataset, choice_size): if hasattr(dataset, "random_choice_in_dataset"): return dataset.random_choice_in_dataset(rng, choice_size) dataset_size = len(dataset) return rng.choice( dataset_size, choice_size, replace=(choice_size > dataset_size) ) def get_virtual_indices(self, rng, datasets, sample_ratios, virtual_size): def get_counts(sample_ratios): counts = np.array([virtual_size * r for r in sample_ratios], dtype=np.int64) diff = virtual_size - counts.sum() assert diff >= 0 # due to round-offs, the size might not match the desired sizes if diff > 0: dataset_indices = rng.choice( len(sample_ratios), size=diff, p=sample_ratios ) for i in dataset_indices: counts[i] += 1 return counts def get_in_dataset_indices(datasets, sizes, sample_ratios): counts = get_counts(sample_ratios) # uniformally sample desired counts for each dataset # if the desired counts are large, sample with replacement: indices = [ self.random_choice_in_dataset(rng, d, c) for c, d in zip(counts, datasets) ] return indices sizes = [len(d) for d in datasets] if sample_ratios is None: # default back to concating datasets in_dataset_indices = [list(range(s)) for s in sizes] virtual_sizes_per_dataset = sizes else: ratios = sample_ratios / sample_ratios.sum() in_dataset_indices = get_in_dataset_indices(datasets, sizes, ratios) virtual_sizes_per_dataset = [len(d) for d in in_dataset_indices] virtual_sizes_per_dataset = np.array(virtual_sizes_per_dataset, np.int64) cumulative_sizes = np.cumsum(virtual_sizes_per_dataset) assert sum(virtual_sizes_per_dataset) == virtual_size assert cumulative_sizes[-1] == virtual_size if virtual_size < sum(sizes): logger.warning( f"virtual data size ({virtual_size}) is less than real data size ({sum(sizes)})." " If virtual size << real data size, there could be data coverage issue." ) in_dataset_indices = np.hstack(in_dataset_indices) return in_dataset_indices, cumulative_sizes, virtual_sizes_per_dataset def _get_dataset_and_index(self, index): i = bisect_right(self.cumulated_sizes, index) return i, self._cur_indices[index] def __getitem__(self, index): # self.__getitem__(index) returns self.datasets[k][self._cur_indices[index]] # where k satisfies self.cumulated_sizes[k - 1] <= k < self.cumulated_sizes[k] ds_idx, ds_sample_idx = self._get_dataset_and_index(index) ret = (ds_idx, self.datasets[ds_idx][ds_sample_idx]) return ret def num_tokens(self, index): return self.sizes[index].max() def num_tokens_vec(self, indices): sizes_vec = self.sizes[np.array(indices)] # max across all dimensions but first one return np.amax(sizes_vec, axis=tuple(range(1, len(sizes_vec.shape)))) def size(self, index): return self.sizes[index] def __len__(self): return self.virtual_size def collater(self, samples, **extra_args): """Merge a list of samples to form a mini-batch.""" if len(samples) == 0: return None if self.collate_format == "ordered_dict": collect_samples = [[] for _ in range(len(self.datasets))] for (i, sample) in samples: collect_samples[i].append(sample) batch = OrderedDict( [ (self.keys[i], dataset.collater(collect_samples[i])) for i, (key, dataset) in enumerate(zip(self.keys, self.datasets)) if len(collect_samples[i]) > 0 ] ) elif self.shared_collater: batch = self.datasets[0].collater([s for _, s in samples]) else: samples_dict = defaultdict(list) pad_to_length = ( defaultdict(int) if "pad_to_length" not in extra_args else extra_args["pad_to_length"] ) for ds_idx, s in samples: pad_to_length["source"] = max( pad_to_length["source"], s["source"].size(0) ) if s["target"] is not None: pad_to_length["target"] = max( pad_to_length["target"], s["target"].size(0) ) samples_dict[ds_idx].append(s) batches = [ self.datasets[i].collater(samples_dict[i], pad_to_length=pad_to_length) for i in range(len(self.datasets)) if len(samples_dict[i]) > 0 ] def straight_data(tensors): batch = torch.cat(tensors, dim=0) return batch src_lengths = straight_data( [b["net_input"]["src_lengths"] for b in batches] ) src_lengths, sort_order = src_lengths.sort(descending=True) def straight_order(tensors): batch = straight_data(tensors) return batch.index_select(0, sort_order) batch = { "id": straight_order([b["id"] for b in batches]), "nsentences": sum(b["nsentences"] for b in batches), "ntokens": sum(b["ntokens"] for b in batches), "net_input": { "src_tokens": straight_order( [b["net_input"]["src_tokens"] for b in batches] ), "src_lengths": src_lengths, }, "target": straight_order([b["target"] for b in batches]) if batches[0]["target"] is not None else None, } if "prev_output_tokens" in batches[0]["net_input"]: batch["net_input"]["prev_output_tokens"] = straight_order( [b["net_input"]["prev_output_tokens"] for b in batches] ) if "src_lang_id" in batches[0]["net_input"]: batch["net_input"]["src_lang_id"] = straight_order( [b["net_input"]["src_lang_id"] for b in batches] ) if "tgt_lang_id" in batches[0]: batch["tgt_lang_id"] = straight_order( [b["tgt_lang_id"] for b in batches] ) return batch @property def sizes(self): if self._sizes is not None: return self._sizes start_time = time.time() in_sub_dataset_indices = [ self._cur_indices[ 0 if i == 0 else self.cumulated_sizes[i - 1] : self.cumulated_sizes[i] ] for i in range(len(self.datasets)) ] sub_dataset_sizes = [ d.sizes[indices] for d, indices in zip(self.datasets, in_sub_dataset_indices) ] self._sizes = np.vstack(sub_dataset_sizes) logger.info(f"sizes() calling time: {get_time_gap(start_time, time.time())}") return self._sizes def ordered_indices(self): if self.shuffle: indices = np.random.permutation(len(self)) else: indices = np.arange(len(self)) sizes = self.sizes tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None src_sizes = ( sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes ) # sort by target length, then source length if tgt_sizes is not None: indices = indices[np.argsort(tgt_sizes[indices], kind="mergesort")] sort_indices = indices[np.argsort(src_sizes[indices], kind="mergesort")] return sort_indices def prefetch(self, indices): prefetch_indices = [[] for _ in range(len(self.datasets))] for i in indices: ds_idx, ds_sample_idx = self._get_dataset_and_index(i) prefetch_indices[ds_idx].append(ds_sample_idx) for i in range(len(prefetch_indices)): self.datasets[i].prefetch(prefetch_indices[i]) @property def can_reuse_epoch_itr_across_epochs(self): return False def set_epoch(self, epoch): super().set_epoch(epoch) if epoch == self._cur_epoch: # re-enter so return return for d in self.datasets: if hasattr(d, "set_epoch"): d.set_epoch(epoch) self._cur_epoch = epoch self._establish_virtual_datasets() def _establish_virtual_datasets(self): if self.sample_ratios is None and self._cur_indices is not None: # not a samping dataset, no need to resample if indices are already established return self._reset_cached_properties() start_time = time.time() # Generate a weighted sample of indices as a function of the # random seed and the current epoch. rng = np.random.RandomState( [ int( hashlib.sha1( str(self.__class__.__name__).encode("utf-8") ).hexdigest(), 16, ) % (2**32), self.seed % (2**32), # global seed self._cur_epoch, # epoch index, ] ) self._clean_if_not_none( [self.cumulated_sizes, self.virtual_size_per_dataset, self._sizes] ) self._sizes = None indices, cumulated_sizes, virtual_size_per_dataset = self.get_virtual_indices( rng, self.datasets, self.sample_ratios, self.virtual_size ) self._cur_indices = indices self.cumulated_sizes = cumulated_sizes self.virtual_size_per_dataset = virtual_size_per_dataset raw_sizes = [len(d) for d in self.datasets] sampled_sizes = self.virtual_size_per_dataset logger.info( f"[{self.split}] Raw sizes: {str(dict(zip(self.keys, raw_sizes)))}; " f"raw total size: {sum(raw_sizes)}" ) logger.info( f"[{self.split}] Resampled sizes: {str(dict(zip(self.keys, sampled_sizes)))}; " f"resampled total size: {sum(sampled_sizes)}" ) if self.sample_ratios is not None: logger.info( f"[{self.split}] Upsampling ratios: {str(dict(zip(self.keys, self.sample_ratios)))}" ) else: logger.info(f"[{self.split}] A concat dataset") logger.info( f"[{self.split}] virtual dataset established time: {get_time_gap(start_time, time.time())}" ) def filter_indices_by_size(self, indices, max_sizes): """Filter a list of sample indices. Remove those that are longer than specified in max_sizes. Args: indices (np.array): original array of sample indices max_sizes (int or list[int] or tuple[int]): max sample size, can be defined separately for src and tgt (then list or tuple) Returns: np.array: filtered sample array list: list of removed indices """ sizes = self.sizes tgt_sizes = sizes[:, 1] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else None src_sizes = ( sizes[:, 0] if len(sizes.shape) > 0 and sizes.shape[1] > 1 else sizes ) return data_utils.filter_paired_dataset_indices_by_size( src_sizes, tgt_sizes, indices, max_sizes ) ================================================ FILE: fairseq/data/multilingual/sampled_multi_epoch_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import hashlib import logging import math import numpy as np from fairseq.data import SampledMultiDataset from .sampled_multi_dataset import CollateFormat, default_virtual_size_func logger = logging.getLogger(__name__) class SampledMultiEpochDataset(SampledMultiDataset): """Samples from multiple sub-datasets according to sampling ratios using virtual epoch sizes to speed up dataloading. Args: datasets ( List[~torch.utils.data.Dataset] or OrderedDict[str, ~torch.utils.data.Dataset] ): datasets sampling_ratios (List[float]): list of probability of each dataset to be sampled (default: None, which corresponds to concating all dataset together). seed (int): RNG seed to use (default: 2). epoch (int): starting epoch number (default: 1). eval_key (str, optional): a key used at evaluation time that causes this instance to pass-through batches from *datasets[eval_key]*. collate_format (CollateFormat): collater output format, either CollateFormat.ordered_dict or CollateFormat.single (default: CollateFormat.single) where CollateFormat.single configures the collater to output batches of data mixed from all sub-datasets, and CollateFormat.ordered_dict configures the collater to output a dictionary of batches indexed by keys of sub-datasets. Note that not all sub-datasets will present in a single batch in both formats. virtual_size (int, or callable): the expected virtual size of the dataset (default: default_virtual_size_func). split (str): the split of the data, e.g. 'train', 'valid' or 'test'. virtual_epoch_size (int): virtual epoch size, the dataset will go through the data by this virtual epoch size one by one to speed up data loading, e.g. indicing and filtering can be performed whenever a virtual epoch is loaded without waiting for the whole dataset to be loaded. shared_collater (bool): whether or not to all sub-datasets have the same collater. shard_epoch (int): the real epoch number for shard selection. shuffle (bool): whether or not to shuffle data (default: True). """ def __init__( self, datasets, sampling_ratios=None, seed=2, epoch=1, eval_key=None, collate_format=CollateFormat.single, virtual_size=default_virtual_size_func, split="", virtual_epoch_size=None, shared_collater=False, shard_epoch=1, shuffle=True, ): self.virtual_epoch_size = virtual_epoch_size self._current_epoch_start_index = None self._random_global_indices = None self.shard_epoch = shard_epoch if shard_epoch is not None else 1 self.load_next_shard = None self._epoch_sizes = None super().__init__( datasets=datasets, sampling_ratios=sampling_ratios, seed=seed, epoch=epoch, eval_key=eval_key, collate_format=collate_format, virtual_size=virtual_size, split=split, shared_collater=shared_collater, shuffle=shuffle, ) def _setup(self, epoch): self.virtual_epoch_size = ( self.virtual_epoch_size if self.virtual_epoch_size is not None else self.virtual_size ) if self.virtual_epoch_size > self.virtual_size: logger.warning( f"virtual epoch size {self.virtual_epoch_size} " f"is greater than virtual dataset size {self.virtual_size}" ) self.virtual_epoch_size = self.virtual_size self.num_virtual_epochs = math.ceil(self.virtual_size / self.virtual_epoch_size) self._current_epoch_start_index = self._get_epoch_start_index(epoch) logger.info( f"virtual epoch size {self.virtual_epoch_size}; virtual dataset size {self.virtual_size}" ) def _map_epoch_index_to_global(self, index): index = self._current_epoch_start_index + index # add randomness return self._random_global_indices[index] @property def sizes(self): if self._epoch_sizes is not None: return self._epoch_sizes _sizes = super().sizes indices = self._random_global_indices[ self._current_epoch_start_index : self._current_epoch_start_index + len(self) ] self._epoch_sizes = _sizes[indices] # del super()._sizes to save memory del self._sizes self._sizes = None return self._epoch_sizes def _get_dataset_and_index(self, index): i = self._map_epoch_index_to_global(index) return super()._get_dataset_and_index(i) def __len__(self): return ( self.virtual_epoch_size if self._current_epoch_start_index + self.virtual_epoch_size < self.virtual_size else self.virtual_size - self._current_epoch_start_index ) def set_epoch(self, epoch): if self._current_epoch_start_index is None: # initializing epoch idnices of a virtual dataset self._setup(epoch) self._next_virtual_epoch(epoch) else: # working on already intialized epoch indices if epoch == self._cur_epoch: # re-enter so return return self._next_virtual_epoch(epoch) def _get_epoch_start_index(self, epoch): assert epoch >= 1 # fairseq is using 1-based epoch everywhere return ((epoch - 1) % self.num_virtual_epochs) * self.virtual_epoch_size def _next_global_indices(self, epoch): rng = np.random.RandomState( [ int( hashlib.sha1( str(self.__class__.__name__).encode("utf-8") ).hexdigest(), 16, ) % (2**32), self.seed % (2**32), # global seed epoch, # epoch index, ] ) del self._random_global_indices self._random_global_indices = rng.choice( self.virtual_size, self.virtual_size, replace=False ) if self.load_next_shard is None: self.load_next_shard = False else: # increase shard epoch for next loading self.shard_epoch += 1 self.load_next_shard = True logger.info( "to load next epoch/shard in next load_dataset: " f"epoch={epoch}/shard_epoch={self.shard_epoch}" ) def _next_virtual_epoch(self, epoch): index = self._get_epoch_start_index(epoch) if index == 0 or self._random_global_indices is None: # need to start from the beginning, # so call super().set_epoch(epoch) to establish the global virtual indices logger.info( "establishing a new set of global virtual indices for " f"epoch={epoch}/shard_epoch={self.shard_epoch}" ) super().set_epoch(epoch) self._next_global_indices(epoch) else: self._cur_epoch = epoch # reset cache sizes and ordered_indices for the epoch after moving to a new epoch self._clean_if_not_none( [ self._epoch_sizes, ] ) self._epoch_sizes = None self._current_epoch_start_index = index ================================================ FILE: fairseq/data/multilingual/sampling_method.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import List logger = logging.getLogger(__name__) def uniform(dataset_sizes: List[int]): return [1.0] * len(dataset_sizes) def temperature_sampling(dataset_sizes, temp): total_size = sum(dataset_sizes) return [(size / total_size) ** (1.0 / temp) for size in dataset_sizes] def make_temperature_sampling(temp=1.0): def sampling_func(dataset_sizes): return temperature_sampling(dataset_sizes, temp) return sampling_func def make_ratio_sampling(ratios): def sampling_func(dataset_sizes): return ratios return sampling_func class SamplingMethod: @staticmethod def add_arguments(parser): parser.add_argument( "--sampling-method", choices=[ "uniform", "temperature", "concat", "RoundRobin", ], type=str, default="concat", help="The method to sample data per language pairs", ) parser.add_argument( "--sampling-temperature", default=1.5, type=float, help="only work with --sampling-method temperature", ) @staticmethod def build_sampler(args, task): return SamplingMethod(args, task) def __init__(self, args, task): self.args = args self.task = task def is_adaptive(self): return False def sampling_method_selector(self): args = self.args logger.info(f"selected sampler: {args.sampling_method}") if args.sampling_method == "uniform": return uniform elif args.sampling_method == "temperature" or self.is_adaptive(): return make_temperature_sampling(float(args.sampling_temperature)) else: # default to concating all data set together return None ================================================ FILE: fairseq/data/nested_dictionary_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import OrderedDict import torch from torch.utils.data.dataloader import default_collate from . import FairseqDataset def _flatten(dico, prefix=None): """Flatten a nested dictionary.""" new_dico = OrderedDict() if isinstance(dico, dict): prefix = prefix + "." if prefix is not None else "" for k, v in dico.items(): if v is None: continue new_dico.update(_flatten(v, prefix + k)) elif isinstance(dico, list): for i, v in enumerate(dico): new_dico.update(_flatten(v, prefix + ".[" + str(i) + "]")) else: new_dico = OrderedDict({prefix: dico}) return new_dico def _unflatten(dico): """Unflatten a flattened dictionary into a nested dictionary.""" new_dico = OrderedDict() for full_k, v in dico.items(): full_k = full_k.split(".") node = new_dico for k in full_k[:-1]: if k.startswith("[") and k.endswith("]"): k = int(k[1:-1]) if k not in node: node[k] = OrderedDict() node = node[k] node[full_k[-1]] = v return new_dico class NestedDictionaryDataset(FairseqDataset): def __init__(self, defn, sizes=None): super().__init__() self.defn = _flatten(defn) self.sizes = [sizes] if not isinstance(sizes, (list, tuple)) else sizes first = None for v in self.defn.values(): if not isinstance( v, ( FairseqDataset, torch.utils.data.Dataset, ), ): raise ValueError("Expected Dataset but found: {}".format(v.__class__)) first = first or v if len(v) > 0: assert len(v) == len(first), "dataset lengths must match" self._len = len(first) def __getitem__(self, index): return OrderedDict((k, ds[index]) for k, ds in self.defn.items()) def __len__(self): return self._len def collater(self, samples): """Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate Returns: dict: a mini-batch suitable for forwarding with a Model """ if len(samples) == 0: return {} sample = OrderedDict() for k, ds in self.defn.items(): try: sample[k] = ds.collater([s[k] for s in samples]) except NotImplementedError: sample[k] = default_collate([s[k] for s in samples]) return _unflatten(sample) def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to enforce ``--max-tokens`` during batching.""" return max(s[index] for s in self.sizes) def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" if len(self.sizes) == 1: return self.sizes[0][index] else: return (s[index] for s in self.sizes) @property def supports_prefetch(self): """Whether this dataset supports prefetching.""" return any(ds.supports_prefetch for ds in self.defn.values()) def prefetch(self, indices): """Prefetch the data required for this epoch.""" for ds in self.defn.values(): if getattr(ds, "supports_prefetch", False): ds.prefetch(indices) @property def can_reuse_epoch_itr_across_epochs(self): return all(ds.can_reuse_epoch_itr_across_epochs for ds in self.defn.values()) def set_epoch(self, epoch): super().set_epoch(epoch) for ds in self.defn.values(): ds.set_epoch(epoch) ================================================ FILE: fairseq/data/noising.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from fairseq.data import data_utils class WordNoising(object): """Generate a noisy version of a sentence, without changing words themselves.""" def __init__(self, dictionary, bpe_cont_marker="@@", bpe_end_marker=None): self.dictionary = dictionary self.bpe_end = None if bpe_cont_marker: self.bpe_end = np.array( [ not self.dictionary[i].endswith(bpe_cont_marker) for i in range(len(self.dictionary)) ] ) elif bpe_end_marker: self.bpe_end = np.array( [ self.dictionary[i].endswith(bpe_end_marker) for i in range(len(self.dictionary)) ] ) self.get_word_idx = ( self._get_bpe_word_idx if self.bpe_end is not None else self._get_token_idx ) def noising(self, x, lengths, noising_prob=0.0): raise NotImplementedError() def _get_bpe_word_idx(self, x): """ Given a list of BPE tokens, for every index in the tokens list, return the index of the word grouping that it belongs to. For example, for input x corresponding to ["how", "are", "y@@", "ou"], return [[0], [1], [2], [2]]. """ # x: (T x B) bpe_end = self.bpe_end[x] if x.size(0) == 1 and x.size(1) == 1: # Special case when we only have one word in x. If x = [[N]], # bpe_end is a scalar (bool) instead of a 2-dim array of bools, # which makes the sum operation below fail. return np.array([[0]]) # do a reduce front sum to generate word ids word_idx = bpe_end[::-1].cumsum(0)[::-1] word_idx = word_idx.max(0)[None, :] - word_idx return word_idx def _get_token_idx(self, x): """ This is to extend noising functions to be able to apply to non-bpe tokens, e.g. word or characters. """ x = torch.t(x) word_idx = np.array([range(len(x_i)) for x_i in x]) return np.transpose(word_idx) class WordDropout(WordNoising): """Randomly drop input words. If not passing blank_idx (default is None), then dropped words will be removed. Otherwise, it will be replaced by the blank_idx.""" def __init__( self, dictionary, default_dropout_prob=0.1, bpe_cont_marker="@@", bpe_end_marker=None, ): super().__init__(dictionary, bpe_cont_marker, bpe_end_marker) self.default_dropout_prob = default_dropout_prob def noising(self, x, lengths, dropout_prob=None, blank_idx=None): if dropout_prob is None: dropout_prob = self.default_dropout_prob # x: (T x B), lengths: B if dropout_prob == 0: return x, lengths assert 0 < dropout_prob < 1 # be sure to drop entire words word_idx = self.get_word_idx(x) sentences = [] modified_lengths = [] for i in range(lengths.size(0)): # Since dropout probabilities need to apply over non-pad tokens, # it is not trivial to generate the keep mask without consider # input lengths; otherwise, this could be done outside the loop # We want to drop whole words based on word_idx grouping num_words = max(word_idx[:, i]) + 1 # ith example: [x0, x1, ..., eos, pad, ..., pad] # We should only generate keep probs for non-EOS tokens. Thus if the # input sentence ends in EOS, the last word idx is not included in # the dropout mask generation and we append True to always keep EOS. # Otherwise, just generate the dropout mask for all word idx # positions. has_eos = x[lengths[i] - 1, i] == self.dictionary.eos() if has_eos: # has eos? keep = np.random.rand(num_words - 1) >= dropout_prob keep = np.append(keep, [True]) # keep EOS symbol else: keep = np.random.rand(num_words) >= dropout_prob words = x[: lengths[i], i].tolist() # TODO: speed up the following loop # drop words from the input according to keep new_s = [ w if keep[word_idx[j, i]] else blank_idx for j, w in enumerate(words) ] new_s = [w for w in new_s if w is not None] # we need to have at least one word in the sentence (more than the # start / end sentence symbols) if len(new_s) <= 1: # insert at beginning in case the only token left is EOS # EOS should be at end of list. new_s.insert(0, words[np.random.randint(0, len(words))]) assert len(new_s) >= 1 and ( not has_eos # Either don't have EOS at end or last token is EOS or (len(new_s) >= 2 and new_s[-1] == self.dictionary.eos()) ), "New sentence is invalid." sentences.append(new_s) modified_lengths.append(len(new_s)) # re-construct input modified_lengths = torch.LongTensor(modified_lengths) modified_x = torch.LongTensor( modified_lengths.max(), modified_lengths.size(0) ).fill_(self.dictionary.pad()) for i in range(modified_lengths.size(0)): modified_x[: modified_lengths[i], i].copy_(torch.LongTensor(sentences[i])) return modified_x, modified_lengths class WordShuffle(WordNoising): """Shuffle words by no more than k positions.""" def __init__( self, dictionary, default_max_shuffle_distance=3, bpe_cont_marker="@@", bpe_end_marker=None, ): super().__init__(dictionary, bpe_cont_marker, bpe_end_marker) self.default_max_shuffle_distance = 3 def noising(self, x, lengths, max_shuffle_distance=None): if max_shuffle_distance is None: max_shuffle_distance = self.default_max_shuffle_distance # x: (T x B), lengths: B if max_shuffle_distance == 0: return x, lengths # max_shuffle_distance < 1 will return the same sequence assert max_shuffle_distance > 1 # define noise word scores noise = np.random.uniform( 0, max_shuffle_distance, size=(x.size(0), x.size(1)), ) noise[0] = -1 # do not move start sentence symbol # be sure to shuffle entire words word_idx = self.get_word_idx(x) x2 = x.clone() for i in range(lengths.size(0)): length_no_eos = lengths[i] if x[lengths[i] - 1, i] == self.dictionary.eos(): length_no_eos = lengths[i] - 1 # generate a random permutation scores = word_idx[:length_no_eos, i] + noise[word_idx[:length_no_eos, i], i] # ensure no reordering inside a word scores += 1e-6 * np.arange(length_no_eos.item()) permutation = scores.argsort() # shuffle words x2[:length_no_eos, i].copy_( x2[:length_no_eos, i][torch.from_numpy(permutation)] ) return x2, lengths class UnsupervisedMTNoising(WordNoising): """ Implements the default configuration for noising in UnsupervisedMT (github.com/facebookresearch/UnsupervisedMT) """ def __init__( self, dictionary, max_word_shuffle_distance, word_dropout_prob, word_blanking_prob, bpe_cont_marker="@@", bpe_end_marker=None, ): super().__init__(dictionary) self.max_word_shuffle_distance = max_word_shuffle_distance self.word_dropout_prob = word_dropout_prob self.word_blanking_prob = word_blanking_prob self.word_dropout = WordDropout( dictionary=dictionary, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker, ) self.word_shuffle = WordShuffle( dictionary=dictionary, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker, ) def noising(self, x, lengths): # 1. Word Shuffle noisy_src_tokens, noisy_src_lengths = self.word_shuffle.noising( x=x, lengths=lengths, max_shuffle_distance=self.max_word_shuffle_distance, ) # 2. Word Dropout noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising( x=noisy_src_tokens, lengths=noisy_src_lengths, dropout_prob=self.word_dropout_prob, ) # 3. Word Blanking noisy_src_tokens, noisy_src_lengths = self.word_dropout.noising( x=noisy_src_tokens, lengths=noisy_src_lengths, dropout_prob=self.word_blanking_prob, blank_idx=self.dictionary.unk(), ) return noisy_src_tokens class NoisingDataset(torch.utils.data.Dataset): def __init__( self, src_dataset, src_dict, seed, noiser=None, noising_class=UnsupervisedMTNoising, **kwargs ): """ Wrap a :class:`~torch.utils.data.Dataset` and apply noise to the samples based on the supplied noising configuration. Args: src_dataset (~torch.utils.data.Dataset): dataset to wrap. to build self.src_dataset -- a LanguagePairDataset with src dataset as the source dataset and None as the target dataset. Should NOT have padding so that src_lengths are accurately calculated by language_pair_dataset collate function. We use language_pair_dataset here to encapsulate the tgt_dataset so we can re-use the LanguagePairDataset collater to format the batches in the structure that SequenceGenerator expects. src_dict (~fairseq.data.Dictionary): source dictionary seed (int): seed to use when generating random noise noiser (WordNoising): a pre-initialized :class:`WordNoising` instance. If this is None, a new instance will be created using *noising_class* and *kwargs*. noising_class (class, optional): class to use to initialize a default :class:`WordNoising` instance. kwargs (dict, optional): arguments to initialize the default :class:`WordNoising` instance given by *noiser*. """ self.src_dataset = src_dataset self.src_dict = src_dict self.seed = seed self.noiser = ( noiser if noiser is not None else noising_class( dictionary=src_dict, **kwargs, ) ) self.sizes = src_dataset.sizes def __getitem__(self, index): """ Returns a single noisy sample. Multiple samples are fed to the collater create a noising dataset batch. """ src_tokens = self.src_dataset[index] src_lengths = torch.LongTensor([len(src_tokens)]) src_tokens = src_tokens.unsqueeze(0) # Transpose src tokens to fit expected shape of x in noising function # (batch size, sequence length) -> (sequence length, batch size) src_tokens_t = torch.t(src_tokens) with data_utils.numpy_seed(self.seed + index): noisy_src_tokens = self.noiser.noising(src_tokens_t, src_lengths) # Transpose back to expected src_tokens format # (sequence length, 1) -> (1, sequence length) noisy_src_tokens = torch.t(noisy_src_tokens) return noisy_src_tokens[0] def __len__(self): """ The length of the noising dataset is the length of src. """ return len(self.src_dataset) @property def supports_prefetch(self): return self.src_dataset.supports_prefetch def prefetch(self, indices): if self.src_dataset.supports_prefetch: self.src_dataset.prefetch(indices) ================================================ FILE: fairseq/data/num_samples_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import FairseqDataset class NumSamplesDataset(FairseqDataset): def __getitem__(self, index): return 1 def __len__(self): return 0 def collater(self, samples): return sum(samples) ================================================ FILE: fairseq/data/numel_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from . import BaseWrapperDataset class NumelDataset(BaseWrapperDataset): def __init__(self, dataset, reduce=False): super().__init__(dataset) self.reduce = reduce def __getitem__(self, index): item = self.dataset[index] if torch.is_tensor(item): return torch.numel(item) else: return np.size(item) def __len__(self): return len(self.dataset) def collater(self, samples): if self.reduce: return sum(samples) else: return torch.tensor(samples) ================================================ FILE: fairseq/data/offset_tokens_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import BaseWrapperDataset class OffsetTokensDataset(BaseWrapperDataset): def __init__(self, dataset, offset): super().__init__(dataset) self.offset = offset def __getitem__(self, idx): return self.dataset[idx] + self.offset ================================================ FILE: fairseq/data/pad_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.data import data_utils from . import BaseWrapperDataset class PadDataset(BaseWrapperDataset): def __init__(self, dataset, pad_idx, left_pad, pad_length=None): super().__init__(dataset) self.pad_idx = pad_idx self.left_pad = left_pad self.pad_length = pad_length def collater(self, samples): return data_utils.collate_tokens( samples, self.pad_idx, left_pad=self.left_pad, pad_to_length=self.pad_length ) class LeftPadDataset(PadDataset): def __init__(self, dataset, pad_idx): super().__init__(dataset, pad_idx, left_pad=True) class RightPadDataset(PadDataset): def __init__(self, dataset, pad_idx): super().__init__(dataset, pad_idx, left_pad=False) ================================================ FILE: fairseq/data/padding_mask_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.data import data_utils from . import BaseWrapperDataset class PaddingMaskDataset(BaseWrapperDataset): def __init__(self, dataset, left_pad, pad_length=None): super().__init__(dataset) self.left_pad = left_pad self.pad_length = pad_length def __getitem__(self, index): item = self.dataset[index] return torch.zeros_like(item).bool() def __len__(self): return len(self.dataset) def collater(self, samples): return data_utils.collate_tokens( samples, True, left_pad=self.left_pad, pad_to_length=self.pad_length ) class LeftPaddingMaskDataset(PaddingMaskDataset): def __init__(self, dataset): super().__init__(dataset, left_pad=True) class RightPaddingMaskDataset(PaddingMaskDataset): def __init__(self, dataset): super().__init__(dataset, left_pad=False) ================================================ FILE: fairseq/data/plasma_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import hashlib import json import subprocess import tempfile from typing import Hashable try: import pyarrow.plasma as plasma PYARROW_AVAILABLE = True except ImportError: plasma = None PYARROW_AVAILABLE = False class PlasmaArray: """ Wrapper around numpy arrays that automatically moves the data to shared memory upon serialization. This is particularly helpful when passing numpy arrays through multiprocessing, so that data is not unnecessarily duplicated or pickled. """ def __init__(self, array): super().__init__() self.array = array self.disable = array.nbytes < 134217728 # disable for arrays <128MB self.object_id = None self.path = None # variables with underscores shouldn't be pickled self._client = None self._server = None self._server_tmp = None self._plasma = None @property def plasma(self): if self._plasma is None and not self.disable: self._plasma = plasma return self._plasma def start_server(self): if self.plasma is None or self._server is not None: return assert self.object_id is None assert self.path is None self._server_tmp = tempfile.NamedTemporaryFile() self.path = self._server_tmp.name self._server = subprocess.Popen( ["plasma_store", "-m", str(int(1.05 * self.array.nbytes)), "-s", self.path] ) @property def client(self): if self._client is None: assert self.path is not None self._client = self.plasma.connect(self.path, num_retries=200) return self._client def __getstate__(self): """Called on pickle load""" if self.plasma is None: return self.__dict__ if self.object_id is None: self.start_server() self.object_id = self.client.put(self.array) state = self.__dict__.copy() del state["array"] state["_client"] = None state["_server"] = None state["_server_tmp"] = None state["_plasma"] = None return state def __setstate__(self, state): """Called on pickle save""" self.__dict__.update(state) if self.plasma is None: return self.array = self.client.get(self.object_id) def __del__(self): if self._server is not None: self._server.kill() self._server = None self._server_tmp.close() self._server_tmp = None DEFAULT_PLASMA_PATH = "/tmp/plasma" class PlasmaView: """Interface to write and read from shared memory. Whereas PlasmaArray writes to plasma on serialization, PlasmaView writes to shared memory on instantiation.""" def __init__(self, array, split_path: str, hash_data: Hashable, plasma_path=None): """ Args: array: numpy array to store. This can be read with ``PlasmaView().array`` split_path: the path whence the data was read, used for hashing hash_data: other metadata about the array that can be used to create a unique key. as of writing, the 3 callers in ``TokenBlockDataset`` use:: hash_data = ((block_size, document_sep_len, str(break_mode), len(dataset)), 0|1|2) """ assert PYARROW_AVAILABLE assert split_path is not None if plasma_path is None: plasma_path = DEFAULT_PLASMA_PATH self.path = plasma_path self.split_path = split_path self._client = None # Initialize lazily for pickle. plasma clients should not be deep copied or serialized. self._n = None self.object_id = self.get_object_id(self.split_path, hash_data) try: self.client.put(array, object_id=self.object_id) except plasma.PlasmaObjectExists: pass @property def client(self): if self._client is None: self._client = plasma.connect(self.path, num_retries=200) return self._client @property def array(self): """Fetch a read only view of an np.array, stored in plasma.""" ret = self.client.get(self.object_id) return ret @staticmethod def get_object_id(split_path: str, hash_data: Hashable): """Returns plasma.ObjectID from hashing split_path and object_num.""" hash = hashlib.blake2b(bytes(split_path, "utf-8"), digest_size=20) harg = json.dumps(hash_data).encode("utf-8") hash.update(harg) return plasma.ObjectID(hash.digest()) def __getstate__(self): """Called on pickle save""" self.disconnect() state = self.__dict__.copy() assert state["_client"] is None assert "object_id" in state return state def __setstate__(self, state): """Called on pickle load""" self.__dict__.update(state) def __del__(self): self.disconnect() def disconnect(self): if self._client is not None: self._client.disconnect() self._client = None def __len__(self): """Save reads by caching len""" if self._n is None: self._n = len(self.array) return self._n GB100 = (1024**3) * 100 class PlasmaStore: def __init__(self, path=DEFAULT_PLASMA_PATH, nbytes: int = GB100): self.server = self.start(path, nbytes) def __del__(self): self.server.kill() @staticmethod def start(path=DEFAULT_PLASMA_PATH, nbytes: int = GB100) -> subprocess.Popen: if not PYARROW_AVAILABLE: raise ImportError("please run pip install pyarrow to use --use_plasma_view") # best practice is to allocate more space than we need. The limitation seems to be the size of /dev/shm _server = subprocess.Popen(["plasma_store", "-m", str(nbytes), "-s", path]) plasma.connect(path, num_retries=200) # If we can't connect we fail immediately return _server ================================================ FILE: fairseq/data/prepend_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from . import BaseWrapperDataset class PrependDataset(BaseWrapperDataset): def __init__(self, dataset, prepend_getter, ensure_first_token_is=None): super().__init__(dataset) self.prepend_getter = prepend_getter self.ensure_first_token = ensure_first_token_is def __getitem__(self, idx): item = self.dataset[idx] is_tuple = isinstance(item, tuple) src = item[0] if is_tuple else item assert self.ensure_first_token is None or src[0] == self.ensure_first_token prepend_idx = self.prepend_getter(self.dataset, idx) assert isinstance(prepend_idx, int) src[0] = prepend_idx item = tuple((src,) + item[1:]) if is_tuple else src return item ================================================ FILE: fairseq/data/prepend_token_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from . import BaseWrapperDataset class PrependTokenDataset(BaseWrapperDataset): def __init__(self, dataset, token=None): super().__init__(dataset) self.token = token if token is not None: self._sizes = np.array(dataset.sizes) + 1 else: self._sizes = dataset.sizes def __getitem__(self, idx): item = self.dataset[idx] if self.token is not None: item = torch.cat([item.new([self.token]), item]) return item @property def sizes(self): return self._sizes def num_tokens(self, index): n = self.dataset.num_tokens(index) if self.token is not None: n += 1 return n def size(self, index): n = self.dataset.size(index) if self.token is not None: n += 1 return n ================================================ FILE: fairseq/data/raw_label_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import FairseqDataset class RawLabelDataset(FairseqDataset): def __init__(self, labels): super().__init__() self.labels = labels def __getitem__(self, index): return self.labels[index] def __len__(self): return len(self.labels) def collater(self, samples): return torch.tensor(samples) ================================================ FILE: fairseq/data/replace_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import BaseWrapperDataset class ReplaceDataset(BaseWrapperDataset): """Replaces tokens found in the dataset by a specified replacement token Args: dataset (~torch.utils.data.Dataset): dataset to replace tokens in replace_map(Dictionary[int,int]): map of token to replace -> replacement token offsets (List[int]): do not replace tokens before (from left if pos, right if neg) this offset. should be as many as the number of objects returned by the underlying dataset __getitem__ method. """ def __init__(self, dataset, replace_map, offsets): super().__init__(dataset) assert len(replace_map) > 0 self.replace_map = replace_map self.offsets = offsets def __getitem__(self, index): item = self.dataset[index] is_tuple = isinstance(item, tuple) srcs = item if is_tuple else [item] for offset, src in zip(self.offsets, srcs): for k, v in self.replace_map.items(): src_off = src[offset:] if offset >= 0 else src[:offset] src_off.masked_fill_(src_off == k, v) item = srcs if is_tuple else srcs[0] return item ================================================ FILE: fairseq/data/resampling_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import numpy as np from fairseq.data import BaseWrapperDataset, plasma_utils logger = logging.getLogger(__name__) class ResamplingDataset(BaseWrapperDataset): """Randomly samples from a given dataset at each epoch. Sampling is done with or without replacement, depending on the "replace" parameter. Optionally, the epoch size can be rescaled. This is potentially desirable to increase per-epoch coverage of the base dataset (since sampling with replacement means that many items in the dataset will be left out). In the case of sampling without replacement, size_ratio should be strictly less than 1. Args: dataset (~torch.utils.data.Dataset): dataset on which to sample. weights (List[float]): list of probability weights (default: None, which corresponds to uniform sampling). replace (bool): sampling mode; True for "with replacement", or False for "without replacement" (default: True) size_ratio (float): the ratio to subsample to; must be positive (default: 1.0). batch_by_size (bool): whether or not to batch by sequence length (default: True). seed (int): RNG seed to use (default: 0). epoch (int): starting epoch number (default: 1). """ def __init__( self, dataset, weights=None, replace=True, size_ratio=1.0, batch_by_size=True, seed=0, epoch=1, ): super().__init__(dataset) if weights is None: self.weights = None else: assert len(weights) == len(dataset) weights_arr = np.array(weights, dtype=np.float64) weights_arr /= weights_arr.sum() self.weights = plasma_utils.PlasmaArray(weights_arr) self.replace = replace assert size_ratio > 0.0 if not self.replace: assert size_ratio < 1.0 self.size_ratio = float(size_ratio) self.actual_size = np.ceil(len(dataset) * self.size_ratio).astype(int) self.batch_by_size = batch_by_size self.seed = seed self._cur_epoch = None self._cur_indices = None self.set_epoch(epoch) def __getitem__(self, index): return self.dataset[self._cur_indices.array[index]] def __len__(self): return self.actual_size @property def sizes(self): if isinstance(self.dataset.sizes, list): return [s[self._cur_indices.array] for s in self.dataset.sizes] return self.dataset.sizes[self._cur_indices.array] def num_tokens(self, index): return self.dataset.num_tokens(self._cur_indices.array[index]) def size(self, index): return self.dataset.size(self._cur_indices.array[index]) def ordered_indices(self): if self.batch_by_size: order = [ np.arange(len(self)), self.sizes, ] # No need to handle `self.shuffle == True` return np.lexsort(order) else: return np.arange(len(self)) def prefetch(self, indices): self.dataset.prefetch(self._cur_indices.array[indices]) @property def can_reuse_epoch_itr_across_epochs(self): return False def set_epoch(self, epoch): logger.debug("ResamplingDataset.set_epoch: {}".format(epoch)) super().set_epoch(epoch) if epoch == self._cur_epoch: return self._cur_epoch = epoch # Generate a weighted sample of indices as a function of the # random seed and the current epoch. rng = np.random.RandomState( [ 42, # magic number self.seed % (2**32), # global seed self._cur_epoch, # epoch index ] ) self._cur_indices = plasma_utils.PlasmaArray( rng.choice( len(self.dataset), self.actual_size, replace=self.replace, p=(None if self.weights is None else self.weights.array), ) ) ================================================ FILE: fairseq/data/roll_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import BaseWrapperDataset class RollDataset(BaseWrapperDataset): def __init__(self, dataset, shifts): super().__init__(dataset) self.shifts = shifts def __getitem__(self, index): item = self.dataset[index] return torch.roll(item, self.shifts) ================================================ FILE: fairseq/data/round_robin_zip_datasets.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from collections import OrderedDict from typing import Dict, Sequence import numpy as np from . import FairseqDataset, LanguagePairDataset logger = logging.getLogger(__name__) class RoundRobinZipDatasets(FairseqDataset): """Zip multiple :class:`~fairseq.data.FairseqDataset` instances together. Shorter datasets are repeated in a round-robin fashion to match the length of the longest one. Args: datasets (Dict[~fairseq.data.FairseqDataset]): a dictionary of :class:`~fairseq.data.FairseqDataset` instances. eval_key (str, optional): a key used at evaluation time that causes this instance to pass-through batches from *datasets[eval_key]*. """ def __init__(self, datasets, eval_key=None): super().__init__() if isinstance(datasets, dict): datasets = OrderedDict(datasets) assert isinstance(datasets, OrderedDict) assert datasets, "Can't make a RoundRobinZipDatasets out of nothing" for dataset in datasets.values(): assert isinstance(dataset, FairseqDataset) self.datasets = datasets self.eval_key = eval_key self.longest_dataset_key = max(datasets, key=lambda k: len(datasets[k])) self.longest_dataset = datasets[self.longest_dataset_key] self._ordered_indices: Dict[str, Sequence[int]] = None def _map_index(self, key, index): assert ( self._ordered_indices is not None ), "Must call RoundRobinZipDatasets.ordered_indices() first" o = self._ordered_indices[key] return o[index % len(o)] def __getitem__(self, index): if self.eval_key is None: return OrderedDict( [ (key, dataset[self._map_index(key, index)]) for key, dataset in self.datasets.items() ] ) else: # at evaluation time it's useful to pass-through batches from a single key return self.datasets[self.eval_key][self._map_index(self.eval_key, index)] def __len__(self): if self._ordered_indices is not None: return len(self._ordered_indices[self.longest_dataset_key]) return len(self.longest_dataset) def collater(self, samples): """Merge a list of samples to form a mini-batch.""" if len(samples) == 0: return None if self.eval_key is None: return OrderedDict( [ (key, dataset.collater([sample[key] for sample in samples])) for key, dataset in self.datasets.items() ] ) else: # at evaluation time it's useful to pass-through batches from a single key return self.datasets[self.eval_key].collater(samples) def num_tokens(self, index): """Return an example's length (number of tokens), used for batching.""" # TODO make it configurable whether to use max() or sum() here return max( dataset.num_tokens(self._map_index(key, index)) for key, dataset in self.datasets.items() ) def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" return { key: dataset.size(self._map_index(key, index)) for key, dataset in self.datasets.items() } def ordered_indices(self): """Ordered indices for batching.""" if self._ordered_indices is None: # Call the underlying dataset's ordered_indices() here, so that we # get the same random ordering as we would have from using the # underlying sub-datasets directly. self._ordered_indices = OrderedDict( [ (key, dataset.ordered_indices()) for key, dataset in self.datasets.items() ] ) return np.arange(len(self)) def filter_indices_by_size(self, indices, max_positions=None): """ Filter each sub-dataset independently, then update the round robin to work on the filtered sub-datasets. """ def _deep_until_language_pair(dataset): if isinstance(dataset, LanguagePairDataset): return dataset if hasattr(dataset, "tgt_dataset"): return _deep_until_language_pair(dataset.tgt_dataset) if hasattr(dataset, "dataset"): return _deep_until_language_pair(dataset.dataset) raise Exception(f"Don't know how to unwrap this dataset: {dataset}") if not isinstance(max_positions, dict): max_positions = {k: max_positions for k in self.datasets.keys()} ignored_some = False for key, dataset in self.datasets.items(): dataset = _deep_until_language_pair(dataset) self._ordered_indices[key], ignored = dataset.filter_indices_by_size( self._ordered_indices[key], max_positions[key] ) if len(ignored) > 0: ignored_some = True logger.warning( f"{len(ignored)} samples from {key} have invalid sizes and will be skipped, " f"max_positions={max_positions[key]}, first few sample ids={ignored[:10]}" ) # Since we are modifying in place the _ordered_indices, # it's not possible anymore to return valid ignored indices. # Hopefully the extra debug information print above should be enough to debug. # Ideally we would receive ignore_invalid_inputs so that we could have # a proper error message. return (np.arange(len(self)), [0] if ignored_some else []) @property def supports_prefetch(self): return all( getattr(dataset, "supports_prefetch", False) for dataset in self.datasets.values() ) def prefetch(self, indices): for key, dataset in self.datasets.items(): dataset.prefetch([self._map_index(key, index) for index in indices]) ================================================ FILE: fairseq/data/shorten_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np from fairseq.data import data_utils from . import BaseWrapperDataset class TruncateDataset(BaseWrapperDataset): """Truncate a sequence by returning the first truncation_length tokens""" def __init__(self, dataset, truncation_length): super().__init__(dataset) assert truncation_length is not None self.truncation_length = truncation_length self.dataset = dataset def __getitem__(self, index): item = self.dataset[index] item_len = item.size(0) if item_len > self.truncation_length: item = item[: self.truncation_length] return item @property def sizes(self): return np.minimum(self.dataset.sizes, self.truncation_length) def __len__(self): return len(self.dataset) class RandomCropDataset(TruncateDataset): """Truncate a sequence by returning a random crop of truncation_length tokens""" def __init__(self, dataset, truncation_length, seed=1): super().__init__(dataset, truncation_length) self.seed = seed self.epoch = 0 @property def can_reuse_epoch_itr_across_epochs(self): return True # only the crop changes, not item sizes def set_epoch(self, epoch, **unused): super().set_epoch(epoch) self.epoch = epoch def __getitem__(self, index): with data_utils.numpy_seed(self.seed, self.epoch, index): item = self.dataset[index] item_len = item.size(0) excess = item_len - self.truncation_length if excess > 0: start_idx = np.random.randint(0, excess) item = item[start_idx : start_idx + self.truncation_length] return item def maybe_shorten_dataset( dataset, split, shorten_data_split_list, shorten_method, tokens_per_sample, seed, ): truncate_split = ( split in shorten_data_split_list.split(",") or len(shorten_data_split_list) == 0 ) if shorten_method == "truncate" and truncate_split: dataset = TruncateDataset(dataset, tokens_per_sample) elif shorten_method == "random_crop" and truncate_split: dataset = RandomCropDataset(dataset, tokens_per_sample, seed) return dataset ================================================ FILE: fairseq/data/sort_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np from . import BaseWrapperDataset class SortDataset(BaseWrapperDataset): def __init__(self, dataset, sort_order): super().__init__(dataset) if not isinstance(sort_order, (list, tuple)): sort_order = [sort_order] self.sort_order = sort_order assert all(len(so) == len(dataset) for so in sort_order) def ordered_indices(self): return np.lexsort(self.sort_order) ================================================ FILE: fairseq/data/span_mask_tokens_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from . import Dictionary, FairseqDataset, data_utils def collate( samples, pad_idx, eos_idx, vocab, left_pad_source=False, left_pad_target=False, input_feeding=True, pad_to_length=None, ): assert input_feeding if len(samples) == 0: return {} def merge(key, left_pad, move_eos_to_beginning=False, pad_to_length=None): return data_utils.collate_tokens( [s[key] for s in samples], pad_idx, eos_idx=None, # use eos_idx of each sample instead of vocab.eos() left_pad=left_pad, move_eos_to_beginning=move_eos_to_beginning, pad_to_length=pad_to_length, ) id = torch.LongTensor([s["id"] for s in samples]) src_tokens = merge( "source", left_pad=left_pad_source, pad_to_length=pad_to_length["source"] if pad_to_length is not None else None, ) # sort by descending source length src_lengths = torch.LongTensor([s["source"].numel() for s in samples]) src_lengths, sort_order = src_lengths.sort(descending=True) id = id.index_select(0, sort_order) src_tokens = src_tokens.index_select(0, sort_order) prev_output_tokens = None target = None if samples[0].get("target", None) is not None: target = merge( "target", left_pad=left_pad_target, pad_to_length=pad_to_length["target"] if pad_to_length is not None else None, ) target = target.index_select(0, sort_order) ntokens = sum(len(s["target"]) for s in samples) if input_feeding: # we create a shifted version of targets for feeding the # previous output token(s) into the next decoder step prev_output_tokens = merge( "target", left_pad=left_pad_target, move_eos_to_beginning=True, pad_to_length=pad_to_length["target"] if pad_to_length is not None else None, ) prev_output_tokens = prev_output_tokens.index_select(0, sort_order) else: ntokens = sum(len(s["source"]) for s in samples) batch = { "id": id, "ntokens": ntokens, "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, }, "target": target, "target_lengths": torch.LongTensor([len(t) for t in target]), "nsentences": samples[0]["source"].size(0), "sort_order": sort_order, } if prev_output_tokens is not None: batch["net_input"]["prev_output_tokens"] = prev_output_tokens return batch class SpanMaskedTokensDataset(FairseqDataset): """ A wrapper around TokenBlockDataset for T5 dataset. Args: dataset (~torch.utils.data.Dataset): dataset to wrap vocab (~fairseq.data.Dictionary): vocabulary noise_density (float): fraction of the tokens to select as noise. mean_noise_span_length (float): mean noise span length. shuffle (bool, optional): shuffle the elements before batching. Default: ``True`` seed: Seed for random number generator for reproducibility. """ def __init__( self, dataset: torch.utils.data.Dataset, vocab: Dictionary, noise_density: float, mean_noise_span_length: float, shuffle: bool, seed: int = 1, ): self.dataset = dataset self.vocab = vocab self.seed = seed self.noise_density = noise_density self.mean_noise_span_length = mean_noise_span_length self.shuffle = shuffle self.epoch = 0 @property def can_reuse_epoch_itr_across_epochs(self): return True # only the noise changes, not item sizes def set_epoch(self, epoch, **unused): self.epoch = epoch def __getitem__(self, index): with data_utils.numpy_seed(self.seed, self.epoch, index): item = self.dataset[index] assert item[-1] == self.vocab.eos() noise_mask = self.random_spans_noise_mask(len(item)) source_sentinel_ids = self.create_sentinel_ids(noise_mask.astype(np.int8)) source = self.filter_input_ids(item, source_sentinel_ids) target_sentinel_ids = self.create_sentinel_ids( (~noise_mask).astype(np.int8) ) target = self.filter_input_ids(item, target_sentinel_ids) return { "id": index, "source": torch.from_numpy(source), "target": torch.from_numpy(target), } def random_spans_noise_mask(self, length): """ This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ . Noise mask consisting of random spans of noise tokens. The number of noise tokens and the number of noise spans and non-noise spans are determined deterministically as follows: num_noise_tokens = round(length * noise_density) num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length) Spans alternate between non-noise and noise, beginning with non-noise. Subject to the above restrictions, all masks are equally likely. Args: length: an int32 scalar (length of the incoming token sequence) Returns: a boolean tensor with shape [length] """ orig_length = length num_noise_tokens = int(np.round(length * self.noise_density)) # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens. num_noise_tokens = min(max(num_noise_tokens, 1), length - 1) num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length)) # avoid degeneracy by ensuring positive number of noise spans num_noise_spans = max(num_noise_spans, 1) num_nonnoise_tokens = length - num_noise_tokens # pick the lengths of the noise spans and the non-noise spans def _random_segmentation(num_items, num_segments): """ Partition a sequence of items randomly into non-empty segments. Args: num_items: an integer scalar > 0 num_segments: an integer scalar in [1, num_items] Returns: a Tensor with shape [num_segments] containing positive integers that add up to num_items """ mask_indices = np.arange(num_items - 1) < (num_segments - 1) np.random.shuffle(mask_indices) first_in_segment = np.pad(mask_indices, [[1, 0]]) segment_id = np.cumsum(first_in_segment) # count length of subsegments assuming that list is sorted _, segment_length = np.unique(segment_id, return_counts=True) return segment_length noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans) nonnoise_span_lengths = _random_segmentation( num_nonnoise_tokens, num_noise_spans ) interleaved_span_lengths = np.reshape( np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2], ) span_starts = np.cumsum(interleaved_span_lengths)[:-1] span_start_indicator = np.zeros((length,), dtype=np.int8) span_start_indicator[span_starts] = True span_num = np.cumsum(span_start_indicator) is_noise = np.equal(span_num % 2, 1) return is_noise[:orig_length] def create_sentinel_ids(self, mask_indices): """ Sentinel ids creation given the indices that should be masked. The start indices of each mask are replaced by the sentinel ids in increasing order. Consecutive mask indices to be deleted are replaced with `-1`. """ start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices sentinel_ids = np.where( start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices ) # making sure all sentinel tokens are unique over the example sentinel_ids = np.where(sentinel_ids != 0, len(self.vocab) - sentinel_ids, 0) sentinel_ids -= mask_indices - start_indices return sentinel_ids @staticmethod def filter_input_ids(input_ids, sentinel_ids): """ Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting. This will reduce the sequence length from `expanded_inputs_length` to `input_length`. """ input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids) # input_ids tokens and sentinel tokens are >= 0, tokens < 0 are # masked tokens coming after sentinel tokens and should be removed return input_ids_full[input_ids_full >= 0] def __len__(self): return len(self.dataset) def collater(self, samples, pad_to_length=None): """ Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate Returns: dict: a mini-batch of data """ return collate( samples, self.vocab.pad(), self.vocab.eos(), self.vocab, pad_to_length=pad_to_length, ) def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to enforce ``--max-tokens`` during batching.""" return self.dataset.sizes[index] def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" return self.dataset.sizes[index] def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: indices = np.random.permutation(len(self)) else: indices = np.arange(len(self)) return indices[np.argsort(self.dataset.sizes[indices], kind="mergesort")] def prefetch(self, indices): self.src.prefetch(indices) self.tgt.prefetch(indices) @property def supports_prefetch(self): return ( hasattr(self.src, "supports_prefetch") and self.src.supports_prefetch and hasattr(self.tgt, "supports_prefetch") and self.tgt.supports_prefetch ) ================================================ FILE: fairseq/data/speech_dlm_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import OrderedDict import numpy as np import torch from fairseq.data import FairseqDataset, MonolingualDataset, data_utils class SpeechDLMDataset(FairseqDataset): """The dataset used to train the SpeechDLM model as described in the paper: https://arxiv.org/pdf/2203.16502.pdf The input datasets is expected to be a dict over channel names with the values being instances of :class:`~fairseq.data.MonolingualDataset`. Each element of SpeechDLMDataset is a dictionary with the following keys: - `id` (int) : index of the item - `source` (OrderedDict[str, Tensor of shape (seq_len,)]) : dictionary over channels with the values containing the input unit tokens - `target_next` (OrderedDict[str, Tensor of shape (seq_len,)]) : dictionary over channels with the values containing the next unit tokens (input tokens shifted by 1). Its value is None if 'next' not in self.targets - `target_edge` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) : dictionary over channels with the values containing the edge unit tokens (input tokens deduplicated). Its value is None if 'edge' not in self.targets - `target_duration` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) : dictionary over channels with the values being the durations of the edge units. Its value is None if 'duration' not in targets. - `target_edge_indices` (OrderedDict[str, Tensor of shape (dedup_seq_len,)]) : dictionary over channels with the values being the indices of the edge units in the source sequence. Its value is None if neither 'edge' or 'duration in targets. Args: datasets (Dict[str, ~fairseq.data.MonolingualDataset]): a dictionary of :class:`~fairseq.data.MonolingualDataset` instances. targets (List[str]): list of the target types that the SpeechDLM model should predict. Can be one of "next", "edge", "duration". shuffle (bool, optional): shuffle the elements before batching (default: True). """ def __init__( self, datasets, targets=None, max_target_durations=None, shuffle=False ): super().__init__() if isinstance(datasets, dict): datasets = OrderedDict(datasets) assert isinstance( datasets, OrderedDict ), "datasets is expected to be an instance of Dictionary or OrderedDict" assert datasets, "datasets is None" for dataset in datasets.values(): assert isinstance( dataset, MonolingualDataset ), "Each value of datasets is expected to be an instance of MonolingualDataset" self.datasets = datasets self.targets = targets if max_target_durations is not None and max_target_durations > 0: self.max_target_durations = max_target_durations else: self.max_target_durations = float("inf") self.sizes = next(iter(datasets.values())).sizes self.vocab = next(iter(datasets.values())).vocab self.length = len(next(iter(datasets.values()))) self.shuffle = shuffle for channel, dataset in datasets.items(): assert ( len(dataset) == self.length ), "[{}] length mismatch ({} vs {})".format( channel, len(dataset), self.length ) assert (dataset.sizes == self.sizes).all(), "[{}] sizes mismatch".format( channel ) assert ( dataset.vocab.pad() == self.vocab.pad() ), "pad token is expected to be the same" assert ( dataset.vocab.eos() == self.vocab.eos() ), "eos token is expected to be the same" assert ( dataset.vocab.bos() == self.vocab.bos() ), "bos token is expected to be the same" assert ( dataset.vocab.unk() == self.vocab.unk() ), "unk token is expected to be the same" def __getitem__(self, index): source = OrderedDict( [ (key, dataset[index]["source"]) for (key, dataset) in self.datasets.items() ] ) item = { "id": index, "source": source, "target_next": None, "target_edge": None, "target_duration": None, "target_edge_indices": None, } if self.targets is not None: for channel in self.datasets: target = self._get_target(index, channel) for t in target: if item[f"target_{t}"] is None: item[f"target_{t}"] = OrderedDict() item[f"target_{t}"][channel] = target[t] return item def __len__(self): return self.length def _get_target(self, index, channel): """Get target in one of ['next', 'edge', 'duration'] - 'next' is the future unit - 'edge' is the edge unit - 'duration' is the duration of the edge unit """ if self.targets is not None: target = {} pad_idx = self.vocab.pad() max_dur = self.max_target_durations future_target = self.datasets[channel][index]["target"] if "edge" in self.targets or "duration" in self.targets: edge_units, edge_unit_counts = torch.unique_consecutive( future_target, return_counts=True ) padding_end = edge_units[-1] == pad_idx if padding_end: edge_units = edge_units[:-1] edge_unit_counts = edge_unit_counts[:-1] edge_indices = torch.cumsum(edge_unit_counts, 0) edge_indices = torch.cat([torch.tensor([0]), edge_indices[:-1]]) target["edge_indices"] = edge_indices for t in self.targets: if t == "next": target[t] = future_target elif t == "edge": target[t] = edge_units elif t == "duration": # count the remaining duration of the last edge indices in the next sentence if not padding_end and index < len(self.datasets[channel]) - 1: i = 0 next_sentence_target = self.datasets[channel][index + 1][ "target" ] while ( next_sentence_target[i] == edge_units[-1] and edge_unit_counts[-1] + i < max_dur ): i += 1 edge_unit_counts[-1] += i # cut off to the maximal threshold if max_dur: edge_unit_counts[edge_unit_counts > max_dur] = max_dur target[t] = edge_unit_counts else: raise Exception("invalid target " + t) return target def collater(self, samples): """Merge a list of samples to form a mini-batch. Args: samples (List[dict]): samples to collate Returns: dict: a mini-batch with the following keys: - `id` (LongTensor): example IDs in the original input order - `ntokens` (int): total number of tokens in the batch - `net_input` (dict): the input to the Model, containing keys: - `src_tokens` (OrderedDict[str, LongTensor]): dictionary over channel with the values being padded 2D Tensor of samples `source` of shape `(bsz, src_len)`. Padding will appear on the right. - `src_lengths` (LongTensor): lengths of source sentences in the mini-batch - `target` (dict): the target of the Model, containing keys: - `next` (OrderedDict[str, LongTensor]): dictionary over channel with the values being padded 2D Tensor of batch samples' `target_next` of shape `(bsz, tgt_len)`. Padding will appear on the right. - `edge` (OrderedDict[str, LongTensor]): dictionary over channel with the values being the concatenated 1D Tensor of batch samples' `target_edge` of shape `(sum of dedup_tgt_len,)` - `duration` (OrderedDict[str, LongTensor]): dictionary over channel with the values being the concatenated 1D Tensor of batch samples' `target_duration` of shape `(sum of dedup_tgt_len,)` - `edge_indices` (OrderedDict[str, LongTensor]): dictionary over channel with the values being the concatenated 1D Tensor of batch samples' `target_edge_indices` of shape `(sum of dedup_tgt_len,)`. The indices are added to multiplies of batch size such that they are the actual indices in the flatten `src_tokens` Tensor """ if len(samples) == 0: return {} pad_idx = self.vocab.pad() eos_idx = self.vocab.eos() def merge(key, max_size=None): if samples[0][key] is None: return None res = OrderedDict() for channel in samples[0][key]: if key in ["source", "target_next"]: # fill batch of shape: (batch_size, max_size) res[channel] = data_utils.collate_tokens( [s[key][channel] for s in samples], pad_idx, eos_idx, left_pad=False, ) elif key in ["target_edge", "target_duration"]: # concatenate the edge units/duration res[channel] = torch.cat([s[key][channel] for s in samples]) elif key == "target_edge_indices": # increase the edge indices to the indices in the flatten batch res[channel] = torch.cat( [s[key][channel] + i * max_size for i, s in enumerate(samples)] ) return res src_tokens = merge("source") tgt_next = merge("target_next") tgt_edge = merge("target_edge") tgt_duration = merge("target_duration") tgt_edge_indices = merge( "target_edge_indices", max_size=next(iter(src_tokens.values())).size(-1) ) return { "id": torch.LongTensor([s["id"] for s in samples]), "nsentences": len(samples), "ntokens": sum(len(item) for s in samples for item in s["source"].values()), "net_input": { "src_tokens": src_tokens, "src_lengths": torch.LongTensor( [next(iter(s["source"].values())).numel() for s in samples] ), }, "target": { "next": tgt_next, "edge": tgt_edge, "duration": tgt_duration, "edge_indices": tgt_edge_indices, }, } def num_tokens(self, index): """Return the number of tokens in a sample. This value is used to enforce ``--max-tokens`` during batching.""" return self.sizes[index] def size(self, index): """Return an example's size as a float or tuple. This value is used when filtering a dataset with ``--max-positions``.""" return self.sizes[index] def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] order.append(self.sizes) return np.lexsort(order) @property def supports_prefetch(self): return all( getattr(dataset, "supports_prefetch", False) for dataset in self.datasets.values() ) def prefetch(self, indices): for key, dataset in self.datasets.items(): dataset.prefetch(indices) ================================================ FILE: fairseq/data/strip_token_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import BaseWrapperDataset class StripTokenDataset(BaseWrapperDataset): def __init__(self, dataset, id_to_strip): super().__init__(dataset) self.id_to_strip = id_to_strip def __getitem__(self, index): item = self.dataset[index] while len(item) > 0 and item[-1] == self.id_to_strip: item = item[:-1] while len(item) > 0 and item[0] == self.id_to_strip: item = item[1:] return item ================================================ FILE: fairseq/data/subsample_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import logging import numpy as np from fairseq.data.data_utils import numpy_seed from . import BaseWrapperDataset logger = logging.getLogger(__name__) class SubsampleDataset(BaseWrapperDataset): """Subsamples a given dataset by a specified ratio. Subsampling is done on the number of examples Args: dataset (~torch.utils.data.Dataset): dataset to subsample size_ratio(float): the ratio to subsample to. must be between 0 and 1 (exclusive) """ def __init__(self, dataset, size_ratio, shuffle=False, seed=None): super().__init__(dataset) assert size_ratio < 1 self.actual_size = np.ceil(len(dataset) * size_ratio).astype(int) with numpy_seed(seed) if seed is not None else contextlib.ExitStack(): self.indices = np.random.choice( list(range(len(self.dataset))), self.actual_size, replace=False ) self.shuffle = shuffle logger.info( "subsampled dataset from {} to {} (ratio={})".format( len(self.dataset), self.actual_size, size_ratio ) ) def __getitem__(self, index): return self.dataset[self.indices[index]] def __len__(self): return self.actual_size def collater(self, samples): return self.dataset.collater(samples) @property def sizes(self): return self.dataset.sizes[self.indices] @property def name(self): return self.dataset.name def num_tokens(self, index): return self.dataset.num_tokens(self.indices[index]) def size(self, index): return self.dataset.size(self.indices[index]) def ordered_indices(self): """Return an ordered list of indices. Batches will be constructed based on this order.""" if self.shuffle: order = [np.random.permutation(len(self))] else: order = [np.arange(len(self))] order.append(self.sizes) return np.lexsort(order) def prefetch(self, indices): self.dataset.prefetch(self.indices[indices]) ================================================ FILE: fairseq/data/text_compressor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from enum import Enum class TextCompressionLevel(Enum): none = 0 low = 1 high = 2 class TextCompressor(object): def __init__( self, level: TextCompressionLevel, max_input_byte_length: int = 2**16 ): self.level = level self.max_input_length = max_input_byte_length def compress(self, text: str) -> bytes: if self.level == TextCompressionLevel.low: import zlib # zlib: built-in, fast return zlib.compress(text.encode(), level=0) elif self.level == TextCompressionLevel.high: try: import unishox2 # unishox2: optimized for short text but slower except ImportError: raise ImportError( "Please install unishox2 for the text compression feature: " "pip install unishox2-py3" ) assert len(text.encode()) <= self.max_input_length return unishox2.compress(text)[0] else: return text.encode() def decompress(self, compressed: bytes) -> str: if self.level == TextCompressionLevel.low: import zlib return zlib.decompress(compressed).decode() elif self.level == TextCompressionLevel.high: try: import unishox2 except ImportError: raise ImportError( "Please install unishox2 for the text compression feature: " "pip install unishox2-py3" ) return unishox2.decompress(compressed, self.max_input_length) else: return compressed.decode() ================================================ FILE: fairseq/data/token_block_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from fairseq.data import FairseqDataset, plasma_utils from fairseq.data.indexed_dataset import best_fitting_int_dtype from typing import Tuple class TokenBlockDataset(FairseqDataset): """Break a Dataset of tokens into blocks. Args: dataset (~torch.utils.data.Dataset): dataset to break into blocks sizes (List[int]): sentence lengths (required for 'complete' and 'eos') block_size (int): maximum block size (ignored in 'eos' break mode) break_mode (str, optional): Mode used for breaking tokens. Values can be one of: - 'none': break tokens into equally sized blocks (up to block_size) - 'complete': break tokens into blocks (up to block_size) such that blocks contains complete sentences, although block_size may be exceeded if some sentences exceed block_size - 'complete_doc': similar to 'complete' mode, but do not cross document boundaries - 'eos': each block contains one sentence (block_size is ignored) include_targets (bool, optional): return next tokens as targets (default: False). document_sep_len (int, optional): document separator size (required for 'complete_doc' break mode). Typically 1 if the sentences have eos and 0 otherwise. """ def __init__( self, dataset, sizes, block_size, pad, eos, break_mode=None, include_targets=False, document_sep_len=1, use_plasma_view=False, split_path=None, plasma_path=None, ): super().__init__() self.dataset = dataset self.pad = pad self.eos = eos self.include_targets = include_targets assert len(dataset) > 0 assert len(dataset) == len(sizes) _sizes, block_to_dataset_index, slice_indices = self._build_slice_indices( sizes, break_mode, document_sep_len, block_size ) if use_plasma_view: plasma_id = (block_size, document_sep_len, str(break_mode), len(dataset)) self._slice_indices = plasma_utils.PlasmaView( slice_indices, split_path, (plasma_id, 0), plasma_path=plasma_path ) self._sizes = plasma_utils.PlasmaView( _sizes, split_path, (plasma_id, 1), plasma_path=plasma_path ) self._block_to_dataset_index = plasma_utils.PlasmaView( block_to_dataset_index, split_path, (plasma_id, 2), plasma_path=plasma_path, ) else: self._slice_indices = plasma_utils.PlasmaArray(slice_indices) self._sizes = plasma_utils.PlasmaArray(_sizes) self._block_to_dataset_index = plasma_utils.PlasmaArray( block_to_dataset_index ) @staticmethod def _build_slice_indices( sizes, break_mode, document_sep_len, block_size ) -> Tuple[np.ndarray]: """Use token_block_utils_fast to build arrays for indexing into self.dataset""" try: from fairseq.data.token_block_utils_fast import ( _get_slice_indices_fast, _get_block_to_dataset_index_fast, ) except ImportError: raise ImportError( "Please build Cython components with: `pip install --editable .` " "or `python setup.py build_ext --inplace`" ) if isinstance(sizes, list): sizes = np.array(sizes, dtype=np.int64) else: if torch.is_tensor(sizes): sizes = sizes.numpy() sizes = sizes.astype(np.int64) break_mode = break_mode if break_mode is not None else "none" # For "eos" break-mode, block_size is not required parameters. if break_mode == "eos" and block_size is None: block_size = 0 slice_indices = _get_slice_indices_fast( sizes, str(break_mode), block_size, document_sep_len ) _sizes = slice_indices[:, 1] - slice_indices[:, 0] # build index mapping block indices to the underlying dataset indices if break_mode == "eos": # much faster version for eos break mode block_to_dataset_index = np.stack( [ np.arange(len(sizes)), # starting index in dataset np.zeros( len(sizes), dtype=np.compat.long ), # starting offset within starting index np.arange(len(sizes)), # ending index in dataset ], 1, ) else: block_to_dataset_index = _get_block_to_dataset_index_fast( sizes, slice_indices, ) size_dtype = np.uint16 if block_size < 65535 else np.uint32 num_tokens = slice_indices[-1].max() slice_indices_dtype = best_fitting_int_dtype(num_tokens) slice_indices = slice_indices.astype(slice_indices_dtype) _sizes = _sizes.astype(size_dtype) block_to_dataset_index = block_to_dataset_index.astype(slice_indices_dtype) return _sizes, block_to_dataset_index, slice_indices @property def slice_indices(self): return self._slice_indices.array @property def sizes(self): return self._sizes.array @property def block_to_dataset_index(self): return self._block_to_dataset_index.array def attr(self, attr: str, index: int): start_ds_idx, _, _ = self.block_to_dataset_index[index] return self.dataset.attr(attr, start_ds_idx) def __getitem__(self, index): start_ds_idx, start_offset, end_ds_idx = self.block_to_dataset_index[index] buffer = torch.cat( [self.dataset[idx] for idx in range(start_ds_idx, end_ds_idx + 1)] ) slice_s, slice_e = self.slice_indices[index] length = slice_e - slice_s s, e = start_offset, start_offset + length item = buffer[s:e] if self.include_targets: # *target* is the original sentence (=item) # *source* is shifted right by 1 (maybe left-padded with eos) # *past_target* is shifted right by 2 (left-padded as needed) if s == 0: source = torch.cat([item.new([self.eos]), buffer[0 : e - 1]]) past_target = torch.cat( [item.new([self.pad, self.eos]), buffer[0 : e - 2]] ) else: source = buffer[s - 1 : e - 1] if s == 1: past_target = torch.cat([item.new([self.eos]), buffer[0 : e - 2]]) else: past_target = buffer[s - 2 : e - 2] return source, item, past_target return item def __len__(self): return len(self.slice_indices) @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): self.dataset.prefetch( { ds_idx for index in indices for start_ds_idx, _, end_ds_idx in [self.block_to_dataset_index[index]] for ds_idx in range(start_ds_idx, end_ds_idx + 1) } ) ================================================ FILE: fairseq/data/token_block_utils_fast.pyx ================================================ # cython: language_level=3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from itertools import chain from libc.math cimport ceil cimport cython cimport numpy as np from libc.stdint cimport int32_t, int64_t DTYPE = np.int64 ctypedef int64_t DTYPE_t @cython.boundscheck(False) @cython.wraparound(False) @cython.nonecheck(False) cdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_none_mode(np.ndarray[DTYPE_t, ndim=1] sizes, int block_size): cdef DTYPE_t total_size = sizes.sum() cdef DTYPE_t length = <DTYPE_t> ceil(total_size / <double> block_size) cdef np.ndarray[DTYPE_t, ndim=2] slice_indices = np.zeros([length, 2], dtype=DTYPE) cdef DTYPE_t[:, :] slice_indices_view = slice_indices cdef DTYPE_t i cdef DTYPE_t start cdef DTYPE_t end for i in range(length): start = i * block_size end = min(start + block_size, total_size) slice_indices_view[i][0] = start slice_indices_view[i][1] = end return slice_indices cdef np.ndarray[DTYPE_t, ndim=2] _fast_convert_to_np_array(list list_of_list): """ Faster function to convert DTYPE_t list of list. Only fast when there are huge number of rows and low number of columns. """ cdef np.ndarray[DTYPE_t, ndim=1] flat = np.fromiter(chain.from_iterable(list_of_list), DTYPE, -1) return flat.reshape((len(list_of_list), -1)) @cython.boundscheck(False) @cython.wraparound(False) @cython.nonecheck(False) cpdef np.ndarray[DTYPE_t, ndim=2] _get_slice_indices_fast(np.ndarray[DTYPE_t, ndim=1] sizes, str break_mode, int block_size, int document_sep_len): cdef DTYPE_t tok_idx = 0 cdef DTYPE_t sz_idx = 0 cdef DTYPE_t curr_size = 0 cdef DTYPE_t i = 0 cdef DTYPE_t length cdef DTYPE_t total_size cdef DTYPE_t[:] sizes_view = sizes cdef np.ndarray[DTYPE_t, ndim=2] slice_indices cdef list slice_indices_list = [] if break_mode is None or break_mode == 'none': slice_indices = _get_slice_indices_none_mode(sizes, block_size) elif break_mode == 'complete': while sz_idx < len(sizes_view): if curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0: curr_size += sizes_view[sz_idx] sz_idx += 1 else: slice_indices_list.append((tok_idx, tok_idx + curr_size)) tok_idx += curr_size curr_size = 0 if curr_size > 0: slice_indices_list.append((tok_idx, tok_idx + curr_size)) slice_indices = _fast_convert_to_np_array(slice_indices_list) elif break_mode == 'complete_doc': while sz_idx < len(sizes_view): if ( (curr_size + sizes_view[sz_idx] <= block_size or curr_size == 0) # an empty sentence indicates end-of-document: and sizes_view[sz_idx] != document_sep_len ): curr_size += sizes_view[sz_idx] sz_idx += 1 else: # Only keep non-empty documents. if curr_size > 1: slice_indices_list.append((tok_idx, tok_idx + curr_size)) tok_idx += curr_size curr_size = 0 if sizes_view[sz_idx] == document_sep_len: tok_idx += sizes_view[sz_idx] sz_idx += 1 if curr_size > 1: slice_indices_list.append((tok_idx, tok_idx + curr_size)) slice_indices = _fast_convert_to_np_array(slice_indices_list) elif break_mode == 'eos': slice_indices = np.zeros((len(sizes), 2), dtype=DTYPE) cumsum = sizes.cumsum(axis=0) slice_indices[1:, 0] = cumsum[:cumsum.shape[0] - 1] slice_indices[:, 1] = cumsum else: raise ValueError('Invalid break_mode: ' + break_mode) return slice_indices @cython.boundscheck(False) @cython.wraparound(False) @cython.nonecheck(False) cpdef np.ndarray[DTYPE_t, ndim=2] _get_block_to_dataset_index_fast(np.ndarray[DTYPE_t, ndim=1] sizes, np.ndarray[DTYPE_t, ndim=2] slice_indices): cdef DTYPE_t start_ds_idx cdef DTYPE_t start_offset cdef DTYPE_t end_ds_idx cdef DTYPE_t i cdef DTYPE_t s cdef DTYPE_t e cdef DatasetSearcher ds = DatasetSearcher(sizes) cdef np.ndarray[DTYPE_t, ndim=2] block_to_dataset_index = np.zeros([len(slice_indices), 3], dtype=DTYPE) cdef DTYPE_t[:, :] block_to_dataset_index_view = block_to_dataset_index cdef DTYPE_t[:, :] slice_indices_view = slice_indices cdef Py_ssize_t x_max = slice_indices.shape[0] for i in range(x_max): s = slice_indices_view[i][0] e = slice_indices_view[i][1] ds.seek(s) start_ds_idx = ds.current_index start_offset = ds.current_offset if e <= s: end_ds_idx = start_ds_idx else: ds.seek(e - 1) end_ds_idx = ds.current_index block_to_dataset_index_view[i][0] = start_ds_idx # starting index in dataset block_to_dataset_index_view[i][1] = start_offset # starting offset within starting index block_to_dataset_index_view[i][2] = end_ds_idx # ending index in dataset return block_to_dataset_index cdef class DatasetSearcher(object): """Helper for mapping "flat" indices to indices and offsets in an underlying dataset.""" cdef DTYPE_t current_i cdef DTYPE_t current_offset cdef DTYPE_t current_index cdef DTYPE_t[:] sizes def __init__(self, DTYPE_t[:] sizes): self.sizes = sizes self.reset() cdef reset(self): self.current_offset = 0 # offset within current index in underlying dataset self.current_i = 0 # "flat" index self.current_index = 0 # index in underlying dataset @cython.boundscheck(False) @cython.wraparound(False) @cython.nonecheck(False) cdef int step(self, DTYPE_t i): cdef DTYPE_t to_consume cdef DTYPE_t remaining if i < self.current_i: self.reset() if i > self.current_i: to_consume = i - self.current_i remaining = self.sizes[self.current_index] - self.current_offset if remaining > to_consume: self.current_offset += to_consume self.current_i += to_consume else: assert remaining >= 0 self.current_i += remaining self.current_index += 1 self.current_offset = 0 return 1 return 0 @cython.boundscheck(False) @cython.wraparound(False) @cython.nonecheck(False) cdef seek(self, DTYPE_t i): cdef int not_done = 1 while not_done == 1: not_done = self.step(i) assert self.current_i == i ================================================ FILE: fairseq/data/transform_eos_concat_langpair_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch from torch.utils.data.dataloader import default_collate from fairseq.data import ConcatDataset logger = logging.getLogger(__name__) class TransformEosConcatLangPairDataset(ConcatDataset): """ It is a combination of TransformEosLangPairDataset and ConcatDataset for multiple LangPairDataset datasets. Assume all datasets share the same src_eos, tgt_bos, left_pad_source and left_pad_target """ def __init__( self, datasets, src_eos, tgt_bos, new_src_eos=None, new_tgt_bos=None, ): super().__init__(datasets) if new_src_eos is not None and new_src_eos != []: assert len(new_src_eos) == len(datasets) else: new_src_eos = [] if new_tgt_bos is not None and new_tgt_bos != []: assert len(new_tgt_bos) == len(datasets) else: new_tgt_bos = [] self.src_eos = src_eos self.tgt_bos = tgt_bos self.new_src_eos = ( torch.LongTensor(new_src_eos).cpu() if len(new_src_eos) > 0 else [] ) self.new_tgt_bos = ( torch.LongTensor(new_tgt_bos).cpu() if len(new_tgt_bos) > 0 else [] ) self.left_pad_source = self.is_left_pad_source(datasets) self.left_pad_target = self.is_left_pad_target(datasets) self.pad_idx = self.src_dict_pad() def src_dict_pad(self): if hasattr(self.datasets[0], "src_dict"): return self.datasets[0].src_dict.pad() if hasattr(self.datasets[0], "dataset"): return self.datasets[0].dataset.src_dict.pad() raise NotImplementedError("No src_dict is found") def __getitem__(self, idx): dataset_idx, sample_idx = self._get_dataset_and_sample_index(idx) return dataset_idx, self.datasets[dataset_idx][sample_idx] def is_left_pad_source(self, datasets): def _left_pad_source(ds): if hasattr(ds, "left_pad_source"): return ds.left_pad_source if hasattr(ds, "dataset"): return _left_pad_source(ds.dataset) logger.warn(f"{type(ds)} has no left_pad_source, using default True") return True left_pad_source = _left_pad_source(datasets[0]) for ds in datasets: if left_pad_source != _left_pad_source(ds): raise ValueError("Different left_pad_source setting detected!") return left_pad_source def is_left_pad_target(self, datasets): def _left_pad_target(ds): if hasattr(ds, "left_pad_target"): return ds.left_pad_target if hasattr(ds, "dataset"): return _left_pad_target(ds.dataset) logger.warn(f"{type(ds)} has no left_pad_target, using default False") return False left_pad_target = _left_pad_target(datasets[0]) for ds in datasets: if left_pad_target != _left_pad_target(ds): raise ValueError("Different left_pad_target setting detected!") return left_pad_target def collater(self, samples, **extra_args): if len(samples) == 0: return samples dataset_ids = [s[0] for s in samples] samples = [s[1] for s in samples] if hasattr(self.datasets[0], "collater"): samples = self.datasets[0].collater(samples, **extra_args) else: samples = default_collate(samples, **extra_args) if len(self.new_src_eos) > 0: if self.left_pad_source: assert ( samples["net_input"]["src_tokens"][:, -1] != self.src_eos ).sum() == 0 samples["net_input"]["src_tokens"][:, -1] = self.new_src_eos[ dataset_ids ] else: eos_idx = samples["net_input"]["src_lengths"] - 1 assert ( samples["net_input"]["src_tokens"][ torch.arange(eos_idx.size(0)), eos_idx ] != self.src_eos ).sum() == 0 samples["net_input"]["src_tokens"].scatter_( 1, eos_idx.view(-1, 1), self.new_src_eos[dataset_ids].view(-1, 1) ) if len(self.new_tgt_bos) > 0 and "prev_output_tokens" in samples["net_input"]: if self.left_pad_target: # TODO: support different padding direction on target side raise NotImplementedError( "TransformEosLangPairDataset does not implement --left-pad-target True option" ) else: assert ( samples["net_input"]["prev_output_tokens"][:, 0] != self.tgt_bos ).sum() == 0 samples["net_input"]["prev_output_tokens"][:, 0] = self.new_tgt_bos[ dataset_ids ] return samples ================================================ FILE: fairseq/data/transform_eos_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from . import FairseqDataset class TransformEosDataset(FairseqDataset): """A :class:`~fairseq.data.FairseqDataset` wrapper that appends/prepends/strips EOS. Note that the transformation is applied in :func:`collater`. Args: dataset (~fairseq.data.FairseqDataset): dataset to wrap eos (int): index of the end-of-sentence symbol append_eos_to_src (bool, optional): append EOS to the end of src remove_eos_from_src (bool, optional): remove EOS from the end of src append_eos_to_tgt (bool, optional): append EOS to the end of tgt remove_eos_from_tgt (bool, optional): remove EOS from the end of tgt """ def __init__( self, dataset, eos, append_eos_to_src=False, remove_eos_from_src=False, append_eos_to_tgt=False, remove_eos_from_tgt=False, has_target=True, ): if not isinstance(dataset, FairseqDataset): raise ValueError("dataset must be an instance of FairseqDataset") if append_eos_to_src and remove_eos_from_src: raise ValueError("cannot combine append_eos_to_src and remove_eos_from_src") if append_eos_to_tgt and remove_eos_from_tgt: raise ValueError("cannot combine append_eos_to_tgt and remove_eos_from_tgt") self.dataset = dataset self.eos = torch.LongTensor([eos]) self.append_eos_to_src = append_eos_to_src self.remove_eos_from_src = remove_eos_from_src self.append_eos_to_tgt = append_eos_to_tgt self.remove_eos_from_tgt = remove_eos_from_tgt self.has_target = has_target # precompute how we should adjust the reported sizes self._src_delta = 0 self._src_delta += 1 if append_eos_to_src else 0 self._src_delta -= 1 if remove_eos_from_src else 0 self._tgt_delta = 0 self._tgt_delta += 1 if append_eos_to_tgt else 0 self._tgt_delta -= 1 if remove_eos_from_tgt else 0 self._checked_src = False self._checked_tgt = False def _check_src(self, src, expect_eos): if not self._checked_src: assert (src[-1] == self.eos[0]) == expect_eos self._checked_src = True def _check_tgt(self, tgt, expect_eos): if self.has_target and not self._checked_tgt: assert (tgt[-1] == self.eos[0]) == expect_eos self._checked_tgt = True def __getitem__(self, index): return self.dataset[index] def __len__(self): return len(self.dataset) def collater(self, samples): def transform(item): if self.append_eos_to_src: self.eos = self.eos.to(device=item["source"].device) self._check_src(item["source"], expect_eos=False) item["source"] = torch.cat([item["source"], self.eos]) if self.remove_eos_from_src: self.eos = self.eos.to(device=item["source"].device) self._check_src(item["source"], expect_eos=True) item["source"] = item["source"][:-1] if self.append_eos_to_tgt: self.eos = self.eos.to(device=item["target"].device) self._check_tgt(item["target"], expect_eos=False) item["target"] = torch.cat([item["target"], self.eos]) if self.remove_eos_from_tgt: self.eos = self.eos.to(device=item["target"].device) self._check_tgt(item["target"], expect_eos=True) item["target"] = item["target"][:-1] return item samples = list(map(transform, samples)) return self.dataset.collater(samples) def num_tokens(self, index): return self.dataset.num_tokens(index) def size(self, index): if self.has_target: src_len, tgt_len = self.dataset.size(index) return (src_len + self._src_delta, tgt_len + self._tgt_delta) else: return self.dataset.size(index) def ordered_indices(self): # NOTE: we assume that the ordering does not change based on the # addition or removal of eos return self.dataset.ordered_indices() @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): return self.dataset.prefetch(indices) ================================================ FILE: fairseq/data/transform_eos_lang_pair_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Optional import torch from . import FairseqDataset class TransformEosLangPairDataset(FairseqDataset): """A :class:`~fairseq.data.FairseqDataset` wrapper that transform bos on collated samples of language pair dataset. Note that the transformation is applied in :func:`collater`. Args: dataset (~fairseq.data.FairseqDataset): dataset that collates sample into LanguagePairDataset schema src_eos (int): original source end-of-sentence symbol index to be replaced new_src_eos (int, optional): new end-of-sentence symbol index to replace source eos symbol tgt_bos (int, optional): original target beginning-of-sentence symbol index to be replaced new_tgt_bos (int, optional): new beginning-of-sentence symbol index to replace at the beginning of 'prev_output_tokens' """ def __init__( self, dataset: FairseqDataset, src_eos: int, new_src_eos: Optional[int] = None, tgt_bos: Optional[int] = None, new_tgt_bos: Optional[int] = None, ): self.dataset = dataset self.src_eos = src_eos self.new_src_eos = new_src_eos self.tgt_bos = tgt_bos self.new_tgt_bos = new_tgt_bos def __getitem__(self, index): return self.dataset[index] def __len__(self): return len(self.dataset) def collater(self, samples, **extra_args): samples = self.dataset.collater(samples, **extra_args) if len(samples) == 0: return samples if "net_input" not in samples: return samples if self.new_src_eos is not None: if self.dataset.left_pad_source: assert ( samples["net_input"]["src_tokens"][:, -1] != self.src_eos ).sum() == 0 samples["net_input"]["src_tokens"][:, -1] = self.new_src_eos else: eos_idx = samples["net_input"]["src_lengths"] - 1 assert ( samples["net_input"]["src_tokens"][ torch.arange(eos_idx.size(0)), eos_idx ] != self.src_eos ).sum() == 0 eos_idx = eos_idx.resize_(len(samples["net_input"]["src_lengths"]), 1) samples["net_input"]["src_tokens"].scatter_( 1, eos_idx, self.new_src_eos ) if ( self.new_tgt_bos is not None and "prev_output_tokens" in samples["net_input"] ): if self.dataset.left_pad_target: # TODO: support different padding direction on target side raise NotImplementedError( "TransformEosLangPairDataset does not implement --left-pad-target True option" ) else: assert ( samples["net_input"]["prev_output_tokens"][:, 0] != self.tgt_bos ).sum() == 0 samples["net_input"]["prev_output_tokens"][:, 0] = self.new_tgt_bos return samples def num_tokens(self, index): return self.dataset.num_tokens(index) def size(self, index): return self.dataset.size(index) @property def sizes(self): # dataset.sizes can be a dynamically computed sizes: return self.dataset.sizes def ordered_indices(self): return self.dataset.ordered_indices() @property def supports_prefetch(self): return getattr(self.dataset, "supports_prefetch", False) def prefetch(self, indices): return self.dataset.prefetch(indices) ================================================ FILE: fairseq/dataclass/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .configs import FairseqDataclass from .constants import ChoiceEnum __all__ = [ "FairseqDataclass", "ChoiceEnum", ] ================================================ FILE: fairseq/dataclass/configs.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import sys from dataclasses import _MISSING_TYPE, dataclass, field from typing import Any, List, Optional import torch from omegaconf import II, MISSING from fairseq.dataclass.constants import ( DATASET_IMPL_CHOICES, DDP_BACKEND_CHOICES, DDP_COMM_HOOK_CHOICES, GENERATION_CONSTRAINTS_CHOICES, GENERATION_DECODING_FORMAT_CHOICES, LOG_FORMAT_CHOICES, PIPELINE_CHECKPOINT_CHOICES, PRINT_ALIGNMENT_CHOICES, ZERO_SHARDING_CHOICES, ) @dataclass class FairseqDataclass: """fairseq base dataclass that supported fetching attributes and metas""" _name: Optional[str] = None @staticmethod def name(): return None def _get_all_attributes(self) -> List[str]: return [k for k in self.__dataclass_fields__.keys()] def _get_meta( self, attribute_name: str, meta: str, default: Optional[Any] = None ) -> Any: return self.__dataclass_fields__[attribute_name].metadata.get(meta, default) def _get_name(self, attribute_name: str) -> str: return self.__dataclass_fields__[attribute_name].name def _get_default(self, attribute_name: str) -> Any: if hasattr(self, attribute_name): if str(getattr(self, attribute_name)).startswith("${"): return str(getattr(self, attribute_name)) elif str(self.__dataclass_fields__[attribute_name].default).startswith( "${" ): return str(self.__dataclass_fields__[attribute_name].default) elif ( getattr(self, attribute_name) != self.__dataclass_fields__[attribute_name].default ): return getattr(self, attribute_name) f = self.__dataclass_fields__[attribute_name] if not isinstance(f.default_factory, _MISSING_TYPE): return f.default_factory() return f.default def _get_type(self, attribute_name: str) -> Any: return self.__dataclass_fields__[attribute_name].type def _get_help(self, attribute_name: str) -> Any: return self._get_meta(attribute_name, "help") def _get_argparse_const(self, attribute_name: str) -> Any: return self._get_meta(attribute_name, "argparse_const") def _get_argparse_alias(self, attribute_name: str) -> Any: return self._get_meta(attribute_name, "argparse_alias") def _get_choices(self, attribute_name: str) -> Any: return self._get_meta(attribute_name, "choices") @classmethod def from_namespace(cls, args): if isinstance(args, cls): return args else: config = cls() for k in config.__dataclass_fields__.keys(): if k.startswith("_"): # private member, skip continue if hasattr(args, k): setattr(config, k, getattr(args, k)) return config @dataclass class CommonConfig(FairseqDataclass): # This is the core dataclass including common parameters shared by all different jobs. Please append your params to other dataclasses if they were # used for a particular purpose or task, such as those dedicated for `distributed training`, `optimization`, etc. no_progress_bar: bool = field( default=False, metadata={"help": "disable progress bar"} ) log_interval: int = field( default=100, metadata={ "help": "log progress every N batches (when progress bar is disabled)" }, ) log_format: Optional[LOG_FORMAT_CHOICES] = field( default=None, metadata={"help": "log format to use"} ) log_file: Optional[str] = field( default=None, metadata={"help": "log file to copy metrics to."} ) aim_repo: Optional[str] = field( default=None, metadata={"help": "path to Aim repository"}, ) aim_run_hash: Optional[str] = field( default=None, metadata={ "help": "Aim run hash. If skipped, creates or continues run " "based on save_dir" }, ) tensorboard_logdir: Optional[str] = field( default=None, metadata={ "help": "path to save logs for tensorboard, should match --logdir " "of running tensorboard (default: no tensorboard logging)" }, ) wandb_project: Optional[str] = field( default=None, metadata={"help": "Weights and Biases project name to use for logging"}, ) azureml_logging: Optional[bool] = field( default=False, metadata={"help": "Log scalars to AzureML context"}, ) seed: int = field( default=1, metadata={"help": "pseudo random number generator seed"} ) cpu: bool = field(default=False, metadata={"help": "use CPU instead of CUDA"}) tpu: bool = field(default=False, metadata={"help": "use TPU instead of CUDA"}) bf16: bool = field(default=False, metadata={"help": "use bfloat16; implies --tpu"}) memory_efficient_bf16: bool = field( default=False, metadata={ "help": "use a memory-efficient version of BF16 training; implies --bf16" }, ) fp16: bool = field(default=False, metadata={"help": "use FP16"}) memory_efficient_fp16: bool = field( default=False, metadata={ "help": "use a memory-efficient version of FP16 training; implies --fp16" }, ) fp16_no_flatten_grads: bool = field( default=False, metadata={"help": "don't flatten FP16 grads tensor"} ) fp16_init_scale: int = field( default=2**7, metadata={"help": "default FP16 loss scale"} ) fp16_scale_window: Optional[int] = field( default=None, metadata={"help": "number of updates before increasing loss scale"}, ) fp16_scale_tolerance: float = field( default=0.0, metadata={ "help": "pct of updates that can overflow before decreasing the loss scale" }, ) on_cpu_convert_precision: bool = field( default=False, metadata={ "help": "if set, the floating point conversion to fp16/bf16 runs on CPU. " "This reduces bus transfer time and GPU memory usage." }, ) min_loss_scale: float = field( default=1e-4, metadata={ "help": "minimum FP16/AMP loss scale, after which training is stopped" }, ) threshold_loss_scale: Optional[float] = field( default=None, metadata={"help": "threshold FP16 loss scale from below"} ) amp: bool = field(default=False, metadata={"help": "use automatic mixed precision"}) amp_batch_retries: int = field( default=2, metadata={ "help": "number of retries of same batch after reducing loss scale with AMP" }, ) amp_init_scale: int = field( default=2**7, metadata={"help": "default AMP loss scale"} ) amp_scale_window: Optional[int] = field( default=None, metadata={"help": "number of updates before increasing AMP loss scale"}, ) user_dir: Optional[str] = field( default=None, metadata={ "help": "path to a python module containing custom extensions (tasks and/or architectures)" }, ) empty_cache_freq: int = field( default=0, metadata={"help": "how often to clear the PyTorch CUDA cache (0 to disable)"}, ) all_gather_list_size: int = field( default=16384, metadata={"help": "number of bytes reserved for gathering stats from workers"}, ) model_parallel_size: int = field( default=1, metadata={"help": "total number of GPUs to parallelize model over"} ) quantization_config_path: Optional[str] = field( default=None, metadata={"help": "path to quantization config file"} ) profile: bool = field( default=False, metadata={"help": "enable autograd profiler emit_nvtx"} ) reset_logging: bool = field( default=False, metadata={ "help": "when using Hydra, reset the logging at the beginning of training" }, ) suppress_crashes: bool = field( default=False, metadata={ "help": "suppress crashes when training with the hydra_train entry point so that the " "main method can return a value (useful for sweeps)" }, ) use_plasma_view: bool = field( default=False, metadata={"help": "Store indices and sizes in shared memory"} ) plasma_path: Optional[str] = field( default="/tmp/plasma", metadata={ "help": "path to run plasma_store, defaults to /tmp/plasma. Paths outside /tmp tend to fail." }, ) @dataclass class DistributedTrainingConfig(FairseqDataclass): distributed_world_size: int = field( default=max(1, torch.cuda.device_count()), metadata={ "help": "total number of GPUs across all nodes (default: all visible GPUs)" }, ) distributed_num_procs: Optional[int] = field( default=max(1, torch.cuda.device_count()), metadata={ "help": "total number of processes to fork (default: all visible GPUs)" }, ) distributed_rank: Optional[int] = field( default=0, metadata={"help": "rank of the current worker"} ) distributed_backend: str = field( default="nccl", metadata={"help": "distributed backend"} ) distributed_init_method: Optional[str] = field( default=None, metadata={ "help": "typically tcp://hostname:port that will be used to " "establish initial connetion" }, ) distributed_port: int = field( default=-1, metadata={ "help": "port number (not required if using --distributed-init-method)" }, ) device_id: int = field( default=os.getenv("LOCAL_RANK", 0), metadata={ "help": "which GPU to use (by default looks for $LOCAL_RANK, usually configured automatically)", "argparse_alias": "--local_rank", }, ) distributed_no_spawn: bool = field( default=False, metadata={ "help": "do not spawn multiple processes even if multiple GPUs are visible" }, ) ddp_backend: DDP_BACKEND_CHOICES = field( default="pytorch_ddp", metadata={"help": "DistributedDataParallel backend"} ) ddp_comm_hook: DDP_COMM_HOOK_CHOICES = field( default="none", metadata={"help": "communication hook"} ) bucket_cap_mb: int = field( default=25, metadata={"help": "bucket size for reduction"} ) fix_batches_to_gpus: bool = field( default=False, metadata={ "help": "don't shuffle batches between GPUs; this reduces overall " "randomness and may affect precision but avoids the cost of re-reading the data" }, ) find_unused_parameters: bool = field( default=False, metadata={ "help": "disable unused parameter detection (not applicable to " "--ddp-backend=legacy_ddp)" }, ) gradient_as_bucket_view: bool = field( default=False, metadata={ "help": "when set to True, gradients will be views pointing to different offsets of allreduce communication buckets. This can reduce peak memory usage, where the saved memory size will be equal to the total gradients size. " "--gradient-as-bucket-view=gradient_as_bucket_view)" }, ) fast_stat_sync: bool = field( default=False, metadata={"help": "[deprecated] this is now defined per Criterion"}, ) heartbeat_timeout: int = field( default=-1, metadata={ "help": "kill the job if no progress is made in N seconds; " "set to -1 to disable" }, ) broadcast_buffers: bool = field( default=False, metadata={ "help": "Copy non-trainable parameters between GPUs, such as " "batchnorm population statistics" }, ) slowmo_momentum: Optional[float] = field( default=None, metadata={ "help": "SlowMo momentum term; by default use 0.0 for 16 GPUs, " "0.2 for 32 GPUs; 0.5 for 64 GPUs, 0.6 for > 64 GPUs" }, ) slowmo_base_algorithm: str = field( default="localsgd", metadata={ "help": "Base algorithm. Either 'localsgd' or 'sgp'. Please refer " "to the documentation of 'slowmo_base_algorithm' parameter in " "https://fairscale.readthedocs.io/en/latest/api/experimental/nn/slowmo_ddp.html " "for more details" }, ) localsgd_frequency: int = field( default=3, metadata={"help": "Local SGD allreduce frequency"} ) nprocs_per_node: int = field( default=max(1, torch.cuda.device_count()), metadata={ "help": "number of GPUs in each node. An allreduce operation across GPUs in " "a node is very fast. Hence, we do allreduce across GPUs in a node, " "and gossip across different nodes" }, ) pipeline_model_parallel: bool = field( default=False, metadata={"help": "if set, use pipeline model parallelism across GPUs"}, ) pipeline_balance: Optional[str] = field( default=None, metadata={ "help": "partition the model into N_K pieces, where each piece " "contains N_i layers. The sum(args.pipeline_balance) " "should equal the total number of layers in the model" }, ) pipeline_devices: Optional[str] = field( default=None, metadata={ "help": "a list of device indices indicating which device to place " "each of the N_K partitions. The length of this list should " "equal the length of the --pipeline-balance argument" }, ) pipeline_chunks: Optional[int] = field( default=0, metadata={"help": "microbatch count for pipeline model parallelism"} ) pipeline_encoder_balance: Optional[str] = field( default=None, metadata={ "help": "partition the pipeline parallel encoder into N_K pieces, where each piece " "contains N_i layers. The sum(args.pipeline_encoder_balance) " "should equal the total number of encoder layers in the model" }, ) pipeline_encoder_devices: Optional[str] = field( default=None, metadata={ "help": "a list of device indices indicating which device to place " "each of the N_K partitions. The length of this list should " "equal the length of the --pipeline-encoder-balance argument" }, ) pipeline_decoder_balance: Optional[str] = field( default=None, metadata={ "help": "partition the pipeline parallel decoder into N_K pieces, where each piece " "contains N_i layers. The sum(args.pipeline_decoder_balance) " "should equal the total number of decoder layers in the model" }, ) pipeline_decoder_devices: Optional[str] = field( default=None, metadata={ "help": "a list of device indices indicating which device to place " "each of the N_K partitions. The length of this list should " "equal the length of the --pipeline-decoder-balance argument" }, ) pipeline_checkpoint: PIPELINE_CHECKPOINT_CHOICES = field( default="never", metadata={"help": "checkpointing mode for pipeline model parallelism"}, ) zero_sharding: ZERO_SHARDING_CHOICES = field( default="none", metadata={"help": "ZeRO sharding"} ) fp16: bool = II("common.fp16") memory_efficient_fp16: bool = II("common.memory_efficient_fp16") tpu: bool = II("common.tpu") # configuration for --ddp-backend=fully_sharded no_reshard_after_forward: bool = field( default=False, metadata={"help": "don't reshard parameters after forward pass"}, ) fp32_reduce_scatter: bool = field( default=False, metadata={"help": "reduce-scatter grads in FP32"}, ) cpu_offload: bool = field( default=False, metadata={"help": "offload FP32 params to CPU"} ) use_sharded_state: bool = field( default=False, metadata={"help": "use sharded checkpoint files"}, ) not_fsdp_flatten_parameters: bool = field( default=False, metadata={"help": "not flatten parameter param for fsdp"}, ) @dataclass class DatasetConfig(FairseqDataclass): num_workers: int = field( default=1, metadata={"help": "how many subprocesses to use for data loading"} ) skip_invalid_size_inputs_valid_test: bool = field( default=False, metadata={"help": "ignore too long or too short lines in valid and test set"}, ) max_tokens: Optional[int] = field( default=None, metadata={"help": "maximum number of tokens in a batch"} ) batch_size: Optional[int] = field( default=None, metadata={ "help": "number of examples in a batch", "argparse_alias": "--max-sentences", }, ) required_batch_size_multiple: int = field( default=8, metadata={"help": "batch size will be a multiplier of this value"} ) required_seq_len_multiple: int = field( default=1, metadata={ "help": "maximum sequence length in batch will be a multiplier of this value" }, ) dataset_impl: Optional[DATASET_IMPL_CHOICES] = field( default=None, metadata={"help": "output dataset implementation"} ) data_buffer_size: int = field( default=10, metadata={"help": "Number of batches to preload"} ) train_subset: str = field( default="train", metadata={"help": "data subset to use for training (e.g. train, valid, test)"}, ) valid_subset: str = field( default="valid", metadata={ "help": "comma separated list of data subsets to use for validation" " (e.g. train, valid, test)" }, ) combine_valid_subsets: Optional[bool] = field( default=None, metadata={ "help": "comma separated list of data subsets to use for validation" " (e.g. train, valid, test)", "argparse_alias": "--combine-val", }, ) ignore_unused_valid_subsets: Optional[bool] = field( default=False, metadata={"help": "do not raise error if valid subsets are ignored"}, ) validate_interval: int = field( default=1, metadata={"help": "validate every N epochs"} ) validate_interval_updates: int = field( default=0, metadata={"help": "validate every N updates"} ) validate_after_updates: int = field( default=0, metadata={"help": "dont validate until reaching this many updates"} ) fixed_validation_seed: Optional[int] = field( default=None, metadata={"help": "specified random seed for validation"} ) disable_validation: bool = field( default=False, metadata={"help": "disable validation"} ) max_tokens_valid: Optional[int] = field( default=II("dataset.max_tokens"), metadata={ "help": "maximum number of tokens in a validation batch" " (defaults to --max-tokens)" }, ) batch_size_valid: Optional[int] = field( default=II("dataset.batch_size"), metadata={ "help": "batch size of the validation batch (defaults to --batch-size)", "argparse_alias": "--max-sentences-valid", }, ) max_valid_steps: Optional[int] = field( default=None, metadata={"help": "How many batches to evaluate", "argparse_alias": "--nval"}, ) curriculum: int = field( default=0, metadata={"help": "don't shuffle batches for first N epochs"} ) gen_subset: str = field( default="test", metadata={"help": "data subset to generate (train, valid, test)"}, ) num_shards: int = field( default=1, metadata={"help": "shard generation over N shards"} ) shard_id: int = field( default=0, metadata={"help": "id of the shard to generate (id < num_shards)"} ) grouped_shuffling: bool = field( default=False, metadata={ "help": "shuffle batches in groups of num_shards to enable similar sequence lengths on each GPU worker when batches are sorted by length", }, ) update_epoch_batch_itr: bool = field( default=II("dataset.grouped_shuffling"), metadata={ "help": "if true then prevents the reuse the epoch batch iterator by setting can_reuse_epoch_itr to false, defaults to --grouped-shuffling )", }, ) update_ordered_indices_seed: bool = field( default=False, metadata={ "help": "if true then increment seed with epoch for getting batch iterators, defautls to False.", }, ) @dataclass class OptimizationConfig(FairseqDataclass): max_epoch: int = field( default=0, metadata={"help": "force stop training at specified epoch"} ) max_update: int = field( default=0, metadata={"help": "force stop training at specified update"} ) stop_time_hours: float = field( default=0, metadata={ "help": "force stop training after specified cumulative time (if >0)" }, ) clip_norm: float = field( default=0.0, metadata={"help": "clip threshold of gradients"} ) sentence_avg: bool = field( default=False, metadata={ "help": "normalize gradients by the number of sentences in a batch" " (default is to normalize by number of tokens)" }, ) update_freq: List[int] = field( default_factory=lambda: [1], metadata={"help": "update parameters every N_i batches, when in epoch i"}, ) lr: List[float] = field( default_factory=lambda: [0.25], metadata={ "help": "learning rate for the first N epochs; all epochs >N using LR_N" " (note: this may be interpreted differently depending on --lr-scheduler)" }, ) stop_min_lr: float = field( default=-1.0, metadata={"help": "stop training when the learning rate reaches this minimum"}, ) use_bmuf: bool = field( default=False, metadata={ "help": "specify global optimizer for syncing models on different GPUs/shards" }, ) skip_remainder_batch: Optional[bool] = field( default=False, metadata={ "help": "if set, include the last (partial) batch of each epoch in training" " (default is to skip it)." }, ) debug_param_names: bool = False @dataclass class CheckpointConfig(FairseqDataclass): save_dir: str = field( default="checkpoints", metadata={"help": "path to save checkpoints"} ) restore_file: str = field( default="checkpoint_last.pt", metadata={ "help": "filename from which to load checkpoint " "(default: <save-dir>/checkpoint_last.pt" }, ) continue_once: Optional[str] = field( default=None, metadata={ "help": "continues from this checkpoint, unless a checkpoint indicated in 'restore_file' option is present" }, ) finetune_from_model: Optional[str] = field( default=None, metadata={ "help": "finetune from a pretrained model; note that meters and lr scheduler will be reset" }, ) reset_dataloader: bool = field( default=False, metadata={ "help": "if set, does not reload dataloader state from the checkpoint" }, ) reset_lr_scheduler: bool = field( default=False, metadata={ "help": "if set, does not load lr scheduler state from the checkpoint" }, ) reset_meters: bool = field( default=False, metadata={"help": "if set, does not load meters from the checkpoint"}, ) reset_optimizer: bool = field( default=False, metadata={"help": "if set, does not load optimizer state from the checkpoint"}, ) optimizer_overrides: str = field( default="{}", metadata={ "help": "a dictionary used to override optimizer args when loading a checkpoint" }, ) save_interval: int = field( default=1, metadata={"help": "save a checkpoint every N epochs"} ) save_interval_updates: int = field( default=0, metadata={"help": "save a checkpoint (and validate) every N updates"} ) keep_interval_updates: int = field( default=-1, metadata={ "help": "keep the last N checkpoints saved with --save-interval-updates" }, ) keep_interval_updates_pattern: int = field( default=-1, metadata={ "help": "when used with --keep-interval-updates, skips deleting " "any checkpoints with update X where " "X %% keep_interval_updates_pattern == 0" }, ) keep_last_epochs: int = field( default=-1, metadata={"help": "keep last N epoch checkpoints"} ) keep_best_checkpoints: int = field( default=-1, metadata={"help": "keep best N checkpoints based on scores"} ) no_save: bool = field( default=False, metadata={"help": "don't save models or checkpoints"} ) no_epoch_checkpoints: bool = field( default=False, metadata={"help": "only store last and best checkpoints"} ) no_last_checkpoints: bool = field( default=False, metadata={"help": "don't store last checkpoints"} ) no_save_optimizer_state: bool = field( default=False, metadata={"help": "don't save optimizer-state as part of checkpoint"}, ) best_checkpoint_metric: str = field( default="loss", metadata={"help": 'metric to use for saving "best" checkpoints'} ) maximize_best_checkpoint_metric: bool = field( default=False, metadata={ "help": 'select the largest metric value for saving "best" checkpoints' }, ) patience: int = field( default=-1, metadata={ "help": ( "early stop training if valid performance doesn't " "improve for N consecutive validation runs; note " "that this is influenced by --validate-interval" ) }, ) checkpoint_suffix: str = field( default="", metadata={"help": "suffix to add to the checkpoint file name"} ) checkpoint_shard_count: int = field( default=1, metadata={ "help": "Number of shards containing the checkpoint - " "if the checkpoint is over 300GB, it is preferable " "to split it into shards to prevent OOM on CPU while loading " "the checkpoint" }, ) load_checkpoint_on_all_dp_ranks: bool = field( default=False, metadata={ "help": "load checkpoints on all data parallel devices " "(default: only load on rank 0 and broadcast to other devices)" }, ) write_checkpoints_asynchronously: bool = field( default=False, metadata={ "help": ( "Write checkpoints asynchronously in a separate " "thread. NOTE: This feature is currently being tested." ), "argparse_alias": "--save-async", }, ) model_parallel_size: int = II("common.model_parallel_size") @dataclass class FairseqBMUFConfig(FairseqDataclass): block_lr: float = field( default=1, metadata={"help": "block learning rate for bmuf"} ) block_momentum: float = field( default=0.875, metadata={"help": "block momentum for bmuf"} ) global_sync_iter: int = field( default=50, metadata={"help": "Iteration for syncing global model"} ) warmup_iterations: int = field( default=500, metadata={"help": "warmup iterations for model to broadcast"} ) use_nbm: bool = field( default=False, metadata={"help": "Specify whether you want to use classical BM / Nesterov BM"}, ) average_sync: bool = field( default=False, metadata={ "help": "Specify whether you want to average the local momentum after each sync" }, ) distributed_world_size: int = II("distributed_training.distributed_world_size") @dataclass class GenerationConfig(FairseqDataclass): beam: int = field( default=5, metadata={"help": "beam size"}, ) beam_mt: int = field( default=0, metadata={"help": "beam size for the first-pass decoder"}, ) nbest: int = field( default=1, metadata={"help": "number of hypotheses to output"}, ) max_len_a: float = field( default=0, metadata={ "help": "generate sequences of maximum length ax + b, where x is the source length" }, ) max_len_b: int = field( default=200, metadata={ "help": "generate sequences of maximum length ax + b, where x is the source length" }, ) max_len_a_mt: float = field( default=0, metadata={ "help": "generate sequences of maximum length ax + b, where x is the source length for the first-pass decoder" }, ) max_len_b_mt: int = field( default=200, metadata={ "help": "generate sequences of maximum length ax + b, where x is the source length for the first-pass decoder" }, ) min_len: int = field( default=1, metadata={"help": "minimum generation length"}, ) match_source_len: bool = field( default=False, metadata={"help": "generations should match the source length"}, ) unnormalized: bool = field( default=False, metadata={"help": "compare unnormalized hypothesis scores"}, ) no_early_stop: bool = field( default=False, metadata={"help": "deprecated"}, ) no_beamable_mm: bool = field( default=False, metadata={"help": "don't use BeamableMM in attention layers"}, ) lenpen: float = field( default=1, metadata={ "help": "length penalty: <1.0 favors shorter, >1.0 favors longer sentences" }, ) lenpen_mt: float = field( default=1, metadata={ "help": "length penalty for the first-pass decoder: <1.0 favors shorter, >1.0 favors longer sentences" }, ) unkpen: float = field( default=0, metadata={ "help": "unknown word penalty: <0 produces more unks, >0 produces fewer" }, ) replace_unk: Optional[str] = field( default=None, metadata={ "help": "perform unknown replacement (optionally with alignment dictionary)", "argparse_const": "@@ ", }, ) sacrebleu: bool = field( default=False, metadata={"help": "score with sacrebleu"}, ) score_reference: bool = field( default=False, metadata={"help": "just score the reference translation"}, ) prefix_size: int = field( default=0, metadata={"help": "initialize generation by target prefix of given length"}, ) no_repeat_ngram_size: int = field( default=0, metadata={ "help": "ngram blocking such that this size ngram cannot be repeated in the generation" }, ) sampling: bool = field( default=False, metadata={"help": "sample hypotheses instead of using beam search"}, ) sampling_topk: int = field( default=-1, metadata={"help": "sample from top K likely next words instead of all words"}, ) sampling_topp: float = field( default=-1.0, metadata={ "help": "sample from the smallest set whose cumulative probability mass exceeds p for next words" }, ) constraints: Optional[GENERATION_CONSTRAINTS_CHOICES] = field( default=None, metadata={ "help": "enables lexically constrained decoding", "argparse_const": "ordered", }, ) temperature: float = field( default=1.0, metadata={"help": "temperature for generation"}, ) diverse_beam_groups: int = field( default=-1, metadata={"help": "number of groups for Diverse Beam Search"}, ) diverse_beam_strength: float = field( default=0.5, metadata={"help": "strength of diversity penalty for Diverse Beam Search"}, ) diversity_rate: float = field( default=-1.0, metadata={"help": "strength of diversity penalty for Diverse Siblings Search"}, ) print_alignment: Optional[PRINT_ALIGNMENT_CHOICES] = field( default=None, metadata={ "help": "if set, uses attention feedback to compute and print alignment to source tokens " "(valid options are: hard, soft, otherwise treated as hard alignment)", "argparse_const": "hard", }, ) print_step: bool = field( default=False, metadata={"help": "print steps"}, ) lm_path: Optional[str] = field( default=None, metadata={"help": "path to lm checkpoint for lm fusion"}, ) lm_weight: float = field( default=0.0, metadata={"help": "weight for lm probs for lm fusion"}, ) # arguments for iterative refinement generator iter_decode_eos_penalty: float = field( default=0.0, metadata={"help": "if > 0.0, it penalized early-stopping in decoding."}, ) iter_decode_max_iter: int = field( default=10, metadata={"help": "maximum iterations for iterative refinement."}, ) iter_decode_force_max_iter: bool = field( default=False, metadata={ "help": "if set, run exact the maximum number of iterations without early stop" }, ) iter_decode_with_beam: int = field( default=1, metadata={ "help": "if > 1, model will generate translations varying by the lengths." }, ) iter_decode_with_external_reranker: bool = field( default=False, metadata={ "help": "if set, the last checkpoint are assumed to be a reranker to rescore the translations" }, ) retain_iter_history: bool = field( default=False, metadata={ "help": "if set, decoding returns the whole history of iterative refinement" }, ) retain_dropout: bool = field( default=False, metadata={"help": "Use dropout at inference time"}, ) # temporarily set to Any until https://github.com/facebookresearch/hydra/issues/1117 is fixed # retain_dropout_modules: Optional[List[str]] = field( retain_dropout_modules: Any = field( default=None, metadata={ "help": "if set, only retain dropout for the specified modules; " "if not set, then dropout will be retained for all modules" }, ) # special decoding format for advanced decoding. decoding_format: Optional[GENERATION_DECODING_FORMAT_CHOICES] = field( default=None, metadata={"help": "special decoding format for advanced decoding."}, ) no_seed_provided: bool = field( default=False, metadata={"help": "if set, dont use seed for initializing random generators"}, ) eos_token: Optional[str] = field( default=None, metadata={"help": "EOS token"}, ) @dataclass class CommonEvalConfig(FairseqDataclass): path: Optional[str] = field( default=None, metadata={"help": "path(s) to model file(s), colon separated"}, ) post_process: Optional[str] = field( default=None, metadata={ "help": ( "post-process text by removing BPE, letter segmentation, etc. " "Valid options can be found in fairseq.data.utils.post_process." ), "argparse_const": "subword_nmt", "argparse_alias": "--remove-bpe", }, ) quiet: bool = field(default=False, metadata={"help": "only print final scores"}) model_overrides: str = field( default="{}", metadata={ "help": "a dictionary used to override model args at generation that were used during model training" }, ) results_path: Optional[str] = field( default=None, metadata={"help": "path to save eval results (optional)"} ) @dataclass class EvalLMConfig(FairseqDataclass): output_word_probs: bool = field( default=False, metadata={ "help": "if set, outputs words and their predicted log probabilities to standard output" }, ) output_word_stats: bool = field( default=False, metadata={ "help": "if set, outputs word statistics such as word count, average probability, etc" }, ) context_window: int = field( default=0, metadata={ "help": "ensures that every evaluated token has access to a context of at least this size, if possible" }, ) softmax_batch: int = field( default=sys.maxsize, metadata={ "help": "if BxT is more than this, will batch the softmax over vocab to this amount of tokens, in order to fit into GPU memory" }, ) @dataclass class InteractiveConfig(FairseqDataclass): buffer_size: int = field( default=0, metadata={ "help": "read this many sentences into a buffer before processing them" }, ) input: str = field( default="-", metadata={"help": "file to read from; use - for stdin"}, ) @dataclass class EMAConfig(FairseqDataclass): store_ema: bool = field( default=False, metadata={help: "store exponential moving average shadow model"} ) ema_decay: float = field( default=0.9999, metadata={"help": "decay for exponential moving average model"} ) ema_start_update: int = field( default=0, metadata={"help": "start EMA update after this many model updates"} ) ema_seed_model: Optional[str] = field( default=None, metadata={ "help": "Seed to load EMA model from. " "Used to load EMA model separately from the actual model." }, ) ema_update_freq: int = field( default=1, metadata={"help": "Do EMA update every this many model updates"} ) ema_fp32: bool = field( default=False, metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"}, ) @dataclass class FairseqConfig(FairseqDataclass): common: CommonConfig = CommonConfig() common_eval: CommonEvalConfig = CommonEvalConfig() distributed_training: DistributedTrainingConfig = DistributedTrainingConfig() dataset: DatasetConfig = DatasetConfig() optimization: OptimizationConfig = OptimizationConfig() checkpoint: CheckpointConfig = CheckpointConfig() bmuf: FairseqBMUFConfig = FairseqBMUFConfig() generation: GenerationConfig = GenerationConfig() eval_lm: EvalLMConfig = EvalLMConfig() interactive: InteractiveConfig = InteractiveConfig() model: Any = MISSING task: Any = None criterion: Any = None optimizer: Any = None lr_scheduler: Any = None scoring: Any = None bpe: Any = None tokenizer: Any = None ema: EMAConfig = EMAConfig() ================================================ FILE: fairseq/dataclass/constants.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from enum import Enum, EnumMeta from typing import List class StrEnumMeta(EnumMeta): # this is workaround for submitit pickling leading to instance checks failing in hydra for StrEnum, see # https://github.com/facebookresearch/hydra/issues/1156 @classmethod def __instancecheck__(cls, other): return "enum" in str(type(other)) class StrEnum(Enum, metaclass=StrEnumMeta): def __str__(self): return self.value def __eq__(self, other: str): return self.value == other def __repr__(self): return self.value def __hash__(self): return hash(str(self)) def ChoiceEnum(choices: List[str]): """return the Enum class used to enforce list of choices""" return StrEnum("Choices", {k: k for k in choices}) LOG_FORMAT_CHOICES = ChoiceEnum(["json", "none", "simple", "tqdm"]) DDP_BACKEND_CHOICES = ChoiceEnum( [ "c10d", # alias for pytorch_ddp "fully_sharded", # FullyShardedDataParallel from fairscale "legacy_ddp", "no_c10d", # alias for legacy_ddp "pytorch_ddp", "slowmo", ] ) DDP_COMM_HOOK_CHOICES = ChoiceEnum(["none", "fp16"]) DATASET_IMPL_CHOICES = ChoiceEnum(["raw", "lazy", "cached", "mmap", "fasta", "huffman"]) GENERATION_CONSTRAINTS_CHOICES = ChoiceEnum(["ordered", "unordered"]) GENERATION_DECODING_FORMAT_CHOICES = ChoiceEnum( ["unigram", "ensemble", "vote", "dp", "bs"] ) ZERO_SHARDING_CHOICES = ChoiceEnum(["none", "os"]) PIPELINE_CHECKPOINT_CHOICES = ChoiceEnum(["always", "never", "except_last"]) PRINT_ALIGNMENT_CHOICES = ChoiceEnum(["hard", "soft"]) ================================================ FILE: fairseq/dataclass/initialize.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import logging from hydra.core.config_store import ConfigStore from fairseq.dataclass.configs import FairseqConfig from omegaconf import DictConfig, OmegaConf logger = logging.getLogger(__name__) def hydra_init(cfg_name="config") -> None: cs = ConfigStore.instance() cs.store(name=f"{cfg_name}", node=FairseqConfig) for k in FairseqConfig.__dataclass_fields__: v = FairseqConfig.__dataclass_fields__[k].default try: cs.store(name=k, node=v) except BaseException: logger.error(f"{k} - {v}") raise def add_defaults(cfg: DictConfig) -> None: """This function adds default values that are stored in dataclasses that hydra doesn't know about""" from fairseq.registry import REGISTRIES from fairseq.tasks import TASK_DATACLASS_REGISTRY from fairseq.models import ARCH_MODEL_NAME_REGISTRY, MODEL_DATACLASS_REGISTRY from fairseq.dataclass.utils import merge_with_parent from typing import Any OmegaConf.set_struct(cfg, False) for k, v in FairseqConfig.__dataclass_fields__.items(): field_cfg = cfg.get(k) if field_cfg is not None and v.type == Any: dc = None if isinstance(field_cfg, str): field_cfg = DictConfig({"_name": field_cfg}) field_cfg.__dict__["_parent"] = field_cfg.__dict__["_parent"] name = getattr(field_cfg, "_name", None) if k == "task": dc = TASK_DATACLASS_REGISTRY.get(name) elif k == "model": name = ARCH_MODEL_NAME_REGISTRY.get(name, name) dc = MODEL_DATACLASS_REGISTRY.get(name) elif k in REGISTRIES: dc = REGISTRIES[k]["dataclass_registry"].get(name) if dc is not None: cfg[k] = merge_with_parent(dc, field_cfg) ================================================ FILE: fairseq/dataclass/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ast import inspect import logging import os import re from argparse import ArgumentError, ArgumentParser, Namespace from dataclasses import _MISSING_TYPE, MISSING, is_dataclass from enum import Enum from typing import Any, Dict, List, Optional, Tuple, Type from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.configs import FairseqConfig from hydra.core.global_hydra import GlobalHydra from hydra.experimental import compose, initialize from omegaconf import DictConfig, OmegaConf, open_dict, _utils logger = logging.getLogger(__name__) def eval_str_list(x, x_type=float): if x is None: return None if isinstance(x, str): if len(x) == 0: return [] x = ast.literal_eval(x) try: return list(map(x_type, x)) except TypeError: return [x_type(x)] def interpret_dc_type(field_type): if isinstance(field_type, str): raise RuntimeError("field should be a type") if field_type == Any: return str typestring = str(field_type) if re.match( r"(typing.|^)Union\[(.*), NoneType\]$", typestring ) or typestring.startswith("typing.Optional"): return field_type.__args__[0] return field_type def gen_parser_from_dataclass( parser: ArgumentParser, dataclass_instance: FairseqDataclass, delete_default: bool = False, with_prefix: Optional[str] = None, ) -> None: """ convert a dataclass instance to tailing parser arguments. If `with_prefix` is provided, prefix all the keys in the resulting parser with it. It means that we are building a flat namespace from a structured dataclass (see transformer_config.py for example). """ def argparse_name(name: str): if name == "data" and (with_prefix is None or with_prefix == ""): # normally data is positional args, so we don't add the -- nor the prefix return name if name == "_name": # private member, skip return None full_name = "--" + name.replace("_", "-") if with_prefix is not None and with_prefix != "": # if a prefix is specified, construct the prefixed arg name full_name = with_prefix + "-" + full_name[2:] # strip -- when composing return full_name def get_kwargs_from_dc( dataclass_instance: FairseqDataclass, k: str ) -> Dict[str, Any]: """k: dataclass attributes""" kwargs = {} field_type = dataclass_instance._get_type(k) inter_type = interpret_dc_type(field_type) field_default = dataclass_instance._get_default(k) if isinstance(inter_type, type) and issubclass(inter_type, Enum): field_choices = [t.value for t in list(inter_type)] else: field_choices = None field_help = dataclass_instance._get_help(k) field_const = dataclass_instance._get_argparse_const(k) if isinstance(field_default, str) and field_default.startswith("${"): kwargs["default"] = field_default else: if field_default is MISSING: kwargs["required"] = True if field_choices is not None: kwargs["choices"] = field_choices if ( isinstance(inter_type, type) and (issubclass(inter_type, List) or issubclass(inter_type, Tuple)) ) or ("List" in str(inter_type) or "Tuple" in str(inter_type)): if "int" in str(inter_type): kwargs["type"] = lambda x: eval_str_list(x, int) elif "float" in str(inter_type): kwargs["type"] = lambda x: eval_str_list(x, float) elif "str" in str(inter_type): kwargs["type"] = lambda x: eval_str_list(x, str) else: raise NotImplementedError( "parsing of type " + str(inter_type) + " is not implemented" ) if field_default is not MISSING: kwargs["default"] = ( ",".join(map(str, field_default)) if field_default is not None else None ) elif ( isinstance(inter_type, type) and issubclass(inter_type, Enum) ) or "Enum" in str(inter_type): kwargs["type"] = str if field_default is not MISSING: if isinstance(field_default, Enum): kwargs["default"] = field_default.value else: kwargs["default"] = field_default elif inter_type is bool: kwargs["action"] = ( "store_false" if field_default is True else "store_true" ) kwargs["default"] = field_default else: kwargs["type"] = inter_type if field_default is not MISSING: kwargs["default"] = field_default # build the help with the hierarchical prefix if with_prefix is not None and with_prefix != "" and field_help is not None: field_help = with_prefix[2:] + ": " + field_help kwargs["help"] = field_help if field_const is not None: kwargs["const"] = field_const kwargs["nargs"] = "?" return kwargs for k in dataclass_instance._get_all_attributes(): field_name = argparse_name(dataclass_instance._get_name(k)) field_type = dataclass_instance._get_type(k) if field_name is None: continue elif inspect.isclass(field_type) and issubclass(field_type, FairseqDataclass): # for fields that are of type FairseqDataclass, we can recursively # add their fields to the namespace (so we add the args from model, task, etc. to the root namespace) prefix = None if with_prefix is not None: # if a prefix is specified, then we don't want to copy the subfields directly to the root namespace # but we prefix them with the name of the current field. prefix = field_name gen_parser_from_dataclass(parser, field_type(), delete_default, prefix) continue kwargs = get_kwargs_from_dc(dataclass_instance, k) field_args = [field_name] alias = dataclass_instance._get_argparse_alias(k) if alias is not None: field_args.append(alias) if "default" in kwargs: if isinstance(kwargs["default"], str) and kwargs["default"].startswith( "${" ): if kwargs["help"] is None: # this is a field with a name that will be added elsewhere continue else: del kwargs["default"] if delete_default and "default" in kwargs: del kwargs["default"] try: parser.add_argument(*field_args, **kwargs) except ArgumentError: pass def _set_legacy_defaults(args, cls): """Helper to set default arguments based on *add_args*.""" if not hasattr(cls, "add_args"): return import argparse parser = argparse.ArgumentParser( argument_default=argparse.SUPPRESS, allow_abbrev=False ) cls.add_args(parser) # copied from argparse.py: defaults = argparse.Namespace() for action in parser._actions: if action.dest is not argparse.SUPPRESS: if not hasattr(defaults, action.dest): if action.default is not argparse.SUPPRESS: setattr(defaults, action.dest, action.default) for key, default_value in vars(defaults).items(): if not hasattr(args, key): setattr(args, key, default_value) def _override_attr( sub_node: str, data_class: Type[FairseqDataclass], args: Namespace ) -> List[str]: overrides = [] if not inspect.isclass(data_class) or not issubclass(data_class, FairseqDataclass): return overrides def get_default(f): if not isinstance(f.default_factory, _MISSING_TYPE): return f.default_factory() return f.default for k, v in data_class.__dataclass_fields__.items(): if k.startswith("_"): # private member, skip continue val = get_default(v) if not hasattr(args, k) else getattr(args, k) field_type = interpret_dc_type(v.type) if ( isinstance(val, str) and not val.startswith("${") # not interpolation and field_type != str and ( not inspect.isclass(field_type) or not issubclass(field_type, Enum) ) # not choices enum ): # upgrade old models that stored complex parameters as string val = ast.literal_eval(val) if isinstance(val, tuple): val = list(val) v_type = getattr(v.type, "__origin__", None) if ( (v_type is List or v_type is list or v_type is Optional) # skip interpolation and not (isinstance(val, str) and val.startswith("${")) ): # if type is int but val is float, then we will crash later - try to convert here if hasattr(v.type, "__args__"): t_args = v.type.__args__ if len(t_args) == 1 and (t_args[0] is float or t_args[0] is int): val = list(map(t_args[0], val)) elif val is not None and ( field_type is int or field_type is bool or field_type is float ): try: val = field_type(val) except: pass # ignore errors here, they are often from interpolation args if val is None: overrides.append("{}.{}=null".format(sub_node, k)) elif val == "": overrides.append("{}.{}=''".format(sub_node, k)) elif isinstance(val, str): val = val.replace("'", r"\'") overrides.append("{}.{}='{}'".format(sub_node, k, val)) elif isinstance(val, FairseqDataclass): overrides += _override_attr(f"{sub_node}.{k}", type(val), args) elif isinstance(val, Namespace): sub_overrides, _ = override_module_args(val) for so in sub_overrides: overrides.append(f"{sub_node}.{k}.{so}") else: overrides.append("{}.{}={}".format(sub_node, k, val)) return overrides def migrate_registry( name, value, registry, args, overrides, deletes, use_name_as_val=False ): if value in registry: overrides.append("{}={}".format(name, value)) overrides.append("{}._name={}".format(name, value)) overrides.extend(_override_attr(name, registry[value], args)) elif use_name_as_val and value is not None: overrides.append("{}={}".format(name, value)) else: deletes.append(name) def override_module_args(args: Namespace) -> Tuple[List[str], List[str]]: """use the field in args to overrides those in cfg""" overrides = [] deletes = [] for k in FairseqConfig.__dataclass_fields__.keys(): overrides.extend( _override_attr(k, FairseqConfig.__dataclass_fields__[k].type, args) ) if args is not None: if hasattr(args, "task"): from fairseq.tasks import TASK_DATACLASS_REGISTRY migrate_registry( "task", args.task, TASK_DATACLASS_REGISTRY, args, overrides, deletes ) else: deletes.append("task") # these options will be set to "None" if they have not yet been migrated # so we can populate them with the entire flat args CORE_REGISTRIES = {"criterion", "optimizer", "lr_scheduler"} from fairseq.registry import REGISTRIES for k, v in REGISTRIES.items(): if hasattr(args, k): migrate_registry( k, getattr(args, k), v["dataclass_registry"], args, overrides, deletes, use_name_as_val=k not in CORE_REGISTRIES, ) else: deletes.append(k) no_dc = True if hasattr(args, "arch"): from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_MODEL_NAME_REGISTRY if args.arch in ARCH_MODEL_REGISTRY: m_cls = ARCH_MODEL_REGISTRY[args.arch] dc = getattr(m_cls, "__dataclass", None) if dc is not None: m_name = ARCH_MODEL_NAME_REGISTRY[args.arch] overrides.append("model={}".format(m_name)) overrides.append("model._name={}".format(args.arch)) # override model params with those exist in args overrides.extend(_override_attr("model", dc, args)) no_dc = False if no_dc: deletes.append("model") return overrides, deletes class omegaconf_no_object_check: def __init__(self): # Changed in https://github.com/omry/omegaconf/pull/911 - both are kept for back compat. if hasattr(_utils, "is_primitive_type"): self.old_is_primitive = _utils.is_primitive_type else: self.old_is_primitive = _utils.is_primitive_type_annotation def __enter__(self): if hasattr(_utils, "is_primitive_type"): _utils.is_primitive_type = lambda _: True else: _utils.is_primitive_type_annotation = lambda _: True def __exit__(self, type, value, traceback): if hasattr(_utils, "is_primitive_type"): _utils.is_primitive_type = self.old_is_primitive else: _utils.is_primitive_type_annotation = self.old_is_primitive def convert_namespace_to_omegaconf(args: Namespace) -> DictConfig: """Convert a flat argparse.Namespace to a structured DictConfig.""" # Here we are using field values provided in args to override counterparts inside config object overrides, deletes = override_module_args(args) # configs will be in fairseq/config after installation config_path = os.path.join("..", "config") GlobalHydra.instance().clear() with initialize(config_path=config_path): try: composed_cfg = compose("config", overrides=overrides, strict=False) except: logger.error("Error when composing. Overrides: " + str(overrides)) raise for k in deletes: composed_cfg[k] = None cfg = OmegaConf.create( OmegaConf.to_container(composed_cfg, resolve=True, enum_to_str=True) ) # hack to be able to set Namespace in dict config. this should be removed when we update to newer # omegaconf version that supports object flags, or when we migrate all existing models from omegaconf import _utils with omegaconf_no_object_check(): if cfg.task is None and getattr(args, "task", None): cfg.task = Namespace(**vars(args)) from fairseq.tasks import TASK_REGISTRY _set_legacy_defaults(cfg.task, TASK_REGISTRY[args.task]) cfg.task._name = args.task if cfg.model is None and getattr(args, "arch", None): cfg.model = Namespace(**vars(args)) from fairseq.models import ARCH_MODEL_REGISTRY _set_legacy_defaults(cfg.model, ARCH_MODEL_REGISTRY[args.arch]) cfg.model._name = args.arch if cfg.optimizer is None and getattr(args, "optimizer", None): cfg.optimizer = Namespace(**vars(args)) from fairseq.optim import OPTIMIZER_REGISTRY _set_legacy_defaults(cfg.optimizer, OPTIMIZER_REGISTRY[args.optimizer]) cfg.optimizer._name = args.optimizer if cfg.lr_scheduler is None and getattr(args, "lr_scheduler", None): cfg.lr_scheduler = Namespace(**vars(args)) from fairseq.optim.lr_scheduler import LR_SCHEDULER_REGISTRY _set_legacy_defaults( cfg.lr_scheduler, LR_SCHEDULER_REGISTRY[args.lr_scheduler] ) cfg.lr_scheduler._name = args.lr_scheduler if cfg.criterion is None and getattr(args, "criterion", None): cfg.criterion = Namespace(**vars(args)) from fairseq.criterions import CRITERION_REGISTRY _set_legacy_defaults(cfg.criterion, CRITERION_REGISTRY[args.criterion]) cfg.criterion._name = args.criterion OmegaConf.set_struct(cfg, True) return cfg def overwrite_args_by_name(cfg: DictConfig, overrides: Dict[str, any]): # this will be deprecated when we get rid of argparse and model_overrides logic from fairseq.registry import REGISTRIES with open_dict(cfg): for k in cfg.keys(): # "k in cfg" will return false if its a "mandatory value (e.g. ???)" if k in cfg and isinstance(cfg[k], DictConfig): if k in overrides and isinstance(overrides[k], dict): for ok, ov in overrides[k].items(): if isinstance(ov, dict) and cfg[k][ok] is not None: overwrite_args_by_name(cfg[k][ok], ov) else: cfg[k][ok] = ov else: overwrite_args_by_name(cfg[k], overrides) elif k in cfg and isinstance(cfg[k], Namespace): for override_key, val in overrides.items(): setattr(cfg[k], override_key, val) elif k in overrides: if ( k in REGISTRIES and overrides[k] in REGISTRIES[k]["dataclass_registry"] ): cfg[k] = DictConfig( REGISTRIES[k]["dataclass_registry"][overrides[k]] ) overwrite_args_by_name(cfg[k], overrides) cfg[k]._name = overrides[k] else: cfg[k] = overrides[k] def merge_with_parent(dc: FairseqDataclass, cfg: DictConfig, remove_missing=False): if remove_missing: def remove_missing_rec(src_keys, target_cfg): if is_dataclass(target_cfg): target_keys = set(target_cfg.__dataclass_fields__.keys()) else: target_keys = set(target_cfg.keys()) for k in list(src_keys.keys()): if k not in target_keys: del src_keys[k] elif OmegaConf.is_config(src_keys[k]): tgt = getattr(target_cfg, k) if tgt is not None and (is_dataclass(tgt) or hasattr(tgt, "keys")): remove_missing_rec(src_keys[k], tgt) with open_dict(cfg): remove_missing_rec(cfg, dc) merged_cfg = OmegaConf.merge(dc, cfg) merged_cfg.__dict__["_parent"] = cfg.__dict__["_parent"] OmegaConf.set_struct(merged_cfg, True) return merged_cfg ================================================ FILE: fairseq/distributed/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .distributed_timeout_wrapper import DistributedTimeoutWrapper from .fully_sharded_data_parallel import ( fsdp_enable_wrap, fsdp_wrap, FullyShardedDataParallel, ) from .legacy_distributed_data_parallel import LegacyDistributedDataParallel from .module_proxy_wrapper import ModuleProxyWrapper from .tpu_distributed_data_parallel import TPUDistributedDataParallel __all__ = [ "DistributedTimeoutWrapper", "fsdp_enable_wrap", "fsdp_wrap", "FullyShardedDataParallel", "LegacyDistributedDataParallel", "ModuleProxyWrapper", "TPUDistributedDataParallel", ] ================================================ FILE: fairseq/distributed/distributed_timeout_wrapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import signal import threading from torch import nn logger = logging.getLogger(__name__) class DistributedTimeoutWrapper(nn.Module): """ A wrapper that kills the process if no progress is made within a given *timeout*. The timer is reset every time :func:`forward` is called. Usage:: module = DistributedTimeoutWrapper(module, timeout=30) x = module(input) time.sleep(20) # safe x = module(input) time.sleep(45) # job will be killed before this returns Args: module (nn.Module): module to wrap timeout (int): number of seconds before killing the process (set to a value <= 0 to disable the timeout) signal (Optional): signal to send once timeout is triggered """ def __init__(self, module: nn.Module, timeout: int, signal=signal.SIGINT): super().__init__() self.module = module self.timeout = timeout self.signal = signal if timeout > 0: self._heartbeat = threading.Event() self._heartbeat_thread = threading.Thread( target=self._check_heartbeat, args=(os.getpid(),), daemon=True, ) self._heartbeat_thread.start() self._terminated = False else: self._heartbeat = None self._heartbeat_thread = None def __del__(self): self.stop_timeout() def __getattr__(self, name): """Forward missing attributes to wrapped module.""" try: return super().__getattr__(name) # defer to nn.Module's logic except AttributeError: return getattr(self.module, name) def stop_timeout(self): if self._heartbeat_thread is not None: self._terminated = True self._heartbeat_thread.join() def state_dict(self, *args, **kwargs): return self.module.state_dict(*args, **kwargs) def load_state_dict(self, *args, **kwargs): return self.module.load_state_dict(*args, **kwargs) def forward(self, *args, **kwargs): if self._heartbeat is not None: self._heartbeat.set() return self.module(*args, **kwargs) def _check_heartbeat(self, parent_pid): self._heartbeat.wait() # wait for the first forward pass while True: self._heartbeat.clear() success = self._heartbeat.wait(timeout=self.timeout) if self._terminated: break elif not success: logger.error( ( "Killing job for not making progress in {} seconds. " "Set --heartbeat-timeout=-1 to disable this timeout." ).format(int(self.timeout)) ) os.kill(parent_pid, self.signal) return ================================================ FILE: fairseq/distributed/fully_sharded_data_parallel.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib from typing import Optional import torch from fairseq.dataclass.configs import DistributedTrainingConfig from fairseq.distributed import utils as dist_utils try: from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP has_FSDP = True except ImportError: FSDP = torch.nn.Module has_FSDP = False class FullyShardedDataParallel(FSDP): """ A small wrapper around fairscale's FullyShardedDataParallel (FSDP) with some fairseq-specific checkpoint saving/loading logic. Args: use_sharded_state (bool): if True, then ``state_dict`` will return ``FSDP.local_state_dict`` and ``load_state_dict`` will call ``FSDP.load_local_state_dict``. Otherwise, ``state_dict`` will return the full model weights on data parallel rank 0 (empty on other ranks) and ``load_state_dict`` will broadcast model weights from rank 0 to other ranks. """ def __init__(self, *args, use_sharded_state: bool = False, **kwargs): if not has_FSDP: raise ImportError( "Cannot find FullyShardedDataParallel. " "Please install fairscale with: pip install fairscale" ) super().__init__(*args, **kwargs) self.use_sharded_state = use_sharded_state @property def unwrapped_module(self) -> torch.nn.Module: if self.flatten_parameters: return self.module.module else: return self.module def state_dict(self, destination=None, prefix="", keep_vars=False): if self.use_sharded_state: return super().local_state_dict( destination=destination, prefix=prefix, keep_vars=keep_vars ) else: if self.rank == 0: return super().state_dict( destination=destination, prefix=prefix, keep_vars=keep_vars ) else: # We must call state_dict() due to use of communication # primitives. But we don't use the result. super().state_dict() return destination or {} def load_state_dict(self, state_dict, strict=True, model_cfg=None): if self.use_sharded_state: return super().load_local_state_dict(state_dict, strict=strict) else: state_dict = dist_utils.broadcast_object( state_dict, src_rank=0, group=self.process_group ) return super().load_state_dict(state_dict, strict=strict) class DummyProcessGroup: def __init__(self, rank: int, size: int): self._rank = rank self._size = size def rank(self) -> int: return self._rank def size(self) -> int: return self._size @contextlib.contextmanager def fsdp_enable_wrap(cfg: DistributedTrainingConfig): try: from fairscale.nn import enable_wrap except ImportError: raise ImportError( "Cannot find FullyShardedDataParallel. " "Please install fairscale with: pip install fairscale" ) if cfg.memory_efficient_fp16: assert cfg.fp16 # memory_efficient_fp16 should imply fp16 group = dist_utils.get_data_parallel_group() if group is None and cfg.distributed_world_size == 1: group = DummyProcessGroup(rank=0, size=1) fsdp_config = { "process_group": group, "reshard_after_forward": not cfg.no_reshard_after_forward, "mixed_precision": cfg.fp16 and not cfg.memory_efficient_fp16, "fp32_reduce_scatter": cfg.fp32_reduce_scatter, "flatten_parameters": not cfg.not_fsdp_flatten_parameters, "cpu_offload": cfg.cpu_offload, "compute_dtype": torch.float16 if cfg.fp16 else torch.float32, "bucket_cap_mb": cfg.bucket_cap_mb, "state_dict_device": torch.device("cpu"), # reduce GPU mem usage } with enable_wrap( wrapper_cls=FullyShardedDataParallel, use_sharded_state=cfg.use_sharded_state, **fsdp_config, ): yield def fsdp_wrap(module, min_num_params: Optional[int] = None, **kwargs): """ Helper to wrap layers/modules in FSDP. This falls back to a no-op if fairscale is not available. Args: module (nn.Module): module to (maybe) wrap min_num_params (int, Optional): minimum number of layer params to wrap """ try: from fairscale.nn import wrap if min_num_params is not None: num_params = sum(p.numel() for p in module.parameters()) if num_params >= min_num_params: return wrap(module, **kwargs) else: return module else: return wrap(module, **kwargs) except ImportError: return module ================================================ FILE: fairseq/distributed/legacy_distributed_data_parallel.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ A modified version of the legacy DistributedDataParallel module that uses c10d communication primitives. This version is simpler than the latest PyTorch version and is useful for debugging. Notably it does not overlap gradient communication with the backward pass, which makes it slower but more robust than the PyTorch version. This version also supports the *no_sync* context manager, which allows faster training with `--update-freq`. """ from collections import OrderedDict from contextlib import contextmanager import torch from torch import nn from fairseq.distributed import utils class LegacyDistributedDataParallel(nn.Module): """Implements distributed data parallelism at the module level. A simplified version of :class:`torch.nn.parallel.DistributedDataParallel`. This version uses a c10d process group for communication and does not broadcast buffers. Args: module (~torch.nn.Module): module to be parallelized process_group: the c10d process group to be used for distributed data parallel all-reduction. buffer_size (int, optional): number of elements to buffer before performing all-reduce (default: 256M). """ def __init__(self, module, process_group, buffer_size=2**28): super().__init__() self.module = module self.process_group = process_group self.world_size = utils.get_world_size(self.process_group) # Never use a bigger buffer than the number of model params self.buffer_size = min(buffer_size, sum(p.numel() for p in module.parameters())) self.buffer = None # We can also forcibly accumulate grads locally and only do the # all-reduce at some later time self.accumulate_grads = False # make per-device lists of parameters paramlists = OrderedDict() for param in self.module.parameters(): device = param.device if paramlists.get(device) is None: paramlists[device] = [] paramlists[device] += [param] self.per_device_params = list(paramlists.values()) @contextmanager def no_sync(self): """A context manager to disable gradient synchronization.""" old_accumulate_grads = self.accumulate_grads self.accumulate_grads = True yield self.accumulate_grads = old_accumulate_grads def forward(self, *inputs, **kwargs): return self.module(*inputs, **kwargs) def all_reduce_grads(self): """ This function must be called explicitly after backward to reduce gradients. There is no automatic hook like c10d. """ def all_reduce_params(params): buffer = self.buffer nonzero_buffer = False if len(params) > 1: offset = 0 for p in params: sz = p.numel() if p.grad is not None: buffer[offset : offset + sz].copy_(p.grad.data.view(-1)) nonzero_buffer = True else: buffer[offset : offset + sz].zero_() offset += sz else: # we only have a single grad to all-reduce p = params[0] if p.grad is not None: buffer = p.grad.data nonzero_buffer = True elif p.numel() <= self.buffer.numel(): buffer = buffer[: p.numel()] buffer.zero_() else: buffer = torch.zeros_like(p) if nonzero_buffer: buffer.div_(self.world_size) utils.all_reduce(buffer, self.process_group) # copy all-reduced grads back into their original place offset = 0 for p in params: sz = p.numel() if p.grad is not None: p.grad.data.copy_(buffer[offset : offset + sz].view_as(p)) else: p.grad = buffer[offset : offset + sz].view_as(p).clone() offset += sz def reduction_fn(): # This function only needs to be called once if self.accumulate_grads: return if self.buffer is None: self.buffer = next(self.module.parameters()).new(self.buffer_size) for params in self.per_device_params: # All-reduce the gradients in buckets offset = 0 buffered_params = [] for param in params: if not param.requires_grad: continue if param.grad is None: param.grad = torch.zeros_like(param) if hasattr(param, "expert"): # Skip gradient sync for unshared parameters continue if param.grad.requires_grad: raise RuntimeError( "DistributedDataParallel only works " "with gradients that don't require " "grad" ) sz = param.numel() if sz > self.buffer.numel(): # all-reduce big params directly all_reduce_params([param]) else: if offset + sz > self.buffer.numel(): all_reduce_params(buffered_params) offset = 0 buffered_params.clear() buffered_params.append(param) offset += sz if len(buffered_params) > 0: all_reduce_params(buffered_params) reduction_fn() ================================================ FILE: fairseq/distributed/module_proxy_wrapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from torch import nn class ModuleProxyWrapper(nn.Module): """ Wrap a DistributedDataParallel module and forward requests for missing attributes to the module wrapped by DDP (the twice-wrapped module). Also forward calls to :func:`state_dict` and :func:`load_state_dict`. Usage:: module.xyz = "hello world" wrapped_module = DistributedDataParallel(module, **ddp_args) wrapped_module = ModuleProxyWrapper(wrapped_module) assert wrapped_module.xyz == "hello world" assert wrapped_module.state_dict().keys() == module.state_dict().keys() Args: module (nn.Module): module to wrap """ def __init__(self, module: nn.Module): super().__init__() assert hasattr( module, "module" ), "ModuleProxyWrapper expects input to wrap another module" self.module = module def __getattr__(self, name): """Forward missing attributes to twice-wrapped module.""" try: # defer to nn.Module's logic return super().__getattr__(name) except AttributeError: try: # forward to the once-wrapped module return getattr(self.module, name) except AttributeError: # forward to the twice-wrapped module return getattr(self.module.module, name) def state_dict(self, *args, **kwargs): """Forward to the twice-wrapped module.""" return self.module.module.state_dict(*args, **kwargs) def load_state_dict(self, *args, **kwargs): """Forward to the twice-wrapped module.""" return self.module.module.load_state_dict(*args, **kwargs) def forward(self, *args, **kwargs): return self.module(*args, **kwargs) ================================================ FILE: fairseq/distributed/tpu_distributed_data_parallel.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from torch import nn from fairseq.distributed import utils class TPUDistributedDataParallel(nn.Module): def __init__(self, module, process_group): super().__init__() self.module = module self.process_group = process_group self.world_size = utils.get_world_size(self.process_group) def forward(self, *inputs, **kwargs): return self.module(*inputs, **kwargs) def all_reduce_grads(self): gradients = [] for p in self.parameters(): if not p.requires_grad: continue if p.grad is None: p.grad = torch.zeros_like(p) if p.grad.requires_grad: raise RuntimeError( "TPUDistributedDataParallel only works with gradients that don't " "require grad" ) gradients.append(p.grad) import torch_xla.core.xla_model as xm xm.all_reduce( "sum", gradients, scale=1.0 / self.world_size, groups=self.process_group[1], ) ================================================ FILE: fairseq/distributed/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import io import logging import os import pickle import random import socket import struct import subprocess import warnings from argparse import Namespace from collections import OrderedDict from dataclasses import dataclass from typing import Any, Dict, List, Mapping, Optional import torch import torch.distributed as dist from fairseq.dataclass.configs import DistributedTrainingConfig, FairseqConfig from omegaconf import open_dict try: import torch_xla.core.xla_model as xm except ImportError: xm = None # Flag to indicate if we're using Megatron # NOTE: this is a temporary hack until we move away from Megatron's model parallel init _USE_MEGATRON = False # Whether to use XLA ops (e.g., on TPUs) instead of CUDA ops. _USE_XLA = False logger = logging.getLogger(__name__) def is_master(cfg: DistributedTrainingConfig): return cfg.distributed_rank == 0 def infer_init_method(cfg: DistributedTrainingConfig, force_distributed=False): if cfg.distributed_init_method is not None or cfg.tpu: return num_pipelines_per_node = None if cfg.pipeline_model_parallel: num_pipeline_devices, num_pipelines_per_node = _pipeline_parallel_pre_init(cfg) if cfg.distributed_world_size == 1: return if all( key in os.environ for key in ["MASTER_ADDR", "MASTER_PORT", "WORLD_SIZE", "RANK"] ): # support torch.distributed.launch _infer_torch_distributed_launch_init(cfg) else: # we can determine the init method automatically for Slurm if not _infer_slurm_init(cfg, num_pipelines_per_node): if cfg.distributed_port <= 0 or force_distributed: _infer_single_node_init(cfg) elif cfg.distributed_port <= 0: _infer_single_node_init(cfg) if cfg.pipeline_model_parallel: _pipeline_parallel_post_init(cfg, num_pipeline_devices, num_pipelines_per_node) elif not cfg.distributed_no_spawn: with open_dict(cfg): cfg.distributed_num_procs = min( torch.cuda.device_count(), cfg.distributed_world_size ) else: if cfg.device_id > 0: logger.info( "setting CUDA device={} on rank {}".format( cfg.device_id, cfg.distributed_rank ) ) torch.cuda.set_device(cfg.device_id) def _infer_torch_distributed_launch_init(cfg: DistributedTrainingConfig): cfg.distributed_init_method = "env://" cfg.distributed_world_size = int(os.environ["WORLD_SIZE"]) cfg.distributed_rank = int(os.environ["RANK"]) cfg.device_id = cfg.distributed_rank % torch.cuda.device_count() # processes are created by torch.distributed.launch cfg.distributed_no_spawn = True def _infer_slurm_init(cfg: DistributedTrainingConfig, num_pipelines_per_node): node_list = os.environ.get("SLURM_STEP_NODELIST") if node_list is None: node_list = os.environ.get("SLURM_JOB_NODELIST") if node_list is not None: try: hostnames = subprocess.check_output( ["scontrol", "show", "hostnames", node_list] ) cfg.distributed_init_method = "tcp://{host}:{port}".format( host=hostnames.split()[0].decode("utf-8"), port=cfg.distributed_port, ) nnodes = int(os.environ.get("SLURM_NNODES")) ntasks_per_node = os.environ.get("SLURM_NTASKS_PER_NODE") if ntasks_per_node is not None: ntasks_per_node = int(ntasks_per_node) else: ntasks = int(os.environ.get("SLURM_NTASKS")) nnodes = int(os.environ.get("SLURM_NNODES")) assert ntasks % nnodes == 0 ntasks_per_node = int(ntasks / nnodes) if ntasks_per_node == 1: gpus_per_node = torch.cuda.device_count() node_id = int(os.environ.get("SLURM_NODEID")) cfg.distributed_rank = node_id * gpus_per_node cfg.distributed_world_size = nnodes * gpus_per_node elif cfg.pipeline_model_parallel: assert ntasks_per_node == num_pipelines_per_node, ( "SLURM --ntasks-per-node must match number of pipelines per " "node (={})".format(num_pipelines_per_node) ) cfg.distributed_no_spawn = True # For 4-way MP on nodes with 8 GPUs, ranks will be [0, 1] on # the first node, [1, 2] on the second node, etc. This # matches torch.distributed.launch. node_id = int(os.environ.get("SLURM_NODEID")) local_id = int(os.environ.get("SLURM_LOCALID")) cfg.distributed_rank = node_id * num_pipelines_per_node + local_id # In the above example, device_id will always be in [0, 1], # which also matches torch.distributed.launch. cfg.device_id = local_id # We also want to set distributed_world_size to be the total # number of pipelines across all nodes. cfg.distributed_world_size = nnodes * num_pipelines_per_node else: assert ( ntasks_per_node == cfg.distributed_world_size // nnodes ), f"{ntasks_per_node}, {cfg.distributed_world_size}, {nnodes}" cfg.distributed_no_spawn = True cfg.distributed_rank = int(os.environ.get("SLURM_PROCID")) cfg.device_id = int(os.environ.get("SLURM_LOCALID")) logger.info(f"Rank {cfg.distributed_rank}, device_id: {cfg.device_id}") return True except subprocess.CalledProcessError as e: # scontrol failed raise e except FileNotFoundError: # Slurm is not installed pass return False def _infer_single_node_init(cfg: DistributedTrainingConfig): assert ( cfg.distributed_world_size <= torch.cuda.device_count() ), f"world size is {cfg.distributed_world_size} but have {torch.cuda.device_count()} available devices" if cfg.distributed_port <= 0: jobid = os.environ.get("SLURM_JOB_ID") task_id = os.environ.get("SLURM_ARRAY_TASK_ID") if jobid is not None: if task_id is not None: jobid += str(task_id) jobid = int(jobid) rng = random.Random(jobid) port = rng.randint(10000, 60000) else: port = random.randint(10000, 60000) cfg.distributed_port = port cfg.distributed_init_method = "tcp://localhost:{port}".format( port=cfg.distributed_port ) def _pipeline_parallel_pre_init(cfg: DistributedTrainingConfig): from fairseq import utils balance_exists = ( cfg.pipeline_balance is not None or cfg.pipeline_encoder_balance is not None or cfg.pipeline_decoder_balance is not None ) devices_exist = ( cfg.pipeline_devices is not None or cfg.pipeline_encoder_devices is not None or cfg.pipeline_decoder_devices is not None ) if not balance_exists: raise ValueError( "--pipeline-balance is currently required for pipeline model parallelism" ) if not devices_exist: raise ValueError( "--pipeline-devices is currently required for pipeline model parallelism" ) cfg.pipeline_balance = utils.eval_str_list(cfg.pipeline_balance, type=int) if cfg.pipeline_devices is not None: cfg.pipeline_devices = utils.eval_str_list(cfg.pipeline_devices, type=int) num_pipeline_devices = len(set(cfg.pipeline_devices)) else: cfg.pipeline_encoder_devices = utils.eval_str_list( cfg.pipeline_encoder_devices, type=int ) cfg.pipeline_decoder_devices = utils.eval_str_list( cfg.pipeline_decoder_devices, type=int ) num_pipeline_devices = len( set(cfg.pipeline_encoder_devices + cfg.pipeline_decoder_devices) ) gpus_per_node = torch.cuda.device_count() assert ( gpus_per_node >= num_pipeline_devices and gpus_per_node % num_pipeline_devices == 0 ), ( "the number of unique device IDs in --pipeline-devices must evenly divide " "the number of GPUs per node (multi-node pipelining is not yet supported)" ) num_pipelines_per_node = gpus_per_node // num_pipeline_devices return num_pipeline_devices, num_pipelines_per_node def _pipeline_parallel_post_init( cfg: DistributedTrainingConfig, num_pipeline_devices, num_pipelines_per_node ): if not cfg.distributed_no_spawn: # When distributed_no_spawn is False, we expect distributed_rank and # distributed_world_size to be based on the total number of GPUs, so # we need to correct them to be based on the number of pipelines. assert cfg.distributed_world_size % num_pipeline_devices == 0 cfg.distributed_world_size = cfg.distributed_world_size // num_pipeline_devices # In the case of 4-way MP on nodes with 8 GPUs, we want # distributed_rank to be the starting GPU index for each pipeline # i.e., 0, 2, ... gpus_per_node = torch.cuda.device_count() assert cfg.distributed_rank % gpus_per_node == 0 assert cfg.distributed_rank % num_pipeline_devices == 0 with open_dict(cfg): cfg.distributed_rank = cfg.distributed_rank // num_pipeline_devices # launch one process per pipeline cfg.distributed_num_procs = num_pipelines_per_node # if we have 4-way MP on a node with 8 GPUs, we want device_ids to be 0 # and 4, indicating the starting device IDs for each pipeline cfg.device_id *= num_pipeline_devices if cfg.device_id > 0: # if there's multiple pipelines on a node (e.g., 4-way MP on an 8 # GPU node), we need to adjust pipeline_devices accordingly logger.debug( "setting CUDA device={} on rank {}".format( cfg.device_id, cfg.distributed_rank ) ) torch.cuda.set_device(cfg.device_id) with open_dict(cfg): cfg.pipeline_devices = [cfg.device_id + d for d in cfg.pipeline_devices] logger.info( "setting pipeline_devices={} on rank {}".format( cfg.pipeline_devices, cfg.distributed_rank ) ) def distributed_init(cfg: FairseqConfig): if isinstance(cfg, Namespace): from fairseq.dataclass.utils import convert_namespace_to_omegaconf cfg = convert_namespace_to_omegaconf(cfg) if not cfg.common.tpu: if torch.distributed.is_available() and torch.distributed.is_initialized(): warnings.warn( "Distributed is already initialized, cannot initialize twice!" ) else: logger.info( "distributed init (rank {}): {}".format( cfg.distributed_training.distributed_rank, cfg.distributed_training.distributed_init_method, ) ) dist.init_process_group( backend=cfg.distributed_training.distributed_backend, init_method=cfg.distributed_training.distributed_init_method, world_size=cfg.distributed_training.distributed_world_size, rank=cfg.distributed_training.distributed_rank, ) logger.info( "initialized host {} as rank {}".format( socket.gethostname(), cfg.distributed_training.distributed_rank, ) ) # perform a dummy all-reduce to initialize the NCCL communicator if torch.cuda.is_available(): dist.all_reduce(torch.zeros(1).cuda()) cfg.distributed_training.distributed_rank = torch.distributed.get_rank() else: assert xm.xrt_world_size() == cfg.distributed_training.distributed_world_size global _USE_XLA _USE_XLA = True cfg.distributed_training.device_id = xm.get_local_ordinal() cfg.distributed_training.distributed_rank = xm.get_ordinal() xm.rendezvous("distributed_init") # wait for all workers if is_master(cfg.distributed_training): logging.getLogger().setLevel(logging.INFO) else: logging.getLogger().setLevel(logging.WARNING) if cfg.common.model_parallel_size > 1: try: from fairseq.model_parallel.megatron.mpu import ( initialize_model_parallel, model_parallel_cuda_manual_seed, ) except ImportError: raise ImportError( "\n\nPlease install the megatron submodule:" "\n\n git submodule update --init " "fairseq/model_parallel/megatron" ) global _USE_MEGATRON _USE_MEGATRON = True initialize_model_parallel(cfg.common.model_parallel_size) model_parallel_cuda_manual_seed(cfg.common.seed) model_part_number = get_model_parallel_rank() cfg.checkpoint.checkpoint_suffix += "-model_part-{0}".format(model_part_number) if hasattr(cfg, "model") and getattr(cfg.model, "base_layers", 0) > 0: cfg.checkpoint.checkpoint_suffix = ( f"-rank-{cfg.distributed_training.distributed_rank}" ) return cfg.distributed_training.distributed_rank def distributed_main(i, main, cfg: FairseqConfig, kwargs): cfg.distributed_training.device_id = i if torch.cuda.is_available() and not cfg.common.cpu and not cfg.common.tpu: torch.cuda.set_device(cfg.distributed_training.device_id) if cfg.distributed_training.distributed_rank is None: # torch.multiprocessing.spawn cfg.distributed_training.distributed_rank = kwargs.pop("start_rank", 0) + i cfg.distributed_training.distributed_rank = distributed_init(cfg) after_distributed_init_fn = kwargs.pop("after_distributed_init_fn", None) if after_distributed_init_fn: cfg = after_distributed_init_fn(cfg) main(cfg, **kwargs) if torch.distributed.is_initialized(): torch.distributed.barrier(get_global_group()) def call_main(cfg: FairseqConfig, main, **kwargs): if cfg.distributed_training.distributed_init_method is None: infer_init_method(cfg.distributed_training) if cfg.distributed_training.distributed_init_method is not None: # distributed training if not cfg.distributed_training.distributed_no_spawn: start_rank = cfg.distributed_training.distributed_rank cfg.distributed_training.distributed_rank = None # assign automatically kwargs["start_rank"] = start_rank torch.multiprocessing.spawn( fn=distributed_main, args=(main, cfg, kwargs), nprocs=min( torch.cuda.device_count(), cfg.distributed_training.distributed_world_size, ), join=True, ) else: distributed_main(cfg.distributed_training.device_id, main, cfg, kwargs) elif cfg.common.tpu and cfg.distributed_training.distributed_world_size > 1: import torch_xla.distributed.xla_multiprocessing as xmp torch.multiprocessing.set_sharing_strategy("file_system") xmp.spawn( fn=distributed_main, args=(main, cfg, kwargs), # tpu-comment: # 8 devices in one TPU VM, is the max processes to be spawned. # The rest is driven by xm.distributed.xla_dist nprocs=min(cfg.distributed_training.distributed_world_size, 8), ) else: # single GPU main main(cfg, **kwargs) def use_xla(): global _USE_XLA return _USE_XLA def new_groups(grouped_ranks: List[List[int]]): if use_xla(): return ("tpu", grouped_ranks) else: groups = [dist.new_group(g) for g in grouped_ranks] my_group_idx = _find_my_group_index(grouped_ranks) return groups[my_group_idx] def _find_my_group_index(grouped_ranks): my_rank = get_global_rank() for i, group in enumerate(grouped_ranks): if my_rank in group: return i raise RuntimeError def _find_my_group(grouped_ranks): index = _find_my_group_index(grouped_ranks) return grouped_ranks[index] def get_rank(group): if use_xla(): assert group[0] == "tpu" my_group = _find_my_group(group[1]) return my_group.index(get_global_rank()) else: return dist.get_rank(group=group) def get_world_size(group): if use_xla(): assert group[0] == "tpu" my_group = _find_my_group(group[1]) return len(my_group) elif torch.distributed.is_initialized(): return dist.get_world_size(group=group) else: return 1 def get_global_group(): if use_xla(): return new_groups([list(range(get_global_world_size()))]) elif torch.distributed.is_initialized(): if not hasattr(get_global_group, "_global_group"): # ideally we could use torch.distributed.group.WORLD, but it seems # to cause random NCCL hangs in some cases get_global_group._global_group = dist.new_group() return get_global_group._global_group else: return None def get_global_rank(): if use_xla(): return xm.get_ordinal() elif torch.distributed.is_initialized(): return torch.distributed.get_rank() else: return 0 def get_global_world_size(): if use_xla(): return xm.xrt_world_size() elif torch.distributed.is_initialized(): return torch.distributed.get_world_size() else: return 1 def get_data_parallel_group(): """Get the data parallel group the caller rank belongs to.""" global _USE_MEGATRON if _USE_MEGATRON: from fairseq.model_parallel.megatron import mpu return mpu.get_data_parallel_group() else: return get_global_group() def get_data_parallel_rank(): """Return my rank for the data parallel group.""" return get_rank(get_data_parallel_group()) def get_data_parallel_world_size(): """Return world size for the data parallel group.""" return get_world_size(get_data_parallel_group()) def get_model_parallel_group(): global _USE_MEGATRON if _USE_MEGATRON: from fairseq.model_parallel.megatron import mpu return mpu.get_model_parallel_group() else: return None def get_model_parallel_rank(): """Return my rank for the model parallel group.""" return get_rank(get_model_parallel_group()) def get_model_parallel_world_size(): """Return world size for the model parallel group.""" return get_world_size(get_model_parallel_group()) def all_reduce(tensor, group, op="sum"): if use_xla(): assert isinstance(group, tuple) and group[0] == "tpu" tensor = [tensor] # wrap in a list to make xm.all_reduce in-place return xm.all_reduce(op, tensor, groups=group[1])[0] else: if op == "sum": op = dist.ReduceOp.SUM elif op == "max": op = dist.ReduceOp.MAX else: raise NotImplementedError dist.all_reduce(tensor, op=op, group=group) return tensor def broadcast(tensor, src, group): if use_xla(): # XLA doesn't support broadcast, hack it with all_reduce if get_rank(group) != src: tensor.zero_() all_reduce(tensor, group) else: dist.broadcast(tensor, src=src, group=group) def all_to_all(tensor, group): """Perform an all-to-all operation on a 1D Tensor.""" assert tensor.dim() == 1 split_count = get_world_size(group=group) assert tensor.numel() % split_count == 0 if use_xla(): assert isinstance(group, tuple) and group[0] == "tpu" return xm.all_to_all( tensor, split_dimension=0, concat_dimension=0, split_count=split_count, groups=group[1], ) else: output = torch.zeros_like(tensor) dist.all_to_all_single(output, tensor, group=group) return output def all_gather(tensor, group, return_tensor=False): """Perform an all-gather operation.""" if use_xla(): result = xm.all_gather(tensor, groups=group[1]) world_size = get_world_size(group=group) result = result.view(world_size, *tensor.size()) if return_tensor: return result else: return [result[i] for i in range(world_size)] else: world_size = get_world_size(group=group) rank = get_rank(group=group) tensor_list = [ tensor if i == rank else torch.empty_like(tensor) for i in range(world_size) ] dist.all_gather(tensor_list, tensor, group=group) if return_tensor: return torch.stack(tensor_list, dim=0) else: return tensor_list def all_gather_list(data, group=None, max_size=16384): """Gathers arbitrary data from all nodes into a list. Similar to :func:`~torch.distributed.all_gather` but for arbitrary Python data. Note that *data* must be picklable and any CUDA tensors will be moved to CPU and returned on CPU as well. Args: data (Any): data from the local worker to be gathered on other workers group: group of the collective max_size (int, optional): maximum size of the data to be gathered across workers """ from fairseq import utils if group is None: group = get_global_group() rank = get_rank(group=group) world_size = get_world_size(group=group) buffer_size = max_size * world_size if ( not hasattr(all_gather_list, "_buffer") or all_gather_list._buffer.numel() < buffer_size ): all_gather_list._buffer = torch.cuda.ByteTensor(buffer_size) all_gather_list._cpu_buffer = torch.ByteTensor(max_size).pin_memory() buffer = all_gather_list._buffer buffer.zero_() cpu_buffer = all_gather_list._cpu_buffer data = utils.move_to_cpu(data) enc = pickle.dumps(data) enc_size = len(enc) header_size = 4 # size of header that contains the length of the encoded data size = header_size + enc_size if size > max_size: raise ValueError( "encoded data size ({}) exceeds max_size ({})".format(size, max_size) ) header = struct.pack(">I", enc_size) cpu_buffer[:size] = torch.ByteTensor(list(header + enc)) start = rank * max_size buffer[start : start + size].copy_(cpu_buffer[:size]) all_reduce(buffer, group=group) buffer = buffer.cpu() try: result = [] for i in range(world_size): out_buffer = buffer[i * max_size : (i + 1) * max_size] (enc_size,) = struct.unpack(">I", bytes(out_buffer[:header_size].tolist())) if enc_size > 0: result.append( pickle.loads( bytes(out_buffer[header_size : header_size + enc_size].tolist()) ) ) return result except pickle.UnpicklingError: raise Exception( "Unable to unpickle data from other workers. all_gather_list requires all " "workers to enter the function together, so this error usually indicates " "that the workers have fallen out of sync somehow. Workers can fall out of " "sync if one of them runs out of memory, or if there are other conditions " "in your training script that can cause one worker to finish an epoch " "while other workers are still iterating over their portions of the data. " "Try rerunning with --ddp-backend=legacy_ddp and see if that helps." ) def all_reduce_dict(data: Mapping[str, Any], device, group) -> Dict[str, Any]: """ AllReduce a dictionary of values across workers. We separately reduce items that are already on the device and items on CPU for better performance. Args: data (Mapping[str, Any]): dictionary of data to all-reduce, but cannot be a nested dictionary device (torch.device): device for the reduction group: group of the collective """ data_keys = list(data.keys()) # We want to separately reduce items that are already on the # device and items on CPU for performance reasons. cpu_data = OrderedDict() device_data = OrderedDict() for k in data_keys: t = data[k] if not torch.is_tensor(t): cpu_data[k] = torch.tensor(t, dtype=torch.double) elif t.device.type != device.type: cpu_data[k] = t.to(dtype=torch.double) else: device_data[k] = t.to(dtype=torch.double) def _all_reduce_dict(data: OrderedDict): if len(data) == 0: return data buf = torch.cat([t.view(-1) for t in data.values()]).to(device=device) all_reduce(buf, group=group) split_buf = torch.split(buf.clone(), [t.numel() for t in data.values()]) reduced_data = [t.view_as(orig) for t, orig in zip(split_buf, data.values())] return OrderedDict(zip(data.keys(), reduced_data)) cpu_data = _all_reduce_dict(cpu_data) device_data = _all_reduce_dict(device_data) def get_from_stack(key): if key in cpu_data: return cpu_data[key] elif key in device_data: return device_data[key] raise KeyError return OrderedDict([(key, get_from_stack(key)) for key in data_keys]) def broadcast_tensors( tensors: Optional[List[torch.Tensor]], src_rank: int, group: object, dist_device: Optional[torch.device] = None, ) -> List[torch.Tensor]: """ Broadcasts a list of tensors without other (non-src) ranks needing to know the dtypes/shapes of the tensors. """ if dist_device is None: if torch.distributed.get_backend(group) == "nccl": dist_device = torch.device("cuda") else: dist_device = torch.device("cpu") # share metadata first to simplify transfer is_src_rank = get_rank(group) == src_rank if is_src_rank: metadata = [ {"size": t.size(), "dtype": t.dtype, "device": t.device} for t in tensors ] metadata = _broadcast_object_slow(metadata, src_rank, group, dist_device) else: metadata = _broadcast_object_slow(None, src_rank, group, dist_device) out_tensors = [] for i, meta in enumerate(metadata): if is_src_rank: tensor = tensors[i] broadcast(tensors[i].to(dist_device), src=src_rank, group=group) else: tensor = torch.zeros( [meta["size"].numel()], dtype=meta["dtype"], device=dist_device ) broadcast(tensor, src=src_rank, group=group) tensor = tensor.view(meta["size"]).to(meta["device"]) out_tensors.append(tensor) return out_tensors def broadcast_object( obj: Any, src_rank: int, group: object, dist_device: Optional[torch.device] = None, ) -> Any: """Broadcast an arbitrary Python object to other workers.""" if dist_device is None: if torch.distributed.get_backend(group) == "nccl": dist_device = torch.device("cuda") else: dist_device = torch.device("cpu") if get_rank(group) == src_rank: # split the tensors from the non-tensors so we can broadcast them # directly, avoiding unnecessary serialization/deserialization tensors = [] obj = _split_tensors_from_obj(obj, tensors) obj = _broadcast_object_slow(obj, src_rank, group, dist_device) tensors = broadcast_tensors(tensors, src_rank, group, dist_device) else: obj = _broadcast_object_slow(None, src_rank, group, dist_device) tensors = broadcast_tensors(None, src_rank, group, dist_device) return _put_tensors_in_obj(obj, tensors) def _broadcast_object_slow( obj: Any, src_rank: int, group: object, dist_device: torch.device, ) -> Any: if get_rank(group) == src_rank: # Emit data buffer = io.BytesIO() torch.save(obj, buffer) buffer = torch.ByteTensor(buffer.getbuffer()).to(dist_device) length = torch.LongTensor([len(buffer)]).to(dist_device) broadcast(length, src=src_rank, group=group) broadcast(buffer, src=src_rank, group=group) else: # Fetch from the source length = torch.LongTensor([0]).to(dist_device) broadcast(length, src=src_rank, group=group) buffer = torch.ByteTensor(int(length.item())).to(dist_device) broadcast(buffer, src=src_rank, group=group) buffer = io.BytesIO(buffer.cpu().numpy()) obj = torch.load(buffer, map_location="cpu") return obj @dataclass(frozen=True) class _TensorPlaceholder: index: int def _split_tensors_from_obj(obj: Any, tensors: List[torch.Tensor]) -> Any: if torch.is_tensor(obj): placeholder = _TensorPlaceholder(index=len(tensors)) tensors.append(obj) return placeholder elif isinstance(obj, dict): return {k: _split_tensors_from_obj(v, tensors) for k, v in obj.items()} elif isinstance(obj, list): return [_split_tensors_from_obj(v, tensors) for v in obj] elif isinstance(obj, tuple): return tuple(_split_tensors_from_obj(v, tensors) for v in obj) elif isinstance(obj, set): return {_split_tensors_from_obj(v, tensors) for v in obj} else: return obj def _put_tensors_in_obj(obj: Any, tensors: List[torch.Tensor]) -> Any: if isinstance(obj, _TensorPlaceholder): return tensors[obj.index] elif isinstance(obj, dict): return {k: _put_tensors_in_obj(v, tensors) for k, v in obj.items()} elif isinstance(obj, list): return [_put_tensors_in_obj(v, tensors) for v in obj] elif isinstance(obj, tuple): return tuple(_put_tensors_in_obj(v, tensors) for v in obj) elif isinstance(obj, set): return {_put_tensors_in_obj(v, tensors) for v in obj} else: return obj ================================================ FILE: fairseq/file_chunker_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import typing as tp def _safe_readline(fd) -> str: pos = fd.tell() while True: try: return fd.readline() except UnicodeDecodeError: pos -= 1 fd.seek(pos) # search where this character begins def find_offsets(filename: str, num_chunks: int) -> tp.List[int]: """ given a file and a number of chuncks, find the offsets in the file to be able to chunk around full lines. """ with open(filename, "r", encoding="utf-8") as f: size = os.fstat(f.fileno()).st_size chunk_size = size // num_chunks offsets = [0 for _ in range(num_chunks + 1)] for i in range(1, num_chunks): f.seek(chunk_size * i) _safe_readline(f) offsets[i] = f.tell() offsets[-1] = size return offsets class ChunkLineIterator: """ Iterator to properly iterate over lines of a file chunck. """ def __init__(self, fd, start_offset: int, end_offset: int): self._fd = fd self._start_offset = start_offset self._end_offset = end_offset def __iter__(self) -> tp.Iterable[str]: self._fd.seek(self._start_offset) # next(f) breaks f.tell(), hence readline() must be used line = _safe_readline(self._fd) while line: pos = self._fd.tell() # f.tell() does not always give the byte position in the file # sometimes it skips to a very large number # it is unlikely that through a normal read we go from # end bytes to end + 2**32 bytes (4 GB) and this makes it unlikely # that the procedure breaks by the undeterministic behavior of # f.tell() if ( self._end_offset > 0 and pos > self._end_offset and pos < self._end_offset + 2**32 ): break yield line line = self._fd.readline() class Chunker: """ contextmanager to read a chunck of a file line by line. """ def __init__(self, path: str, start_offset: int, end_offset: int): self.path = path self.start_offset = start_offset self.end_offset = end_offset def __enter__(self) -> ChunkLineIterator: self.fd = open(self.path, "r", encoding="utf-8") return ChunkLineIterator(self.fd, self.start_offset, self.end_offset) def __exit__(self, exc_type, exc_val, exc_tb) -> None: self.fd.close() ================================================ FILE: fairseq/file_io.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import shutil from typing import List, Optional logger = logging.getLogger(__file__) try: from iopath.common.file_io import g_pathmgr as IOPathManager try: # [FB only - for now] AWS PathHandler for PathManager from .fb_pathhandlers import S3PathHandler IOPathManager.register_handler(S3PathHandler()) except KeyError: logging.warning("S3PathHandler already registered.") except ImportError: logging.debug( "S3PathHandler couldn't be imported. Either missing fb-only files, or boto3 module." ) except ImportError: IOPathManager = None class PathManager: """ Wrapper for insulating OSS I/O (using Python builtin operations) from iopath's PathManager abstraction (for transparently handling various internal backends). """ @staticmethod def open( path: str, mode: str = "r", buffering: int = -1, encoding: Optional[str] = None, errors: Optional[str] = None, newline: Optional[str] = None, ): if IOPathManager: return IOPathManager.open( path=path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline, ) return open( path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline, ) @staticmethod def copy(src_path: str, dst_path: str, overwrite: bool = False) -> bool: if IOPathManager: return IOPathManager.copy( src_path=src_path, dst_path=dst_path, overwrite=overwrite ) return shutil.copyfile(src_path, dst_path) @staticmethod def get_local_path(path: str, **kwargs) -> str: if IOPathManager: return IOPathManager.get_local_path(path, **kwargs) return path @staticmethod def exists(path: str) -> bool: if IOPathManager: return IOPathManager.exists(path) return os.path.exists(path) @staticmethod def isfile(path: str) -> bool: if IOPathManager: return IOPathManager.isfile(path) return os.path.isfile(path) @staticmethod def ls(path: str) -> List[str]: if IOPathManager: return IOPathManager.ls(path) return os.listdir(path) @staticmethod def mkdirs(path: str) -> None: if IOPathManager: return IOPathManager.mkdirs(path) os.makedirs(path, exist_ok=True) @staticmethod def rm(path: str) -> None: if IOPathManager: return IOPathManager.rm(path) os.remove(path) @staticmethod def chmod(path: str, mode: int) -> None: if not PathManager.path_requires_pathmanager(path): os.chmod(path, mode) @staticmethod def register_handler(handler) -> None: if IOPathManager: return IOPathManager.register_handler(handler=handler) @staticmethod def copy_from_local( local_path: str, dst_path: str, overwrite: bool = False, **kwargs ) -> None: if IOPathManager: return IOPathManager.copy_from_local( local_path=local_path, dst_path=dst_path, overwrite=overwrite, **kwargs ) return shutil.copyfile(local_path, dst_path) @staticmethod def path_requires_pathmanager(path: str) -> bool: """Do we require PathManager to access given path?""" if IOPathManager: for p in IOPathManager._path_handlers.keys(): if path.startswith(p): return True return False @staticmethod def supports_rename(path: str) -> bool: # PathManager doesn't yet support renames return not PathManager.path_requires_pathmanager(path) @staticmethod def rename(src: str, dst: str): os.rename(src, dst) """ ioPath async PathManager methods: """ @staticmethod def opena( path: str, mode: str = "r", buffering: int = -1, encoding: Optional[str] = None, errors: Optional[str] = None, newline: Optional[str] = None, ): """ Return file descriptor with asynchronous write operations. """ global IOPathManager if not IOPathManager: logging.info("ioPath is initializing PathManager.") try: from iopath.common.file_io import PathManager IOPathManager = PathManager() except Exception: logging.exception("Failed to initialize ioPath PathManager object.") return IOPathManager.opena( path=path, mode=mode, buffering=buffering, encoding=encoding, errors=errors, newline=newline, ) @staticmethod def async_close() -> bool: """ Wait for files to be written and clean up asynchronous PathManager. NOTE: `PathManager.async_close()` must be called at the end of any script that uses `PathManager.opena(...)`. """ global IOPathManager if IOPathManager: return IOPathManager.async_close() return False ================================================ FILE: fairseq/file_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Utilities for working with the local dataset cache. This file is adapted from `AllenNLP <https://github.com/allenai/allennlp>`_. and `huggingface <https://github.com/huggingface>`_. """ import fnmatch import json import logging import os import shutil import tarfile import tempfile from functools import partial, wraps from hashlib import sha256 from io import open try: from torch.hub import _get_torch_home torch_cache_home = _get_torch_home() except ImportError: torch_cache_home = os.path.expanduser( os.getenv( "TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch") ) ) default_cache_path = os.path.join(torch_cache_home, "pytorch_fairseq") try: from urllib.parse import urlparse except ImportError: from urlparse import urlparse try: from pathlib import Path PYTORCH_FAIRSEQ_CACHE = Path(os.getenv("PYTORCH_FAIRSEQ_CACHE", default_cache_path)) except (AttributeError, ImportError): PYTORCH_FAIRSEQ_CACHE = os.getenv("PYTORCH_FAIRSEQ_CACHE", default_cache_path) CONFIG_NAME = "config.json" WEIGHTS_NAME = "pytorch_model.bin" logger = logging.getLogger(__name__) # pylint: disable=invalid-name def load_archive_file(archive_file): # redirect to the cache, if necessary try: resolved_archive_file = cached_path(archive_file, cache_dir=None) except EnvironmentError: logger.info( "Archive name '{}' was not found in archive name list. " "We assumed '{}' was a path or URL but couldn't find any file " "associated to this path or URL.".format( archive_file, archive_file, ) ) return None if resolved_archive_file == archive_file: logger.info("loading archive file {}".format(archive_file)) else: logger.info( "loading archive file {} from cache at {}".format( archive_file, resolved_archive_file ) ) # Extract archive to temp dir and replace .tar.bz2 if necessary tempdir = None if not os.path.isdir(resolved_archive_file): tempdir = tempfile.mkdtemp() logger.info( "extracting archive file {} to temp dir {}".format( resolved_archive_file, tempdir ) ) ext = os.path.splitext(archive_file)[1][1:] with tarfile.open(resolved_archive_file, "r:" + ext) as archive: top_dir = os.path.commonprefix(archive.getnames()) archive.extractall(tempdir) os.remove(resolved_archive_file) shutil.move(os.path.join(tempdir, top_dir), resolved_archive_file) shutil.rmtree(tempdir) return resolved_archive_file def url_to_filename(url, etag=None): """ Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the URL's, delimited by a period. """ url_bytes = url.encode("utf-8") url_hash = sha256(url_bytes) filename = url_hash.hexdigest() if etag: etag_bytes = etag.encode("utf-8") etag_hash = sha256(etag_bytes) filename += "." + etag_hash.hexdigest() return filename def filename_to_url(filename, cache_dir=None): """ Return the url and etag (which may be ``None``) stored for `filename`. Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. """ if cache_dir is None: cache_dir = PYTORCH_FAIRSEQ_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) cache_path = os.path.join(cache_dir, filename) if not os.path.exists(cache_path): raise EnvironmentError("file {} not found".format(cache_path)) meta_path = cache_path + ".json" if not os.path.exists(meta_path): raise EnvironmentError("file {} not found".format(meta_path)) with open(meta_path, encoding="utf-8") as meta_file: metadata = json.load(meta_file) url = metadata["url"] etag = metadata["etag"] return url, etag def cached_path_from_pm(url_or_filename): """ Tries to cache the specified URL using PathManager class. Returns the cached path if success otherwise failure. """ try: from fairseq.file_io import PathManager local_path = PathManager.get_local_path(url_or_filename) return local_path except Exception: return None def cached_path(url_or_filename, cache_dir=None): """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path. """ if cache_dir is None: cache_dir = PYTORCH_FAIRSEQ_CACHE if isinstance(url_or_filename, Path): url_or_filename = str(url_or_filename) if isinstance(cache_dir, Path): cache_dir = str(cache_dir) parsed = urlparse(url_or_filename) if parsed.scheme in ("http", "https", "s3"): # URL, so get it from the cache (downloading if necessary) return get_from_cache(url_or_filename, cache_dir) elif os.path.exists(url_or_filename): # File, and it exists. return url_or_filename elif parsed.scheme == "": # File, but it doesn't exist. raise EnvironmentError("file {} not found".format(url_or_filename)) else: cached_path = cached_path_from_pm(url_or_filename) if cached_path: return cached_path # Something unknown raise ValueError( "unable to parse {} as a URL or as a local path".format(url_or_filename) ) def split_s3_path(url): """Split a full s3 path into the bucket name and path.""" parsed = urlparse(url) if not parsed.netloc or not parsed.path: raise ValueError("bad s3 path {}".format(url)) bucket_name = parsed.netloc s3_path = parsed.path # Remove '/' at beginning of path. if s3_path.startswith("/"): s3_path = s3_path[1:] return bucket_name, s3_path def s3_request(func): """ Wrapper function for s3 requests in order to create more helpful error messages. """ @wraps(func) def wrapper(url, *args, **kwargs): from botocore.exceptions import ClientError try: return func(url, *args, **kwargs) except ClientError as exc: if int(exc.response["Error"]["Code"]) == 404: raise EnvironmentError("file {} not found".format(url)) else: raise return wrapper @s3_request def s3_etag(url): """Check ETag on S3 object.""" import boto3 s3_resource = boto3.resource("s3") bucket_name, s3_path = split_s3_path(url) s3_object = s3_resource.Object(bucket_name, s3_path) return s3_object.e_tag @s3_request def s3_get(url, temp_file): """Pull a file directly from S3.""" import boto3 s3_resource = boto3.resource("s3") bucket_name, s3_path = split_s3_path(url) s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) def request_wrap_timeout(func, url): import requests for attempt, timeout in enumerate([10, 20, 40, 60, 60]): try: return func(timeout=timeout) except requests.exceptions.Timeout as e: logger.warning( "Request for %s timed-out (attempt %d). Retrying with a timeout of %d secs", url, attempt, timeout, exc_info=e, ) continue raise RuntimeError(f"Unable to fetch file {url}") def http_get(url, temp_file): import requests from tqdm import tqdm req = request_wrap_timeout(partial(requests.get, url, stream=True), url) content_length = req.headers.get("Content-Length") total = int(content_length) if content_length is not None else None progress = tqdm(unit="B", total=total) for chunk in req.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) temp_file.write(chunk) progress.close() def get_from_cache(url, cache_dir=None): """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. """ if cache_dir is None: cache_dir = PYTORCH_FAIRSEQ_CACHE if isinstance(cache_dir, Path): cache_dir = str(cache_dir) if not os.path.exists(cache_dir): os.makedirs(cache_dir) # Get eTag to add to filename, if it exists. if url.startswith("s3://"): etag = s3_etag(url) else: try: import requests response = request_wrap_timeout( partial(requests.head, url, allow_redirects=True), url ) if response.status_code != 200: etag = None else: etag = response.headers.get("ETag") except RuntimeError: etag = None filename = url_to_filename(url, etag) # get cache path to put the file cache_path = os.path.join(cache_dir, filename) # If we don't have a connection (etag is None) and can't identify the file # try to get the last downloaded one if not os.path.exists(cache_path) and etag is None: matching_files = fnmatch.filter(os.listdir(cache_dir), filename + ".*") matching_files = list(filter(lambda s: not s.endswith(".json"), matching_files)) if matching_files: cache_path = os.path.join(cache_dir, matching_files[-1]) if not os.path.exists(cache_path): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with tempfile.NamedTemporaryFile() as temp_file: logger.info("%s not found in cache, downloading to %s", url, temp_file.name) # GET file object if url.startswith("s3://"): s3_get(url, temp_file) else: http_get(url, temp_file) # we are copying the file before closing it, so flush to avoid truncation temp_file.flush() # shutil.copyfileobj() starts at the current position, so go to the start temp_file.seek(0) logger.info("copying %s to cache at %s", temp_file.name, cache_path) with open(cache_path, "wb") as cache_file: shutil.copyfileobj(temp_file, cache_file) logger.info("creating metadata file for %s", cache_path) meta = {"url": url, "etag": etag} meta_path = cache_path + ".json" with open(meta_path, "w") as meta_file: output_string = json.dumps(meta) meta_file.write(output_string) logger.info("removing temp file %s", temp_file.name) return cache_path def read_set_from_file(filename): """ Extract a de-duped collection (set) of text from a file. Expected file format is one item per line. """ collection = set() with open(filename, "r", encoding="utf-8") as file_: for line in file_: collection.add(line.rstrip()) return collection def get_file_extension(path, dot=True, lower=True): ext = os.path.splitext(path)[1] ext = ext if dot else ext[1:] return ext.lower() if lower else ext ================================================ FILE: fairseq/hub_utils.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import copy import logging import os from typing import Any, Dict, Iterator, List import torch from omegaconf import open_dict from torch import nn from fairseq import utils from fairseq.data import encoders logger = logging.getLogger(__name__) def from_pretrained( model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", archive_map=None, **kwargs ): from fairseq import checkpoint_utils, file_utils if archive_map is not None: if model_name_or_path in archive_map: model_name_or_path = archive_map[model_name_or_path] if data_name_or_path is not None and data_name_or_path in archive_map: data_name_or_path = archive_map[data_name_or_path] # allow archive_map to set default arg_overrides (e.g., tokenizer, bpe) # for each model if isinstance(model_name_or_path, dict): for k, v in model_name_or_path.items(): if k == "checkpoint_file": checkpoint_file = v elif ( k != "path" # only set kwargs that don't already have overrides and k not in kwargs ): kwargs[k] = v model_name_or_path = model_name_or_path["path"] model_path = file_utils.load_archive_file(model_name_or_path) # convenience hack for loading data and BPE codes from model archive if data_name_or_path.startswith("."): kwargs["data"] = os.path.abspath(os.path.join(model_path, data_name_or_path)) else: kwargs["data"] = file_utils.load_archive_file(data_name_or_path) for file, arg in { "code": "bpe_codes", "bpecodes": "bpe_codes", "sentencepiece.bpe.model": "sentencepiece_model", "merges.txt": "bpe_merges", "vocab.json": "bpe_vocab", }.items(): path = os.path.join(model_path, file) if os.path.exists(path): kwargs[arg] = path if "user_dir" in kwargs: utils.import_user_module(argparse.Namespace(user_dir=kwargs["user_dir"])) model_path = [ os.path.join(model_path, cpt) for cpt in checkpoint_file.split(os.pathsep) ] if "is_vocoder" in kwargs: args = {"data": kwargs["data"], "model_path": model_path} task = None models = None else: models, args, task = checkpoint_utils.load_model_ensemble_and_task( model_path, arg_overrides=kwargs, ) if "generation_args" in kwargs and kwargs["generation_args"]: for key in kwargs["generation_args"]: setattr(args["generation"], key, kwargs["generation_args"][key]) return { "args": args, "task": task, "models": models, } class GeneratorHubInterface(nn.Module): """ PyTorch Hub interface for generating sequences from a pre-trained translation or language model. """ def __init__(self, cfg, task, models): super().__init__() self.cfg = cfg self.task = task self.models = nn.ModuleList(models) self.src_dict = task.source_dictionary self.tgt_dict = task.target_dictionary # optimize model for generation for model in self.models: model.prepare_for_inference_(cfg) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) self.align_dict = utils.load_align_dict(cfg.generation.replace_unk) self.tokenizer = encoders.build_tokenizer(cfg.tokenizer) self.bpe = encoders.build_bpe(cfg.bpe) self.max_positions = utils.resolve_max_positions( self.task.max_positions(), *[model.max_positions() for model in models] ) # this is useful for determining the device self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) @property def device(self): return self._float_tensor.device def translate( self, sentences: List[str], beam: int = 5, verbose: bool = False, **kwargs ) -> List[str]: return self.sample(sentences, beam, verbose, **kwargs) def sample( self, sentences: List[str], beam: int = 1, verbose: bool = False, **kwargs ) -> List[str]: if isinstance(sentences, str): return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0] tokenized_sentences = [self.encode(sentence) for sentence in sentences] batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs) return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos] def score( self, sentences: List[str], replace_newline_with_eos: bool = False, **kwargs ): if isinstance(sentences, str): return self.score( [sentences], replace_newline_with_eos=replace_newline_with_eos, **kwargs )[0] def encode(sentence): if replace_newline_with_eos: return torch.cat([self.encode(line) for line in sentence.splitlines()]) else: return self.encode(sentence) # NOTE: this doesn't support translation tasks currently tokenized_sentences = [encode(sentence) for sentence in sentences] return [ hypos[0] for hypos in self.generate( tokenized_sentences, score_reference=True, **kwargs ) ] def generate( self, tokenized_sentences: List[torch.LongTensor], beam: int = 5, verbose: bool = False, skip_invalid_size_inputs=False, inference_step_args=None, prefix_allowed_tokens_fn=None, **kwargs ) -> List[List[Dict[str, torch.Tensor]]]: if torch.is_tensor(tokenized_sentences) and tokenized_sentences.dim() == 1: return self.generate( tokenized_sentences.unsqueeze(0), beam=beam, verbose=verbose, **kwargs )[0] # build generator using current args as well as any kwargs gen_args = copy.deepcopy(self.cfg.generation) with open_dict(gen_args): gen_args.beam = beam for k, v in kwargs.items(): setattr(gen_args, k, v) generator = self.task.build_generator( self.models, gen_args, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn, ) inference_step_args = inference_step_args or {} results = [] for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs): batch = utils.apply_to_sample(lambda t: t.to(self.device), batch) translations = self.task.inference_step( generator, self.models, batch, **inference_step_args ) for id, hypos in zip(batch["id"].tolist(), translations): results.append((id, hypos)) # sort output to match input order outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])] if verbose: def getarg(name, default): return getattr(gen_args, name, getattr(self.cfg, name, default)) for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs): src_str_with_unk = self.string(source_tokens) logger.info("S\t{}".format(src_str_with_unk)) for hypo in target_hypotheses: hypo_str = self.decode(hypo["tokens"]) logger.info("H\t{}\t{}".format(hypo["score"], hypo_str)) logger.info( "P\t{}".format( " ".join( map( lambda x: "{:.4f}".format(x), hypo["positional_scores"].tolist(), ) ) ) ) if hypo["alignment"] is not None and getarg( "print_alignment", False ): logger.info( "A\t{}".format( " ".join( [ "{}-{}".format(src_idx, tgt_idx) for src_idx, tgt_idx in hypo["alignment"] ] ) ) ) return outputs def encode(self, sentence: str) -> torch.LongTensor: sentence = self.tokenize(sentence) sentence = self.apply_bpe(sentence) return self.binarize(sentence) def decode(self, tokens: torch.LongTensor) -> str: sentence = self.string(tokens) sentence = self.remove_bpe(sentence) return self.detokenize(sentence) def tokenize(self, sentence: str) -> str: if self.tokenizer is not None: sentence = self.tokenizer.encode(sentence) return sentence def detokenize(self, sentence: str) -> str: if self.tokenizer is not None: sentence = self.tokenizer.decode(sentence) return sentence def apply_bpe(self, sentence: str) -> str: if self.bpe is not None: sentence = self.bpe.encode(sentence) return sentence def remove_bpe(self, sentence: str) -> str: if self.bpe is not None: sentence = self.bpe.decode(sentence) return sentence def binarize(self, sentence: str) -> torch.LongTensor: return self.src_dict.encode_line(sentence, add_if_not_exist=False).long() def string(self, tokens: torch.LongTensor) -> str: return self.tgt_dict.string(tokens) def _build_batches( self, tokens: List[List[int]], skip_invalid_size_inputs: bool ) -> Iterator[Dict[str, Any]]: lengths = torch.LongTensor([t.numel() for t in tokens]) batch_iterator = self.task.get_batch_iterator( dataset=self.task.build_dataset_for_inference(tokens, lengths), max_tokens=self.cfg.dataset.max_tokens, max_sentences=self.cfg.dataset.batch_size, max_positions=self.max_positions, ignore_invalid_inputs=skip_invalid_size_inputs, disable_iterator_cache=True, ).next_epoch_itr(shuffle=False) return batch_iterator class BPEHubInterface(object): """PyTorch Hub interface for Byte-Pair Encoding (BPE).""" def __init__(self, bpe, **kwargs): super().__init__() args = argparse.Namespace(bpe=bpe, **kwargs) self.bpe = encoders.build_bpe(args) assert self.bpe is not None def encode(self, sentence: str) -> str: return self.bpe.encode(sentence) def decode(self, sentence: str) -> str: return self.bpe.decode(sentence) class TokenizerHubInterface(object): """PyTorch Hub interface for tokenization.""" def __init__(self, tokenizer, **kwargs): super().__init__() args = argparse.Namespace(tokenizer=tokenizer, **kwargs) self.tokenizer = encoders.build_tokenizer(args) assert self.tokenizer is not None def encode(self, sentence: str) -> str: return self.tokenizer.encode(sentence) def decode(self, sentence: str) -> str: return self.tokenizer.decode(sentence) ================================================ FILE: fairseq/incremental_decoding_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import uuid from typing import Dict, Optional from torch import Tensor class FairseqIncrementalState(object): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.init_incremental_state() def init_incremental_state(self): self._incremental_state_id = str(uuid.uuid4()) def _get_full_incremental_state_key(self, key: str) -> str: return "{}.{}".format(self._incremental_state_id, key) def get_incremental_state( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], key: str, ) -> Optional[Dict[str, Optional[Tensor]]]: """Helper for getting incremental state for an nn.Module.""" full_key = self._get_full_incremental_state_key(key) if incremental_state is None or full_key not in incremental_state: return None return incremental_state[full_key] def set_incremental_state( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], key: str, value: Dict[str, Optional[Tensor]], ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]: """Helper for setting incremental state for an nn.Module.""" if incremental_state is not None: full_key = self._get_full_incremental_state_key(key) incremental_state[full_key] = value return incremental_state def with_incremental_state(cls): cls.__bases__ = (FairseqIncrementalState,) + tuple( b for b in cls.__bases__ if b != FairseqIncrementalState ) return cls ================================================ FILE: fairseq/iterative_refinement_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import namedtuple import numpy as np import torch from fairseq import utils DecoderOut = namedtuple( "IterativeRefinementDecoderOut", ["output_tokens", "output_scores", "attn", "step", "max_step", "history"], ) class IterativeRefinementGenerator(object): def __init__( self, tgt_dict, models=None, eos_penalty=0.0, max_iter=10, max_ratio=2, beam_size=1, decoding_format=None, retain_dropout=False, adaptive=True, retain_history=False, reranking=False, ): """ Generates translations based on iterative refinement. Args: tgt_dict: target dictionary eos_penalty: if > 0.0, it penalized early-stopping in decoding max_iter: maximum number of refinement iterations max_ratio: generate sequences of maximum length ax, where x is the source length decoding_format: decoding mode in {'unigram', 'ensemble', 'vote', 'dp', 'bs'} retain_dropout: retaining dropout in the inference adaptive: decoding with early stop """ self.bos = tgt_dict.bos() self.pad = tgt_dict.pad() self.unk = tgt_dict.unk() self.eos = tgt_dict.eos() self.vocab_size = len(tgt_dict) self.eos_penalty = eos_penalty self.max_iter = max_iter self.max_ratio = max_ratio self.beam_size = beam_size self.reranking = reranking self.decoding_format = decoding_format self.retain_dropout = retain_dropout self.retain_history = retain_history self.adaptive = adaptive self.models = models def generate_batched_itr( self, data_itr, maxlen_a=None, maxlen_b=None, cuda=False, timer=None, prefix_size=0, ): """Iterate over a batched dataset and yield individual translations. Args: maxlen_a/b: generate sequences of maximum length ax + b, where x is the source sentence length. cuda: use GPU for generation timer: StopwatchMeter for timing generations. """ for sample in data_itr: if "net_input" not in sample: continue if timer is not None: timer.start() with torch.no_grad(): hypos = self.generate( self.models, sample, prefix_tokens=sample["target"][:, :prefix_size] if prefix_size > 0 else None, ) if timer is not None: timer.stop(sample["ntokens"]) for i, id in enumerate(sample["id"]): # remove padding src = utils.strip_pad(sample["net_input"]["src_tokens"][i, :], self.pad) ref = utils.strip_pad(sample["target"][i, :], self.pad) yield id, src, ref, hypos[i] @torch.no_grad() def generate(self, models, sample, prefix_tokens=None, constraints=None): if constraints is not None: raise NotImplementedError( "Constrained decoding with the IterativeRefinementGenerator is not supported" ) # TODO: iterative refinement generator does not support ensemble for now. if not self.retain_dropout: for model in models: model.eval() model, reranker = models[0], None if self.reranking: assert len(models) > 1, "Assuming the last checkpoint is the reranker" assert ( self.beam_size > 1 ), "Reranking requires multiple translation for each example" reranker = models[-1] models = models[:-1] if len(models) > 1 and hasattr(model, "enable_ensemble"): assert model.allow_ensemble, "{} does not support ensembling".format( model.__class__.__name__ ) model.enable_ensemble(models) # TODO: better encoder inputs? src_tokens = sample["net_input"]["src_tokens"] src_lengths = sample["net_input"]["src_lengths"] bsz, src_len = src_tokens.size() # initialize encoder_out = model.forward_encoder([src_tokens, src_lengths]) prev_decoder_out = model.initialize_output_tokens(encoder_out, src_tokens) if self.beam_size > 1: assert ( model.allow_length_beam ), "{} does not support decoding with length beam.".format( model.__class__.__name__ ) # regenerate data based on length-beam length_beam_order = ( utils.new_arange(src_tokens, self.beam_size, bsz).t().reshape(-1) ) encoder_out = model.encoder.reorder_encoder_out( encoder_out, length_beam_order ) prev_decoder_out = model.regenerate_length_beam( prev_decoder_out, self.beam_size ) bsz = bsz * self.beam_size sent_idxs = torch.arange(bsz) prev_output_tokens = prev_decoder_out.output_tokens.clone() if self.retain_history: prev_decoder_out = prev_decoder_out._replace(history=[prev_output_tokens]) finalized = [[] for _ in range(bsz)] def is_a_loop(x, y, s, a): b, l_x, l_y = x.size(0), x.size(1), y.size(1) if l_x > l_y: y = torch.cat([y, x.new_zeros(b, l_x - l_y).fill_(self.pad)], 1) s = torch.cat([s, s.new_zeros(b, l_x - l_y)], 1) if a is not None: a = torch.cat([a, a.new_zeros(b, l_x - l_y, a.size(2))], 1) elif l_x < l_y: x = torch.cat([x, y.new_zeros(b, l_y - l_x).fill_(self.pad)], 1) return (x == y).all(1), y, s, a def finalized_hypos(step, prev_out_token, prev_out_score, prev_out_attn): cutoff = prev_out_token.ne(self.pad) tokens = prev_out_token[cutoff] if prev_out_score is None: scores, score = None, None else: scores = prev_out_score[cutoff] score = scores.mean() if prev_out_attn is None: hypo_attn, alignment = None, None else: hypo_attn = prev_out_attn[cutoff] alignment = hypo_attn.max(dim=1)[1] return { "steps": step, "tokens": tokens, "positional_scores": scores, "score": score, "hypo_attn": hypo_attn, "alignment": alignment, } for step in range(self.max_iter + 1): decoder_options = { "eos_penalty": self.eos_penalty, "max_ratio": self.max_ratio, "decoding_format": self.decoding_format, } prev_decoder_out = prev_decoder_out._replace( step=step, max_step=self.max_iter + 1, ) decoder_out = model.forward_decoder( prev_decoder_out, encoder_out, **decoder_options ) if self.adaptive: # terminate if there is a loop terminated, out_tokens, out_scores, out_attn = is_a_loop( prev_output_tokens, decoder_out.output_tokens, decoder_out.output_scores, decoder_out.attn, ) decoder_out = decoder_out._replace( output_tokens=out_tokens, output_scores=out_scores, attn=out_attn, ) else: terminated = decoder_out.output_tokens.new_zeros( decoder_out.output_tokens.size(0) ).bool() if step == self.max_iter: # reach last iteration, terminate terminated.fill_(1) # collect finalized sentences finalized_idxs = sent_idxs[terminated.to(sent_idxs.device)] finalized_tokens = decoder_out.output_tokens[terminated] finalized_scores = decoder_out.output_scores[terminated] finalized_attn = ( None if (decoder_out.attn is None or decoder_out.attn.size(0) == 0) else decoder_out.attn[terminated] ) if self.retain_history: finalized_history_tokens = [h[terminated] for h in decoder_out.history] for i in range(finalized_idxs.size(0)): finalized[finalized_idxs[i]] = [ finalized_hypos( step, finalized_tokens[i], finalized_scores[i], None if finalized_attn is None else finalized_attn[i], ) ] if self.retain_history: finalized[finalized_idxs[i]][0]["history"] = [] for j in range(len(finalized_history_tokens)): finalized[finalized_idxs[i]][0]["history"].append( finalized_hypos( step, finalized_history_tokens[j][i], None, None ) ) # check if all terminated if terminated.sum() == terminated.size(0): break # for next step not_terminated = ~terminated prev_decoder_out = decoder_out._replace( output_tokens=decoder_out.output_tokens[not_terminated], output_scores=decoder_out.output_scores[not_terminated], attn=decoder_out.attn[not_terminated] if (decoder_out.attn is not None and decoder_out.attn.size(0) > 0) else None, history=[h[not_terminated] for h in decoder_out.history] if decoder_out.history is not None else None, ) encoder_out = model.encoder.reorder_encoder_out( encoder_out, not_terminated.nonzero(as_tuple=False).squeeze() ) sent_idxs = sent_idxs[not_terminated.to(sent_idxs.device)] prev_output_tokens = prev_decoder_out.output_tokens.clone() if self.beam_size > 1: if reranker is not None: finalized = self.rerank( reranker, finalized, [src_tokens, src_lengths], self.beam_size ) # aggregate information from length beam finalized = [ finalized[ np.argmax( [ finalized[self.beam_size * i + j][0]["score"] for j in range(self.beam_size) ] ) + self.beam_size * i ] for i in range(len(finalized) // self.beam_size) ] return finalized def rerank(self, reranker, finalized, encoder_input, beam_size): def rebuild_batch(finalized): finalized_tokens = [f[0]["tokens"] for f in finalized] finalized_maxlen = max(f.size(0) for f in finalized_tokens) final_output_tokens = ( finalized_tokens[0] .new_zeros(len(finalized_tokens), finalized_maxlen) .fill_(self.pad) ) for i, f in enumerate(finalized_tokens): final_output_tokens[i, : f.size(0)] = f return final_output_tokens final_output_tokens = rebuild_batch(finalized) final_output_tokens[ :, 0 ] = self.eos # autoregressive model assumes starting with EOS reranker_encoder_out = reranker.encoder(*encoder_input) length_beam_order = ( utils.new_arange( final_output_tokens, beam_size, reranker_encoder_out.encoder_out.size(1) ) .t() .reshape(-1) ) reranker_encoder_out = reranker.encoder.reorder_encoder_out( reranker_encoder_out, length_beam_order ) reranking_scores = reranker.get_normalized_probs( reranker.decoder(final_output_tokens[:, :-1], reranker_encoder_out), True, None, ) reranking_scores = reranking_scores.gather(2, final_output_tokens[:, 1:, None]) reranking_masks = final_output_tokens[:, 1:].ne(self.pad) reranking_scores = ( reranking_scores[:, :, 0].masked_fill_(~reranking_masks, 0).sum(1) ) reranking_scores = reranking_scores / reranking_masks.sum(1).type_as( reranking_scores ) for i in range(len(finalized)): finalized[i][0]["score"] = reranking_scores[i] return finalized ================================================ FILE: fairseq/logging/__init__.py ================================================ ================================================ FILE: fairseq/logging/meters.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import bisect import time from collections import OrderedDict from typing import Dict, Optional try: import torch def type_as(a, b): if torch.is_tensor(a) and torch.is_tensor(b): return a.to(b) else: return a except ImportError: torch = None def type_as(a, b): return a try: import numpy as np except ImportError: np = None class Meter(object): """Base class for Meters.""" def __init__(self): pass def state_dict(self): return {} def load_state_dict(self, state_dict): pass def reset(self): raise NotImplementedError @property def smoothed_value(self) -> float: """Smoothed value used for logging.""" raise NotImplementedError def safe_round(number, ndigits): if hasattr(number, "__round__"): return round(number, ndigits) elif torch is not None and torch.is_tensor(number) and number.numel() == 1: return safe_round(number.item(), ndigits) elif np is not None and np.ndim(number) == 0 and hasattr(number, "item"): return safe_round(number.item(), ndigits) else: return number class AverageMeter(Meter): """Computes and stores the average and current value""" def __init__(self, round: Optional[int] = None): self.round = round self.reset() def reset(self): self.val = None # most recent update self.sum = 0 # sum from all updates self.count = 0 # total n from all updates def update(self, val, n=1): if val is not None: self.val = val if n > 0: self.sum = type_as(self.sum, val) + (val * n) self.count = type_as(self.count, n) + n def state_dict(self): return { "val": self.val, "sum": self.sum, "count": self.count, "round": self.round, } def load_state_dict(self, state_dict): self.val = state_dict["val"] self.sum = state_dict["sum"] self.count = state_dict["count"] self.round = state_dict.get("round", None) @property def avg(self): return self.sum / self.count if self.count > 0 else self.val @property def smoothed_value(self) -> float: val = self.avg if self.round is not None and val is not None: val = safe_round(val, self.round) return val class SumMeter(Meter): """Computes and stores the sum""" def __init__(self, round: Optional[int] = None): self.round = round self.reset() def reset(self): self.sum = 0 # sum from all updates def update(self, val): if val is not None: self.sum = type_as(self.sum, val) + val def state_dict(self): return { "sum": self.sum, "round": self.round, } def load_state_dict(self, state_dict): self.sum = state_dict["sum"] self.round = state_dict.get("round", None) @property def smoothed_value(self) -> float: val = self.sum if self.round is not None and val is not None: val = safe_round(val, self.round) return val class ConcatTensorMeter(Meter): """Concatenates tensors""" def __init__(self, dim=0): super().__init__() self.reset() self.dim = dim def reset(self): self.tensor = None def update(self, val): if self.tensor is None: self.tensor = val else: self.tensor = torch.cat([self.tensor, val], dim=self.dim) def state_dict(self): return { "tensor": self.tensor, } def load_state_dict(self, state_dict): self.tensor = state_dict["tensor"] @property def smoothed_value(self) -> float: return [] # return a dummy value class TimeMeter(Meter): """Computes the average occurrence of some event per second""" def __init__( self, init: int = 0, n: int = 0, round: Optional[int] = None, ): self.round = round self.reset(init, n) def reset(self, init=0, n=0): self.init = init self.start = time.perf_counter() self.n = n self.i = 0 def update(self, val=1): self.n = type_as(self.n, val) + val self.i += 1 def state_dict(self): return { "init": self.elapsed_time, "n": self.n, "round": self.round, } def load_state_dict(self, state_dict): if "start" in state_dict: # backwards compatibility for old state_dicts self.reset(init=state_dict["init"]) else: self.reset(init=state_dict["init"], n=state_dict["n"]) self.round = state_dict.get("round", None) @property def avg(self): return self.n / self.elapsed_time @property def elapsed_time(self): return self.init + (time.perf_counter() - self.start) @property def smoothed_value(self) -> float: val = self.avg if self.round is not None and val is not None: val = safe_round(val, self.round) return val class StopwatchMeter(Meter): """Computes the sum/avg duration of some event in seconds""" def __init__(self, round: Optional[int] = None): self.round = round self.sum = 0 self.n = 0 self.start_time = None def start(self): self.start_time = time.perf_counter() def stop(self, n=1, prehook=None): if self.start_time is not None: if prehook is not None: prehook() delta = time.perf_counter() - self.start_time self.sum = self.sum + delta self.n = type_as(self.n, n) + n def reset(self): self.sum = 0 # cumulative time during which stopwatch was active self.n = 0 # total n across all start/stop self.start() def state_dict(self): return { "sum": self.sum, "n": self.n, "round": self.round, } def load_state_dict(self, state_dict): self.sum = state_dict["sum"] self.n = state_dict["n"] self.start_time = None self.round = state_dict.get("round", None) @property def avg(self): return self.sum / self.n if self.n > 0 else self.sum @property def elapsed_time(self): if self.start_time is None: return 0.0 return time.perf_counter() - self.start_time @property def smoothed_value(self) -> float: val = self.avg if self.sum > 0 else self.elapsed_time if self.round is not None and val is not None: val = safe_round(val, self.round) return val class MetersDict(OrderedDict): """A sorted dictionary of :class:`Meters`. Meters are sorted according to a priority that is given when the meter is first added to the dictionary. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.priorities = [] def __setitem__(self, key, value): assert key not in self, "MetersDict doesn't support reassignment" priority, value = value bisect.insort(self.priorities, (priority, len(self.priorities), key)) super().__setitem__(key, value) for _, _, key in self.priorities: # reorder dict to match priorities self.move_to_end(key) def add_meter(self, key, meter, priority): self.__setitem__(key, (priority, meter)) def state_dict(self): return [ (pri, key, self[key].__class__.__name__, self[key].state_dict()) for pri, _, key in self.priorities # can't serialize DerivedMeter instances if not isinstance(self[key], MetersDict._DerivedMeter) ] def load_state_dict(self, state_dict): self.clear() self.priorities.clear() for pri, key, meter_cls, meter_state in state_dict: meter = globals()[meter_cls]() meter.load_state_dict(meter_state) self.add_meter(key, meter, pri) def get_smoothed_value(self, key: str) -> float: """Get a single smoothed value.""" meter = self[key] if isinstance(meter, MetersDict._DerivedMeter): return meter.fn(self) else: return meter.smoothed_value def get_smoothed_values(self) -> Dict[str, float]: """Get all smoothed values.""" return OrderedDict( [ (key, self.get_smoothed_value(key)) for key in self.keys() if not key.startswith("_") ] ) def reset(self): """Reset Meter instances.""" for meter in self.values(): if isinstance(meter, MetersDict._DerivedMeter): continue meter.reset() class _DerivedMeter(Meter): """A Meter whose values are derived from other Meters.""" def __init__(self, fn): self.fn = fn def reset(self): pass ================================================ FILE: fairseq/logging/metrics.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ A standalone module for aggregating metrics. Metrics can be logged from anywhere using the `log_*` functions defined in this module. The logged values will be aggregated dynamically based on the aggregation context in which the logging occurs. See the :func:`aggregate` context manager for more details. """ import contextlib import uuid from collections import defaultdict from typing import Callable, List, Optional from .meters import * # Aggregation contexts are considered "active" when inside the scope # created by the :func:`aggregate` context manager. _aggregators = OrderedDict() _active_aggregators = OrderedDict() _active_aggregators_cnt = defaultdict(lambda: 0) def reset() -> None: """Reset all metrics aggregators.""" _aggregators.clear() _active_aggregators.clear() _active_aggregators_cnt.clear() # The "default" aggregator observes all logged values. _aggregators["default"] = MetersDict() _active_aggregators["default"] = _aggregators["default"] _active_aggregators_cnt["default"] = 1 reset() @contextlib.contextmanager def aggregate(name: Optional[str] = None, new_root: bool = False): """Context manager to aggregate metrics under a given name. Aggregations can be nested. If *new_root* is ``False``, then logged metrics will be recorded along the entire stack of nested aggregators, including a global "default" aggregator. If *new_root* is ``True``, then this aggregator will be the root of a new aggregation stack, thus bypassing any parent aggregators. Note that aggregation contexts are uniquely identified by their *name* (e.g., train, valid). Creating a context with an existing name will reuse the corresponding :class:`MetersDict` instance. If no name is given, then a temporary aggregator will be created. Usage:: with metrics.aggregate("train"): for step, batch in enumerate(epoch): with metrics.aggregate("train_inner") as agg: metrics.log_scalar("loss", get_loss(batch)) if step % log_interval == 0: print(agg.get_smoothed_value("loss")) agg.reset() print(metrics.get_smoothed_values("train")["loss"]) Args: name (str): name of the aggregation. Defaults to a random/temporary name if not given explicitly. new_root (bool): make this aggregation the root of a new aggregation stack. """ if name is None: # generate a temporary name name = str(uuid.uuid4()) assert name not in _aggregators agg = MetersDict() else: assert name != "default" agg = _aggregators.setdefault(name, MetersDict()) if new_root: backup_aggregators = _active_aggregators.copy() _active_aggregators.clear() backup_aggregators_cnt = _active_aggregators_cnt.copy() _active_aggregators_cnt.clear() _active_aggregators[name] = agg _active_aggregators_cnt[name] += 1 yield agg _active_aggregators_cnt[name] -= 1 if _active_aggregators_cnt[name] == 0 and name in _active_aggregators: del _active_aggregators[name] if new_root: _active_aggregators.clear() _active_aggregators.update(backup_aggregators) _active_aggregators_cnt.clear() _active_aggregators_cnt.update(backup_aggregators_cnt) def get_active_aggregators() -> List[MetersDict]: return list(_active_aggregators.values()) def log_scalar( key: str, value: float, weight: float = 1, priority: int = 10, round: Optional[int] = None, ): """Log a scalar value. Args: key (str): name of the field to log value (float): value to log weight (float): weight that this value contributes to the average. A weight of 0 will always log the latest value. priority (int): smaller values are logged earlier in the output round (Optional[int]): number of digits to round to when displaying """ for agg in get_active_aggregators(): if key not in agg: agg.add_meter(key, AverageMeter(round=round), priority) agg[key].update(value, weight) def log_scalar_sum( key: str, value: float, priority: int = 10, round: Optional[int] = None, ): """Log a scalar value that is summed for reporting. Args: key (str): name of the field to log value (float): value to log priority (int): smaller values are logged earlier in the output round (Optional[int]): number of digits to round to when displaying """ for agg in get_active_aggregators(): if key not in agg: agg.add_meter(key, SumMeter(round=round), priority) agg[key].update(value) def log_concat_tensor( key: str, value: torch.Tensor, priority: int = 10, dim: int = 0, ): """Log a scalar value that is summed for reporting. Args: key (str): name of the field to log value (float): value to log priority (int): smaller values are logged earlier in the output round (Optional[int]): number of digits to round to when displaying """ for agg in get_active_aggregators(): if key not in agg: agg.add_meter(key, ConcatTensorMeter(dim=dim), priority) agg[key].update(value) def log_derived(key: str, fn: Callable[[MetersDict], float], priority: int = 20): """Log a scalar value derived from other meters. Args: key (str): name of the field to log fn (Callable[[MetersDict], float]): function that takes a single argument *meters* and returns the derived value priority (int): smaller values are logged earlier in the output """ for agg in get_active_aggregators(): if key not in agg: agg.add_meter(key, MetersDict._DerivedMeter(fn), priority) def log_speed( key: str, value: float, priority: int = 30, round: Optional[int] = None, ): """Log the rate of some quantity per second. Args: key (str): name of the field to log value (float): value to log priority (int): smaller values are logged earlier in the output round (Optional[int]): number of digits to round to when displaying """ for agg in get_active_aggregators(): if key not in agg: agg.add_meter(key, TimeMeter(round=round), priority) agg[key].reset() # reset meter on the first call else: agg[key].update(value) def log_start_time(key: str, priority: int = 40, round: Optional[int] = None): """Log the duration of some event in seconds. The duration will be computed once :func:`log_stop_time` is called. Args: key (str): name of the field to log priority (int): smaller values are logged earlier in the output round (Optional[int]): number of digits to round to when displaying """ for agg in get_active_aggregators(): if key not in agg: agg.add_meter(key, StopwatchMeter(round=round), priority) agg[key].start() def log_stop_time(key: str, weight: float = 0.0, prehook=None): """Log the duration of some event in seconds. The duration will be computed since :func:`log_start_time` was called. Set weight > 0 to report the average time instead of the sum. Args: key (str): name of the field to log weight (float): weight that this time contributes to the average prehook (function, no arguments): will be called before the timer is stopped. For example, use prehook=torch.cuda.synchronize to make sure all gpu operations are done before timer is stopped. """ for agg in get_active_aggregators(): if key in agg: agg[key].stop(weight, prehook) def log_custom( new_meter_fn: Callable[[], Meter], key: str, *args, priority: int = 50, **kwargs, ): """Log using a custom Meter. Any extra *args* or *kwargs* will be passed through to the Meter's *update* method. Args: new_meter_fn (Callable[[], Meter]): function that returns a new Meter instance key (str): name of the field to log priority (int): smaller values are logged earlier in the output """ for agg in get_active_aggregators(): if key not in agg: agg.add_meter(key, new_meter_fn(), priority) agg[key].update(*args, **kwargs) def reset_meter(name: str, key: str) -> None: """Reset Meter instance aggregated under a given *name* and *key*.""" meter = get_meter(name, key) if meter is not None: meter.reset() def reset_meters(name: str) -> None: """Reset Meter instances aggregated under a given *name*.""" meters = get_meters(name) if meters is not None: meters.reset() def get_meter(name: str, key: str) -> Meter: """Get a single Meter instance aggregated under *name* and *key*. Returns: Meter or None if no metrics have been logged under *name* and *key*. """ if name not in _aggregators: return None return _aggregators[name].get(key, None) def get_meters(name: str) -> MetersDict: """Get Meter instances aggregated under a given *name*. Returns: MetersDict or None if no metrics have been logged under *name*. """ return _aggregators.get(name, None) def get_smoothed_value(name: str, key: str) -> float: """Get a single smoothed value. Raises: KeyError: if no metrics have been logged under *name* and *key*. """ return _aggregators[name].get_smoothed_value(key) def get_smoothed_values(name: str) -> Dict[str, float]: """Get smoothed values aggregated under a given *name*. Raises: KeyError: if no metrics have been logged under *name*. """ return _aggregators[name].get_smoothed_values() def state_dict(): return OrderedDict([(name, agg.state_dict()) for name, agg in _aggregators.items()]) def load_state_dict(state_dict): for name, agg_state in state_dict.items(): _aggregators[name] = MetersDict() _aggregators[name].load_state_dict(agg_state) def xla_metrics_report(): try: import torch_xla.debug.metrics as met print(met.metrics_report()) except ImportError: return ================================================ FILE: fairseq/logging/progress_bar.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Wrapper around various loggers and progress bars (e.g., tqdm). """ import atexit import json import logging import os import sys from collections import OrderedDict from contextlib import contextmanager from numbers import Number from typing import Optional import torch from .meters import AverageMeter, StopwatchMeter, TimeMeter logger = logging.getLogger(__name__) def progress_bar( iterator, log_format: Optional[str] = None, log_interval: int = 100, log_file: Optional[str] = None, epoch: Optional[int] = None, prefix: Optional[str] = None, aim_repo: Optional[str] = None, aim_run_hash: Optional[str] = None, aim_param_checkpoint_dir: Optional[str] = None, tensorboard_logdir: Optional[str] = None, default_log_format: str = "tqdm", wandb_project: Optional[str] = None, wandb_run_name: Optional[str] = None, azureml_logging: Optional[bool] = False, ): if log_format is None: log_format = default_log_format if log_file is not None: handler = logging.FileHandler(filename=log_file) logger.addHandler(handler) if log_format == "tqdm" and not sys.stderr.isatty(): log_format = "simple" if log_format == "json": bar = JsonProgressBar(iterator, epoch, prefix, log_interval) elif log_format == "none": bar = NoopProgressBar(iterator, epoch, prefix) elif log_format == "simple": bar = SimpleProgressBar(iterator, epoch, prefix, log_interval) elif log_format == "tqdm": bar = TqdmProgressBar(iterator, epoch, prefix) else: raise ValueError("Unknown log format: {}".format(log_format)) if aim_repo: bar = AimProgressBarWrapper( bar, aim_repo=aim_repo, aim_run_hash=aim_run_hash, aim_param_checkpoint_dir=aim_param_checkpoint_dir, ) if tensorboard_logdir: try: # [FB only] custom wrapper for TensorBoard import palaas # noqa from .fb_tbmf_wrapper import FbTbmfWrapper bar = FbTbmfWrapper(bar, log_interval) except ImportError: bar = TensorboardProgressBarWrapper(bar, tensorboard_logdir) if wandb_project: bar = WandBProgressBarWrapper(bar, wandb_project, run_name=wandb_run_name) if azureml_logging: bar = AzureMLProgressBarWrapper(bar) return bar def build_progress_bar( args, iterator, epoch: Optional[int] = None, prefix: Optional[str] = None, default: str = "tqdm", no_progress_bar: str = "none", ): """Legacy wrapper that takes an argparse.Namespace.""" if getattr(args, "no_progress_bar", False): default = no_progress_bar if getattr(args, "distributed_rank", 0) == 0: tensorboard_logdir = getattr(args, "tensorboard_logdir", None) else: tensorboard_logdir = None return progress_bar( iterator, log_format=args.log_format, log_interval=args.log_interval, epoch=epoch, prefix=prefix, tensorboard_logdir=tensorboard_logdir, default_log_format=default, ) def format_stat(stat): if isinstance(stat, Number): stat = "{:g}".format(stat) elif isinstance(stat, AverageMeter): stat = "{:.3f}".format(stat.avg) elif isinstance(stat, TimeMeter): stat = "{:g}".format(round(stat.avg)) elif isinstance(stat, StopwatchMeter): stat = "{:g}".format(round(stat.sum)) elif torch.is_tensor(stat): stat = stat.tolist() return stat class BaseProgressBar(object): """Abstract class for progress bars.""" def __init__(self, iterable, epoch=None, prefix=None): self.iterable = iterable self.n = getattr(iterable, "n", 0) self.epoch = epoch self.prefix = "" if epoch is not None: self.prefix += "epoch {:03d}".format(epoch) if prefix is not None: self.prefix += (" | " if self.prefix != "" else "") + prefix def __len__(self): return len(self.iterable) def __enter__(self): return self def __exit__(self, *exc): return False def __iter__(self): raise NotImplementedError def log(self, stats, tag=None, step=None): """Log intermediate stats according to log_interval.""" raise NotImplementedError def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" raise NotImplementedError def update_config(self, config): """Log latest configuration.""" pass def _str_commas(self, stats): return ", ".join(key + "=" + stats[key].strip() for key in stats.keys()) def _str_pipes(self, stats): return " | ".join(key + " " + stats[key].strip() for key in stats.keys()) def _format_stats(self, stats): postfix = OrderedDict(stats) # Preprocess stats according to datatype for key in postfix.keys(): postfix[key] = str(format_stat(postfix[key])) return postfix @contextmanager def rename_logger(logger, new_name): old_name = logger.name if new_name is not None: logger.name = new_name yield logger logger.name = old_name class JsonProgressBar(BaseProgressBar): """Log output in JSON format.""" def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000): super().__init__(iterable, epoch, prefix) self.log_interval = log_interval self.i = None self.size = None def __iter__(self): self.size = len(self.iterable) for i, obj in enumerate(self.iterable, start=self.n): self.i = i yield obj def log(self, stats, tag=None, step=None): """Log intermediate stats according to log_interval.""" step = step or self.i or 0 if step > 0 and self.log_interval is not None and step % self.log_interval == 0: update = ( self.epoch - 1 + (self.i + 1) / float(self.size) if self.epoch is not None else None ) stats = self._format_stats(stats, epoch=self.epoch, update=update) with rename_logger(logger, tag): logger.info(json.dumps(stats)) def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" self.stats = stats if tag is not None: self.stats = OrderedDict( [(tag + "_" + k, v) for k, v in self.stats.items()] ) stats = self._format_stats(self.stats, epoch=self.epoch) with rename_logger(logger, tag): logger.info(json.dumps(stats)) def _format_stats(self, stats, epoch=None, update=None): postfix = OrderedDict() if epoch is not None: postfix["epoch"] = epoch if update is not None: postfix["update"] = round(update, 3) # Preprocess stats according to datatype for key in stats.keys(): postfix[key] = format_stat(stats[key]) return postfix class NoopProgressBar(BaseProgressBar): """No logging.""" def __init__(self, iterable, epoch=None, prefix=None): super().__init__(iterable, epoch, prefix) def __iter__(self): for obj in self.iterable: yield obj def log(self, stats, tag=None, step=None): """Log intermediate stats according to log_interval.""" pass def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" pass class SimpleProgressBar(BaseProgressBar): """A minimal logger for non-TTY environments.""" def __init__(self, iterable, epoch=None, prefix=None, log_interval=1000): super().__init__(iterable, epoch, prefix) self.log_interval = log_interval self.i = None self.size = None def __iter__(self): self.size = len(self.iterable) for i, obj in enumerate(self.iterable, start=self.n): self.i = i yield obj def log(self, stats, tag=None, step=None): """Log intermediate stats according to log_interval.""" step = step or self.i or 0 if step > 0 and self.log_interval is not None and step % self.log_interval == 0: stats = self._format_stats(stats) postfix = self._str_commas(stats) with rename_logger(logger, tag): logger.info( "{}: {:5d} / {:d} {}".format( self.prefix, self.i + 1, self.size, postfix ) ) def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" postfix = self._str_pipes(self._format_stats(stats)) with rename_logger(logger, tag): logger.info("{} | {}".format(self.prefix, postfix)) class TqdmProgressBar(BaseProgressBar): """Log to tqdm.""" def __init__(self, iterable, epoch=None, prefix=None): super().__init__(iterable, epoch, prefix) from tqdm import tqdm self.tqdm = tqdm( iterable, self.prefix, leave=False, disable=(logger.getEffectiveLevel() > logging.INFO), ) def __iter__(self): return iter(self.tqdm) def log(self, stats, tag=None, step=None): """Log intermediate stats according to log_interval.""" self.tqdm.set_postfix(self._format_stats(stats), refresh=False) def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" postfix = self._str_pipes(self._format_stats(stats)) with rename_logger(logger, tag): logger.info("{} | {}".format(self.prefix, postfix)) try: import functools from aim import Repo as AimRepo @functools.lru_cache() def get_aim_run(repo, run_hash): from aim import Run return Run(run_hash=run_hash, repo=repo) except ImportError: get_aim_run = None AimRepo = None class AimProgressBarWrapper(BaseProgressBar): """Log to Aim.""" def __init__(self, wrapped_bar, aim_repo, aim_run_hash, aim_param_checkpoint_dir): self.wrapped_bar = wrapped_bar if get_aim_run is None: self.run = None logger.warning("Aim not found, please install with: pip install aim") else: logger.info(f"Storing logs at Aim repo: {aim_repo}") if not aim_run_hash: # Find run based on save_dir parameter query = f"run.checkpoint.save_dir == '{aim_param_checkpoint_dir}'" try: runs_generator = AimRepo(aim_repo).query_runs(query) run = next(runs_generator.iter_runs()) aim_run_hash = run.run.hash except Exception: pass if aim_run_hash: logger.info(f"Appending to run: {aim_run_hash}") self.run = get_aim_run(aim_repo, aim_run_hash) def __iter__(self): return iter(self.wrapped_bar) def log(self, stats, tag=None, step=None): """Log intermediate stats to Aim.""" self._log_to_aim(stats, tag, step) self.wrapped_bar.log(stats, tag=tag, step=step) def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" self._log_to_aim(stats, tag, step) self.wrapped_bar.print(stats, tag=tag, step=step) def update_config(self, config): """Log latest configuration.""" if self.run is not None: for key in config: self.run.set(key, config[key], strict=False) self.wrapped_bar.update_config(config) def _log_to_aim(self, stats, tag=None, step=None): if self.run is None: return if step is None: step = stats["num_updates"] if "train" in tag: context = {"tag": tag, "subset": "train"} elif "val" in tag: context = {"tag": tag, "subset": "val"} else: context = {"tag": tag} for key in stats.keys() - {"num_updates"}: self.run.track(stats[key], name=key, step=step, context=context) try: _tensorboard_writers = {} from torch.utils.tensorboard import SummaryWriter except ImportError: try: from tensorboardX import SummaryWriter except ImportError: SummaryWriter = None def _close_writers(): for w in _tensorboard_writers.values(): w.close() atexit.register(_close_writers) class TensorboardProgressBarWrapper(BaseProgressBar): """Log to tensorboard.""" def __init__(self, wrapped_bar, tensorboard_logdir): self.wrapped_bar = wrapped_bar self.tensorboard_logdir = tensorboard_logdir if SummaryWriter is None: logger.warning( "tensorboard not found, please install with: pip install tensorboard" ) def _writer(self, key): if SummaryWriter is None: return None _writers = _tensorboard_writers if key not in _writers: _writers[key] = SummaryWriter(os.path.join(self.tensorboard_logdir, key)) _writers[key].add_text("sys.argv", " ".join(sys.argv)) return _writers[key] def __iter__(self): return iter(self.wrapped_bar) def log(self, stats, tag=None, step=None): """Log intermediate stats to tensorboard.""" self._log_to_tensorboard(stats, tag, step) self.wrapped_bar.log(stats, tag=tag, step=step) def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" self._log_to_tensorboard(stats, tag, step) self.wrapped_bar.print(stats, tag=tag, step=step) def update_config(self, config): """Log latest configuration.""" # TODO add hparams to Tensorboard self.wrapped_bar.update_config(config) def _log_to_tensorboard(self, stats, tag=None, step=None): writer = self._writer(tag or "") if writer is None: return if step is None: step = stats["num_updates"] for key in stats.keys() - {"num_updates"}: if isinstance(stats[key], AverageMeter): writer.add_scalar(key, stats[key].val, step) elif isinstance(stats[key], Number): writer.add_scalar(key, stats[key], step) elif torch.is_tensor(stats[key]) and stats[key].numel() == 1: writer.add_scalar(key, stats[key].item(), step) writer.flush() try: import wandb except ImportError: wandb = None class WandBProgressBarWrapper(BaseProgressBar): """Log to Weights & Biases.""" def __init__(self, wrapped_bar, wandb_project, run_name=None): self.wrapped_bar = wrapped_bar if wandb is None: logger.warning("wandb not found, pip install wandb") return # reinit=False to ensure if wandb.init() is called multiple times # within one process it still references the same run wandb.init(project=wandb_project, reinit=False, name=run_name) def __iter__(self): return iter(self.wrapped_bar) def log(self, stats, tag=None, step=None): """Log intermediate stats to tensorboard.""" self._log_to_wandb(stats, tag, step) self.wrapped_bar.log(stats, tag=tag, step=step) def print(self, stats, tag=None, step=None): """Print end-of-epoch stats.""" self._log_to_wandb(stats, tag, step) self.wrapped_bar.print(stats, tag=tag, step=step) def update_config(self, config): """Log latest configuration.""" if wandb is not None: wandb.config.update(config) self.wrapped_bar.update_config(config) def _log_to_wandb(self, stats, tag=None, step=None): if wandb is None: return if step is None: step = stats["num_updates"] prefix = "" if tag is None else tag + "/" for key in stats.keys() - {"num_updates"}: if isinstance(stats[key], AverageMeter): wandb.log({prefix + key: stats[key].val}, step=step) elif isinstance(stats[key], Number): wandb.log({prefix + key: stats[key]}, step=step) try: from azureml.core import Run except ImportError: Run = None class AzureMLProgressBarWrapper(BaseProgressBar): """Log to Azure ML""" def __init__(self, wrapped_bar): self.wrapped_bar = wrapped_bar if Run is None: logger.warning("azureml.core not found, pip install azureml-core") return self.run = Run.get_context() def __exit__(self, *exc): if Run is not None: self.run.complete() return False def __iter__(self): return iter(self.wrapped_bar) def log(self, stats, tag=None, step=None): """Log intermediate stats to AzureML""" self._log_to_azureml(stats, tag, step) self.wrapped_bar.log(stats, tag=tag, step=step) def print(self, stats, tag=None, step=None): """Print end-of-epoch stats""" self._log_to_azureml(stats, tag, step) self.wrapped_bar.print(stats, tag=tag, step=step) def update_config(self, config): """Log latest configuration.""" self.wrapped_bar.update_config(config) def _log_to_azureml(self, stats, tag=None, step=None): if Run is None: return if step is None: step = stats["num_updates"] prefix = "" if tag is None else tag + "/" for key in stats.keys() - {"num_updates"}: name = prefix + key if isinstance(stats[key], AverageMeter): self.run.log_row(name=name, **{"step": step, key: stats[key].val}) elif isinstance(stats[key], Number): self.run.log_row(name=name, **{"step": step, key: stats[key]}) ================================================ FILE: fairseq/model_parallel/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import criterions, models, modules # noqa ================================================ FILE: fairseq/model_parallel/criterions/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os # automatically import any Python files in the criterions/ directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("fairseq.model_parallel.criterions." + module) ================================================ FILE: fairseq/model_parallel/criterions/vocab_parallel_cross_entropy.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from fairseq import utils from fairseq.logging import metrics from fairseq.criterions import FairseqCriterion, register_criterion try: from fairseq.model_parallel.megatron.mpu.cross_entropy import ( vocab_parallel_cross_entropy, ) has_megatron_submodule = True except (ImportError, ModuleNotFoundError): has_megatron_submodule = False @register_criterion("vocab_parallel_cross_entropy") class VocabParallelCrossEntropyCriterion(FairseqCriterion): def __init__(self, task, sentence_avg): super().__init__(task) self.sentence_avg = sentence_avg if not has_megatron_submodule: raise ImportError( "\n\nPlease install the megatron submodule:" "\n\n git submodule update --init " "fairseq/model_parallel/megatron" ) def forward(self, model, sample, reduce=True): """Compute the loss for the given sample. Returns a tuple with three elements: 1) the loss 2) the sample size, which is used as the denominator for the gradient 3) logging outputs to display while training """ net_output = model(**sample["net_input"]) target = sample["target"] loss = vocab_parallel_cross_entropy(net_output[0].float(), target) loss = (loss * (target != self.padding_idx)).sum() sample_size = ( sample["target"].size(0) if self.sentence_avg else sample["ntokens"] ) logging_output = { "loss": utils.item(loss.data) if reduce else loss.data, "ntokens": sample["ntokens"], "nsentences": sample["target"].size(0), "sample_size": sample_size, } return loss, sample_size, logging_output @staticmethod def reduce_metrics(logging_outputs) -> None: """Aggregate logging outputs from data parallel training.""" loss_sum = sum(log.get("loss", 0) for log in logging_outputs) ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) sample_size = sum(log.get("sample_size", 0) for log in logging_outputs) metrics.log_scalar( "loss", loss_sum / sample_size / math.log(2), sample_size, round=3 ) if sample_size != ntokens: metrics.log_scalar( "nll_loss", loss_sum / ntokens / math.log(2), ntokens, round=3 ) metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) ) else: metrics.log_derived( "ppl", lambda meters: utils.get_perplexity(meters["loss"].avg) ) @staticmethod def logging_outputs_can_be_summed() -> bool: """ Whether the logging outputs returned by `forward` can be summed across workers prior to calling `reduce_metrics`. Setting this to True will improves distributed training speed. """ return True ================================================ FILE: fairseq/model_parallel/megatron_trainer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Train a network across multiple GPUs. """ from fairseq.dataclass.configs import FairseqConfig from fairseq.distributed import utils as distributed_utils from fairseq.trainer import Trainer try: from fairseq.model_parallel.megatron.mpu import ( get_data_parallel_rank, get_data_parallel_world_size, get_model_parallel_src_rank, get_cuda_rng_tracker, ) has_megatron_submodule = True except (ImportError, ModuleNotFoundError): has_megatron_submodule = False class MegatronTrainer(Trainer): """Main class for model parallel with data parallel training.""" def __init__(self, cfg: FairseqConfig, task, model, criterion, **kwargs): if not has_megatron_submodule: raise ImportError( "\n\nPlease install the megatron submodule:" "\n\n git submodule update --init " "fairseq/model_parallel/megatron" ) super().__init__(cfg, task, model, criterion, **kwargs) def clip_grad_norm(self, clip_norm): def _aggregate_model_parallel_grad_norm(total_norm): total_norm = total_norm**2 distributed_utils.all_reduce( total_norm, group=distributed_utils.get_model_parallel_group() ) total_norm = total_norm**0.5 return total_norm return self.optimizer.clip_grad_norm( clip_norm, aggregate_norm_fn=_aggregate_model_parallel_grad_norm, ) def save_checkpoint(self, filename, extra_state): """Save all training state in a checkpoint file.""" extra_state["rng_tracker_states"] = get_cuda_rng_tracker().get_states() super().save_checkpoint(filename, extra_state) def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): extra_state = super().load_checkpoint( filename, reset_optimizer=reset_optimizer, reset_lr_scheduler=reset_lr_scheduler, optimizer_overrides=optimizer_overrides, reset_meters=reset_meters, ) if extra_state is not None and "rng_tracker_states" in extra_state: get_cuda_rng_tracker().set_states(extra_state["rng_tracker_states"]) return extra_state ================================================ FILE: fairseq/model_parallel/models/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os # automatically import any Python files in the models/ directory models_dir = os.path.dirname(__file__) for file in os.listdir(models_dir): path = os.path.join(models_dir, file) if ( not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)) ): model_name = file[: file.find(".py")] if file.endswith(".py") else file module = importlib.import_module("fairseq.model_parallel.models." + model_name) ================================================ FILE: fairseq/model_parallel/models/pipeline_parallel_transformer/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .model import * # noqa ================================================ FILE: fairseq/model_parallel/models/pipeline_parallel_transformer/layers.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from collections import namedtuple import torch import torch.nn as nn import torch.nn.functional as F from fairseq import options, utils from fairseq.modules import ( AdaptiveSoftmax, LayerNorm, MultiheadAttention, PositionalEmbedding, ) EncoderOut = namedtuple( "TransformerEncoderOut", [ "encoder_out", # T x B x C "encoder_padding_mask", # B x T "encoder_embedding", # B x T x C "encoder_states", # List[T x B x C] ], ) class TransformerEncoderEmbedding(nn.Module): """Encoder Embedding + Positional Embedding""" def __init__(self, args, embed_tokens): super().__init__() self.dropout = args.dropout self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens if isinstance(embed_tokens, nn.ModuleList): self.padding_idx = embed_tokens[0].padding_idx embed_dim = sum(e.embedding_dim for e in embed_tokens) else: self.padding_idx = embed_tokens.padding_idx embed_dim = embed_tokens.embedding_dim self.embed_scale = math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None def forward(self, input): # embed tokens and positions src_tokens = input[0] prev_output_tokens = input[2] if isinstance(self.embed_tokens, nn.ModuleList): x_embed_list = [] for embed_tokens_part in self.embed_tokens: x_embed_list.append(embed_tokens_part(src_tokens)) embedded = torch.cat(x_embed_list, dim=-1) else: embedded = self.embed_tokens(src_tokens) x = embed = self.embed_scale * embedded if self.embed_positions is not None: x = embed + self.embed_positions(src_tokens) if self.layernorm_embedding: x = self.layernorm_embedding(x) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) return (x, encoder_padding_mask, prev_output_tokens) class TransformerEncoderLayerNorm(nn.Module): """ Layer norm at the the end of all encoder layers if args.encoder_enormalize_before = True """ def __init__(self, args, embed_dim): super().__init__() if args.encoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward(self, input): x = input[0] encoder_padding_mask = input[1] prev_output_tokens = input[2] if self.layer_norm: x = self.layer_norm(x) # keeping track of the incremental_state is not supported yet return (x, encoder_padding_mask, prev_output_tokens) class TransformerDecoderEmbedding(nn.Module): """Decoder Embedding + Positional Embedding""" def __init__(self, args, embed_tokens): super().__init__() self.dropout = args.dropout self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = ( sum(e.embedding_dim for e in embed_tokens) if isinstance(embed_tokens, nn.ModuleList) else embed_tokens.embedding_dim ) embed_dim = args.decoder_embed_dim self.output_embed_dim = args.decoder_output_dim padding_idx = ( embed_tokens[0].padding_idx if isinstance(embed_tokens, nn.ModuleList) else embed_tokens.padding_idx ) self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) def forward(self, input): mt_task = False if isinstance(input, tuple): if len(input) == 3: encoder_out = input[0] encoder_padding_mask = input[1] prev_output_tokens = input[2] incremental_state = None # Hardcoding to avoid passing of None objects mt_task = True else: # HACK for now, need to fix (TODO sidgoyal) prev_output_tokens = input[0] # discard "src_lengths" encoder_out = None encoder_padding_mask = None incremental_state = None else: prev_output_tokens = input encoder_out = None encoder_padding_mask = None incremental_state = None positions = ( self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions if isinstance(self.embed_tokens, nn.ModuleList): x_embed_list = [] for embed_tokens_part in self.embed_tokens: x_embed_list.append(embed_tokens_part(prev_output_tokens)) x = self.embed_scale * torch.cat(x_embed_list, dim=-1) else: x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) if mt_task: return (x, encoder_out, encoder_padding_mask) return x class TransformerDecoderOutputLayer(nn.Module): def __init__(self, args, embed_tokens, dictionary): super().__init__() self.share_input_output_embed = args.share_decoder_input_output_embed self.embed_tokens = embed_tokens self.output_embed_dim = args.decoder_output_dim embed_dim = args.decoder_embed_dim self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not args.tie_adaptive_weights else None ) self.adaptive_softmax = None if args.adaptive_softmax_cutoff is not None: assert not isinstance(embed_tokens, nn.ModuleList) self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, options.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif not self.share_input_output_embed: self.embed_tokens = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_( self.embed_tokens, mean=0, std=self.output_embed_dim**-0.5 ) if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward(self, input, apply_final_proj=True): if isinstance(input, tuple): x = input[0] else: x = input if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if apply_final_proj: x = self.output_layer(x) return x def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: if isinstance(self.embed_tokens, nn.ModuleList): output = None for i, emb in enumerate(self.embed_tokens): sidx = i * emb.embedding_dim eidx = (i + 1) * emb.embedding_dim if output is None: output = F.linear(features[:, :, sidx:eidx], emb.weight) else: output += F.linear(features[:, :, sidx:eidx], emb.weight) return output else: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_tokens) else: return features class TransformerEncoderLayer(nn.Module): """Encoder layer block. In the original paper each operation (multi-head attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *args.encoder_normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments """ def __init__(self, args): super().__init__() self.embed_dim = args.encoder_embed_dim self.self_attn = MultiheadAttention( self.embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, ) self.self_attn_layer_norm = LayerNorm(self.embed_dim) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu") ) self.activation_dropout = getattr(args, "activation_dropout", 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, "relu_dropout", 0) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) def upgrade_state_dict_named(self, state_dict, name): """ Rename layer norm states from `...layer_norms.0.weight` to `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to `...final_layer_norm.weight` """ layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"} for old, new in layer_norm_map.items(): for m in ("weight", "bias"): k = "{}.layer_norms.{}.{}".format(name, old, m) if k in state_dict: state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k] del state_dict[k] def forward(self, input): """ Args: input (Tuple): input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` input[1] (ByteTensor/FloatTensor): encoder padding mask - binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. input[2] (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing) Returns: output (Tuple): output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)` output[1] (ByteTensor/FloatTensor): encoder padding mask output[2] (LongTensor): previous decoder outputs """ x = input[0] encoder_padding_mask = input[1] prev_output_tokens = input[2] residual = x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) x, _ = self.self_attn( query=x, key=x, value=x, key_padding_mask=encoder_padding_mask ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) residual = x x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) x = self.activation_fn(self.fc1(x)) x = F.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) return (x, encoder_padding_mask, prev_output_tokens) def maybe_layer_norm(self, layer_norm, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return layer_norm(x) else: return x class TransformerDecoderLayer(nn.Module): """Decoder layer block. In the original paper each operation (multi-head attention, encoder attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *args.decoder_normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__() self.embed_dim = args.decoder_embed_dim self.self_attn = MultiheadAttention( embed_dim=self.embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=True, ) self.dropout = args.dropout self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu") ) self.activation_dropout = getattr(args, "activation_dropout", 0) if self.activation_dropout == 0: # for backwards compatibility with models that use args.relu_dropout self.activation_dropout = getattr(args, "relu_dropout", 0) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, "char_inputs", False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, kdim=getattr(args, "encoder_embed_dim", None), vdim=getattr(args, "encoder_embed_dim", None), dropout=args.attention_dropout, encoder_decoder_attention=True, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False def prepare_for_onnx_export_(self): self.onnx_trace = True def forward(self, input): """ Args: input (Tuple): input[0] (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` input[1] (Tensor): encoder output of shape `(batch, src_len, embed_dim)` input[2] (ByteTensor/FloatTensor): encoder padding mask - binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: output (Tuple): output[0] (Tensor): encoded output of shape `(batch, src_len, embed_dim)` output[1] (ByteTensor/FloatTensor): encoder padding mask output[2] (LongTensor): previous decoder outputs """ # Note: incremental state is not yet supported mt_task = False if isinstance(input, tuple): x = input[0] encoder_out = input[1] encoder_padding_mask = input[2] incremental_state = None mt_task = True else: x = input encoder_out = None encoder_padding_mask = None incremental_state = None if incremental_state is None: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None # TODO: add back prev_self_attn_state, prev_attn_state, # self_attn_padding_mask prev_self_attn_state = None prev_attn_state = None self_attn_padding_mask = None residual = x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True) if prev_self_attn_state is not None: if incremental_state is None: incremental_state = {} prev_key, prev_value = prev_self_attn_state saved_state = {"prev_key": prev_key, "prev_value": prev_value} self.self_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.self_attn( query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state, need_weights=False, attn_mask=self_attn_mask, ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True) if self.encoder_attn is not None: residual = x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True) if prev_attn_state is not None: if incremental_state is None: incremental_state = {} prev_key, prev_value = prev_attn_state saved_state = {"prev_key": prev_key, "prev_value": prev_value} self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=(not self.training and self.need_attn), ) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True) residual = x x = self.maybe_layer_norm(self.final_layer_norm, x, before=True) x = self.activation_fn(self.fc1(x)) x = F.dropout(x, p=self.activation_dropout, training=self.training) x = self.fc2(x) x = F.dropout(x, p=self.dropout, training=self.training) x = residual + x x = self.maybe_layer_norm(self.final_layer_norm, x, after=True) if mt_task: return (x, encoder_out, encoder_padding_mask) return x def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, "_future_mask") or self._future_mask is None or self._future_mask.device != tensor.device ): self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 ) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1 ) return self._future_mask[:dim, :dim] def maybe_layer_norm(self, layer_norm, x, before=False, after=False): assert before ^ after if after ^ self.normalize_before: return layer_norm(x) else: return x def make_generation_fast_(self, need_attn=False, **kwargs): self.need_attn = need_attn def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m ================================================ FILE: fairseq/model_parallel/models/pipeline_parallel_transformer/model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.model_parallel.models.pipeline_parallel_transformer.layers import ( Embedding, TransformerDecoderEmbedding, TransformerDecoderLayer, TransformerDecoderOutputLayer, TransformerEncoderEmbedding, TransformerEncoderLayer, TransformerEncoderLayerNorm, ) from fairseq.models import ( BaseFairseqModel, FairseqDecoder, FairseqEncoder, register_model, register_model_architecture, ) from fairseq.models.fairseq_encoder import EncoderOut from fairseq.models.transformer import ( base_architecture, transformer_iwslt_de_en, transformer_wmt_en_de_big, ) from fairseq.modules import SinusoidalPositionalEmbedding logger = logging.getLogger(__name__) DEFAULT_MAX_SOURCE_POSITIONS = 1024 DEFAULT_MAX_TARGET_POSITIONS = 1024 TORCH_PIPE = False RPC_INIT = False def import_pipe(): global TORCH_PIPE global RPC_INIT try: from torch.distributed.pipeline.sync import Pipe # noqa global Pipe from torch.distributed.pipeline.sync.utils import partition_model global partition_model from torch.distributed import rpc import tempfile TORCH_PIPE = True # Initialize single process RPC agent since TORCH_PIPE requires # RRef. RRef depends on RPC being initialized and as a result we initialize # RPC with a single node. tmpfile = tempfile.NamedTemporaryFile() if not RPC_INIT: rpc.init_rpc( name="worker", rank=0, world_size=1, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method="file://{}".format(tmpfile.name), ), ) RPC_INIT = True logger.info("Using torch pipe") except ImportError: try: from fairscale.nn import Pipe # noqa logger.info("Using fairscale pipe") except ImportError: raise ImportError("Please install fairscale with: pip install fairscale") @register_model("pipeline_parallel_transformer") class PipelineParallelTransformerModel(BaseFairseqModel): def __init__(self, encoder, decoder, balance, devices, chunks, checkpoint): import_pipe() super().__init__() assert isinstance(encoder, FairseqEncoder) assert isinstance(decoder, FairseqDecoder) encoder_module_list = ( [encoder.embedding_layer] + list(encoder.encoder_layers) + [encoder.final_layer_norm] ) self.num_encoder_modules = len(encoder_module_list) decoder_module_list = ( [decoder.embedding_layer] + list(decoder.decoder_layers) + [decoder.decoder_output_layer] ) self.num_decoder_modules = len(decoder_module_list) module_list = encoder_module_list + decoder_module_list self.devices = devices if TORCH_PIPE: self.model = Pipe( partition_model(nn.Sequential(*module_list), balance, devices), chunks=chunks, checkpoint=checkpoint, ) else: self.model = Pipe( nn.Sequential(*module_list), balance=balance, devices=devices, chunks=chunks, checkpoint=checkpoint, ) self.encoder_max_positions = self.max_positions_helper( encoder.embedding_layer, "max_source_positions" ) self.decoder_max_positions = self.max_positions_helper( decoder.embedding_layer, "max_target_positions" ) self.adaptive_softmax = getattr(decoder, "adaptive_softmax", None) # Note: To be populated during inference self.encoder = None self.decoder = None def forward(self, src_tokens, src_lengths, prev_output_tokens): if self.training: input_lst = [src_tokens, src_lengths, prev_output_tokens] input = tuple(i.to(self.devices[0], non_blocking=True) for i in input_lst) if TORCH_PIPE: return self.model(input).local_value() else: return self.model(input) else: assert self.encoder is not None and self.decoder is not None, ( "encoder and decoder need to be initialized by " + "calling the `prepare_for_inference_()` method" ) encoder_output_tuple = self.encoder(input) return self.decoder(encoder_output_tuple) def prepare_for_inference_(self, cfg): if self.encoder is not None and self.decoder is not None: logger.info("Encoder and Decoder already initialized") return encoder_module_list = [] decoder_module_list = [] module_count = 0 for partition in self.model.partitions: for module in partition: if module_count < self.num_encoder_modules: encoder_module_list.append(module) else: decoder_module_list.append(module) module_count += 1 self.model = None self.encoder = TransformerEncoder( cfg.distributed_training, None, None, encoder_module_list ) self.decoder = TransformerDecoder( cfg.distributed_training, None, None, decoder_module_list=decoder_module_list, ) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--activation-fn', choices=utils.get_available_activation_fns(), help='activation function to use') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') parser.add_argument('--activation-dropout', '--relu-dropout', type=float, metavar='D', help='dropout probability after activation in FFN.') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-ffn-embed-dim', type=int, metavar='N', help='encoder embedding dimension for FFN') parser.add_argument('--encoder-layers', type=int, metavar='N', help='num encoder layers') parser.add_argument('--encoder-attention-heads', type=int, metavar='N', help='num encoder attention heads') parser.add_argument('--encoder-normalize-before', action='store_true', help='apply layernorm before each encoder block') parser.add_argument('--encoder-learned-pos', action='store_true', help='use learned positional embeddings in the encoder') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-ffn-embed-dim', type=int, metavar='N', help='decoder embedding dimension for FFN') parser.add_argument('--decoder-layers', type=int, metavar='N', help='num decoder layers') parser.add_argument('--decoder-attention-heads', type=int, metavar='N', help='num decoder attention heads') parser.add_argument('--decoder-learned-pos', action='store_true', help='use learned positional embeddings in the decoder') parser.add_argument('--decoder-normalize-before', action='store_true', help='apply layernorm before each decoder block') parser.add_argument('--share-decoder-input-output-embed', action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') parser.add_argument('--no-token-positional-embeddings', default=False, action='store_true', help='if set, disables positional embeddings (outside self attention)') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion'), parser.add_argument('--adaptive-softmax-dropout', type=float, metavar='D', help='sets adaptive softmax dropout for the tail projections') parser.add_argument('--num-embedding-chunks', type=int, metavar='N', default=1, help='Number of embedding layer chunks (enables more even distribution' 'of optimizer states across data parallel nodes' 'when using optimizer state sharding and' 'a big embedding vocabulary)') # fmt: on @classmethod def build_model_base(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if not hasattr(args, "max_source_positions"): args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS if not hasattr(args, "max_target_positions"): args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS src_dict, tgt_dict = task.source_dictionary, task.target_dictionary def build_embedding(dictionary, embed_dim, path=None, num_embed_chunks=1): assert embed_dim % num_embed_chunks == 0, ( f"Number of embedding chunks = {num_embed_chunks} should be " + f"divisible by the embedding dimension = {embed_dim}" ) assert path is None or num_embed_chunks == 1, ( "Loading embedding from a path with number of embedding chunks > 1" + " is not yet supported" ) num_embeddings = len(dictionary) padding_idx = dictionary.pad() # if provided, load from preloaded dictionaries if path: emb = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) else: embed_chunk_dim = embed_dim // num_embed_chunks emb = nn.ModuleList() for i in range(num_embed_chunks): emb.append(Embedding(num_embeddings, embed_chunk_dim, padding_idx)) return emb num_embed_chunks = args.num_embedding_chunks if args.share_all_embeddings: if src_dict != tgt_dict: raise ValueError("--share-all-embeddings requires a joined dictionary") if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embeddings not compatible with --decoder-embed-path" ) encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path, num_embed_chunks, ) decoder_embed_tokens = encoder_embed_tokens args.share_decoder_input_output_embed = True else: assert args.share_decoder_input_output_embed or num_embed_chunks == 1, ( "Not sharing decoder I/O embeddings is not yet supported with number of " + "embedding chunks > 1" ) encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path, num_embed_chunks, ) decoder_embed_tokens = build_embedding( tgt_dict, args.decoder_embed_dim, args.decoder_embed_path, num_embed_chunks, ) encoder = cls.build_encoder(args, src_dict, encoder_embed_tokens) decoder = cls.build_decoder(args, tgt_dict, decoder_embed_tokens) return (encoder, decoder) @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return TransformerEncoder(args, src_dict, embed_tokens) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): return TransformerDecoder(args, tgt_dict, embed_tokens) @classmethod def build_model(cls, args, task): encoder, decoder = cls.build_model_base(args, task) return PipelineParallelTransformerModel( encoder=encoder, decoder=decoder, balance=utils.eval_str_list(args.pipeline_balance, type=int), devices=utils.eval_str_list(args.pipeline_devices, type=int), chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) def output_layer(self, features, **kwargs): """Project features to the default output size (typically vocabulary size).""" return self.decoder.output_layer(features, **kwargs) def max_positions(self): """Maximum length supported by the model.""" return (self.encoder_max_positions, self.decoder_max_positions) def max_positions_helper( self, embedding_layer, max_positions_field="max_source_positions" ): """Maximum input length supported by the encoder or decoder.""" if embedding_layer.embed_positions is None: return getattr(embedding_layer, max_positions_field) return min( getattr(embedding_layer, max_positions_field), embedding_layer.embed_positions.max_positions, ) def get_normalized_probs(self, net_output, log_probs, sample=None): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None: if sample is not None: assert "target" in sample target = sample["target"] else: target = None out = self.adaptive_softmax.get_log_prob(net_output, target=target) return out.exp_() if not log_probs else out # A Pipe() module returns a tuple of tensors as the output. # In this case, the tuple has one element - the output tensor of logits logits = net_output if isinstance(net_output, torch.Tensor) else net_output[0] if log_probs: return utils.log_softmax(logits, dim=-1, onnx_trace=False) else: return utils.softmax(logits, dim=-1, onnx_trace=False) def max_decoder_positions(self): """Maximum length supported by the decoder.""" return self.decoder_max_positions def load_state_dict(self, state_dict, strict=True, model_cfg=None): """Copies parameters and buffers from *state_dict* into this module and its descendants. Overrides the method in :class:`nn.Module`. Compared with that method this additionally "upgrades" *state_dicts* from old checkpoints. """ self.upgrade_state_dict(state_dict) is_regular_transformer = not any("model.partitions" in k for k in state_dict) if is_regular_transformer: state_dict = self.convert_to_pipeline_parallel_state_dict(state_dict) return super().load_state_dict(state_dict, strict) def convert_to_pipeline_parallel_state_dict(self, state_dict): new_state_dict = self.state_dict() encoder_layer_idx = 0 decoder_layer_idx = 0 encoder_key_suffixes = [ "self_attn.k_proj.weight", "self_attn.k_proj.bias", "self_attn.v_proj.weight", "self_attn.v_proj.bias", "self_attn.q_proj.weight", "self_attn.q_proj.bias", "self_attn.out_proj.weight", "self_attn.out_proj.bias", "self_attn_layer_norm.weight", "self_attn_layer_norm.bias", "fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias", "final_layer_norm.weight", "final_layer_norm.bias", ] decoder_key_suffixes = [ "self_attn.k_proj.weight", "self_attn.k_proj.bias", "self_attn.v_proj.weight", "self_attn.v_proj.bias", "self_attn.q_proj.weight", "self_attn.q_proj.bias", "self_attn.out_proj.weight", "self_attn.out_proj.bias", "self_attn_layer_norm.weight", "self_attn_layer_norm.bias", "encoder_attn.k_proj.weight", "encoder_attn.k_proj.bias", "encoder_attn.v_proj.weight", "encoder_attn.v_proj.bias", "encoder_attn.q_proj.weight", "encoder_attn.q_proj.bias", "encoder_attn.out_proj.weight", "encoder_attn.out_proj.bias", "encoder_attn_layer_norm.weight", "encoder_attn_layer_norm.bias", "fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias", "final_layer_norm.weight", "final_layer_norm.bias", ] for pid, partition in enumerate(self.model.partitions): logger.info(f"Begin Partition {pid}") for mid, module in enumerate(partition): # fmt: off if isinstance(module, TransformerEncoderEmbedding): new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['encoder.embed_tokens.weight'] if isinstance(module, TransformerEncoderLayer): for suffix in encoder_key_suffixes: new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'encoder.layers.{encoder_layer_idx}.{suffix}'] encoder_layer_idx += 1 if isinstance(module, TransformerDecoderLayer): for suffix in decoder_key_suffixes: new_state_dict[f'model.partitions.{pid}.{mid}.{suffix}'] = state_dict[f'decoder.layers.{decoder_layer_idx}.{suffix}'] decoder_layer_idx += 1 if isinstance(module, TransformerEncoderLayerNorm): if 'encoder.layer_norm.weight' in state_dict: new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.weight'] = state_dict['encoder.layer_norm.weight'] new_state_dict[f'model.partitions.{pid}.{mid}.layer_norm.bias'] = state_dict['encoder.layer_norm.bias'] if isinstance(module, TransformerDecoderEmbedding): new_state_dict[f'model.partitions.{pid}.{mid}.embed_tokens.weight'] = state_dict['decoder.embed_tokens.weight'] if isinstance(module, TransformerDecoderOutputLayer): new_state_dict[f'model.partitions.{pid}.{mid}.output_projection.weight'] = state_dict['decoder.output_projection.weight'] # fmt: on return new_state_dict class TransformerEncoder(FairseqEncoder): """ Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`TransformerEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, args, dictionary, embed_tokens, encoder_module_list=None): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) import_pipe() self.use_pipeline = encoder_module_list is not None if not self.use_pipeline: self.embedding_layer = TransformerEncoderEmbedding(args, embed_tokens) self.encoder_layers = nn.Sequential( *[TransformerEncoderLayer(args) for i in range(args.encoder_layers)] ) if isinstance(embed_tokens, nn.ModuleList): emb_dim = sum(e.embedding_dim for e in embed_tokens) else: emb_dim = embed_tokens.embedding_dim self.final_layer_norm = TransformerEncoderLayerNorm(args, emb_dim) else: encoder_balance = utils.eval_str_list( args.pipeline_encoder_balance, type=int ) encoder_devices = utils.eval_str_list( args.pipeline_encoder_devices, type=int ) assert sum(encoder_balance) == len(encoder_module_list), ( f"Sum of encoder_balance={encoder_balance} is not equal " + f"to num_encoder_modules={len(encoder_module_list)}" ) if TORCH_PIPE: self.model = Pipe( module=partition_model( nn.Sequential(*encoder_module_list), encoder_balance, encoder_devices, ), chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) else: self.model = Pipe( module=nn.Sequential(*encoder_module_list), balance=encoder_balance, devices=encoder_devices, chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) def forward(self, src_tokens, src_lengths): """ Args: input_tuple( src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` ) Returns: output_tuple( - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - prev_output_tokens - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. ) """ dummy_prev_output_tokens = torch.zeros( 1, dtype=src_tokens.dtype, device=src_tokens.device ) input_tuple = (src_tokens, src_lengths, dummy_prev_output_tokens) if self.use_pipeline: input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple) if TORCH_PIPE: encoder_out = self.model(input_tuple).local_value() else: encoder_out = self.model(input_tuple) else: encoder_embed_output_tuple = self.embedding_layer(input_tuple) encoder_layers_output = self.encoder_layers(encoder_embed_output_tuple) encoder_out = self.final_layer_norm(encoder_layers_output) # first element is the encoder output # second element is the encoder padding mask # the remaining elements of EncoderOut are not computed by # the PipelineParallelTransformer return EncoderOut(encoder_out[0], encoder_out[1], None, None, None, None) def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if encoder_out.encoder_out is not None: encoder_out = encoder_out._replace( encoder_out=encoder_out.encoder_out.index_select(1, new_order) ) if encoder_out.encoder_padding_mask is not None: encoder_out = encoder_out._replace( encoder_padding_mask=encoder_out.encoder_padding_mask.index_select( 0, new_order ) ) if encoder_out.encoder_embedding is not None: encoder_out = encoder_out._replace( encoder_embedding=encoder_out.encoder_embedding.index_select( 0, new_order ) ) if encoder_out.encoder_states is not None: for idx, state in enumerate(encoder_out.encoder_states): encoder_out.encoder_states[idx] = state.index_select(1, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embedding_layer.embed_positions is None: return self.embedding_layer.max_source_positions return min( self.embedding_layer.max_source_positions, self.embedding_layer.embed_positions.max_positions, ) class TransformerDecoder(FairseqDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, decoder_module_list=None, ): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) import_pipe() self.use_pipeline = decoder_module_list is not None if not self.use_pipeline: self.embedding_layer = TransformerDecoderEmbedding(args, embed_tokens) self.decoder_layers = nn.Sequential( *[ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(args.decoder_layers) ] ) self.decoder_output_layer = TransformerDecoderOutputLayer( args, embed_tokens, dictionary ) else: decoder_balance = utils.eval_str_list( args.pipeline_decoder_balance, type=int ) decoder_devices = utils.eval_str_list( args.pipeline_decoder_devices, type=int ) assert sum(decoder_balance) == len(decoder_module_list), ( f"Sum of decoder_balance={decoder_balance} is not equal " + f"to num_decoder_modules={len(decoder_module_list)}" ) if TORCH_PIPE: self.model = Pipe( module=partition_model( nn.Sequential(*decoder_module_list), decoder_balance, decoder_devices, ), chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) else: self.model = Pipe( module=nn.Sequential(*decoder_module_list), balance=decoder_balance, devices=decoder_devices, chunks=args.pipeline_chunks, checkpoint=args.pipeline_checkpoint, ) def forward( self, prev_output_tokens, encoder_out=None, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ input_tuple = ( encoder_out.encoder_out, encoder_out.encoder_padding_mask, prev_output_tokens, ) if self.use_pipeline: input_tuple = tuple(i.to(self.model.devices[0]) for i in input_tuple) if TORCH_PIPE: return (self.model(input_tuple).local_value(),) else: return (self.model(input_tuple),) else: embed_layer_output = self.embedding_layer(input_tuple) state = self.decoder_layers(embed_layer_output) return (self.decoder_output_layer(state),) def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" if self.embedding_layer.embed_positions is None: return self.embedding_layer.max_target_positions return min( self.embedding_layer.max_target_positions, self.embedding_layer.embed_positions.max_positions, ) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, "_future_mask") or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim ): self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 ) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" for i in range(len(self.layers)): # update layer norms layer_norm_map = { "0": "self_attn_layer_norm", "1": "encoder_attn_layer_norm", "2": "final_layer_norm", } for old, new in layer_norm_map.items(): for m in ("weight", "bias"): k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) if k in state_dict: state_dict[ "{}.layers.{}.{}.{}".format(name, i, new, m) ] = state_dict[k] del state_dict[k] version_key = "{}.version".format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict @register_model_architecture( "pipeline_parallel_transformer", "transformer_iwslt_de_en_pipeline_parallel" ) def transformer_iwslt_de_en_dist(args): transformer_iwslt_de_en(args) @register_model_architecture( "pipeline_parallel_transformer", "transformer_wmt_en_de_big_pipeline_parallel" ) def transformer_wmt_en_de_big_dist(args): transformer_wmt_en_de_big(args) ================================================ FILE: fairseq/model_parallel/models/roberta/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .model import * # noqa ================================================ FILE: fairseq/model_parallel/models/roberta/model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ RoBERTa: A Robustly Optimized BERT Pretraining Approach. """ import logging import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.model_parallel.models.transformer import ModelParallelTransformerEncoder from fairseq.models import register_model, register_model_architecture from fairseq.models.roberta import ( roberta_base_architecture, roberta_prenorm_architecture, RobertaEncoder, RobertaModel, ) from fairseq.modules import LayerNorm try: from fairseq.model_parallel.megatron.mpu import ( copy_to_model_parallel_region, gather_from_model_parallel_region, ColumnParallelLinear, VocabParallelEmbedding, ) has_megatron_submodule = True except (ImportError, ModuleNotFoundError): has_megatron_submodule = False logger = logging.getLogger(__name__) @register_model("model_parallel_roberta") class ModelParallelRobertaModel(RobertaModel): def __init__(self, args, encoder): super().__init__(args, encoder) self.classification_heads = nn.ModuleDict() @staticmethod def add_args(parser): RobertaModel.add_args(parser) parser.add_argument( "--no-final-layer-norm", action="store_true", help=( "don't add final layernorm (only applicable when " "--encoder-normalize-before=True" ), ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present base_architecture(args) task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8) task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8) if not hasattr(args, "max_positions"): args.max_positions = args.tokens_per_sample if getattr(args, "untie_weights_roberta", False): raise NotImplementedError( "--untie-weights-roberta is not supported in model parallel mode" ) encoder = ModelParallelRobertaEncoder(args, task.source_dictionary) return cls(args, encoder) def forward( self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs ): if classification_head_name is not None: features_only = True x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs) if classification_head_name is not None: x = self.classification_heads[classification_head_name](x) return x, extra def register_classification_head( self, name, num_classes=None, inner_dim=None, **kwargs ): """Register a classification head.""" if name in self.classification_heads: prev_num_classes = self.classification_heads[name].out_proj.out_features prev_inner_dim = self.classification_heads[name].dense.out_features if num_classes != prev_num_classes or inner_dim != prev_inner_dim: logger.warning( 're-registering head "{}" with num_classes {} (prev: {}) ' "and inner_dim {} (prev: {})".format( name, num_classes, prev_num_classes, inner_dim, prev_inner_dim ) ) self.classification_heads[name] = ModelParallelRobertaClassificationHead( self.args.encoder_embed_dim, inner_dim or self.args.encoder_embed_dim, num_classes, self.args.pooler_activation_fn, self.args.pooler_dropout, ) class ModelParallelRobertaLMHead(nn.Module): """Head for masked language modeling.""" def __init__(self, embed_dim, output_dim, activation_fn, weight=None): super().__init__() self.dense = ColumnParallelLinear(embed_dim, embed_dim, gather_output=True) self.activation_fn = utils.get_activation_fn(activation_fn) self.layer_norm = LayerNorm(embed_dim) if weight is None: weight = nn.Linear(embed_dim, output_dim, bias=False).weight self.weight = weight self.bias = nn.Parameter(torch.zeros(output_dim)) def forward(self, features, masked_tokens=None, **kwargs): # Only project the unmasked tokens while training, # saves both memory and computation if masked_tokens is not None: features = features[masked_tokens, :] x = self.dense(features) x = self.activation_fn(x) x = self.layer_norm(x) x = copy_to_model_parallel_region(x) # project back to size of vocabulary with bias x = F.linear(x, self.weight) x = gather_from_model_parallel_region(x).contiguous() x = x + self.bias return x class ModelParallelRobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__( self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout ): super().__init__() self.dense = ColumnParallelLinear(input_dim, inner_dim, gather_output=True) self.activation_fn = utils.get_activation_fn(activation_fn) self.dropout = nn.Dropout(p=pooler_dropout) self.out_proj = nn.Linear(inner_dim, num_classes) def forward(self, features, **kwargs): x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = self.activation_fn(x) x = self.dropout(x) x = self.out_proj(x) return x class ModelParallelRobertaEncoder(RobertaEncoder): """RoBERTa encoder.""" def __init__(self, args, dictionary): super().__init__(args, dictionary) assert not self.args.untie_weights_roberta def build_embedding(self, vocab_size, embedding_dim, padding_idx): return VocabParallelEmbedding(vocab_size, embedding_dim, padding_idx) def build_encoder(self, args, dictionary, embed_tokens): return ModelParallelTransformerEncoder(args, dictionary, embed_tokens) def build_lm_head(self, embed_dim, output_dim, activation_fn, weight): return ModelParallelRobertaLMHead(embed_dim, output_dim, activation_fn, weight) @register_model_architecture("model_parallel_roberta", "model_parallel_roberta") def base_architecture(args): args.no_final_layer_norm = getattr(args, "no_final_layer_norm", False) # model parallel RoBERTa defaults to "Pre-LN" formulation roberta_prenorm_architecture(args) # earlier versions of model parallel RoBERTa removed the final layer norm @register_model_architecture("model_parallel_roberta", "model_parallel_roberta_v1") def model_parallel_roberta_v1_architecture(args): args.no_final_layer_norm = getattr(args, "no_final_layer_norm", True) base_architecture(args) @register_model_architecture( "model_parallel_roberta", "model_parallel_roberta_postnorm" ) def model_parallel_roberta_postnorm_architecture(args): # the original BERT/RoBERTa uses the "Post-LN" formulation roberta_base_architecture(args) @register_model_architecture("model_parallel_roberta", "model_parallel_roberta_base") def model_parallel_roberta_base_architecture(args): base_architecture(args) @register_model_architecture("model_parallel_roberta", "model_parallel_roberta_large") def model_parallel_roberta_large_architecture(args): args.encoder_layers = getattr(args, "encoder_layers", 24) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) base_architecture(args) ================================================ FILE: fairseq/model_parallel/models/transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch.nn as nn from fairseq.model_parallel.modules import ( ModelParallelTransformerDecoderLayer, ModelParallelTransformerEncoderLayer, ) from fairseq.models import register_model from fairseq.models.transformer import ( TransformerDecoder, TransformerEncoder, TransformerModel, ) try: from fairseq.model_parallel.megatron.mpu import ( VocabParallelEmbedding, copy_to_model_parallel_region, gather_from_model_parallel_region, ) has_megatron_submodule = True except (ImportError, ModuleNotFoundError): has_megatron_submodule = False logger = logging.getLogger(__name__) @register_model("model_parallel_transformer") class ModelParallelTransformerModel(TransformerModel): """ Model parallel Transformer model. """ @classmethod def build_embedding(cls, args, dictionary, embed_dim, path=None): if not has_megatron_submodule: raise ImportError( "\n\nPlease install the megatron submodule:" "\n\n git submodule update --init " "fairseq/model_parallel/megatron" ) dictionary.pad_to_multiple_(args.model_parallel_size * 8) num_embeddings = len(dictionary) padding_idx = dictionary.pad() def _vocab_init(tensor, **kwargs): nn.init.normal_(tensor, mean=0, std=num_embeddings**-0.5) nn.init.constant_(tensor[1], 0) emb = VocabParallelEmbedding( num_embeddings, embed_dim, padding_idx, init_method=_vocab_init ) # if provided, load from preloaded dictionaries if path: raise NotImplementedError( "Loading of embedding from path is not supported for model parallel" ) return emb @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return ModelParallelTransformerEncoder(args, src_dict, embed_tokens) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): return ModelParallelTransformerDecoder( args, tgt_dict, embed_tokens, no_encoder_attn=getattr(args, "no_cross_attention", False), ) class ModelParallelTransformerEncoder(TransformerEncoder): """ Model parallel Transformer encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`ModelParallelTransformerEncoderLayer`. """ def __init__(self, args, dictionary, embed_tokens): super().__init__(args, dictionary, embed_tokens) if args.no_final_layer_norm: self.layer_norm = None def build_encoder_layer(self, args): return ModelParallelTransformerEncoderLayer(args) class ModelParallelTransformerDecoder(TransformerDecoder): """ Model Parallel Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`ModelParallelTransformerDecoderLayer`. """ def build_decoder_layer(self, args, no_encoder_attn=False): return ModelParallelTransformerDecoderLayer(args, no_encoder_attn) def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" if not self.share_input_output_embed: raise NotImplementedError( "Model parallel training currently requires --share-decoder-input-output-embed" ) features = copy_to_model_parallel_region(features) # project back to size of vocabulary x = self.output_projection(features) if getattr(self.args, "criterion") != "vocab_parallel_cross_entropy": x = gather_from_model_parallel_region(x).contiguous() return x ================================================ FILE: fairseq/model_parallel/models/transformer_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn from fairseq.model_parallel.models.transformer import ModelParallelTransformerDecoder from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer_lm import TransformerLanguageModel try: from fairseq.model_parallel.megatron.mpu import VocabParallelEmbedding has_megatron_submodule = True except (ImportError, ModuleNotFoundError): has_megatron_submodule = False DEFAULT_MAX_TARGET_POSITIONS = 1024 @register_model("model_parallel_transformer_lm") class ModelParallelTransformerLanguageModel(TransformerLanguageModel): @staticmethod def add_args(parser): TransformerLanguageModel.add_args(parser) @classmethod def build_model(cls, args, task): """Build a new model instance.""" if not has_megatron_submodule: raise ImportError( "\n\nPlease install the megatron submodule:" "\n\n git submodule update --init " "fairseq/model_parallel/megatron" ) # make sure all arguments are present in older models base_lm_architecture(args) task.source_dictionary.pad_to_multiple_(args.model_parallel_size * 8) task.target_dictionary.pad_to_multiple_(args.model_parallel_size * 8) if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if getattr(args, "max_target_positions", None) is None: args.max_target_positions = getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) if args.character_embeddings: raise NotImplementedError( "Character embeddings is not supported for model parallel" ) elif args.adaptive_input: raise NotImplementedError( "Adaptive input is not supported for model parallel" ) else: embed_tokens = cls.build_embedding( args, task.source_dictionary, args.decoder_input_dim ) decoder = ModelParallelTransformerDecoder( args, task.target_dictionary, embed_tokens, no_encoder_attn=True, ) return cls(decoder) @classmethod def build_embedding(cls, args, dictionary, embed_dim, path=None): def _vocab_init(tensor, **kwargs): nn.init.normal_(tensor, mean=0, std=embed_dim**-0.5) nn.init.constant_(tensor[1], 0) embed_tokens = VocabParallelEmbedding( len(dictionary), embed_dim, dictionary.pad(), init_method=_vocab_init ) return embed_tokens def base_lm_architecture(args): # backward compatibility for older model checkpoints if hasattr(args, "no_tie_adaptive_proj"): # previous models defined --no-tie-adaptive-proj, so use the existence of # that option to determine if this is an "old" model checkpoint args.no_decoder_final_norm = True # old models always set this to True if args.no_tie_adaptive_proj is False: args.tie_adaptive_proj = True if hasattr(args, "decoder_final_norm"): args.no_decoder_final_norm = not args.decoder_final_norm args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.relu_dropout = getattr(args, "relu_dropout", 0.0) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) # Model training is not stable without this args.decoder_normalize_before = True args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.character_embeddings = getattr(args, "character_embeddings", False) args.character_filters = getattr( args, "character_filters", "[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", ) args.character_embedding_dim = getattr(args, "character_embedding_dim", 4) args.char_embedder_highway_layers = getattr(args, "char_embedder_highway_layers", 2) args.adaptive_input = getattr(args, "adaptive_input", False) args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0.0) args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0.0) args.add_bos_token = getattr(args, "add_bos_token", False) @register_model_architecture("model_parallel_transformer_lm", "transformer_lm_megatron") def transformer_lm_megatron(args): args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 3072) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072 * 4) args.decoder_layers = getattr(args, "decoder_layers", 72) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_fn = getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture( "model_parallel_transformer_lm", "transformer_lm_megatron_11b" ) def transformer_lm_megatron_11b(args): args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 3072) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 3072 * 6) args.decoder_layers = getattr(args, "decoder_layers", 72) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_fn = getattr(args, "activation_fn", "gelu") base_lm_architecture(args) ================================================ FILE: fairseq/model_parallel/modules/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" from .multihead_attention import ModelParallelMultiheadAttention from .transformer_layer import ( ModelParallelTransformerEncoderLayer, ModelParallelTransformerDecoderLayer, ) __all__ = [ "ModelParallelMultiheadAttention", "ModelParallelTransformerEncoderLayer", "ModelParallelTransformerDecoderLayer", ] ================================================ FILE: fairseq/model_parallel/modules/multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, Optional, Tuple import torch import torch.nn.functional as F from torch import Tensor, nn from fairseq import utils from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules.fairseq_dropout import FairseqDropout try: from fairseq.model_parallel.megatron.mpu import ( ColumnParallelLinear, RowParallelLinear, get_cuda_rng_tracker, get_model_parallel_world_size, ) has_megatron_submodule = True except (ImportError, ModuleNotFoundError): has_megatron_submodule = False @with_incremental_state class ModelParallelMultiheadAttention(nn.Module): """Model parallel Multi-headed attention. This performs the Multi-headed attention over multiple gpus. See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details. """ def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, self_attention=False, encoder_decoder_attention=False, ): super().__init__() if not has_megatron_submodule: raise ImportError( "\n\nPlease install the megatron submodule:" "\n\n git submodule update --init " "fairseq/model_parallel/megatron" ) self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.model_parallel_size = get_model_parallel_world_size() self.num_heads_partition = num_heads // self.model_parallel_size assert ( self.num_heads_partition * self.model_parallel_size == num_heads ), "Number of heads must be divisible by model parallel size" self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.head_dim = embed_dim // num_heads assert ( self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert ( not self.self_attention or self.qkv_same_dim ), "Self-attention requires query, key and value to be of the same size" self.k_proj = ColumnParallelLinear( self.kdim, embed_dim, bias=bias, gather_output=False ) self.v_proj = ColumnParallelLinear( self.vdim, embed_dim, bias=bias, gather_output=False ) self.q_proj = ColumnParallelLinear( embed_dim, embed_dim, bias=bias, gather_output=False ) self.out_proj = RowParallelLinear( embed_dim, embed_dim, bias=bias, input_is_parallel=True ) def forward( self, query, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, static_kv: bool = False, attn_mask: Optional[Tensor] = None, **unused_kwargs, ) -> Tuple[Tensor, Optional[Tensor]]: """Input shape: Time x Batch x Channel Args: key_padding_mask (ByteTensor, optional): mask to exclude keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s. attn_mask (ByteTensor, optional): typically used to implement causal attention, where the mask prevents the attention from looking forward in time (default: None). """ tgt_len, bsz, embed_dim = query.size() assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] is_tpu = query.device.type == "xla" if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if saved_state is not None and "prev_key" in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert self.encoder_decoder_attention and not self.self_attention key = value = None else: saved_state = None if self.self_attention: q = self.q_proj(query) k = self.k_proj(query) v = self.v_proj(query) elif self.encoder_decoder_attention: # encoder-decoder attention q = self.q_proj(query) if key is None: assert value is None k = v = None else: k = self.k_proj(key) v = self.v_proj(key) else: assert key is not None and value is not None q = self.q_proj(query) k = self.k_proj(key) v = self.v_proj(value) q *= self.scaling q = ( q.contiguous() .view(tgt_len, bsz * self.num_heads_partition, self.head_dim) .transpose(0, 1) ) if k is not None: k = ( k.contiguous() .view(-1, bsz * self.num_heads_partition, self.head_dim) .transpose(0, 1) ) if v is not None: v = ( v.contiguous() .view(-1, bsz * self.num_heads_partition, self.head_dim) .transpose(0, 1) ) if saved_state is not None: # saved states are stored with shape (bsz, num_heads_partition, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None prev_key = _prev_key.view( bsz * self.num_heads_partition, -1, self.head_dim ) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None prev_value = _prev_value.view( bsz * self.num_heads_partition, -1, self.head_dim ) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) prev_key_padding_mask: Optional[Tensor] = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None key_padding_mask = ( ModelParallelMultiheadAttention._append_prev_key_padding_mask( key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=bsz, src_len=k.size(1), static_kv=static_kv, ) ) saved_state["prev_key"] = k.view( bsz, self.num_heads_partition, -1, self.head_dim ) saved_state["prev_value"] = v.view( bsz, self.num_heads_partition, -1, self.head_dim ) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None incremental_state = self._set_input_buffer(incremental_state, saved_state) assert k is not None src_len = k.size(1) # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len attn_weights = torch.bmm(q, k.transpose(1, 2)) assert list(attn_weights.size()) == [ bsz * self.num_heads_partition, tgt_len, src_len, ] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view( bsz, self.num_heads_partition, tgt_len, src_len ) if not is_tpu: attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf"), ) else: attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.view( bsz * self.num_heads_partition, tgt_len, src_len ) attn_weights_float = utils.softmax(attn_weights, dim=-1) attn_weights = attn_weights_float.type_as(attn_weights) with get_cuda_rng_tracker().fork(): attn_probs = self.dropout_module(attn_weights) assert v is not None attn = torch.bmm(attn_probs, v) assert list(attn.size()) == [ bsz * self.num_heads_partition, tgt_len, self.head_dim, ] embed_dim_partition = embed_dim // self.model_parallel_size attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim_partition) attn = self.out_proj(attn) # return attn_weights None to keep the return type same as single gpu multihead attention # This will be deprecated. attn_weights: Optional[Tensor] = None return attn, attn_weights @staticmethod def _append_prev_key_padding_mask( key_padding_mask: Optional[Tensor], prev_key_padding_mask: Optional[Tensor], batch_size: int, src_len: int, static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask elif prev_key_padding_mask is not None and key_padding_mask is not None: new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1 ) # During incremental decoding, as the padding token enters and # leaves the frame, there will be a time when prev or current # is None elif prev_key_padding_mask is not None: filler = torch.zeros(batch_size, src_len - prev_key_padding_mask.size(1)) if prev_key_padding_mask.is_cuda: filler = filler.cuda() new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), filler.float()], dim=1 ) elif key_padding_mask is not None: filler = torch.zeros(batch_size, src_len - key_padding_mask.size(1)) if key_padding_mask.is_cuda: filler = filler.cuda() new_key_padding_mask = torch.cat( [filler.float(), key_padding_mask.float()], dim=1 ) else: new_key_padding_mask = prev_key_padding_mask return new_key_padding_mask def reorder_incremental_state( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order ): """Reorder buffered internal state (for incremental generation).""" input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: for k in input_buffer.keys(): if input_buffer[k] is not None: input_buffer[k] = input_buffer[k].index_select(0, new_order) incremental_state = self._set_input_buffer(incremental_state, input_buffer) return incremental_state def _get_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ) -> Dict[str, Optional[Tensor]]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: return result else: empty_result: Dict[str, Optional[Tensor]] = {} return empty_result def _set_input_buffer( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], buffer: Dict[str, Optional[Tensor]], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) ================================================ FILE: fairseq/model_parallel/modules/transformer_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.model_parallel.modules import ModelParallelMultiheadAttention from fairseq.modules import TransformerDecoderLayer, TransformerEncoderLayer try: from fairseq.model_parallel.megatron.mpu import ( ColumnParallelLinear, RowParallelLinear, ) has_megatron_submodule = True except (ImportError, ModuleNotFoundError): has_megatron_submodule = False class ModelParallelTransformerEncoderLayer(TransformerEncoderLayer): """Encoder layer block over multiple gpus. See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details. """ def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): if q_noise > 0: raise NotImplementedError return ColumnParallelLinear(input_dim, output_dim, gather_output=False) def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): if q_noise > 0: raise NotImplementedError return RowParallelLinear(input_dim, output_dim, input_is_parallel=True) def build_self_attention(self, embed_dim, args, **unused_kwargs): return ModelParallelMultiheadAttention( embed_dim, args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, ) class ModelParallelTransformerDecoderLayer(TransformerDecoderLayer): """Decoder layer block. See "Megatron-LM: https://arxiv.org/pdf/1909.08053.pdf" for more details. """ def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): if q_noise > 0: raise NotImplementedError return ColumnParallelLinear(input_dim, output_dim, gather_output=False) def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): if q_noise > 0: raise NotImplementedError return RowParallelLinear(input_dim, output_dim, input_is_parallel=True) def build_self_attention(self, embed_dim, args, **unused_kwargs): return ModelParallelMultiheadAttention( embed_dim=embed_dim, num_heads=args.decoder_attention_heads, dropout=args.attention_dropout, self_attention=not getattr(args, "cross_self_attention", False), ) def build_encoder_attention(self, embed_dim, args, **unused_kwargs): return ModelParallelMultiheadAttention( embed_dim=embed_dim, num_heads=args.decoder_attention_heads, kdim=getattr(args, "encoder_embed_dim", None), vdim=getattr(args, "encoder_embed_dim", None), dropout=args.attention_dropout, encoder_decoder_attention=True, ) ================================================ FILE: fairseq/models/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import argparse import importlib import os from contextlib import ExitStack from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import merge_with_parent from hydra.core.config_store import ConfigStore from omegaconf import open_dict, OmegaConf from .composite_encoder import CompositeEncoder from .distributed_fairseq_model import DistributedFairseqModel from .fairseq_decoder import FairseqDecoder from .fairseq_encoder import FairseqEncoder from .fairseq_incremental_decoder import FairseqIncrementalDecoder from .fairseq_model import ( BaseFairseqModel, FairseqEncoderDecoderModel, FairseqEncoderModel, FairseqLanguageModel, FairseqModel, FairseqMultiModel, ) MODEL_REGISTRY = {} MODEL_DATACLASS_REGISTRY = {} ARCH_MODEL_REGISTRY = {} ARCH_MODEL_NAME_REGISTRY = {} ARCH_MODEL_INV_REGISTRY = {} ARCH_CONFIG_REGISTRY = {} __all__ = [ "BaseFairseqModel", "CompositeEncoder", "DistributedFairseqModel", "FairseqDecoder", "FairseqEncoder", "FairseqEncoderDecoderModel", "FairseqEncoderModel", "FairseqIncrementalDecoder", "FairseqLanguageModel", "FairseqModel", "FairseqMultiModel", ] def build_model(cfg: FairseqDataclass, task, from_checkpoint=False): model = None model_type = getattr(cfg, "_name", None) or getattr(cfg, "arch", None) if not model_type and len(cfg) == 1: # this is hit if config object is nested in directory that is named after model type model_type = next(iter(cfg)) if model_type in MODEL_DATACLASS_REGISTRY: cfg = cfg[model_type] else: raise Exception( "Could not infer model type from directory. Please add _name field to indicate model type. " "Available models: " + str(MODEL_DATACLASS_REGISTRY.keys()) + " Requested model type: " + model_type ) if model_type in ARCH_MODEL_REGISTRY: # case 1: legacy models model = ARCH_MODEL_REGISTRY[model_type] elif model_type in MODEL_DATACLASS_REGISTRY: # case 2: config-driven models model = MODEL_REGISTRY[model_type] if model_type in MODEL_DATACLASS_REGISTRY: # set defaults from dataclass. note that arch name and model name can be the same dc = MODEL_DATACLASS_REGISTRY[model_type] if isinstance(cfg, argparse.Namespace): cfg = dc.from_namespace(cfg) else: cfg = merge_with_parent(dc(), cfg, from_checkpoint) else: if model_type in ARCH_CONFIG_REGISTRY: with open_dict(cfg) if OmegaConf.is_config(cfg) else ExitStack(): # this calls the different "arch" functions (like base_architecture()) that you indicate # if you specify --arch on the command line. this is only applicable to the old argparse based models # hydra models should expose different architectures via different config files # it will modify the cfg object and default parameters according to the arch ARCH_CONFIG_REGISTRY[model_type](cfg) assert model is not None, ( f"Could not infer model type from {cfg}. " "Available models: {}".format(MODEL_DATACLASS_REGISTRY.keys()) + f" Requested model type: {model_type}" ) return model.build_model(cfg, task) def register_model(name, dataclass=None): """ New model types can be added to fairseq with the :func:`register_model` function decorator. For example:: @register_model('lstm') class LSTM(FairseqEncoderDecoderModel): (...) .. note:: All models must implement the :class:`BaseFairseqModel` interface. Typically you will extend :class:`FairseqEncoderDecoderModel` for sequence-to-sequence tasks or :class:`FairseqLanguageModel` for language modeling tasks. Args: name (str): the name of the model """ def register_model_cls(cls): if name in MODEL_REGISTRY: return MODEL_REGISTRY[name] if not issubclass(cls, BaseFairseqModel): raise ValueError( "Model ({}: {}) must extend BaseFairseqModel".format(name, cls.__name__) ) MODEL_REGISTRY[name] = cls if dataclass is not None and not issubclass(dataclass, FairseqDataclass): raise ValueError( "Dataclass {} must extend FairseqDataclass".format(dataclass) ) cls.__dataclass = dataclass if dataclass is not None: MODEL_DATACLASS_REGISTRY[name] = dataclass cs = ConfigStore.instance() node = dataclass() node._name = name cs.store(name=name, group="model", node=node, provider="fairseq") @register_model_architecture(name, name) def noop(_): pass return cls return register_model_cls def register_model_architecture(model_name, arch_name): """ New model architectures can be added to fairseq with the :func:`register_model_architecture` function decorator. After registration, model architectures can be selected with the ``--arch`` command-line argument. For example:: @register_model_architecture('lstm', 'lstm_luong_wmt_en_de') def lstm_luong_wmt_en_de(cfg): args.encoder_embed_dim = getattr(cfg.model, 'encoder_embed_dim', 1000) (...) The decorated function should take a single argument *cfg*, which is a :class:`omegaconf.DictConfig`. The decorated function should modify these arguments in-place to match the desired architecture. Args: model_name (str): the name of the Model (Model must already be registered) arch_name (str): the name of the model architecture (``--arch``) """ def register_model_arch_fn(fn): if model_name not in MODEL_REGISTRY: raise ValueError( "Cannot register model architecture for unknown model type ({})".format( model_name ) ) if arch_name in ARCH_MODEL_REGISTRY: raise ValueError( "Cannot register duplicate model architecture ({})".format(arch_name) ) if not callable(fn): raise ValueError( "Model architecture must be callable ({})".format(arch_name) ) ARCH_MODEL_REGISTRY[arch_name] = MODEL_REGISTRY[model_name] ARCH_MODEL_NAME_REGISTRY[arch_name] = model_name ARCH_MODEL_INV_REGISTRY.setdefault(model_name, []).append(arch_name) ARCH_CONFIG_REGISTRY[arch_name] = fn return fn return register_model_arch_fn def import_models(models_dir, namespace): for file in os.listdir(models_dir): path = os.path.join(models_dir, file) if ( not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)) ): model_name = file[: file.find(".py")] if file.endswith(".py") else file importlib.import_module(namespace + "." + model_name) # extra `model_parser` for sphinx if model_name in MODEL_REGISTRY: parser = argparse.ArgumentParser(add_help=False) group_archs = parser.add_argument_group("Named architectures") group_archs.add_argument( "--arch", choices=ARCH_MODEL_INV_REGISTRY[model_name] ) group_args = parser.add_argument_group( "Additional command-line arguments" ) MODEL_REGISTRY[model_name].add_args(group_args) globals()[model_name + "_parser"] = parser # automatically import any Python files in the models/ directory models_dir = os.path.dirname(__file__) import_models(models_dir, "fairseq.models") ================================================ FILE: fairseq/models/bart/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .hub_interface import * # noqa from .model import * # noqa ================================================ FILE: fairseq/models/bart/hub_interface.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import logging from typing import Dict, List import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.data import encoders from fairseq.hub_utils import GeneratorHubInterface from omegaconf import open_dict logger = logging.getLogger(__name__) class BARTHubInterface(GeneratorHubInterface): """A simple PyTorch Hub interface to BART. Usage: https://github.com/pytorch/fairseq/tree/main/examples/bart """ def __init__(self, cfg, task, model): super().__init__(cfg, task, [model]) self.model = self.models[0] def encode( self, sentence: str, *addl_sentences, no_separator=True ) -> torch.LongTensor: """ BPE-encode a sentence (or multiple sentences). Every sequence begins with a beginning-of-sentence (`<s>`) symbol. Every sentence ends with an end-of-sentence (`</s>`). Example (single sentence): `<s> a b c </s>` Example (sentence pair): `<s> d e f </s> 1 2 3 </s>` The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE requires leading spaces. For example:: >>> bart.encode('Hello world').tolist() [0, 31414, 232, 2] >>> bart.encode(' world').tolist() [0, 232, 2] >>> bart.encode('world').tolist() [0, 8331, 2] """ tokens = self.bpe.encode(sentence) if len(tokens.split(" ")) > min(self.max_positions) - 2: tokens = " ".join(tokens.split(" ")[: min(self.max_positions) - 2]) bpe_sentence = "<s> " + tokens + " </s>" for s in addl_sentences: bpe_sentence += " </s>" if not no_separator else "" bpe_sentence += " " + self.bpe.encode(s) + " </s>" tokens = self.task.source_dictionary.encode_line(bpe_sentence, append_eos=False) return tokens.long() def decode(self, tokens: torch.LongTensor): assert tokens.dim() == 1 tokens = tokens.cpu().numpy() if tokens[0] == self.task.source_dictionary.bos(): tokens = tokens[1:] # remove <s> eos_mask = tokens == self.task.source_dictionary.eos() doc_mask = eos_mask[1:] & eos_mask[:-1] sentences = np.split(tokens, doc_mask.nonzero()[0] + 1) sentences = [ self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences ] if len(sentences) == 1: return sentences[0] return sentences def _build_sample(self, src_tokens: List[torch.LongTensor]): # assert torch.is_tensor(src_tokens) dataset = self.task.build_dataset_for_inference( src_tokens, [x.numel() for x in src_tokens], ) sample = dataset.collater(dataset) sample = utils.apply_to_sample(lambda tensor: tensor.to(self.device), sample) return sample def generate( self, tokenized_sentences: List[torch.LongTensor], *args, inference_step_args=None, skip_invalid_size_inputs=False, **kwargs ) -> List[List[Dict[str, torch.Tensor]]]: inference_step_args = inference_step_args or {} if "prefix_tokens" in inference_step_args: raise NotImplementedError("prefix generation not implemented for BART") res = [] for batch in self._build_batches(tokenized_sentences, skip_invalid_size_inputs): src_tokens = batch["net_input"]["src_tokens"] inference_step_args["prefix_tokens"] = src_tokens.new_full( (src_tokens.size(0), 1), fill_value=self.task.source_dictionary.bos() ).to(device=self.device) results = super().generate( src_tokens, *args, inference_step_args=inference_step_args, skip_invalid_size_inputs=skip_invalid_size_inputs, **kwargs ) for id, hypos in zip(batch["id"].tolist(), results): res.append((id, hypos)) res = [hypos for _, hypos in sorted(res, key=lambda x: x[0])] return res def extract_features( self, tokens: torch.LongTensor, return_all_hiddens: bool = False ) -> torch.Tensor: if tokens.dim() == 1: tokens = tokens.unsqueeze(0) if tokens.size(-1) > min(self.model.max_positions()): raise ValueError( "tokens exceeds maximum length: {} > {}".format( tokens.size(-1), self.model.max_positions() ) ) tokens.to(device=self.device), prev_output_tokens = tokens.clone() prev_output_tokens[:, 0] = tokens.gather( 1, (tokens.ne(self.task.source_dictionary.pad()).sum(dim=1) - 1).unsqueeze(-1), ).squeeze() prev_output_tokens[:, 1:] = tokens[:, :-1] features, extra = self.model( src_tokens=tokens, src_lengths=None, prev_output_tokens=prev_output_tokens, features_only=True, return_all_hiddens=return_all_hiddens, ) if return_all_hiddens: # convert from T x B x C -> B x T x C inner_states = extra["inner_states"] return [inner_state.transpose(0, 1) for inner_state in inner_states] else: return features # just the last layer's features def register_classification_head( self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs ): self.model.register_classification_head( name, num_classes=num_classes, embedding_size=embedding_size, **kwargs ) def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False): if tokens.dim() == 1: tokens = tokens.unsqueeze(0) features = self.extract_features(tokens.to(device=self.device)) sentence_representation = features[ tokens.eq(self.task.source_dictionary.eos()), : ].view(features.size(0), -1, features.size(-1))[:, -1, :] logits = self.model.classification_heads[head](sentence_representation) if return_logits: return logits return F.log_softmax(logits, dim=-1) def fill_mask( self, masked_inputs: List[str], topk: int = 5, match_source_len: bool = True, **generate_kwargs ): masked_token = "<mask>" batch_tokens = [] for masked_input in masked_inputs: assert ( masked_token in masked_input ), "please add one {} token for the input".format(masked_token) text_spans = masked_input.split(masked_token) text_spans_bpe = ( (" {0} ".format(masked_token)) .join([self.bpe.encode(text_span.rstrip()) for text_span in text_spans]) .strip() ) tokens = self.task.source_dictionary.encode_line( "<s> " + text_spans_bpe + " </s>", append_eos=False, add_if_not_exist=False, ).long() batch_tokens.append(tokens) # ensure beam size is at least as big as topk generate_kwargs["beam"] = max( topk, generate_kwargs.get("beam", -1), ) generate_kwargs["match_source_len"] = match_source_len batch_hypos = self.generate(batch_tokens, **generate_kwargs) return [ [(self.decode(hypo["tokens"]), hypo["score"]) for hypo in hypos[:topk]] for hypos in batch_hypos ] ================================================ FILE: fairseq/models/bart/model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension """ import logging from typing import Optional import torch import torch.nn as nn from fairseq import utils from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer import TransformerModel from fairseq.modules.transformer_sentence_encoder import init_bert_params from .hub_interface import BARTHubInterface logger = logging.getLogger(__name__) @register_model("bart") class BARTModel(TransformerModel): __jit_unused_properties__ = ["supported_targets"] @classmethod def hub_models(cls): return { "bart.base": "http://dl.fbaipublicfiles.com/fairseq/models/bart.base.tar.gz", "bart.large": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.tar.gz", "bart.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.mnli.tar.gz", "bart.large.cnn": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.cnn.tar.gz", "bart.large.xsum": "http://dl.fbaipublicfiles.com/fairseq/models/bart.large.xsum.tar.gz", } def __init__(self, args, encoder, decoder): super().__init__(args, encoder, decoder) # We follow BERT's random weight initialization self.apply(init_bert_params) self.classification_heads = nn.ModuleDict() if hasattr(self.encoder, "dictionary"): self.eos: int = self.encoder.dictionary.eos() @staticmethod def add_args(parser): super(BARTModel, BARTModel).add_args(parser) parser.add_argument( "--pooler-dropout", type=float, metavar="D", help="dropout probability in the masked_lm pooler layers", ) parser.add_argument( "--pooler-activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use for pooler layer", ) parser.add_argument( "--spectral-norm-classification-head", action="store_true", help="Apply spectral normalization on the classification head", ) @property def supported_targets(self): return {"self"} def forward( self, src_tokens, src_lengths, prev_output_tokens, features_only: bool = False, classification_head_name: Optional[str] = None, token_embeddings: Optional[torch.Tensor] = None, return_all_hiddens: bool = True, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): if classification_head_name is not None: features_only = True encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, token_embeddings=token_embeddings, return_all_hiddens=return_all_hiddens, ) x, extra = self.decoder( prev_output_tokens, encoder_out=encoder_out, features_only=features_only, alignment_layer=alignment_layer, alignment_heads=alignment_heads, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens, ) eos: int = self.eos if classification_head_name is not None: sentence_representation = x[src_tokens.eq(eos), :].view( x.size(0), -1, x.size(-1) )[:, -1, :] for k, head in self.classification_heads.items(): # for torch script only supports iteration if k == classification_head_name: x = head(sentence_representation) break return x, extra @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="gpt2", sample_break_mode="eos", **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, sample_break_mode=sample_break_mode, **kwargs, ) return BARTHubInterface(x["args"], x["task"], x["models"][0]) def register_classification_head( self, name, num_classes=None, inner_dim=None, **kwargs ): """Register a classification head.""" logger.info("Registering classification head: {0}".format(name)) if name in self.classification_heads: prev_num_classes = self.classification_heads[name].out_proj.out_features prev_inner_dim = self.classification_heads[name].dense.out_features if num_classes != prev_num_classes or inner_dim != prev_inner_dim: logger.warning( 're-registering head "{}" with num_classes {} (prev: {}) ' "and inner_dim {} (prev: {})".format( name, num_classes, prev_num_classes, inner_dim, prev_inner_dim ) ) self.classification_heads[name] = BARTClassificationHead( input_dim=self.args.encoder_embed_dim, inner_dim=inner_dim or self.args.encoder_embed_dim, num_classes=num_classes, activation_fn=self.args.pooler_activation_fn, pooler_dropout=self.args.pooler_dropout, do_spectral_norm=getattr( self.args, "spectral_norm_classification_head", False ), ) def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) prefix = name + "." if name != "" else "" current_head_names = ( [] if not hasattr(self, "classification_heads") else self.classification_heads.keys() ) # Handle new classification heads present in the state dict. keys_to_delete = [] for k in state_dict.keys(): if not k.startswith(prefix + "classification_heads."): continue head_name = k[len(prefix + "classification_heads.") :].split(".")[0] num_classes = state_dict[ prefix + "classification_heads." + head_name + ".out_proj.weight" ].size(0) inner_dim = state_dict[ prefix + "classification_heads." + head_name + ".dense.weight" ].size(0) if getattr(self.args, "load_checkpoint_heads", False): if head_name not in current_head_names: self.register_classification_head(head_name, num_classes, inner_dim) else: if head_name not in current_head_names: logger.warning( "deleting classification head ({}) from checkpoint " "not present in current model: {}".format(head_name, k) ) keys_to_delete.append(k) elif ( num_classes != self.classification_heads[head_name].out_proj.out_features or inner_dim != self.classification_heads[head_name].dense.out_features ): logger.warning( "deleting classification head ({}) from checkpoint " "with different dimensions than current model: {}".format( head_name, k ) ) keys_to_delete.append(k) for k in keys_to_delete: del state_dict[k] def truncate_emb(key): if key in state_dict: state_dict[key] = state_dict[key][:-1, :] # When finetuning on translation task, remove last row of # embedding matrix that corresponds to mask_idx token. loaded_dict_size = state_dict["encoder.embed_tokens.weight"].size(0) if ( loaded_dict_size == len(self.encoder.dictionary) + 1 and "<mask>" not in self.encoder.dictionary ): truncate_emb("encoder.embed_tokens.weight") truncate_emb("decoder.embed_tokens.weight") truncate_emb("encoder.output_projection.weight") truncate_emb("decoder.output_projection.weight") # When continued pretraining on new set of languages for mbart, # add extra lang embeddings at the end of embed_tokens. # Note: newly added languages are assumed to have been added at the end. if self.args.task == "multilingual_denoising" and loaded_dict_size < len( self.encoder.dictionary ): logger.info( "Adding extra language embeddings not found in pretrained model for " "continued pretraining of MBART on new set of languages." ) loaded_mask_token_embedding = state_dict["encoder.embed_tokens.weight"][ -1, : ] num_langids_to_add = len(self.encoder.dictionary) - loaded_dict_size embed_dim = state_dict["encoder.embed_tokens.weight"].size(1) new_lang_embed_to_add = torch.zeros(num_langids_to_add, embed_dim) nn.init.normal_(new_lang_embed_to_add, mean=0, std=embed_dim**-0.5) new_lang_embed_to_add = new_lang_embed_to_add.to( dtype=state_dict["encoder.embed_tokens.weight"].dtype, ) state_dict["encoder.embed_tokens.weight"] = torch.cat( [ state_dict["encoder.embed_tokens.weight"][ : loaded_dict_size - 1, : ], new_lang_embed_to_add, loaded_mask_token_embedding.unsqueeze(0), ] ) state_dict["decoder.embed_tokens.weight"] = torch.cat( [ state_dict["decoder.embed_tokens.weight"][ : loaded_dict_size - 1, : ], new_lang_embed_to_add, loaded_mask_token_embedding.unsqueeze(0), ] ) # Copy any newly-added classification heads into the state dict # with their current weights. if hasattr(self, "classification_heads"): cur_state = self.classification_heads.state_dict() for k, v in cur_state.items(): if prefix + "classification_heads." + k not in state_dict: logger.info("Overwriting " + prefix + "classification_heads." + k) state_dict[prefix + "classification_heads." + k] = v def set_beam_size(self, beam): """Set beam size for efficient beamable enc-dec attention.""" beamable = False for layer in self.decoder.layers: if layer.encoder_attn is not None: if hasattr(layer.encoder_attn, "set_beam_size"): layer.encoder_attn.set_beam_size(beam) beamable = True if beamable: self.encoder.reorder_encoder_out = self.encoder._reorder_encoder_out class BARTClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__( self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, do_spectral_norm=False, ): super().__init__() self.dense = nn.Linear(input_dim, inner_dim) self.activation_fn = utils.get_activation_fn(activation_fn) self.dropout = nn.Dropout(p=pooler_dropout) self.out_proj = nn.Linear(inner_dim, num_classes) if do_spectral_norm: self.out_proj = torch.nn.utils.spectral_norm(self.out_proj) def forward(self, features, **kwargs): x = features x = self.dropout(x) x = self.dense(x) x = self.activation_fn(x) x = self.dropout(x) x = self.out_proj(x) return x @register_model_architecture("bart", "bart_large") def bart_large_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 1024) args.encoder_layers = getattr(args, "encoder_layers", 12) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 12) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", True) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.relu_dropout = getattr(args, "relu_dropout", 0.0) args.dropout = getattr(args, "dropout", 0.1) args.max_target_positions = getattr(args, "max_target_positions", 1024) args.max_source_positions = getattr(args, "max_source_positions", 1024) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", True ) args.share_all_embeddings = getattr(args, "share_all_embeddings", True) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.no_scale_embedding = getattr(args, "no_scale_embedding", True) args.layernorm_embedding = getattr(args, "layernorm_embedding", True) args.activation_fn = getattr(args, "activation_fn", "gelu") args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) @register_model_architecture("bart", "bart_base") def bart_base_architecture(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4 * 768) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 12) bart_large_architecture(args) @register_model_architecture("bart", "mbart_large") def mbart_large_architecture(args): args.no_scale_embedding = getattr(args, "no_scale_embedding", False) bart_large_architecture(args) @register_model_architecture("bart", "mbart_base") def mbart_base_architecture(args): args.no_scale_embedding = getattr(args, "no_scale_embedding", False) bart_base_architecture(args) @register_model_architecture("bart", "mbart_base_wmt20") def mbart_base_wmt20_architecture(args): args.layernorm_embedding = getattr(args, "layernorm_embedding", False) mbart_base_architecture(args) ================================================ FILE: fairseq/models/composite_encoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .fairseq_encoder import FairseqEncoder class CompositeEncoder(FairseqEncoder): """ A wrapper around a dictionary of :class:`FairseqEncoder` objects. We run forward on each encoder and return a dictionary of outputs. The first encoder's dictionary is used for initialization. Args: encoders (dict): a dictionary of :class:`FairseqEncoder` objects. """ def __init__(self, encoders): super().__init__(next(iter(encoders.values())).dictionary) self.encoders = encoders for key in self.encoders: self.add_module(key, self.encoders[key]) def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: the outputs from each Encoder """ encoder_out = {} for key in self.encoders: encoder_out[key] = self.encoders[key](src_tokens, src_lengths) return encoder_out def reorder_encoder_out(self, encoder_out, new_order): """Reorder encoder output according to new_order.""" for key in self.encoders: encoder_out[key] = self.encoders[key].reorder_encoder_out( encoder_out[key], new_order ) return encoder_out def max_positions(self): return min(self.encoders[key].max_positions() for key in self.encoders) def upgrade_state_dict(self, state_dict): for key in self.encoders: self.encoders[key].upgrade_state_dict(state_dict) return state_dict ================================================ FILE: fairseq/models/distributed_fairseq_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import signal import threading import torch import torch.nn as nn from torch.nn.parallel import DistributedDataParallel from fairseq.distributed import ( DistributedTimeoutWrapper, LegacyDistributedDataParallel, ModuleProxyWrapper, TPUDistributedDataParallel, ) logger = logging.getLogger(__name__) _SLOWMO_DDP_DISABLED = False try: from fairscale.experimental.nn.data_parallel import ( SlowMoBaseAlgorithm, SlowMoDistributedDataParallel, ) except ImportError: _SLOWMO_DDP_DISABLED = True def DistributedFairseqModel(args, model, process_group, device): """ Wrap a *model* to support distributed data parallel training. This is similar to the built-in DistributedDataParallel, but allows additional configuration of the DistributedDataParallel class to use, and also provides easier access to the wrapped model by forwarding requests for missing attributes to the wrapped model. Args: args (argparse.Namespace): fairseq args model (BaseFairseqModel): model to wrap process_group: the c10d process group to be used for distributed data parallel all-reduction. device: device to move model to """ assert isinstance(model, nn.Module) if args.tpu: wrapped_model = TPUDistributedDataParallel( module=model.to(device), process_group=process_group, ) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend in {"c10d", "pytorch_ddp"}: wrapped_model = DistributedDataParallel( module=model.to(device), device_ids=[args.device_id], output_device=args.device_id, broadcast_buffers=args.broadcast_buffers, bucket_cap_mb=args.bucket_cap_mb, process_group=process_group, find_unused_parameters=args.find_unused_parameters, gradient_as_bucket_view=args.gradient_as_bucket_view, ) if args.ddp_comm_hook == "fp16": logger.info("enable fp16 communication hook in DDP") try: from torch.distributed.algorithms.ddp_comm_hooks import ( DDPCommHookType, register_ddp_comm_hook, ) except: logger.error( "Could not import from torch.distributed.algorithms.ddp_comm_hooks; you may need to update your pytorch version" ) raise register_ddp_comm_hook(DDPCommHookType.FP16_COMPRESS, wrapped_model) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend in {"no_c10d", "legacy_ddp"}: wrapped_model = LegacyDistributedDataParallel( module=model.to(device), buffer_size=2**28, process_group=process_group, ) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend == "slowmo": if _SLOWMO_DDP_DISABLED: raise ImportError( "Cannot find SlowMoDistributedDataParallel. " "Please install fairscale with: pip install fairscale" ) # The values of slowmo_momentum below were obtained by tuning on the # En-De 16 dataset by training the transformer_wmt_en_de_large model if args.slowmo_momentum is None: if args.distributed_world_size <= 16: args.slowmo_momentum = 0.0 elif args.distributed_world_size <= 32: args.slowmo_momentum = 0.2 elif args.distributed_world_size <= 64: args.slowmo_momentum = 0.5 else: args.slowmo_momentum = 0.6 slowmo_base_algorithm = SlowMoBaseAlgorithm[args.slowmo_base_algorithm.upper()] wrapped_model = SlowMoDistributedDataParallel( module=model.to(device), broadcast_buffers=args.broadcast_buffers, nprocs_per_node=args.nprocs_per_node, slowmo_momentum=args.slowmo_momentum, slowmo_base_algorithm=slowmo_base_algorithm, localsgd_frequency=args.localsgd_frequency, ) # forward missing getattr and state_dict/load_state_dict to orig model wrapped_model = ModuleProxyWrapper(wrapped_model) elif args.ddp_backend == "fully_sharded": try: from fairscale.nn.data_parallel import FullyShardedDataParallel as FSDP except ImportError: raise ImportError( "Cannot find FullyShardedDataParallel. " "Please install fairscale with: pip install fairscale" ) assert isinstance(model, FSDP), "expected model to already be wrapped in FSDP" wrapped_model = model if args.memory_efficient_fp16: wrapped_model = wrapped_model.half() if not args.cpu_offload: wrapped_model = wrapped_model.to(device=device) else: raise ValueError("Unknown --ddp-backend: " + args.ddp_backend) # kill hung distributed jobs after a timeout if getattr(args, "heartbeat_timeout", -1) > 0: wrapped_model = DistributedTimeoutWrapper( wrapped_model, timeout=getattr(args, "heartbeat_timeout", -1) ) return wrapped_model ================================================ FILE: fairseq/models/ema/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os from .ema import EMA def build_ema(model, cfg, device): return EMA(model, cfg, device) # automatically import any Python files in the models/ema/ directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): file_name = file[: file.find(".py")] importlib.import_module("fairseq.models.ema." + file_name) ================================================ FILE: fairseq/models/ema/ema.py ================================================ #!/usr/bin/env python3 """ This module has the EMA class used to store a copy of the exponentially decayed model params. Typical usage of EMA class involves initializing an object using an existing model (random or from a seed model) and setting the config like ema_decay, ema_start_update which determine how the EMA model is updated. After every update of the model i.e. at the end of the train_step, the EMA should be updated by passing the new model to the EMA.step function. The EMA model state dict can be stored in the extra state under the key of "ema" and dumped into a checkpoint and loaded. The EMA object can be passed to tasks by setting task.uses_ema property. EMA is a smoothed/ensemble model which might have better performance when used for inference or further fine-tuning. EMA class has a reverse function to load the EMA params into a model and use it like a regular model. This implementation is used for trainer-level ema tracking. For EMA tracking inside the model, please use fairseq/modules/ema_module.py instead. """ import copy import logging import torch from fairseq import checkpoint_utils class EMA(object): """Exponential Moving Average of Fairseq Models EMA keeps a copy of the exponentially decayed model params. The set of params should include both gradient-descent and non-gradient descent params, such as batch mean/var and buffers. This is a modified implementation of the open source code in https://github.com/zhawe01/fairseq-gec.git, and internal source code in fbcode/mobile-vision/projects/classification_pytorch/lib/utils/model_ema.py. Similar to TF EMA. https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage. EMA provides a averaged and smoothed set of model weights, and has been shown to improve vision models. EMA class does all necessary functions to update, reload, or init EMA methods. EMA object is initialized from an arbitrary model. By default, it is stored in the same device (unless device specified at initialization) and with the same precision as the model (unless ema_fp32 is True). ema_fp32 is recommended. This stores the EMA parameters in fp32 only for the EMA update step, and is used at the default precision otherwise. EMA is usually enabled using EMAConfig with store_ema=True. Some important parameters to configure EMA are 1) ema_decay - The decay of EMA 2) ema_update_freq - EMA is updated every this many model updates. 3) ema_start_update - Start EMA update after this many model updates [default 0] Key methods: 1) step - One update of EMA using new model 2) restore - Update EMA from a state dict 3) reverse - Load EMA into a model 4) get_decay, _set_decay - Used to get or set the decay. Note _set_decay is called from step. 5) build_fp32_params - Used to initialize or update the fp32 copy of EMA params. Note this is enabled only when ema_fp32=True """ def __init__(self, model, config, device=None, skip_keys=None): """ @param model model to initialize the EMA with @param config EMAConfig object with configuration like ema_decay, ema_update_freq, ema_fp32 @param device If provided, copy EMA to this device (e.g. gpu). Otherwise EMA is in the same device as the model. """ self.decay = config.ema_decay self.model = copy.deepcopy(model) self.model.requires_grad_(False) self.config = config self.skip_keys = skip_keys or set() self.fp32_params = {} if self.config.ema_seed_model is not None: state = checkpoint_utils.load_ema_from_checkpoint( self.config.ema_seed_model ) self.model.load_state_dict(state["model"], strict=True) if device is not None: logging.info(f"Copying EMA model to device {device}") self.model = self.model.to(device=device) if self.config.ema_fp32: self.build_fp32_params() self.update_freq_counter = 0 def get_model(self): return self.model def build_fp32_params(self, state_dict=None): """ Store a copy of the EMA params in fp32. If state dict is passed, the EMA params is copied from the provided state dict. Otherwise, it is copied from the current EMA model parameters. """ if not self.config.ema_fp32: raise RuntimeError( "build_fp32_params should not be called if ema_fp32=False. " "Use ema_fp32=True if this is really intended." ) if state_dict is None: state_dict = self.model.state_dict() def _to_float(t): return t.float() if torch.is_floating_point(t) else t for param_key in state_dict: if param_key in self.fp32_params: self.fp32_params[param_key].copy_(state_dict[param_key]) else: self.fp32_params[param_key] = _to_float(state_dict[param_key]) def restore(self, state_dict, build_fp32_params=False): """Load data from a model spec into EMA model""" self.model.load_state_dict(state_dict, strict=False) if build_fp32_params: self.build_fp32_params(state_dict) def _set_decay(self, decay): self.decay = decay def get_decay(self): return self.decay def _step_internal(self, new_model, updates=None): """One update of the EMA model based on new model weights""" decay = self.decay ema_state_dict = {} ema_params = ( self.fp32_params if self.config.ema_fp32 else self.model.state_dict() ) for key, param in new_model.state_dict().items(): if isinstance(param, dict): continue try: ema_param = ema_params[key] except KeyError: ema_param = ( param.float().clone() if param.ndim == 1 else copy.deepcopy(param) ) if param.shape != ema_param.shape: raise ValueError( "incompatible tensor shapes between model param and ema param" + "{} vs. {}".format(param.shape, ema_param.shape) ) if "version" in key: # Do not decay a model.version pytorch param continue if key in self.skip_keys: ema_param = param.to(dtype=ema_param.dtype).clone() else: ema_param.mul_(decay) ema_param.add_(param.to(dtype=ema_param.dtype), alpha=1 - decay) ema_state_dict[key] = ema_param self.restore(ema_state_dict, build_fp32_params=False) def step(self, new_model, updates=None): """ One update of EMA which is done every self.config.ema_update_freq updates of the model. @param updates The current number of model updates done. Decay is set of 0 if model updates < ema_start_update, which means the model will be simply copied over to the EMA. When model updates >= ema_start_updates, then EMA is updated with a decay of self.config.ema_decay. """ if updates is not None: self._set_decay( 0 if updates < self.config.ema_start_update else self.config.ema_decay ) if self.config.ema_update_freq > 1: self.update_freq_counter += 1 if self.update_freq_counter >= self.config.ema_update_freq: self._step_internal(new_model, updates) self.update_freq_counter = 0 else: self._step_internal(new_model, updates) def reverse(self, model): """ Load the model parameters from EMA model. Useful for inference or fine-tuning from the EMA model. """ d = self.model.state_dict() if "_ema" in d: del d["_ema"] model.load_state_dict(d, strict=False) return model ================================================ FILE: fairseq/models/fairseq_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Optional, Tuple import torch.nn as nn from fairseq import utils from torch import Tensor class FairseqDecoder(nn.Module): """Base class for decoders.""" def __init__(self, dictionary): super().__init__() self.dictionary = dictionary self.onnx_trace = False self.adaptive_softmax = None def forward(self, prev_output_tokens, encoder_out=None, **kwargs): """ Args: prev_output_tokens (LongTensor): shifted output tokens of shape `(batch, tgt_len)`, for teacher forcing encoder_out (dict, optional): output from the encoder, used for encoder-side attention Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, **kwargs ) x = self.output_layer(x) return x, extra def extract_features(self, prev_output_tokens, encoder_out=None, **kwargs): """ Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ raise NotImplementedError def output_layer(self, features, **kwargs): """ Project features to the default output size, e.g., vocabulary size. Args: features (Tensor): features returned by *extract_features*. """ raise NotImplementedError def get_normalized_probs( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): """Get normalized probabilities (or log probs) from a net's output.""" return self.get_normalized_probs_scriptable(net_output, log_probs, sample) # TorchScript doesn't support super() method so that the scriptable Subclass # can't access the base class model in Torchscript. # Current workaround is to add a helper function with different name and # call the helper function from scriptable Subclass. def get_normalized_probs_scriptable( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): """Get normalized probabilities (or log probs) from a net's output.""" if hasattr(self, "adaptive_softmax") and self.adaptive_softmax is not None: if sample is not None: assert "target" in sample target = sample["target"] else: target = None out = self.adaptive_softmax.get_log_prob(net_output[0], target=target) return out.exp_() if not log_probs else out logits = net_output[0] if log_probs: return utils.log_softmax(logits, dim=-1, onnx_trace=self.onnx_trace) else: return utils.softmax(logits, dim=-1, onnx_trace=self.onnx_trace) def max_positions(self): """Maximum input length supported by the decoder.""" return 1e6 # an arbitrary large number def upgrade_state_dict_named(self, state_dict, name): """Upgrade old state dicts to work with newer code.""" return state_dict def prepare_for_onnx_export_(self): self.onnx_trace = True ================================================ FILE: fairseq/models/fairseq_encoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, NamedTuple, Optional import torch import torch.nn as nn from torch import Tensor EncoderOut = NamedTuple( "EncoderOut", [ ("encoder_out", Tensor), # T x B x C ("encoder_padding_mask", Optional[Tensor]), # B x T ("encoder_embedding", Optional[Tensor]), # B x T x C ("encoder_states", Optional[List[Tensor]]), # List[T x B x C] ("src_tokens", Optional[Tensor]), # B x T ("src_lengths", Optional[Tensor]), # B x 1 ], ) class FairseqEncoder(nn.Module): """Base class for encoders.""" def __init__(self, dictionary): super().__init__() self.dictionary = dictionary def forward(self, src_tokens, src_lengths=None, **kwargs): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` """ raise NotImplementedError def forward_torchscript(self, net_input: Dict[str, Tensor]): """A TorchScript-compatible version of forward. Encoders which use additional arguments may want to override this method for TorchScript compatibility. """ if torch.jit.is_scripting(): return self.forward( src_tokens=net_input["src_tokens"], src_lengths=net_input["src_lengths"], ) else: return self.forward_non_torchscript(net_input) @torch.jit.unused def forward_non_torchscript(self, net_input: Dict[str, Tensor]): encoder_input = { k: v for k, v in net_input.items() if k != "prev_output_tokens" } return self.forward(**encoder_input) def reorder_encoder_out(self, encoder_out, new_order): """ Reorder encoder output according to `new_order`. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: `encoder_out` rearranged according to `new_order` """ raise NotImplementedError def max_positions(self): """Maximum input length supported by the encoder.""" return 1e6 # an arbitrary large number def upgrade_state_dict_named(self, state_dict, name): """Upgrade old state dicts to work with newer code.""" return state_dict def set_num_updates(self, num_updates): """State from trainer to pass along to model at every update.""" def _apply(m): if hasattr(m, "set_num_updates") and m != self: m.set_num_updates(num_updates) self.apply(_apply) ================================================ FILE: fairseq/models/fairseq_incremental_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import Dict, Optional from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.models import FairseqDecoder from torch import Tensor logger = logging.getLogger(__name__) @with_incremental_state class FairseqIncrementalDecoder(FairseqDecoder): """Base class for incremental decoders. Incremental decoding is a special mode at inference time where the Model only receives a single timestep of input corresponding to the previous output token (for teacher forcing) and must produce the next output *incrementally*. Thus the model must cache any long-term state that is needed about the sequence, e.g., hidden states, convolutional states, etc. Compared to the standard :class:`FairseqDecoder` interface, the incremental decoder interface allows :func:`forward` functions to take an extra keyword argument (*incremental_state*) that can be used to cache state across time-steps. The :class:`FairseqIncrementalDecoder` interface also defines the :func:`reorder_incremental_state` method, which is used during beam search to select and reorder the incremental state based on the selection of beams. To learn more about how incremental decoding works, refer to `this blog <http://www.telesens.co/2019/04/21/understanding-incremental-decoding-in-fairseq/>`_. """ def __init__(self, dictionary): super().__init__(dictionary) def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs ): """ Args: prev_output_tokens (LongTensor): shifted output tokens of shape `(batch, tgt_len)`, for teacher forcing encoder_out (dict, optional): output from the encoder, used for encoder-side attention incremental_state (dict, optional): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ raise NotImplementedError def extract_features( self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs ): """ Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ raise NotImplementedError def reorder_incremental_state( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor, ): """Reorder incremental state. This will be called when the order of the input has changed from the previous time step. A typical use case is beam search, where the input order changes between time steps based on the selection of beams. """ pass def reorder_incremental_state_scripting( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor, ): """Main entry point for reordering the incremental state. Due to limitations in TorchScript, we call this function in :class:`fairseq.sequence_generator.SequenceGenerator` instead of calling :func:`reorder_incremental_state` directly. """ for module in self.modules(): if hasattr(module, "reorder_incremental_state"): result = module.reorder_incremental_state(incremental_state, new_order) if result is not None: incremental_state = result def set_beam_size(self, beam_size): """Sets the beam size in the decoder and all children.""" if getattr(self, "_beam_size", -1) != beam_size: seen = set() def apply_set_beam_size(module): if ( module != self and hasattr(module, "set_beam_size") and module not in seen ): seen.add(module) module.set_beam_size(beam_size) self.apply(apply_set_beam_size) self._beam_size = beam_size ================================================ FILE: fairseq/models/fairseq_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Base classes for various fairseq models. """ import logging from argparse import Namespace from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.data import Dictionary from fairseq.dataclass.utils import ( convert_namespace_to_omegaconf, gen_parser_from_dataclass, ) from fairseq.models import FairseqDecoder, FairseqEncoder from omegaconf import DictConfig from torch import Tensor logger = logging.getLogger(__name__) def check_type(module, expected_type): if hasattr(module, "unwrapped_module"): assert isinstance( module.unwrapped_module, expected_type ), f"{type(module.unwrapped_module)} != {expected_type}" else: assert isinstance(module, expected_type), f"{type(module)} != {expected_type}" class BaseFairseqModel(nn.Module): """Base class for fairseq models.""" def __init__(self): super().__init__() self._is_generation_fast = False @classmethod def add_args(cls, parser): """Add model-specific arguments to the parser.""" dc = getattr(cls, "__dataclass", None) if dc is not None: # do not set defaults so that settings defaults from various architectures still works gen_parser_from_dataclass(parser, dc(), delete_default=True) @classmethod def build_model(cls, args, task): """Build a new model instance.""" raise NotImplementedError("Model must implement the build_model method") def get_targets(self, sample, net_output): """Get targets from either the sample or the net's output.""" return sample["target"] def get_normalized_probs( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): """Get normalized probabilities (or log probs) from a net's output.""" return self.get_normalized_probs_scriptable(net_output, log_probs, sample) # TorchScript doesn't support super() method so that the scriptable Subclass # can't access the base class model in Torchscript. # Current workaround is to add a helper function with different name and # call the helper function from scriptable Subclass. def get_normalized_probs_scriptable( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): """Scriptable helper function for get_normalized_probs in ~BaseFairseqModel""" if hasattr(self, "decoder"): return self.decoder.get_normalized_probs(net_output, log_probs, sample) elif torch.is_tensor(net_output): # syntactic sugar for simple models which don't have a decoder # (e.g., the classification tutorial) logits = net_output.float() if log_probs: return F.log_softmax(logits, dim=-1) else: return F.softmax(logits, dim=-1) raise NotImplementedError def extract_features(self, *args, **kwargs): """Similar to *forward* but only return features.""" return self(*args, **kwargs) def max_positions(self): """Maximum length supported by the model.""" return None def load_state_dict( self, state_dict, strict=True, model_cfg: Optional[DictConfig] = None, args: Optional[Namespace] = None, ): """Copies parameters and buffers from *state_dict* into this module and its descendants. Overrides the method in :class:`nn.Module`. Compared with that method this additionally "upgrades" *state_dicts* from old checkpoints. """ if model_cfg is None and args is not None: logger.warn( "using 'args' is deprecated, please update your code to use dataclass config" ) model_cfg = convert_namespace_to_omegaconf(args).model self.upgrade_state_dict(state_dict) from fairseq.checkpoint_utils import prune_state_dict new_state_dict = prune_state_dict(state_dict, model_cfg) return super().load_state_dict(new_state_dict, strict) def upgrade_state_dict(self, state_dict): """Upgrade old state dicts to work with newer code.""" self.upgrade_state_dict_named(state_dict, "") def upgrade_state_dict_named(self, state_dict, name): """Upgrade old state dicts to work with newer code. Args: state_dict (dict): state dictionary to upgrade, in place name (str): the state dict key corresponding to the current module """ assert state_dict is not None def do_upgrade(m, prefix): if len(prefix) > 0: prefix += "." for n, c in m.named_children(): name = prefix + n if hasattr(c, "upgrade_state_dict_named"): c.upgrade_state_dict_named(state_dict, name) elif hasattr(c, "upgrade_state_dict"): c.upgrade_state_dict(state_dict) do_upgrade(c, name) do_upgrade(self, name) def set_num_updates(self, num_updates): """State from trainer to pass along to model at every update.""" for m in self.modules(): if hasattr(m, "set_num_updates") and m != self: m.set_num_updates(num_updates) def set_epoch(self, epoch): for m in self.modules(): if hasattr(m, "set_epoch") and m != self: m.set_epoch(epoch) def prepare_for_inference_(self, cfg: DictConfig): """Prepare model for inference.""" kwargs = {} kwargs["beamable_mm_beam_size"] = ( None if getattr(cfg.generation, "no_beamable_mm", False) else getattr(cfg.generation, "beam", 5) ) kwargs["need_attn"] = getattr(cfg.generation, "print_alignment", False) if getattr(cfg.generation, "retain_dropout", False): kwargs["retain_dropout"] = cfg.generation.retain_dropout kwargs["retain_dropout_modules"] = cfg.generation.retain_dropout_modules self.make_generation_fast_(**kwargs) def make_generation_fast_(self, **kwargs): """ Legacy entry point to optimize model for faster generation. Prefer prepare_for_inference_. """ if self._is_generation_fast: return # only apply once self._is_generation_fast = True # remove weight norm from all modules in the network def apply_remove_weight_norm(module): try: nn.utils.remove_weight_norm(module) except (AttributeError, ValueError): # this module didn't have weight norm return self.apply(apply_remove_weight_norm) def apply_make_generation_fast_(module, prefix): if len(prefix) > 0: prefix += "." base_func = BaseFairseqModel.make_generation_fast_ for n, m in module.named_modules(): if ( m != self and hasattr(m, "make_generation_fast_") # don't call this implementation again, e.g., if # children modules also inherit from BaseFairseqModel and m.make_generation_fast_.__func__ is not base_func ): name = prefix + n m.make_generation_fast_(name=name, **kwargs) apply_make_generation_fast_(self, "") def train(mode=True): if mode: raise RuntimeError("cannot train after make_generation_fast") # this model should no longer be used for training self.eval() self.train = train def prepare_for_onnx_export_(self, **kwargs): """Make model exportable via ONNX trace.""" seen = set() def apply_prepare_for_onnx_export_(module): if ( module != self and hasattr(module, "prepare_for_onnx_export_") and module not in seen ): seen.add(module) module.prepare_for_onnx_export_(**kwargs) self.apply(apply_prepare_for_onnx_export_) @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", **kwargs, ): """ Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model file. Downloads and caches the pre-trained model file if needed. The base implementation returns a :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to generate translations or sample from language models. The underlying :class:`~fairseq.models.FairseqModel` can be accessed via the *generator.models* attribute. Other models may override this to implement custom hub interfaces. Args: model_name_or_path (str): either the name of a pre-trained model to load or a path/URL to a pre-trained model state dict checkpoint_file (str, optional): colon-separated list of checkpoint files in the model archive to ensemble (default: 'model.pt') data_name_or_path (str, optional): point args.data to the archive at the given path/URL. Can start with '.' or './' to reuse the model archive path. """ from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), **kwargs, ) logger.info(x["args"]) return hub_utils.GeneratorHubInterface(x["args"], x["task"], x["models"]) @classmethod def hub_models(cls): return {} class FairseqEncoderDecoderModel(BaseFairseqModel): """Base class for encoder-decoder models. Args: encoder (FairseqEncoder): the encoder decoder (FairseqDecoder): the decoder """ def __init__(self, encoder, decoder): super().__init__() self.encoder = encoder self.decoder = decoder check_type(self.encoder, FairseqEncoder) check_type(self.decoder, FairseqDecoder) def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): """ Run the forward pass for an encoder-decoder model. First feed a batch of source tokens through the encoder. Then, feed the encoder output and previous decoder outputs (i.e., teacher forcing) to the decoder to produce the next outputs:: encoder_out = self.encoder(src_tokens, src_lengths) return self.decoder(prev_output_tokens, encoder_out) Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): source sentence lengths of shape `(batch)` prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, **kwargs ) return decoder_out def forward_decoder(self, prev_output_tokens, **kwargs): return self.decoder(prev_output_tokens, **kwargs) def extract_features(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) features = self.decoder.extract_features( prev_output_tokens, encoder_out=encoder_out, **kwargs ) return features def output_layer(self, features, **kwargs): """Project features to the default output size (typically vocabulary size).""" return self.decoder.output_layer(features, **kwargs) def max_positions(self): """Maximum length supported by the model.""" return (self.encoder.max_positions(), self.decoder.max_positions()) def max_decoder_positions(self): """Maximum length supported by the decoder.""" return self.decoder.max_positions() class FairseqModel(FairseqEncoderDecoderModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) utils.deprecation_warning( "FairseqModel is deprecated, please use FairseqEncoderDecoderModel " "or BaseFairseqModel instead", stacklevel=4, ) class FairseqMultiModel(BaseFairseqModel): """Base class for combining multiple encoder-decoder models.""" def __init__(self, encoders, decoders): super().__init__() assert encoders.keys() == decoders.keys() self.keys = list(encoders.keys()) for key in self.keys: check_type(encoders[key], FairseqEncoder) check_type(decoders[key], FairseqDecoder) self.models = nn.ModuleDict( { key: FairseqEncoderDecoderModel(encoders[key], decoders[key]) for key in self.keys } ) @staticmethod def build_shared_embeddings( dicts: Dict[str, Dictionary], langs: List[str], embed_dim: int, build_embedding: callable, pretrained_embed_path: Optional[str] = None, ): """ Helper function to build shared embeddings for a set of languages after checking that all dicts corresponding to those languages are equivalent. Args: dicts: Dict of lang_id to its corresponding Dictionary langs: languages that we want to share embeddings for embed_dim: embedding dimension build_embedding: callable function to actually build the embedding pretrained_embed_path: Optional path to load pretrained embeddings """ shared_dict = dicts[langs[0]] if any(dicts[lang] != shared_dict for lang in langs): raise ValueError( "--share-*-embeddings requires a joined dictionary: " "--share-encoder-embeddings requires a joined source " "dictionary, --share-decoder-embeddings requires a joined " "target dictionary, and --share-all-embeddings requires a " "joint source + target dictionary." ) return build_embedding(shared_dict, embed_dim, pretrained_embed_path) def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): raise NotImplementedError def max_positions(self): """Maximum length supported by the model.""" return { key: ( self.models[key].encoder.max_positions(), self.models[key].decoder.max_positions(), ) for key in self.keys } def max_decoder_positions(self): """Maximum length supported by the decoder.""" return min(model.decoder.max_positions() for model in self.models.values()) @property def encoder(self): return self.models[self.keys[0]].encoder @property def decoder(self): return self.models[self.keys[0]].decoder def forward_decoder(self, prev_output_tokens, **kwargs): return self.decoder(prev_output_tokens, **kwargs) def load_state_dict( self, state_dict, strict=True, model_cfg=None, args: Optional[Namespace] = None, ): """Copies parameters and buffers from *state_dict* into this module and its descendants. Overrides the method in :class:`nn.Module`. Compared with that method this additionally "upgrades" *state_dicts* from old checkpoints. """ if model_cfg is None and args is not None: logger.warn( "using 'args' is deprecated, please update your code to use dataclass config" ) model_cfg = convert_namespace_to_omegaconf(args).model self.upgrade_state_dict(state_dict) from fairseq.checkpoint_utils import prune_state_dict new_state_dict = prune_state_dict(state_dict, model_cfg) return super().load_state_dict(new_state_dict, strict) class FairseqLanguageModel(BaseFairseqModel): """Base class for decoder-only models. Args: decoder (FairseqDecoder): the decoder """ def __init__(self, decoder): super().__init__() self.decoder = decoder check_type(self.decoder, FairseqDecoder) def forward(self, src_tokens, **kwargs): """ Run the forward pass for a decoder-only model. Feeds a batch of tokens through the decoder to predict the next tokens. Args: src_tokens (LongTensor): tokens on which to condition the decoder, of shape `(batch, tgt_len)` src_lengths (LongTensor): source sentence lengths of shape `(batch)` Returns: tuple: - the decoder's output of shape `(batch, seq_len, vocab)` - a dictionary with any model-specific outputs """ return self.decoder(src_tokens, **kwargs) def forward_decoder(self, prev_output_tokens, **kwargs): return self.decoder(prev_output_tokens, **kwargs) def extract_features(self, src_tokens, **kwargs): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, seq_len, embed_dim)` - a dictionary with any model-specific outputs """ return self.decoder.extract_features(src_tokens, **kwargs) def output_layer(self, features, **kwargs): """Project features to the default output size (typically vocabulary size).""" return self.decoder.output_layer(features, **kwargs) def max_positions(self): """Maximum length supported by the model.""" return self.decoder.max_positions() def max_decoder_positions(self): """Maximum length supported by the decoder.""" return self.decoder.max_positions() @property def supported_targets(self): return {"future"} class FairseqEncoderModel(BaseFairseqModel): """Base class for encoder-only models. Args: encoder (FairseqEncoder): the encoder """ def __init__(self, encoder): super().__init__() self.encoder = encoder check_type(self.encoder, FairseqEncoder) def forward(self, src_tokens, src_lengths, **kwargs): """ Run the forward pass for a encoder-only model. Feeds a batch of tokens through the encoder to generate features. Args: src_tokens (LongTensor): input tokens of shape `(batch, src_len)` src_lengths (LongTensor): source sentence lengths of shape `(batch)` Returns: the encoder's output, typically of shape `(batch, src_len, features)` """ return self.encoder(src_tokens, src_lengths, **kwargs) def get_normalized_probs(self, net_output, log_probs, sample=None): """Get normalized probabilities (or log probs) from a net's output.""" encoder_out = net_output["encoder_out"] if torch.is_tensor(encoder_out): logits = encoder_out.float() if log_probs: return F.log_softmax(logits, dim=-1) else: return F.softmax(logits, dim=-1) raise NotImplementedError def max_positions(self): """Maximum length supported by the model.""" return self.encoder.max_positions() ================================================ FILE: fairseq/models/fconv.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, register_model_architecture, ) from fairseq.modules import ( AdaptiveSoftmax, BeamableMM, FairseqDropout, GradMultiply, LearnedPositionalEmbedding, LinearizedConvolution, ) @register_model("fconv") class FConvModel(FairseqEncoderDecoderModel): """ A fully convolutional model, i.e. a convolutional encoder and a convolutional decoder, as described in `"Convolutional Sequence to Sequence Learning" (Gehring et al., 2017) <https://arxiv.org/abs/1705.03122>`_. Args: encoder (FConvEncoder): the encoder decoder (FConvDecoder): the decoder The Convolutional model provides the following named architectures and command-line arguments: .. argparse:: :ref: fairseq.models.fconv_parser :prog: """ @classmethod def hub_models(cls): def moses_subword(path): return { "path": path, "tokenizer": "moses", "bpe": "subword_nmt", } return { "conv.wmt14.en-fr": moses_subword( "https://dl.fbaipublicfiles.com/fairseq/models/wmt14.v2.en-fr.fconv-py.tar.bz2" ), "conv.wmt14.en-de": moses_subword( "https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-de.fconv-py.tar.bz2" ), "conv.wmt17.en-de": moses_subword( "https://dl.fbaipublicfiles.com/fairseq/models/wmt17.v2.en-de.fconv-py.tar.bz2" ), } def __init__(self, encoder, decoder): super().__init__(encoder, decoder) self.encoder.num_attention_layers = sum( layer is not None for layer in decoder.attention ) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') parser.add_argument('--share-input-output-embed', action='store_true', help='share input and output embeddings (requires' ' --decoder-out-embed-dim and --decoder-embed-dim' ' to be equal)') # fmt: on @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) encoder_embed_dict = None if args.encoder_embed_path: encoder_embed_dict = utils.parse_embedding(args.encoder_embed_path) utils.print_embed_overlap(encoder_embed_dict, task.source_dictionary) decoder_embed_dict = None if args.decoder_embed_path: decoder_embed_dict = utils.parse_embedding(args.decoder_embed_path) utils.print_embed_overlap(decoder_embed_dict, task.target_dictionary) encoder = FConvEncoder( dictionary=task.source_dictionary, embed_dim=args.encoder_embed_dim, embed_dict=encoder_embed_dict, convolutions=eval(args.encoder_layers), dropout=args.dropout, max_positions=args.max_source_positions, ) decoder = FConvDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, embed_dict=decoder_embed_dict, convolutions=eval(args.decoder_layers), out_embed_dim=args.decoder_out_embed_dim, attention=eval(args.decoder_attention), dropout=args.dropout, max_positions=args.max_target_positions, share_embed=args.share_input_output_embed, ) return FConvModel(encoder, decoder) class FConvEncoder(FairseqEncoder): """ Convolutional encoder consisting of `len(convolutions)` layers. Args: dictionary (~fairseq.data.Dictionary): encoding dictionary embed_dim (int, optional): embedding dimension embed_dict (str, optional): filename from which to load pre-trained embeddings max_positions (int, optional): maximum supported input sequence length convolutions (list, optional): the convolutional layer structure. Each list item `i` corresponds to convolutional layer `i`. Layers are given as ``(out_channels, kernel_width, [residual])``. Residual connections are added between layers when ``residual=1`` (which is the default behavior). dropout (float, optional): dropout to be applied before each conv layer """ def __init__( self, dictionary, embed_dim=512, embed_dict=None, max_positions=1024, convolutions=((512, 3),) * 20, dropout=0.1, ): super().__init__(dictionary) self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.num_attention_layers = None num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding( embed_dict, self.dictionary, self.embed_tokens ) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, self.padding_idx, ) convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for _, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels) if residual_dim != out_channels else None ) if kernel_size % 2 == 1: padding = kernel_size // 2 else: padding = 0 self.convolutions.append( ConvTBC( in_channels, out_channels * 2, kernel_size, dropout=dropout, padding=padding, ) ) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.fc2 = Linear(in_channels, embed_dim) def forward(self, src_tokens, src_lengths): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` Returns: dict: - **encoder_out** (tuple): a tuple with two elements, where the first element is the last encoder layer's output and the second element is the same quantity summed with the input embedding (used for attention). The shape of both tensors is `(batch, src_len, embed_dim)`. - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = self.dropout_module(x) input_embedding = x # project to size of convolution x = self.fc1(x) # used to mask padding in input encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) residuals = [x] # temporal convolutions for proj, conv, res_layer in zip( self.projections, self.convolutions, self.residuals ): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = self.dropout_module(x) if conv.kernel_size[0] % 2 == 1: # padding is implicit in the conv x = conv(x) else: padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding) * math.sqrt(0.5) return { "encoder_out": (x, y), "encoder_padding_mask": encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): if encoder_out["encoder_out"] is not None: encoder_out["encoder_out"] = ( encoder_out["encoder_out"][0].index_select(0, new_order), encoder_out["encoder_out"][1].index_select(0, new_order), ) if encoder_out["encoder_padding_mask"] is not None: encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(0, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" return self.embed_positions.max_positions class AttentionLayer(nn.Module): def __init__(self, conv_channels, embed_dim, bmm=None): super().__init__() # projects from output of convolution to embedding dimension self.in_projection = Linear(conv_channels, embed_dim) # projects from embedding dimension to convolution size self.out_projection = Linear(embed_dim, conv_channels) self.bmm = bmm if bmm is not None else torch.bmm def forward(self, x, target_embedding, encoder_out, encoder_padding_mask): residual = x # attention x = (self.in_projection(x) + target_embedding) * math.sqrt(0.5) x = self.bmm(x, encoder_out[0]) # don't attend over padding if encoder_padding_mask is not None: x = ( x.float() .masked_fill(encoder_padding_mask.unsqueeze(1), float("-inf")) .type_as(x) ) # FP16 support: cast to float and back # softmax over last dim sz = x.size() x = F.softmax(x.view(sz[0] * sz[1], sz[2]), dim=1) x = x.view(sz) attn_scores = x x = self.bmm(x, encoder_out[1]) # scale attention output (respecting potentially different lengths) s = encoder_out[1].size(1) if encoder_padding_mask is None: x = x * (s * math.sqrt(1.0 / s)) else: s = s - encoder_padding_mask.type_as(x).sum( dim=1, keepdim=True ) # exclude padding s = s.unsqueeze(-1) x = x * (s * s.rsqrt()) # project back x = (self.out_projection(x) + residual) * math.sqrt(0.5) return x, attn_scores def make_generation_fast_(self, beamable_mm_beam_size=None, **kwargs): """Replace torch.bmm with BeamableMM.""" if beamable_mm_beam_size is not None: del self.bmm self.add_module("bmm", BeamableMM(beamable_mm_beam_size)) class FConvDecoder(FairseqIncrementalDecoder): """Convolutional decoder""" def __init__( self, dictionary, embed_dim=512, embed_dict=None, out_embed_dim=256, max_positions=1024, convolutions=((512, 3),) * 20, attention=True, dropout=0.1, share_embed=False, positional_embeddings=True, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0.0, ): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([2])) self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.need_attn = True convolutions = extend_conv_spec(convolutions) in_channels = convolutions[0][0] if isinstance(attention, bool): # expand True into [True, True, ...] and do the same with False attention = [attention] * len(convolutions) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError( "Attention is expected to be a list of booleans of " "length equal to the number of layers." ) num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) if embed_dict: self.embed_tokens = utils.load_embedding( embed_dict, self.dictionary, self.embed_tokens ) self.embed_positions = ( PositionalEmbedding( max_positions, embed_dim, padding_idx, ) if positional_embeddings else None ) self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.residuals = [] layer_in_channels = [in_channels] for i, (out_channels, kernel_size, residual) in enumerate(convolutions): if residual == 0: residual_dim = out_channels else: residual_dim = layer_in_channels[-residual] self.projections.append( Linear(residual_dim, out_channels) if residual_dim != out_channels else None ) self.convolutions.append( LinearizedConv1d( in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout, ) ) self.attention.append( AttentionLayer(out_channels, embed_dim) if attention[i] else None ) self.residuals.append(residual) in_channels = out_channels layer_in_channels.append(out_channels) self.adaptive_softmax = None self.fc2 = self.fc3 = None if adaptive_softmax_cutoff is not None: assert not share_embed self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, in_channels, adaptive_softmax_cutoff, dropout=adaptive_softmax_dropout, ) else: self.fc2 = Linear(in_channels, out_embed_dim) if share_embed: assert out_embed_dim == embed_dim, ( "Shared embed weights implies same dimensions " " out_embed_dim={} vs embed_dim={}".format(out_embed_dim, embed_dim) ) self.fc3 = nn.Linear(out_embed_dim, num_embeddings) self.fc3.weight = self.embed_tokens.weight else: self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout) def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused ): if encoder_out is not None: encoder_padding_mask = encoder_out["encoder_padding_mask"] encoder_out = encoder_out["encoder_out"] # split and transpose encoder outputs encoder_a, encoder_b = self._split_encoder_out( encoder_out, incremental_state ) if self.embed_positions is not None: pos_embed = self.embed_positions(prev_output_tokens, incremental_state) else: pos_embed = 0 if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] x = self._embed_tokens(prev_output_tokens, incremental_state) # embed tokens and combine with positional embeddings x += pos_embed x = self.dropout_module(x) target_embedding = x # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = self._transpose_if_training(x, incremental_state) # temporal convolutions avg_attn_scores = None num_attn_layers = len(self.attention) residuals = [x] for proj, conv, attention, res_layer in zip( self.projections, self.convolutions, self.attention, self.residuals ): if res_layer > 0: residual = residuals[-res_layer] residual = residual if proj is None else proj(residual) else: residual = None x = self.dropout_module(x) x = conv(x, incremental_state) x = F.glu(x, dim=2) # attention if attention is not None: x = self._transpose_if_training(x, incremental_state) x, attn_scores = attention( x, target_embedding, (encoder_a, encoder_b), encoder_padding_mask ) if not self.training and self.need_attn: attn_scores = attn_scores / num_attn_layers if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) x = self._transpose_if_training(x, incremental_state) # residual if residual is not None: x = (x + residual) * math.sqrt(0.5) residuals.append(x) # T x B x C -> B x T x C x = self._transpose_if_training(x, incremental_state) # project back to size of vocabulary if not using adaptive softmax if self.fc2 is not None and self.fc3 is not None: x = self.fc2(x) x = self.dropout_module(x) x = self.fc3(x) return x, avg_attn_scores def reorder_incremental_state(self, incremental_state, new_order): super().reorder_incremental_state(incremental_state, new_order) encoder_out = utils.get_incremental_state( self, incremental_state, "encoder_out" ) if encoder_out is not None: encoder_out = tuple(eo.index_select(0, new_order) for eo in encoder_out) utils.set_incremental_state( self, incremental_state, "encoder_out", encoder_out ) def max_positions(self): """Maximum output length supported by the decoder.""" return ( self.embed_positions.max_positions if self.embed_positions is not None else float("inf") ) def upgrade_state_dict(self, state_dict): if utils.item(state_dict.get("decoder.version", torch.Tensor([1]))[0]) < 2: # old models use incorrect weight norm dimension for i, conv in enumerate(self.convolutions): # reconfigure weight norm nn.utils.remove_weight_norm(conv) self.convolutions[i] = nn.utils.weight_norm(conv, dim=0) state_dict["decoder.version"] = torch.Tensor([1]) return state_dict def make_generation_fast_(self, need_attn=False, **kwargs): self.need_attn = need_attn def _embed_tokens(self, tokens, incremental_state): if incremental_state is not None: # keep only the last token for incremental forward pass tokens = tokens[:, -1:] return self.embed_tokens(tokens) def _split_encoder_out(self, encoder_out, incremental_state): """Split and transpose encoder outputs. This is cached when doing incremental inference. """ cached_result = utils.get_incremental_state( self, incremental_state, "encoder_out" ) if cached_result is not None: return cached_result # transpose only once to speed up attention layers encoder_a, encoder_b = encoder_out encoder_a = encoder_a.transpose(1, 2).contiguous() result = (encoder_a, encoder_b) if incremental_state is not None: utils.set_incremental_state(self, incremental_state, "encoder_out", result) return result def _transpose_if_training(self, x, incremental_state): if incremental_state is None: x = x.transpose(0, 1) return x def extend_conv_spec(convolutions): """ Extends convolutional spec that is a list of tuples of 2 or 3 parameters (kernel size, dim size and optionally how many layers behind to look for residual) to default the residual propagation param if it is not specified """ extended = [] for spec in convolutions: if len(spec) == 3: extended.append(spec) elif len(spec) == 2: extended.append(spec + (1,)) else: raise Exception( "invalid number of parameters in convolution spec " + str(spec) + ". expected 2 or 3" ) return tuple(extended) def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, 0, 0.1) nn.init.constant_(m.weight[padding_idx], 0) return m def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx): m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) nn.init.normal_(m.weight, 0, 0.1) nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, dropout=0.0): """Weight-normalized Linear layer (input: N x T x C)""" m = nn.Linear(in_features, out_features) nn.init.normal_(m.weight, mean=0, std=math.sqrt((1 - dropout) / in_features)) nn.init.constant_(m.bias, 0) return nn.utils.weight_norm(m) def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs): """Weight-normalized Conv1d layer optimized for decoding""" m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) nn.init.normal_(m.weight, mean=0, std=std) nn.init.constant_(m.bias, 0) return nn.utils.weight_norm(m, dim=2) def ConvTBC(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs): """Weight-normalized Conv1d layer""" from fairseq.modules import ConvTBC m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) nn.init.normal_(m.weight, mean=0, std=std) nn.init.constant_(m.bias, 0) return nn.utils.weight_norm(m, dim=2) @register_model_architecture("fconv", "fconv") def base_architecture(args): args.dropout = getattr(args, "dropout", 0.1) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_layers = getattr(args, "encoder_layers", "[(512, 3)] * 20") args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_layers = getattr(args, "decoder_layers", "[(512, 3)] * 20") args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256) args.decoder_attention = getattr(args, "decoder_attention", "True") args.share_input_output_embed = getattr(args, "share_input_output_embed", False) @register_model_architecture("fconv", "fconv_iwslt_de_en") def fconv_iwslt_de_en(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_layers = getattr(args, "encoder_layers", "[(256, 3)] * 4") args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) args.decoder_layers = getattr(args, "decoder_layers", "[(256, 3)] * 3") args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256) base_architecture(args) @register_model_architecture("fconv", "fconv_wmt_en_ro") def fconv_wmt_en_ro(args): args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512) base_architecture(args) @register_model_architecture("fconv", "fconv_wmt_en_de") def fconv_wmt_en_de(args): convs = "[(512, 3)] * 9" # first 9 layers have 512 units convs += " + [(1024, 3)] * 4" # next 4 layers have 1024 units convs += " + [(2048, 1)] * 2" # final 2 layers use 1x1 convolutions args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) args.encoder_layers = getattr(args, "encoder_layers", convs) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768) args.decoder_layers = getattr(args, "decoder_layers", convs) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512) base_architecture(args) @register_model_architecture("fconv", "fconv_wmt_en_fr") def fconv_wmt_en_fr(args): convs = "[(512, 3)] * 6" # first 6 layers have 512 units convs += " + [(768, 3)] * 4" # next 4 layers have 768 units convs += " + [(1024, 3)] * 3" # next 3 layers have 1024 units convs += " + [(2048, 1)] * 1" # next 1 layer uses 1x1 convolutions convs += " + [(4096, 1)] * 1" # final 1 layer uses 1x1 convolutions args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) args.encoder_layers = getattr(args, "encoder_layers", convs) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 768) args.decoder_layers = getattr(args, "decoder_layers", convs) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512) base_architecture(args) ================================================ FILE: fairseq/models/fconv_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq import utils from fairseq.models import ( FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.fconv import FConvDecoder from fairseq.utils import safe_hasattr @register_model("fconv_lm") class FConvLanguageModel(FairseqLanguageModel): def __init__(self, decoder): super().__init__(decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-layers", type=str, metavar="EXPR", help="decoder layers [(dim, kernel_size), ...]", ) parser.add_argument( "--decoder-out-embed-dim", type=int, metavar="N", help="decoder output embedding dimension", ) parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ) parser.add_argument( "--adaptive-softmax-dropout", type=float, metavar="D", help="sets adaptive softmax dropout for the tail projections", ) parser.add_argument( "--decoder-attention", type=str, metavar="EXPR", help="decoder attention [True, ...]", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if safe_hasattr(args, "max_target_positions") and not safe_hasattr( args, "tokens_per_sample" ): args.tokens_per_sample = args.max_target_positions decoder = FConvDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, convolutions=eval(args.decoder_layers), out_embed_dim=args.decoder_embed_dim, attention=eval(args.decoder_attention), dropout=args.dropout, max_positions=args.tokens_per_sample, share_embed=False, positional_embeddings=False, adaptive_softmax_cutoff=( utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None ), adaptive_softmax_dropout=args.adaptive_softmax_dropout, ) return FConvLanguageModel(decoder) @register_model_architecture("fconv_lm", "fconv_lm") def base_lm_architecture(args): args.dropout = getattr(args, "dropout", 0.1) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128) args.decoder_layers = getattr(args, "decoder_layers", "[(1268, 4)] * 13") args.decoder_attention = getattr(args, "decoder_attention", "False") args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) @register_model_architecture("fconv_lm", "fconv_lm_dauphin_wikitext103") def fconv_lm_dauphin_wikitext103(args): layers = "[(850, 6)] * 3" layers += " + [(850, 1)] * 1" layers += " + [(850, 5)] * 4" layers += " + [(850, 1)] * 1" layers += " + [(850, 4)] * 3" layers += " + [(1024, 4)] * 1" layers += " + [(2048, 4)] * 1" args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 280) args.decoder_layers = getattr(args, "decoder_layers", layers) args.decoder_attention = getattr(args, "decoder_attention", "False") args.adaptive_softmax_cutoff = getattr( args, "adaptive_softmax_cutoff", "10000,20000,200000" ) base_lm_architecture(args) @register_model_architecture("fconv_lm", "fconv_lm_dauphin_gbw") def fconv_lm_dauphin_gbw(args): layers = "[(512, 5)]" layers += " + [(128, 1, 0), (128, 5, 0), (512, 1, 3)] * 3" layers += " + [(512, 1, 0), (512, 5, 0), (1024, 1, 3)] * 3" layers += " + [(1024, 1, 0), (1024, 5, 0), (2048, 1, 3)] * 6" layers += " + [(1024, 1, 0), (1024, 5, 0), (4096, 1, 3)]" args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128) args.decoder_layers = getattr(args, "decoder_layers", layers) args.decoder_attention = getattr(args, "decoder_attention", "False") args.adaptive_softmax_cutoff = getattr( args, "adaptive_softmax_cutoff", "10000,50000,200000" ) base_lm_architecture(args) ================================================ FILE: fairseq/models/fconv_self_att.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math import os import torch import torch.nn as nn import torch.nn.functional as F from fairseq import checkpoint_utils from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.models import ( CompositeEncoder, FairseqDecoder, FairseqEncoder, FairseqEncoderDecoderModel, register_model, register_model_architecture, ) from fairseq.modules import ( DownsampledMultiHeadAttention, FairseqDropout, GradMultiply, LayerNorm, LearnedPositionalEmbedding, LinearizedConvolution, ) logger = logging.getLogger(__name__) @register_model("fconv_self_att") class FConvModelSelfAtt(FairseqEncoderDecoderModel): @classmethod def hub_models(cls): return { "conv.stories.pretrained": { "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz", "checkpoint_file": "pretrained_checkpoint.pt", "tokenizer": "nltk", }, "conv.stories": { "path": "https://dl.fbaipublicfiles.com/fairseq/models/stories_checkpoint.tar.gz", "checkpoint_file": "fusion_checkpoint.pt", "tokenizer": "nltk", "pretrained": "True", "pretrained_checkpoint": "./pretrained_checkpoint.pt", }, # Test set containing dictionaries "data.stories": "https://dl.fbaipublicfiles.com/fairseq/data/stories_test.tar.bz2", } def __init__(self, encoder, decoder, pretrained_encoder=None): super().__init__(encoder, decoder) self.encoder.num_attention_layers = sum( layer is not None for layer in decoder.attention ) self.pretrained_encoder = pretrained_encoder if self.pretrained_encoder is None: encoders = {"encoder": encoder} else: encoders = {"encoder": encoder, "pretrained": self.pretrained_encoder} # for fusion model, CompositeEncoder contains both pretrained and training encoders # these are forwarded and then combined in the decoder self.encoder = CompositeEncoder(encoders) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-layers', type=str, metavar='EXPR', help='encoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-layers', type=str, metavar='EXPR', help='decoder layers [(dim, kernel_size), ...]') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='EXPR', help='decoder attention [True, ...]') parser.add_argument('--self-attention', type=str, metavar='EXPR', help='decoder self-attention layers, ex: [True] + [False]*5') parser.add_argument('--multihead-attention-nheads', type=int, help='Number of heads to use in attention') parser.add_argument('--multihead-self-attention-nheads', type=int, help='Number of heads to use in self-attention') parser.add_argument('--encoder-attention', type=str, metavar='EXPR', help='encoder attention [True, ...]') parser.add_argument('--encoder-attention-nheads', type=int, help='Number of heads to use in encoder attention') parser.add_argument('--project-input', type=str, metavar='EXPR', help='Use projections in self-attention [True, ...]') parser.add_argument('--gated-attention', type=str, metavar='EXPR', help='Use GLU layers in self-attention projections [True, ...]') parser.add_argument('--downsample', type=str, metavar='EXPR', help='Use downsampling in self-attention [True, ...]') parser.add_argument('--pretrained-checkpoint', metavar='DIR', help='path to load checkpoint from pretrained model') parser.add_argument('--pretrained', type=str, metavar='EXPR', help='use pretrained model when training [True, ...]') # fmt: on @classmethod def build_model(cls, args, task): """Build a new model instance.""" trained_encoder, trained_decoder = None, None pretrained = eval(args.pretrained) if pretrained: logger.info("loading pretrained model") if not os.path.exists(args.pretrained_checkpoint): new_pretrained_checkpoint = os.path.join( args.data, args.pretrained_checkpoint ) if os.path.exists(new_pretrained_checkpoint): args.pretrained_checkpoint = new_pretrained_checkpoint trained_model = checkpoint_utils.load_model_ensemble( filenames=[args.pretrained_checkpoint], task=task, )[0][0] trained_decoder = list(trained_model.children())[1] trained_encoder = list(trained_model.children())[0] # freeze pretrained model for param in trained_decoder.parameters(): param.requires_grad = False for param in trained_encoder.parameters(): param.requires_grad = False encoder = FConvEncoder( task.source_dictionary, embed_dim=args.encoder_embed_dim, convolutions=eval(args.encoder_layers), dropout=args.dropout, max_positions=args.max_source_positions, attention=eval(args.encoder_attention), attention_nheads=args.encoder_attention_nheads, ) decoder = FConvDecoder( task.target_dictionary, embed_dim=args.decoder_embed_dim, convolutions=eval(args.decoder_layers), out_embed_dim=args.decoder_out_embed_dim, attention=eval(args.decoder_attention), dropout=args.dropout, max_positions=args.max_target_positions, selfattention=eval(args.self_attention), attention_nheads=args.multihead_attention_nheads, selfattention_nheads=args.multihead_self_attention_nheads, project_input=eval(args.project_input), gated_attention=eval(args.gated_attention), downsample=eval(args.downsample), pretrained=pretrained, trained_decoder=trained_decoder, ) model = FConvModelSelfAtt(encoder, decoder, trained_encoder) return model @property def pretrained(self): return self.pretrained_encoder is not None class FConvEncoder(FairseqEncoder): """Convolutional encoder""" def __init__( self, dictionary, embed_dim=512, max_positions=1024, convolutions=((512, 3),) * 20, dropout=0.1, attention=False, attention_nheads=1, ): super().__init__(dictionary) self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.num_attention_layers = None num_embeddings = len(dictionary) self.padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, self.padding_idx, ) def expand_bool_array(val): if isinstance(val, bool): # expand True into [True, True, ...] and do the same with False return [val] * len(convolutions) return val attention = expand_bool_array(attention) in_channels = convolutions[0][0] self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.attproj = nn.ModuleList() for i, (out_channels, kernel_size) in enumerate(convolutions): self.projections.append( Linear(in_channels, out_channels) if in_channels != out_channels else None ) self.convolutions.append( ConvTBC(in_channels, out_channels * 2, kernel_size, dropout=dropout) ) self.attention.append( SelfAttention(out_channels, embed_dim, attention_nheads) if attention[i] else None ) in_channels = out_channels self.fc2 = Linear(in_channels, embed_dim) def forward(self, src_tokens, src_lengths): # embed tokens and positions x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens) x = self.dropout_module(x) input_embedding = x.transpose(0, 1) # project to size of convolution x = self.fc1(x) encoder_padding_mask = src_tokens.eq(self.padding_idx).t() # -> T x B if not encoder_padding_mask.any(): encoder_padding_mask = None # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions for proj, conv, attention in zip( self.projections, self.convolutions, self.attention ): residual = x if proj is None else proj(x) if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) x = self.dropout_module(x) padding_l = (conv.kernel_size[0] - 1) // 2 padding_r = conv.kernel_size[0] // 2 x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r)) x = conv(x) x = F.glu(x, dim=2) if attention is not None: x = attention(x) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(1, 0) # project back to size of embedding x = self.fc2(x) if encoder_padding_mask is not None: encoder_padding_mask = encoder_padding_mask.t() # -> B x T x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0) # scale gradients (this only affects backward, not forward) x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers)) # add output to input embedding for attention y = (x + input_embedding.transpose(0, 1)) * math.sqrt(0.5) return { "encoder_out": (x, y), "encoder_padding_mask": encoder_padding_mask, # B x T } def reorder_encoder_out(self, encoder_out, new_order): encoder_out["encoder_out"] = tuple( eo.index_select(0, new_order) for eo in encoder_out["encoder_out"] ) if encoder_out["encoder_padding_mask"] is not None: encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(0, new_order) if "pretrained" in encoder_out: encoder_out["pretrained"]["encoder_out"] = tuple( eo.index_select(0, new_order) for eo in encoder_out["pretrained"]["encoder_out"] ) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" return self.embed_positions.max_positions @with_incremental_state class FConvDecoder(FairseqDecoder): """Convolutional decoder""" def __init__( self, dictionary, embed_dim=512, out_embed_dim=256, max_positions=1024, convolutions=((512, 3),) * 8, attention=True, dropout=0.1, selfattention=False, attention_nheads=1, selfattention_nheads=1, project_input=False, gated_attention=False, downsample=False, pretrained=False, trained_decoder=None, ): super().__init__(dictionary) self.register_buffer("version", torch.Tensor([2])) self.pretrained = pretrained self.pretrained_decoder = trained_decoder self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.need_attn = True in_channels = convolutions[0][0] def expand_bool_array(val): if isinstance(val, bool): # expand True into [True, True, ...] and do the same with False return [val] * len(convolutions) return val attention = expand_bool_array(attention) selfattention = expand_bool_array(selfattention) if not isinstance(attention, list) or len(attention) != len(convolutions): raise ValueError( "Attention is expected to be a list of booleans of " "length equal to the number of layers." ) num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) self.embed_positions = PositionalEmbedding( max_positions, embed_dim, padding_idx, ) self.fc1 = Linear(embed_dim, in_channels, dropout=dropout) self.projections = nn.ModuleList() self.convolutions = nn.ModuleList() self.attention = nn.ModuleList() self.selfattention = nn.ModuleList() self.attproj = nn.ModuleList() for i, (out_channels, kernel_size) in enumerate(convolutions): self.projections.append( Linear(in_channels, out_channels) if in_channels != out_channels else None ) self.convolutions.append( LinearizedConv1d( in_channels, out_channels * 2, kernel_size, padding=(kernel_size - 1), dropout=dropout, ) ) self.attention.append( DownsampledMultiHeadAttention( out_channels, embed_dim, attention_nheads, project_input=project_input, gated=False, downsample=False, ) if attention[i] else None ) self.attproj.append( Linear(out_channels, embed_dim, dropout=dropout) if attention[i] else None ) self.selfattention.append( SelfAttention( out_channels, embed_dim, selfattention_nheads, project_input=project_input, gated=gated_attention, downsample=downsample, ) if selfattention[i] else None ) in_channels = out_channels self.fc2 = Linear(in_channels, out_embed_dim) self.fc3 = Linear(out_embed_dim, num_embeddings, dropout=dropout) # model fusion if self.pretrained: # independent gates are learned from the concatenated input self.gate1 = nn.Sequential( Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid() ) self.gate2 = nn.Sequential( Linear(out_embed_dim * 2, out_embed_dim), nn.Sigmoid() ) # pretrained and trained models are joined self.joining = nn.Sequential( Linear(out_embed_dim * 2, out_embed_dim * 2), LayerNorm(out_embed_dim * 2), nn.GLU(), Linear(out_embed_dim, out_embed_dim * 2), LayerNorm(out_embed_dim * 2), nn.GLU(), Linear(out_embed_dim, out_embed_dim), LayerNorm(out_embed_dim), ) # pretrained model contains an output layer that is nhid -> vocab size # but the models are combined in their hidden state # the hook stores the output of the pretrained model forward self.pretrained_outputs = {} def save_output(): def hook(a, b, output): self.pretrained_outputs["out"] = output return hook self.pretrained_decoder.fc2.register_forward_hook(save_output()) def forward(self, prev_output_tokens, encoder_out): trained_encoder_out = encoder_out["pretrained"] if self.pretrained else None encoder_out = encoder_out["encoder"]["encoder_out"] encoder_a, encoder_b = self._split_encoder_out(encoder_out) # embed positions positions = self.embed_positions(prev_output_tokens) # embed tokens and positions x = self.embed_tokens(prev_output_tokens) + positions x = self.dropout_module(x) target_embedding = x.transpose(0, 1) # project to size of convolution x = self.fc1(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # temporal convolutions avg_attn_scores = None for proj, conv, attention, selfattention, attproj in zip( self.projections, self.convolutions, self.attention, self.selfattention, self.attproj, ): residual = x if proj is None else proj(x) x = self.dropout_module(x) x = conv(x) x = F.glu(x, dim=2) # attention if attention is not None: r = x x, attn_scores = attention( attproj(x) + target_embedding, encoder_a, encoder_b ) x = x + r if not self.training and self.need_attn: if avg_attn_scores is None: avg_attn_scores = attn_scores else: avg_attn_scores.add_(attn_scores) if selfattention is not None: x = selfattention(x) x = (x + residual) * math.sqrt(0.5) # T x B x C -> B x T x C x = x.transpose(0, 1) # project back to size of vocabulary x = self.fc2(x) x = self.dropout_module(x) if not self.pretrained: x = self.fc3(x) # fusion gating if self.pretrained: trained_x, _ = self.pretrained_decoder.forward( prev_output_tokens, trained_encoder_out ) y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1) gate1 = self.gate1(y) gate2 = self.gate2(y) gated_x1 = gate1 * x gated_x2 = gate2 * self.pretrained_outputs["out"] fusion = torch.cat([gated_x1, gated_x2], dim=-1) fusion = self.joining(fusion) fusion_output = self.fc3(fusion) return fusion_output, avg_attn_scores else: return x, avg_attn_scores def max_positions(self): """Maximum output length supported by the decoder.""" return self.embed_positions.max_positions def make_generation_fast_(self, need_attn=False, **kwargs): self.need_attn = need_attn def _split_encoder_out(self, encoder_out): """Split and transpose encoder outputs.""" # transpose only once to speed up attention layers encoder_a, encoder_b = encoder_out encoder_a = encoder_a.transpose(0, 1).contiguous() encoder_b = encoder_b.transpose(0, 1).contiguous() result = (encoder_a, encoder_b) return result class SelfAttention(nn.Module): def __init__( self, out_channels, embed_dim, num_heads, project_input=False, gated=False, downsample=False, ): super().__init__() self.attention = DownsampledMultiHeadAttention( out_channels, embed_dim, num_heads, dropout=0, bias=True, project_input=project_input, gated=gated, downsample=downsample, ) self.in_proj_q = Linear(out_channels, embed_dim) self.in_proj_k = Linear(out_channels, embed_dim) self.in_proj_v = Linear(out_channels, embed_dim) self.ln = LayerNorm(out_channels) def forward(self, x): residual = x query = self.in_proj_q(x) key = self.in_proj_k(x) value = self.in_proj_v(x) x, _ = self.attention( query, key, value, mask_future_timesteps=True, use_scalar_bias=True ) return self.ln(x + residual) def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) m.weight.data.normal_(0, 0.1) return m def PositionalEmbedding(num_embeddings, embedding_dim, padding_idx): m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) m.weight.data.normal_(0, 0.1) return m def Linear(in_features, out_features, dropout=0.0): """Weight-normalized Linear layer (input: N x T x C)""" m = nn.Linear(in_features, out_features) m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features)) m.bias.data.zero_() return m def LinearizedConv1d(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs): """Weight-normalized Conv1d layer optimized for decoding""" m = LinearizedConvolution(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) m.weight.data.normal_(mean=0, std=std) m.bias.data.zero_() return m def ConvTBC(in_channels, out_channels, kernel_size, dropout=0.0, **kwargs): """Weight-normalized Conv1d layer""" from fairseq.modules import ConvTBC m = ConvTBC(in_channels, out_channels, kernel_size, **kwargs) std = math.sqrt((4 * (1.0 - dropout)) / (m.kernel_size[0] * in_channels)) m.weight.data.normal_(mean=0, std=std) m.bias.data.zero_() return m @register_model_architecture("fconv_self_att", "fconv_self_att") def base_architecture(args): args.dropout = getattr(args, "dropout", 0.1) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_layers = getattr(args, "encoder_layers", "[(512, 3)] * 3") args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_layers = getattr(args, "decoder_layers", "[(512, 3)] * 8") args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256) args.decoder_attention = getattr(args, "decoder_attention", "True") args.self_attention = getattr(args, "self_attention", "False") args.encoder_attention = getattr(args, "encoder_attention", "False") args.multihead_attention_nheads = getattr(args, "multihead_attention_nheads", 1) args.multihead_self_attention_nheads = getattr( args, "multihead_self_attention_nheads", 1 ) args.encoder_attention_nheads = getattr(args, "encoder_attention_nheads", 1) args.project_input = getattr(args, "project_input", "False") args.gated_attention = getattr(args, "gated_attention", "False") args.downsample = getattr(args, "downsample", "False") args.pretrained_checkpoint = getattr(args, "pretrained_checkpoint", "") args.pretrained = getattr(args, "pretrained", "False") @register_model_architecture("fconv_self_att", "fconv_self_att_wp") def fconv_self_att_wp(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_layers = getattr( args, "encoder_layers", "[(128, 3)] * 2 + [(512,3)] * 1" ) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) args.decoder_layers = getattr( args, "decoder_layers", "[(512, 4)] * 4 + [(768, 4)] * 2 + [(1024, 4)] * 1" ) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256) args.self_attention = getattr(args, "self_attention", "True") args.multihead_self_attention_nheads = getattr( args, "multihead_self_attention_nheads", 4 ) args.project_input = getattr(args, "project_input", "True") args.gated_attention = getattr(args, "gated_attention", "True") args.downsample = getattr(args, "downsample", "True") base_architecture(args) ================================================ FILE: fairseq/models/hubert/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .hubert import * # noqa from .hubert_asr import * # noqa ================================================ FILE: fairseq/models/hubert/hubert.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import numpy as np import torch import torch.nn as nn from omegaconf import II from fairseq import utils from fairseq.data.data_utils import compute_mask_indices from fairseq.data.dictionary import Dictionary from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from fairseq.models.wav2vec.wav2vec2 import ( EXTRACTOR_MODE_CHOICES, MASKING_DISTRIBUTION_CHOICES, LAYER_TYPE_CHOICES, ConvFeatureExtractionModel, TransformerEncoder, ) from fairseq.modules import GradMultiply, LayerNorm from fairseq.tasks.hubert_pretraining import ( HubertPretrainingConfig, HubertPretrainingTask, ) logger = logging.getLogger(__name__) @dataclass class HubertConfig(FairseqDataclass): label_rate: float = II("task.label_rate") extractor_mode: EXTRACTOR_MODE_CHOICES = field( default="default", metadata={ "help": "mode for feature extractor. default has a single group " "norm with d groups in the first conv block, whereas layer_norm " "has layer norms in every block (meant to use with normalize=True)" }, ) encoder_layers: int = field( default=12, metadata={"help": "num encoder layers in the transformer"} ) encoder_embed_dim: int = field( default=768, metadata={"help": "encoder embedding dimension"} ) encoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "encoder embedding dimension for FFN"} ) encoder_attention_heads: int = field( default=12, metadata={"help": "num encoder attention heads"} ) activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="gelu", metadata={"help": "activation function to use"} ) layer_type: LAYER_TYPE_CHOICES = field( default="transformer", metadata={"help": "layer type in encoder"} ) # dropouts dropout: float = field( default=0.1, metadata={"help": "dropout probability for the transformer"}, ) attention_dropout: float = field( default=0.1, metadata={"help": "dropout probability for attention weights"}, ) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN"}, ) encoder_layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"}, ) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) dropout_features: float = field( default=0.0, metadata={"help": "dropout to apply to the features (after feat extr)"}, ) final_dim: int = field( default=0, metadata={ "help": "project final representations and targets to this many " "dimensions. set to encoder_embed_dim is <= 0" }, ) untie_final_proj: bool = field( default=False, metadata={"help": "use separate projection for each target"}, ) layer_norm_first: bool = field( default=False, metadata={"help": "apply layernorm first in the transformer"}, ) conv_feature_layers: str = field( default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", metadata={ "help": "string describing convolutional feature extraction " "layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" }, ) conv_bias: bool = field( default=False, metadata={"help": "include bias in conv encoder"} ) logit_temp: float = field( default=0.1, metadata={"help": "temperature to divide logits by"} ) target_glu: bool = field( default=False, metadata={"help": "adds projection + glu to targets"} ) feature_grad_mult: float = field( default=1.0, metadata={"help": "multiply feature extractor var grads by this"}, ) # masking mask_length: int = field(default=10, metadata={"help": "mask length"}) mask_prob: float = field( default=0.65, metadata={"help": "probability of replacing a token with mask"}, ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length"} ) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"} ) mask_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"}, ) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"}, ) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"}, ) mask_channel_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) # positional embeddings conv_pos: int = field( default=128, metadata={"help": "number of filters for convolutional positional embeddings"}, ) conv_pos_groups: int = field( default=16, metadata={"help": "number of groups for convolutional positional embedding"}, ) conv_pos_batch_norm: bool = field( default=False, metadata={ "help": "use batch norm instead of weight norm in conv_pos (for bf16 models)" }, ) latent_temp: Tuple[float, float, float] = field( default=(2, 0.5, 0.999995), metadata={"help": "legacy (to be removed)"}, ) # loss computation skip_masked: bool = field( default=False, metadata={"help": "skip computing losses over masked frames"}, ) skip_nomask: bool = field( default=False, metadata={"help": "skip computing losses over unmasked frames"}, ) checkpoint_activations: bool = field( default=False, metadata={"help": "recompute activations and save memory for extra compute"}, ) # FP16 optimization required_seq_len_multiple: int = field( default=2, metadata={ "help": "pad the input to encoder such that the sequence length is divisible by multiple" }, ) # Conformer depthwise_conv_kernel_size: int = field( default=31, metadata={ "help": "depthwise-conv-kernel-size for convolution in conformer layer" }, ) attn_type: str = field( default="", metadata={"help": "if espnet use ESPNET MHA"}, ) pos_enc_type: str = field( default="abs", metadata={"help": "Positional encoding type to use in conformer"}, ) fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) @register_model("hubert", dataclass=HubertConfig) class HubertModel(BaseFairseqModel): def __init__( self, cfg: HubertConfig, task_cfg: HubertPretrainingConfig, dictionaries: List[Dictionary], ) -> None: super().__init__() logger.info(f"HubertModel Config: {cfg}") feature_enc_layers = eval(cfg.conv_feature_layers) # noqa self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias, ) feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate self.post_extract_proj = ( nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None ) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection self.mask_other = cfg.mask_other self.mask_length = cfg.mask_length self.no_mask_overlap = cfg.no_mask_overlap self.mask_min_space = cfg.mask_min_space self.mask_channel_prob = cfg.mask_channel_prob self.mask_channel_selection = cfg.mask_channel_selection self.mask_channel_other = cfg.mask_channel_other self.mask_channel_length = cfg.mask_channel_length self.no_mask_channel_overlap = cfg.no_mask_channel_overlap self.mask_channel_min_space = cfg.mask_channel_min_space self.dropout_input = nn.Dropout(cfg.dropout_input) self.dropout_features = nn.Dropout(cfg.dropout_features) self.feature_grad_mult = cfg.feature_grad_mult self.logit_temp = cfg.logit_temp self.skip_masked = cfg.skip_masked self.skip_nomask = cfg.skip_nomask final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim self.mask_emb = nn.Parameter( torch.FloatTensor(cfg.encoder_embed_dim).uniform_() ) self.encoder = TransformerEncoder(cfg) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if cfg.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU() ) self.untie_final_proj = cfg.untie_final_proj if self.untie_final_proj: self.final_proj = nn.Linear( cfg.encoder_embed_dim, final_dim * len(dictionaries) ) else: self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim) # modules below are not needed during fine-tuning if any([d is None for d in dictionaries]): logger.info("cannot find dictionary. assume will be used for fine-tuning") else: self.num_classes = [len(d) for d in dictionaries] self.label_embs_concat = nn.Parameter( torch.FloatTensor(sum(self.num_classes), final_dim) ) nn.init.uniform_(self.label_embs_concat) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model(cls, cfg: HubertConfig, task: HubertPretrainingTask): """Build a new model instance.""" model = HubertModel(cfg, task.cfg, task.dictionaries) return model def apply_mask(self, x, padding_mask, target_list): B, T, C = x.shape if self.mask_prob > 0: mask_indices = compute_mask_indices( (B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space, ) mask_indices = torch.from_numpy(mask_indices).to(x.device) x[mask_indices] = self.mask_emb else: mask_indices = None if self.mask_channel_prob > 0: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x[mask_channel_indices] = 0 return x, mask_indices def compute_nce(self, x, pos, negs): neg_is_pos = (pos == negs).all(-1) pos = pos.unsqueeze(0) targets = torch.cat([pos, negs], dim=0) logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x) logits /= self.logit_temp if neg_is_pos.any(): logits[1:][neg_is_pos] = float("-inf") logits = logits.transpose(0, 1) # (num_x, num_cls+1) return logits def forward_features(self, source: torch.Tensor) -> torch.Tensor: if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) return features def forward_targets( self, features: torch.Tensor, target_list: List[torch.Tensor], ) -> Tuple[torch.Tensor, torch.Tensor]: # Trim features to ensure labels exist and then get aligned labels feat_tsz = features.size(2) targ_tsz = min([t.size(1) for t in target_list]) if self.feat2tar_ratio * feat_tsz > targ_tsz: feat_tsz = int(targ_tsz / self.feat2tar_ratio) features = features[..., :feat_tsz] target_inds = torch.arange(feat_tsz).float() * self.feat2tar_ratio target_list = [t[:, target_inds.long()] for t in target_list] return features, target_list def forward_padding_mask( self, features: torch.Tensor, padding_mask: torch.Tensor, ) -> torch.Tensor: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) return padding_mask def forward( self, source: torch.Tensor, target_list: Optional[List[torch.Tensor]] = None, padding_mask: Optional[torch.Tensor] = None, mask: bool = True, features_only: bool = False, output_layer: Optional[int] = None, ) -> Dict[str, torch.Tensor]: """output layer is 1-based""" features = self.forward_features(source) if target_list is not None: features, target_list = self.forward_targets(features, target_list) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None: padding_mask = self.forward_padding_mask(features, padding_mask) if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) if mask: x, mask_indices = self.apply_mask(features, padding_mask, target_list) else: x = features mask_indices = None # feature: (B, T, D), float # target: (B, T), long # x: (B, T, D), float # padding_mask: (B, T), bool # mask_indices: (B, T), bool x, _ = self.encoder( x, padding_mask=padding_mask, layer=None if output_layer is None else output_layer - 1, ) if features_only: return {"x": x, "padding_mask": padding_mask, "features": features} def compute_pred(proj_x, target, label_embs): # compute logits for the i-th label set y = torch.index_select(label_embs, 0, target.long()) negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) # proj_x: (S, D) # y: (S, D) # negs: (Neg, S, D) return self.compute_nce(proj_x, y, negs) label_embs_list = self.label_embs_concat.split(self.num_classes, 0) if not self.skip_masked: masked_indices = torch.logical_and(~padding_mask, mask_indices) proj_x_m = self.final_proj(x[masked_indices]) if self.untie_final_proj: proj_x_m_list = proj_x_m.chunk(len(target_list), dim=-1) else: proj_x_m_list = [proj_x_m for _ in range(len(target_list))] logit_m_list = [ compute_pred(proj_x_m, t[masked_indices], label_embs_list[i]) for i, (proj_x_m, t) in enumerate(zip(proj_x_m_list, target_list)) ] else: logit_m_list = [None for _ in target_list] if not self.skip_nomask: nomask_indices = torch.logical_and(~padding_mask, ~mask_indices) proj_x_u = self.final_proj(x[nomask_indices]) if self.untie_final_proj: proj_x_u_list = proj_x_u.chunk(len(target_list), dim=-1) else: proj_x_u_list = [proj_x_u for _ in range(len(target_list))] logit_u_list = [ compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i]) for i, (proj_x_u, t) in enumerate(zip(proj_x_u_list, target_list)) ] else: logit_u_list = [None for _ in target_list] result = { "logit_m_list": logit_m_list, "logit_u_list": logit_u_list, "padding_mask": padding_mask, "features_pen": features_pen, } return result def extract_features( self, source: torch.Tensor, padding_mask: Optional[torch.Tensor] = None, mask: bool = False, ret_conv: bool = False, output_layer: Optional[int] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: res = self.forward( source, padding_mask=padding_mask, mask=mask, features_only=True, output_layer=output_layer, ) feature = res["features"] if ret_conv else res["x"] return feature, res["padding_mask"] def get_logits(self, net_output, is_masked=True): if is_masked: logits_list = net_output["logit_m_list"] else: logits_list = net_output["logit_u_list"] logits_list = [x.float() for x in logits_list if x is not None] return logits_list def get_targets(self, net_output, is_masked=True): logits_list = self.get_logits(net_output, is_masked) targets_list = [x.new_zeros(x.size(0), dtype=torch.long) for x in logits_list] return targets_list def get_extra_losses(self, net_output): extra_losses = [] names = [] if "features_pen" in net_output: extra_losses.append(net_output["features_pen"]) names.append("features_pen") return extra_losses, names def remove_pretraining_modules(self): self.target_glu = None self.final_proj = None ================================================ FILE: fairseq/models/hubert/hubert_asr.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import copy import logging import math from argparse import Namespace from dataclasses import dataclass, field from typing import Any, Optional import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from omegaconf import II, MISSING, open_dict from fairseq import checkpoint_utils, tasks, utils from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import ( BaseFairseqModel, FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, ) from fairseq.models.hubert.hubert import MASKING_DISTRIBUTION_CHOICES from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer from fairseq.tasks import FairseqTask logger = logging.getLogger(__name__) @dataclass class HubertAsrConfig(FairseqDataclass): w2v_path: str = field(default=MISSING, metadata={"help": "path to hubert model"}) no_pretrained_weights: bool = field( default=False, metadata={"help": "if true, does not load pretrained weights"}, ) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) final_dropout: float = field( default=0.0, metadata={"help": "dropout after transformer and before final projection"}, ) dropout: float = field( default=0.0, metadata={"help": "dropout probability inside hubert model"}, ) attention_dropout: float = field( default=0.0, metadata={ "help": "dropout probability for attention weights " "inside hubert model" }, ) activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN " "inside hubert model" }, ) encoder_embed_dim: Optional[int] = field( default=768, metadata={"help": "encoder embedding dimension"} ) # masking apply_mask: bool = field( default=False, metadata={"help": "apply masking during fine-tuning"} ) mask_length: int = field( default=10, metadata={"help": "repeat the mask indices multiple times"} ) mask_prob: float = field( default=0.5, metadata={ "help": "probability of replacing a token with mask " "(normalized by length)" }, ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose masks"} ) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"} ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"}, ) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"}, ) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"}, ) freeze_finetune_updates: int = field( default=0, metadata={"help": "dont finetune hubert for this many updates"}, ) feature_grad_mult: float = field( default=0.0, metadata={"help": "reset feature grad mult in hubert to this"}, ) layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a layer in hubert"}, ) normalize: bool = II("task.normalize") data: str = II("task.data") # this holds the loaded hubert args w2v_args: Any = None @dataclass class HubertCtcConfig(HubertAsrConfig): pass @register_model("hubert_ctc", dataclass=HubertCtcConfig) class HubertCtc(BaseFairseqModel): def __init__(self, cfg: HubertCtcConfig, w2v_encoder: BaseFairseqModel): super().__init__() self.cfg = cfg self.w2v_encoder = w2v_encoder def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model(cls, cfg: HubertCtcConfig, task: FairseqTask): """Build a new model instance.""" w2v_encoder = HubertEncoder(cfg, task) return cls(cfg, w2v_encoder) def get_normalized_probs(self, net_output, log_probs): """Get normalized probabilities (or log probs) from a net's output.""" logits = net_output["encoder_out"] if log_probs: return utils.log_softmax(logits.float(), dim=-1) else: return utils.softmax(logits.float(), dim=-1) def get_logits(self, net_output): logits = net_output["encoder_out"] padding = net_output["encoder_padding_mask"] if padding is not None and padding.any(): padding = padding.T logits[padding][..., 0] = 0 logits[padding][..., 1:] = float("-inf") return logits def forward(self, **kwargs): x = self.w2v_encoder(**kwargs) return x @dataclass class HubertSeq2SeqConfig(HubertAsrConfig): decoder_embed_dim: int = field( default=768, metadata={"help": "decoder embedding dimension"} ) decoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "decoder embedding dimension for FFN"} ) decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"}) decoder_layerdrop: float = field( default=0.0, metadata={"help": "decoder layerdrop chance"} ) decoder_attention_heads: int = field( default=4, metadata={"help": "num decoder attention heads"} ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"} ) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings (outside self attention)" }, ) decoder_dropout: float = field( default=0.0, metadata={"help": "dropout probability in the decoder"} ) decoder_attention_dropout: float = field( default=0.0, metadata={ "help": "dropout probability for attention weights inside the decoder" }, ) decoder_activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN inside the decoder" }, ) max_target_positions: int = field( default=2048, metadata={"help": "max target positions"} ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"} ) autoregressive: bool = II("task.autoregressive") seq2seq_path: str = field( default="", metadata={"help": "reset_dict"}, ) reset_dict: bool = field( default=False, metadata={"help": "reset_dict"}, ) @register_model("hubert_seq2seq", dataclass=HubertSeq2SeqConfig) class HubertSeq2SeqModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @classmethod def build_model(cls, cfg: HubertSeq2SeqConfig, task: FairseqTask): """Build a new model instance.""" assert ( cfg.autoregressive ), "Please set task.autoregressive=true for seq2seq asr models" src_dict, tgt_dict = task.source_dictionary, task.target_dictionary def build_embedding(dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) return emb decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim) encoder = cls.build_encoder(cfg, task) decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) model = HubertSeq2SeqModel(encoder, decoder) if cfg["seq2seq_path"]: state = checkpoint_utils.load_checkpoint_to_cpu(cfg.seq2seq_path) state = state["model"] if cfg["reset_dict"]: del state["decoder.embed_out"] del state["decoder.embed_tokens.weight"] model.load_state_dict(state, strict=False) return model @classmethod def build_encoder(cls, cfg: HubertAsrConfig, task): return HubertEncoder(cfg, task) @classmethod def build_decoder(cls, cfg: HubertSeq2SeqConfig, tgt_dict, embed_tokens): return TransformerDecoder(cfg, tgt_dict, embed_tokens) def forward(self, **kwargs): encoder_out = self.encoder(**kwargs) decoder_out = self.decoder(encoder_out=encoder_out, **kwargs) return decoder_out def upgrade_state_dict_named(self, state_dict, name): return state_dict def load_state_dict( self, state_dict, strict=True, model_cfg=None, args: Optional[Namespace] = None, ): if model_cfg.reset_dict: logger.warn("Overriding loading strict state dict!") del state_dict["decoder.embed_out"] del state_dict["decoder.embed_tokens.weight"] return super().load_state_dict(state_dict, False, model_cfg, args) return super().load_state_dict(state_dict, strict, model_cfg, args) class HubertEncoder(FairseqEncoder): def __init__(self, cfg: HubertAsrConfig, task): self.apply_mask = cfg.apply_mask arg_overrides = { "dropout": cfg.dropout, "activation_dropout": cfg.activation_dropout, "dropout_input": cfg.dropout_input, "attention_dropout": cfg.attention_dropout, "mask_length": cfg.mask_length, "mask_prob": cfg.mask_prob, "mask_selection": cfg.mask_selection, "mask_other": cfg.mask_other, "no_mask_overlap": cfg.no_mask_overlap, "mask_channel_length": cfg.mask_channel_length, "mask_channel_prob": cfg.mask_channel_prob, "mask_channel_selection": cfg.mask_channel_selection, "mask_channel_other": cfg.mask_channel_other, "no_mask_channel_overlap": cfg.no_mask_channel_overlap, "encoder_layerdrop": cfg.layerdrop, "feature_grad_mult": cfg.feature_grad_mult, } if cfg.w2v_args is None: state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides) w2v_args = state.get("cfg", None) if w2v_args is None: w2v_args = convert_namespace_to_omegaconf(state["args"]) cfg.w2v_args = w2v_args else: state = None w2v_args = cfg.w2v_args if isinstance(w2v_args, Namespace): cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args) assert cfg.normalize == w2v_args.task.normalize, ( "Fine-tuning works best when data normalization is the same. " "Please check that --normalize is set or unset for " "both pre-training and here" ) w2v_args.task.data = cfg.data pretrain_task = tasks.setup_task(w2v_args.task) if state is not None and "task_state" in state: # This will load the stored "dictionaries" object pretrain_task.load_state_dict(state["task_state"]) else: pretrain_task.load_state_dict(task.state_dict()) model = pretrain_task.build_model(w2v_args.model, from_checkpoint=True) if state is not None and not cfg.no_pretrained_weights: # set strict=False because we omit some modules model.load_state_dict(state["model"], strict=False) model.remove_pretraining_modules() super().__init__(pretrain_task.source_dictionary) d = w2v_args.model.encoder_embed_dim self.w2v_model = model self.final_dropout = nn.Dropout(cfg.final_dropout) self.freeze_finetune_updates = cfg.freeze_finetune_updates self.num_updates = 0 if task.target_dictionary is not None and not cfg.autoregressive: self.proj = Linear(d, len(task.target_dictionary)) elif getattr(cfg, "decoder_embed_dim", d) != d: self.proj = Linear(d, cfg.decoder_embed_dim) else: self.proj = None def set_num_updates(self, num_updates): """Set the number of parameters updates.""" super().set_num_updates(num_updates) self.num_updates = num_updates def forward(self, source, padding_mask, tbc=True, **kwargs): w2v_args = { "source": source, "padding_mask": padding_mask, "mask": self.apply_mask and self.training, } ft = self.freeze_finetune_updates <= self.num_updates with torch.no_grad() if not ft else contextlib.ExitStack(): x, padding_mask = self.w2v_model.extract_features(**w2v_args) if tbc: # B x T x C -> T x B x C x = x.transpose(0, 1) x = self.final_dropout(x) if self.proj: x = self.proj(x) return { "encoder_out": x, # T x B x C "encoder_padding_mask": padding_mask, # B x T "padding_mask": padding_mask, } def reorder_encoder_out(self, encoder_out, new_order): if encoder_out["encoder_out"] is not None: encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 1, new_order ) if encoder_out["encoder_padding_mask"] is not None: encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(0, new_order) if encoder_out["padding_mask"] is not None: encoder_out["padding_mask"] = encoder_out["padding_mask"].index_select( 0, new_order ) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" return None def upgrade_state_dict_named(self, state_dict, name): return state_dict class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, cfg: HubertSeq2SeqConfig, dictionary, embed_tokens, no_encoder_attn=False, ): super().__init__(dictionary) self.dropout = cfg.decoder_dropout self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder_embed_dim self.output_embed_dim = cfg.decoder_embed_dim self.layerdrop = cfg.decoder_layerdrop self.padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( cfg.max_target_positions, embed_dim, self.padding_idx, learned=cfg.decoder_learned_pos, ) if not cfg.no_token_positional_embeddings else None ) # TODO: update this when transformer gets converted to dataclass configs transformer_cfg = copy.deepcopy(cfg) with open_dict(transformer_cfg): transformer_cfg.dropout = transformer_cfg.decoder_dropout transformer_cfg.attention_dropout = ( transformer_cfg.decoder_attention_dropout ) transformer_cfg.activation_dropout = ( transformer_cfg.decoder_activation_dropout ) self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerDecoderLayer(transformer_cfg, no_encoder_attn) for _ in range(transformer_cfg.decoder_layers) ] ) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if transformer_cfg.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ if type(prev_output_tokens) == list: max_len = max((len(x) for x in prev_output_tokens)) tmp = torch.zeros( [len(prev_output_tokens), max_len], device=prev_output_tokens[0].device ) for (i, p) in enumerate(prev_output_tokens): tmp[i, : len(p)] = p prev_output_tokens = tmp prev_output_tokens = prev_output_tokens.long() x, extra = self.extract_features( prev_output_tokens, encoder_out, incremental_state ) x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused ): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ # embed positions positions = ( self.embed_positions( prev_output_tokens, incremental_state=incremental_state ) if self.embed_positions is not None else None ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers self_attn_padding_mask = None if prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) for layer in self.layers: dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): x, attn, _ = layer( x, encoder_out["encoder_out"] if encoder_out is not None else None, encoder_out["padding_mask"] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, self_attn_padding_mask=self_attn_padding_mask, ) inner_states.append(x) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) return x, {"attn": attn, "inner_states": inner_states} def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, "_future_mask") or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim ): self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 ) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): return state_dict def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m ================================================ FILE: fairseq/models/huggingface/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os # automatically import any Python files in the models/huggingface/ directory models_dir = os.path.dirname(__file__) for file in os.listdir(models_dir): path = os.path.join(models_dir, file) if ( not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)) ): model_name = file[: file.find(".py")] if file.endswith(".py") else file module = importlib.import_module("fairseq.models.huggingface." + model_name) ================================================ FILE: fairseq/models/huggingface/hf_gpt2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys from typing import Dict, List, Optional import torch from fairseq.models import ( FairseqIncrementalDecoder, FairseqLanguageModel, register_model, register_model_architecture, ) logger = logging.getLogger(__name__) DEFAULT_MAX_TARGET_POSITIONS = 1024 @register_model("hf_gpt2") class HuggingFaceGPT2LanguageModel(FairseqLanguageModel): def __init__(self, decoder): super().__init__(decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--embed-dim', type=int, metavar='N', help='embedding dimension') parser.add_argument('--num-attention-heads', type=int, metavar='N', help='num attention heads') parser.add_argument('--num-layers', type=int, metavar='N', help='num layers') parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability for all fully connected layers ' 'in the embeddings, encoder, and pooler') parser.add_argument('--attention-dropout', type=float, metavar='D', help='dropout probability for attention weights') # fmt: on @classmethod def build_model(cls, args, task): """Build a new model instance.""" default_architecture(args) return cls(HuggingFaceGPT2Decoder(args, task)) class HuggingFaceGPT2Decoder(FairseqIncrementalDecoder): def __init__(self, args, task): try: from transformers import GPT2Config, GPT2LMHeadModel except ImportError: raise ImportError( "\n\nPlease install huggingface/transformers with:" "\n\n pip install transformers" ) super().__init__(task.target_dictionary) config = GPT2Config( vocab_size=len(task.target_dictionary), n_positions=args.max_target_positions + 1, n_ctx=args.max_target_positions, n_embd=args.embed_dim, n_layer=args.num_layers, n_head=args.num_attention_heads, resid_pdrop=args.dropout, embd_pdrop=args.dropout, attn_pdrop=args.attention_dropout, layer_norm_epsilon=1e-6, ) self.model = GPT2LMHeadModel(config) # set zero embedding for padding symbol self.pad_idx = task.target_dictionary.pad() self.model.transformer.wte.weight.data[self.pad_idx].zero_() self.model.transformer.wpe.weight.data[0].zero_() def forward( self, prev_output_tokens, src_lengths=None, incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, encoder_out=None, ): features = self.extract_features(prev_output_tokens, incremental_state) lm_logits = self.model.lm_head(features) return (lm_logits,) def extract_features( self, prev_output_tokens, incremental_state: Optional[Dict[str, List[torch.Tensor]]] = None, ): if incremental_state: past = self.get_incremental_state("past") else: past = None # don't attend to padding symbols attention_mask = prev_output_tokens.ne(self.pad_idx).int() # set position ids to exclude padding symbols position_ids = attention_mask * ( torch.arange(1, 1 + prev_output_tokens.size(1)) .to(prev_output_tokens) .repeat(prev_output_tokens.size(0), 1) ) outputs = self.model.transformer( input_ids=prev_output_tokens, past=past, attention_mask=attention_mask, position_ids=position_ids, ) last_hidden_states = outputs[0] if incremental_state: self.set_incremental_state(incremental_state, "past", outputs[1]) return last_hidden_states def max_positions(self): return self.model.config.n_positions - 1 @register_model_architecture("hf_gpt2", "hf_gpt2") def default_architecture(args): if getattr(args, "max_target_positions", None) is None: args.max_target_positions = getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) args.embed_dim = getattr(args, "embed_dim", 768) args.num_attention_heads = getattr(args, "num_attention_heads", 12) args.num_layers = getattr(args, "num_layers", 12) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) @register_model_architecture("hf_gpt2", "hf_gpt2_medium") def hf_gpt2_medium(args): args.embed_dim = getattr(args, "embed_dim", 1024) args.num_attention_heads = getattr(args, "num_attention_heads", 16) args.num_layers = getattr(args, "num_layers", 24) default_architecture(args) @register_model_architecture("hf_gpt2", "hf_gpt2_large") def hf_gpt2_large(args): args.embed_dim = getattr(args, "embed_dim", 1280) args.num_attention_heads = getattr(args, "num_attention_heads", 20) args.num_layers = getattr(args, "num_layers", 36) default_architecture(args) @register_model_architecture("hf_gpt2", "hf_gpt2_xl") def hf_gpt2_xl(args): args.embed_dim = getattr(args, "embed_dim", 1600) args.num_attention_heads = getattr(args, "num_attention_heads", 25) args.num_layers = getattr(args, "num_layers", 48) default_architecture(args) ================================================ FILE: fairseq/models/lightconv.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Any, Dict, List, Optional, Tuple import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, register_model_architecture, ) from fairseq.modules import ( AdaptiveSoftmax, DynamicConv_scripatable as DynamicConv, FairseqDropout, LayerNorm, LightweightConv, MultiheadAttention, PositionalEmbedding, ) from fairseq.utils import safe_hasattr from torch import Tensor @register_model("lightconv") class LightConvModel(FairseqEncoderDecoderModel): """ LightConv and DynamicConv model from `"Pay Less Attention with Lightweight and Dynamic Convolutions" (Wu, et al, 2019) <https://openreview.net/pdf?id=SkVhlh09tX>`_. To use LightConv please set ``--encoder-conv-type lightweight --decoder-conv-type lightweight`` To use DynamicConv please set ``--encoder-conv-type dynamic --decoder-conv-type dynamic`` Args: encoder (LightConvEncoder): the encoder decoder (LightConvDecoder): the decoder The LightConv model provides the following named architectures and command-line arguments: .. argparse:: :ref: fairseq.models.lightconv_parser :prog: """ @classmethod def hub_models(cls): # fmt: off def moses_subword(path): return { 'path': path, 'tokenizer': 'moses', 'bpe': 'subword_nmt', } return { 'lightconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.lightconv.tar.gz'), 'dynamicconv.no_glu.iwslt14.de-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/iwslt14.de-en.dynamicconv.tar.gz'), 'lightconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv.tar.gz'), 'dynamicconv.no_glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv.tar.gz'), 'lightconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'), 'dynamicconv.glu.wmt16.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'), 'lightconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.lightconv-glu.tar.gz'), 'dynamicconv.glu.wmt17.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt16.en-de.joined-dict.dynamicconv-glu.tar.gz'), 'lightconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.lightconv-glu.tar.gz'), 'dynamicconv.glu.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt14.en-fr.joined-dict.dynamicconv-glu.tar.gz'), 'lightconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.lightconv-glu.tar.gz'), 'dynamicconv.glu.wmt17.zh-en': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/dynamicconv/wmt17.zh-en.dynamicconv-glu.tar.gz'), } # fmt: on def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--input-dropout", type=float, metavar="D", help="dropout probability of the inputs", ) parser.add_argument( "--encoder-embed-path", type=str, metavar="STR", help="path to pre-trained encoder embedding", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-conv-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--encoder-learned-pos", action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--decoder-embed-path", type=str, metavar="STR", help="path to pre-trained decoder embedding", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-conv-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="use learned positional embeddings in the decoder", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--share-all-embeddings", action="store_true", help="share encoder, decoder and output embeddings" " (requires shared dictionary and embed dim)", ) parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ), parser.add_argument( "--adaptive-softmax-dropout", type=float, metavar="D", help="sets adaptive softmax dropout for the tail projections", ) """LightConv and DynamicConv arguments""" parser.add_argument( "--encoder-kernel-size-list", type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31,31]")', ) parser.add_argument( "--decoder-kernel-size-list", type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")', ) parser.add_argument( "--encoder-glu", type=utils.eval_bool, help="glu after in proj" ) parser.add_argument( "--decoder-glu", type=utils.eval_bool, help="glu after in proj" ) parser.add_argument( "--encoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument( "--decoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) parser.add_argument( "--weight-dropout", type=float, metavar="D", help="dropout probability for conv weights", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if not safe_hasattr(args, "max_source_positions"): args.max_source_positions = 1024 if not safe_hasattr(args, "max_target_positions"): args.max_target_positions = 1024 src_dict, tgt_dict = task.source_dictionary, task.target_dictionary def build_embedding(dictionary, embed_dim, path=None): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb if args.share_all_embeddings: if src_dict != tgt_dict: raise RuntimeError( "--share-all-embeddings requires a joined dictionary" ) if args.encoder_embed_dim != args.decoder_embed_dim: raise RuntimeError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise RuntimeError( "--share-all-embeddings not compatible with --decoder-embed-path" ) encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = encoder_embed_tokens args.share_decoder_input_output_embed = True else: encoder_embed_tokens = build_embedding( src_dict, args.encoder_embed_dim, args.encoder_embed_path ) decoder_embed_tokens = build_embedding( tgt_dict, args.decoder_embed_dim, args.decoder_embed_path ) encoder = LightConvEncoder(args, src_dict, encoder_embed_tokens) decoder = LightConvDecoder(args, tgt_dict, decoder_embed_tokens) return LightConvModel(encoder, decoder) def forward( self, src_tokens: Tensor, src_lengths: Tensor, prev_output_tokens: Tensor, ): """ (The forward method inherited from the base class has a **kwargs argument in its input, which is not supported in torchscript. This method overwrites the forward method definition without **kwargs.) Run the forward pass for an encoder-decoder model. First feed a batch of source tokens through the encoder. Then, feed the encoder output and previous decoder outputs (i.e., teacher forcing) to the decoder to produce the next outputs:: encoder_out = self.encoder(src_tokens, src_lengths) return self.decoder(prev_output_tokens, encoder_out) Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): source sentence lengths of shape `(batch)` prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ encoder_out = self.encoder(src_tokens, src_lengths) decoder_out = self.decoder(prev_output_tokens, encoder_out=encoder_out) return decoder_out class LightConvEncoder(FairseqEncoder): """ LightConv encoder consisting of *args.encoder_layers* layers. Each layer is a :class:`LightConvEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, args, dictionary, embed_tokens): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = args.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( args.max_source_positions, embed_dim, self.padding_idx, learned=args.encoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layers = nn.ModuleList([]) self.layers.extend( [ LightConvEncoderLayer( args, kernel_size=args.encoder_kernel_size_list[i] ) for i in range(args.encoder_layers) ] ) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.encoder_normalize_before if self.normalize: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward( self, src_tokens: Tensor, src_lengths: Optional[Tensor] = None ) -> Dict[str, List[Tensor]]: """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` """ # embed tokens and positions x = self.embed_scale * self.embed_tokens(src_tokens) if self.embed_positions is not None: x += self.embed_positions(src_tokens) x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) # B x T if not encoder_padding_mask.any(): encoder_mask = None else: encoder_mask = encoder_padding_mask # encoder layers for layer in self.layers: x = layer(x, encoder_mask) if self.layer_norm is not None: x = self.layer_norm(x) output_dict: Dict[str, List[Tensor]] = {} if src_lengths is not None: output_dict["src_lengths"] = [src_lengths] output_dict["encoder_out"] = [x] # T x B x C if encoder_mask is not None: output_dict["encoder_padding_mask"] = [encoder_mask] # B x T return output_dict @torch.jit.export def reorder_encoder_out( self, encoder_out: Dict[str, List[Tensor]], new_order: Tensor ): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if len(encoder_out["encoder_out"]) == 0: encoder = [] else: encoder = [encoder_out["encoder_out"][0].index_select(1, new_order)] output_dict = {"encoder_out": encoder} if ("encoder_padding_mask" not in encoder_out) or ( len(encoder_out["encoder_padding_mask"]) == 0 ): encoder_padding_mask = [] else: encoder_padding_mask = [ encoder_out["encoder_padding_mask"][0].index_select(0, new_order) ] output_dict["encoder_padding_mask"] = encoder_padding_mask return output_dict def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions) class LightConvDecoder(FairseqIncrementalDecoder): """ LightConv decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`LightConvDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs. Default: ``False`` """ def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, final_norm=True ): super().__init__(dictionary) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.share_input_output_embed = args.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim output_embed_dim = args.decoder_output_dim padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( args.max_target_positions, embed_dim, padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) self.layers = nn.ModuleList([]) self.layers.extend( [ LightConvDecoderLayer( args, no_encoder_attn, kernel_size=args.decoder_kernel_size_list[i], dictionary=dictionary, ) for i in range(args.decoder_layers) ] ) self.adaptive_softmax = None self.output_projection = None self.project_out_dim = ( Linear(embed_dim, output_embed_dim, bias=False) if embed_dim != output_embed_dim and not args.tie_adaptive_weights else None ) if args.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), output_embed_dim, utils.eval_str_list(args.adaptive_softmax_cutoff, type=int), dropout=args.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if args.tie_adaptive_weights else None, factor=args.adaptive_softmax_factor, tie_proj=args.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=output_embed_dim**-0.5 ) self.register_buffer("version", torch.Tensor([2])) self.normalize = args.decoder_normalize_before and final_norm if self.normalize: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward( self, prev_output_tokens: Tensor, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, src_lengths: Optional[Any] = None, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the last decoder layer's output of shape `(batch, tgt_len, vocab)` - the last decoder layer's attention weights of shape `(batch, tgt_len, src_len)` """ # embed positions positions = ( self.embed_positions( prev_output_tokens, incremental_state=incremental_state, ) if self.embed_positions is not None else None ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens.contiguous()) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states: List[Optional[Tensor]] = [x] # decoder layers attn: Optional[Tensor] = None for layer in self.layers: encoder: Optional[Tensor] = None encoder_padding_mask: Optional[Tensor] = None if encoder_out is not None: if len(encoder_out["encoder_out"]) > 0: encoder = encoder_out["encoder_out"][0] if ( "encoder_padding_mask" in encoder_out and len(encoder_out["encoder_padding_mask"]) > 0 ): encoder_padding_mask = encoder_out["encoder_padding_mask"][0] x, attn = layer( x, encoder, encoder_padding_mask, incremental_state, ) inner_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) if self.adaptive_softmax is None: # project back to size of vocabulary x = self.output_projection(x) return x, {"attn": [attn], "inner_states": inner_states} def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, "_future_mask") or self._future_mask is None or self._future_mask.device != tensor.device ): self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 ) if self._future_mask.size(0) < dim: self._future_mask = torch.triu( utils.fill_with_neg_inf(self._future_mask.resize_(dim, dim)), 1 ) return self._future_mask[:dim, :dim] class LightConvEncoderLayer(nn.Module): """Encoder layer block. Args: args (argparse.Namespace): parsed command-line arguments kernel_size: kernel size of the convolution """ def __init__(self, args, kernel_size=0): super().__init__() self.embed_dim = args.encoder_embed_dim self.conv_dim = args.encoder_conv_dim padding_l = ( kernel_size // 2 if kernel_size % 2 == 1 else ((kernel_size - 1) // 2, kernel_size // 2) ) if args.encoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.encoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.encoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=padding_l, weight_softmax=args.weight_softmax, num_heads=args.encoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__ ) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__ ) self.normalize_before = args.encoder_normalize_before self.fc1 = Linear(self.embed_dim, args.encoder_ffn_embed_dim) self.fc2 = Linear(args.encoder_ffn_embed_dim, self.embed_dim) self.layer_norm1 = LayerNorm(self.embed_dim) self.layer_norm2 = LayerNorm(self.embed_dim) def forward(self, x, encoder_padding_mask: Optional[Tensor] = None) -> Tensor: """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x normalize = self.maybe_layer_norm(before=True) if normalize: x = self.layer_norm1(x) x = self.input_dropout_module(x) x = self.linear1(x) if self.act is not None: x = self.act(x) if encoder_padding_mask is not None: x = x.masked_fill(encoder_padding_mask.transpose(0, 1).unsqueeze(2), 0) x = self.conv(x) x = self.linear2(x) x = self.dropout_module(x) x = residual + x normalize = self.maybe_layer_norm(after=True) if normalize: x = self.layer_norm1(x) residual = x normalize = self.maybe_layer_norm(before=True) if normalize: x = self.layer_norm2(x) x = F.relu(self.fc1(x)) x = self.relu_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = residual + x normalize = self.maybe_layer_norm(after=True) if normalize: x = self.layer_norm2(x) return x def maybe_layer_norm(self, before: bool = False, after: bool = False): assert before ^ after, "Incorrect arguments" return after ^ self.normalize_before def extra_repr(self): return ( "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format( self.dropout_module.p, self.relu_dropout_module.p, self.input_dropout_module.p, self.normalize_before, ) ) class LightConvDecoderLayer(nn.Module): """Decoder layer block. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs. Default: ``False`` kernel_size: kernel size of the convolution """ def __init__(self, args, no_encoder_attn=False, kernel_size=0, dictionary=None): super().__init__() self.embed_dim = args.decoder_embed_dim self.conv_dim = args.decoder_conv_dim if args.decoder_glu: self.linear1 = Linear(self.embed_dim, 2 * self.conv_dim) self.act = nn.GLU() else: self.linear1 = Linear(self.embed_dim, self.conv_dim) self.act = None if args.decoder_conv_type == "lightweight": self.conv = LightweightConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) elif args.decoder_conv_type == "dynamic": self.conv = DynamicConv( self.conv_dim, kernel_size, padding_l=kernel_size - 1, weight_softmax=args.weight_softmax, num_heads=args.decoder_attention_heads, weight_dropout=args.weight_dropout, ) else: raise NotImplementedError self.linear2 = Linear(self.conv_dim, self.embed_dim) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.relu_dropout_module = FairseqDropout( args.relu_dropout, module_name=self.__class__.__name__ ) self.input_dropout_module = FairseqDropout( args.input_dropout, module_name=self.__class__.__name__ ) self.normalize_before = args.decoder_normalize_before self.conv_layer_norm = LayerNorm(self.embed_dim) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = MultiheadAttention( self.embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, encoder_decoder_attention=True, dictionary=dictionary, ) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim) self.fc1 = Linear(self.embed_dim, args.decoder_ffn_embed_dim) self.fc2 = Linear(args.decoder_ffn_embed_dim, self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim) self.need_attn = True def forward( self, x: Tensor, encoder_out: Optional[Tensor], encoder_padding_mask: Optional[Tensor], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], prev_conv_state: Optional[Tensor] = None, prev_attn_state: Optional[Tuple[Tensor, Tensor]] = None, conv_mask: Optional[Tensor] = None, conv_padding_mask: Optional[Tensor] = None, ): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. Returns: encoded output of shape `(batch, src_len, embed_dim)` """ residual = x normalize = self.maybe_layer_norm(before=True) if normalize: x = self.conv_layer_norm(x) if prev_conv_state is not None: self.conv._set_input_buffer(incremental_state, prev_conv_state) x = self.input_dropout_module(x) x = self.linear1(x) if self.act is not None: x = self.act(x) x = self.conv(x, incremental_state=incremental_state) x = self.linear2(x) x = self.dropout_module(x) x = residual + x normalize = self.maybe_layer_norm(after=True) if normalize: x = self.conv_layer_norm(x) attn: Optional[Tensor] = None if self.encoder_attn is not None: residual = x normalize = self.maybe_layer_norm(before=True) if normalize: x = self.encoder_attn_layer_norm(x) if prev_attn_state is not None: saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_attn_state[0], "prev_value": prev_attn_state[1], } self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=(not self.training and self.need_attn), ) x = self.dropout_module(x) x = residual + x normalize = self.maybe_layer_norm(after=True) if normalize: x = self.encoder_attn_layer_norm(x) residual = x normalize = self.maybe_layer_norm(before=True) if normalize: x = self.final_layer_norm(x) x = F.relu(self.fc1(x)) x = self.relu_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = residual + x normalize = self.maybe_layer_norm(after=True) if normalize: x = self.final_layer_norm(x) return x, attn def maybe_layer_norm(self, before: bool = False, after: bool = False): assert before ^ after, "Incorrect usage" return after ^ self.normalize_before def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn def extra_repr(self): return ( "dropout={}, relu_dropout={}, input_dropout={}, normalize_before={}".format( self.dropout_module.p, self.relu_dropout_module.p, self.input_dropout_module.p, self.normalize_before, ) ) def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m @register_model_architecture("lightconv", "lightconv") def base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 7) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.relu_dropout = getattr(args, "relu_dropout", 0.0) args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.encoder_conv_dim = getattr(args, "encoder_conv_dim", args.encoder_embed_dim) args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim) args.encoder_kernel_size_list = getattr( args, "encoder_kernel_size_list", [3, 7, 15, 31, 31, 31, 31] ) args.decoder_kernel_size_list = getattr( args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31] ) if len(args.encoder_kernel_size_list) == 1: args.encoder_kernel_size_list = ( args.encoder_kernel_size_list * args.encoder_layers ) if len(args.decoder_kernel_size_list) == 1: args.decoder_kernel_size_list = ( args.decoder_kernel_size_list * args.decoder_layers ) assert ( len(args.encoder_kernel_size_list) == args.encoder_layers ), "encoder_kernel_size_list doesn't match encoder_layers" assert ( len(args.decoder_kernel_size_list) == args.decoder_layers ), "decoder_kernel_size_list doesn't match decoder_layers" args.encoder_glu = getattr(args, "encoder_glu", True) args.decoder_glu = getattr(args, "decoder_glu", True) args.input_dropout = getattr(args, "input_dropout", 0.1) args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout) @register_model_architecture("lightconv", "lightconv_iwslt_de_en") def lightconv_iwslt_de_en(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.encoder_layers = getattr(args, "encoder_layers", 7) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.decoder_layers = getattr(args, "decoder_layers", 6) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.weight_dropout = getattr(args, "weight_dropout", 0.1) args.encoder_glu = getattr(args, "encoder_glu", False) args.decoder_glu = getattr(args, "decoder_glu", False) args.input_dropout = getattr(args, "input_dropout", 0.0) base_architecture(args) @register_model_architecture("lightconv", "lightconv_wmt_en_de") def lightconv_wmt_en_de(args): base_architecture(args) @register_model_architecture("lightconv", "lightconv_wmt_en_de_big") def lightconv_wmt_en_de_big(args): args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.dropout = getattr(args, "dropout", 0.3) base_architecture(args) @register_model_architecture("lightconv", "lightconv_wmt_en_fr_big") def lightconv_wmt_en_fr_big(args): args.dropout = getattr(args, "dropout", 0.1) lightconv_wmt_en_de_big(args) @register_model_architecture("lightconv", "lightconv_wmt_zh_en_big") def lightconv_wmt_zh_en_big(args): args.dropout = getattr(args, "dropout", 0.2) args.attention_dropout = getattr(args, "attention_dropout", 0.2) args.weight_dropout = getattr(args, "weight_dropout", 0.2) lightconv_wmt_en_de_big(args) ================================================ FILE: fairseq/models/lightconv_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq import utils from fairseq.models import ( FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.lightconv import Embedding, LightConvDecoder from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder @register_model("lightconv_lm") class LightConvLanguageModel(FairseqLanguageModel): def __init__(self, decoder): super().__init__(decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--dropout", default=0.1, type=float, metavar="D", help="dropout probability", ) parser.add_argument( "--attention-dropout", default=0.0, type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--relu-dropout", default=0.0, type=float, metavar="D", help="dropout probability after ReLU in FFN", ) parser.add_argument( "--input-dropout", type=float, metavar="D", help="dropout probability of the inputs", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-output-dim", type=int, metavar="N", help="decoder output dimension", ) parser.add_argument( "--decoder-input-dim", type=int, metavar="N", help="decoder input dimension" ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads or LightConv/DynamicConv heads", ) parser.add_argument( "--decoder-normalize-before", default=False, action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--adaptive-softmax-cutoff", metavar="EXPR", help="comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion", ) parser.add_argument( "--adaptive-softmax-dropout", type=float, metavar="D", help="sets adaptive softmax dropout for the tail projections", ) parser.add_argument( "--adaptive-softmax-factor", type=float, metavar="N", help="adaptive input factor", ) parser.add_argument( "--no-token-positional-embeddings", default=False, action="store_true", help="if set, disables positional embeddings (outside self attention)", ) parser.add_argument( "--share-decoder-input-output-embed", default=False, action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--character-embeddings", default=False, action="store_true", help="if set, uses character embedding convolutions to produce token embeddings", ) parser.add_argument( "--character-filters", type=str, metavar="LIST", default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", help="size of character embeddings", ) parser.add_argument( "--character-embedding-dim", type=int, metavar="N", default=4, help="size of character embeddings", ) parser.add_argument( "--char-embedder-highway-layers", type=int, metavar="N", default=2, help="number of highway layers for character token embeddder", ) parser.add_argument( "--adaptive-input", default=False, action="store_true", help="if set, uses adaptive input", ) parser.add_argument( "--adaptive-input-factor", type=float, metavar="N", help="adaptive input factor", ) parser.add_argument( "--adaptive-input-cutoff", metavar="EXPR", help="comma separated list of adaptive input cutoff points.", ) parser.add_argument( "--tie-adaptive-weights", action="store_true", help="if set, ties the weights of adaptive softmax and adaptive input", ) parser.add_argument( "--tie-adaptive-proj", action="store_true", help="if set, ties the projection weights of adaptive softmax and adaptive input", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="use learned positional embeddings in the decoder", ) """LightConv and DynamicConv arguments""" parser.add_argument( "--decoder-kernel-size-list", type=lambda x: utils.eval_str_list(x, int), help='list of kernel size (default: "[3,7,15,31,31,31]")', ) parser.add_argument( "--decoder-glu", type=utils.eval_bool, help="glu after in proj" ) parser.add_argument( "--decoder-conv-type", default="dynamic", type=str, choices=["dynamic", "lightweight"], help="type of convolution", ) parser.add_argument("--weight-softmax", default=True, type=utils.eval_bool) parser.add_argument( "--weight-dropout", type=float, metavar="D", help="dropout probability for conv weights", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if getattr(args, "max_source_positions", None) is None: args.max_source_positions = args.tokens_per_sample if getattr(args, "max_target_positions", None) is None: args.max_target_positions = args.tokens_per_sample if args.character_embeddings: embed_tokens = CharacterTokenEmbedder( task.dictionary, eval(args.character_filters), args.character_embedding_dim, args.decoder_embed_dim, args.char_embedder_highway_layers, ) elif args.adaptive_input: embed_tokens = AdaptiveInput( len(task.dictionary), task.dictionary.pad(), args.decoder_input_dim, args.adaptive_input_factor, args.decoder_embed_dim, utils.eval_str_list(args.adaptive_input_cutoff, type=int), ) else: embed_tokens = Embedding( len(task.dictionary), args.decoder_input_dim, task.dictionary.pad() ) if args.tie_adaptive_weights: assert args.adaptive_input assert args.adaptive_input_factor == args.adaptive_softmax_factor assert ( args.adaptive_softmax_cutoff == args.adaptive_input_cutoff ), "{} != {}".format( args.adaptive_softmax_cutoff, args.adaptive_input_cutoff ) assert args.decoder_input_dim == args.decoder_output_dim decoder = LightConvDecoder( args, task.output_dictionary, embed_tokens, no_encoder_attn=True, final_norm=False, ) return LightConvLanguageModel(decoder) @register_model_architecture("lightconv_lm", "lightconv_lm") def base_lm_architecture(args): args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.character_embeddings = getattr(args, "character_embeddings", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.decoder_conv_dim = getattr(args, "decoder_conv_dim", args.decoder_embed_dim) # The model training is not stable without this args.decoder_normalize_before = True args.adaptive_input = getattr(args, "adaptive_input", False) args.adaptive_input_factor = getattr(args, "adaptive_input_factor", 4) args.adaptive_input_cutoff = getattr(args, "adaptive_input_cutoff", None) args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False) args.decoder_kernel_size_list = getattr( args, "decoder_kernel_size_list", [3, 7, 15, 31, 31, 31] ) if len(args.decoder_kernel_size_list) == 1: args.decoder_kernel_size_list = ( args.decoder_kernel_size_list * args.decoder_layers ) assert ( len(args.decoder_kernel_size_list) == args.decoder_layers ), "decoder_kernel_size_list doesn't match decoder_layers" args.decoder_glu = getattr(args, "decoder_glu", True) args.input_dropout = getattr(args, "input_dropout", 0.1) args.weight_dropout = getattr(args, "weight_dropout", args.attention_dropout) @register_model_architecture("lightconv_lm", "lightconv_lm_gbw") def lightconv_lm_gbw(args): args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) base_lm_architecture(args) ================================================ FILE: fairseq/models/lstm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, register_model_architecture, ) from fairseq.modules import AdaptiveSoftmax, FairseqDropout from torch import Tensor DEFAULT_MAX_SOURCE_POSITIONS = 1e5 DEFAULT_MAX_TARGET_POSITIONS = 1e5 @register_model("lstm") class LSTMModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--encoder-embed-dim', type=int, metavar='N', help='encoder embedding dimension') parser.add_argument('--encoder-embed-path', type=str, metavar='STR', help='path to pre-trained encoder embedding') parser.add_argument('--encoder-freeze-embed', action='store_true', help='freeze encoder embeddings') parser.add_argument('--encoder-hidden-size', type=int, metavar='N', help='encoder hidden size') parser.add_argument('--encoder-layers', type=int, metavar='N', help='number of encoder layers') parser.add_argument('--encoder-bidirectional', action='store_true', help='make all layers of encoder bidirectional') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-freeze-embed', action='store_true', help='freeze decoder embeddings') parser.add_argument('--decoder-hidden-size', type=int, metavar='N', help='decoder hidden size') parser.add_argument('--decoder-layers', type=int, metavar='N', help='number of decoder layers') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='BOOL', help='decoder attention') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', help='share decoder input and output embeddings') parser.add_argument('--share-all-embeddings', default=False, action='store_true', help='share encoder, decoder and output embeddings' ' (requires shared dictionary and embed dim)') # Granular dropout settings (if not specified these default to --dropout) parser.add_argument('--encoder-dropout-in', type=float, metavar='D', help='dropout probability for encoder input embedding') parser.add_argument('--encoder-dropout-out', type=float, metavar='D', help='dropout probability for encoder output') parser.add_argument('--decoder-dropout-in', type=float, metavar='D', help='dropout probability for decoder input embedding') parser.add_argument('--decoder-dropout-out', type=float, metavar='D', help='dropout probability for decoder output') # fmt: on @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure that all args are properly defaulted (in case there are any new ones) base_architecture(args) if args.encoder_layers != args.decoder_layers: raise ValueError("--encoder-layers must match --decoder-layers") max_source_positions = getattr( args, "max_source_positions", DEFAULT_MAX_SOURCE_POSITIONS ) max_target_positions = getattr( args, "max_target_positions", DEFAULT_MAX_TARGET_POSITIONS ) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) if args.encoder_embed_path: pretrained_encoder_embed = load_pretrained_embedding_from_file( args.encoder_embed_path, task.source_dictionary, args.encoder_embed_dim ) else: num_embeddings = len(task.source_dictionary) pretrained_encoder_embed = Embedding( num_embeddings, args.encoder_embed_dim, task.source_dictionary.pad() ) if args.share_all_embeddings: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError("--share-all-embeddings requires a joint dictionary") if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embed not compatible with --decoder-embed-path" ) if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to " "match --decoder-embed-dim" ) pretrained_decoder_embed = pretrained_encoder_embed args.share_decoder_input_output_embed = True else: # separate decoder input embeddings pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim, ) # one last double check of parameter combinations if args.share_decoder_input_output_embed and ( args.decoder_embed_dim != args.decoder_out_embed_dim ): raise ValueError( "--share-decoder-input-output-embeddings requires " "--decoder-embed-dim to match --decoder-out-embed-dim" ) if args.encoder_freeze_embed: pretrained_encoder_embed.weight.requires_grad = False if args.decoder_freeze_embed: pretrained_decoder_embed.weight.requires_grad = False encoder = LSTMEncoder( dictionary=task.source_dictionary, embed_dim=args.encoder_embed_dim, hidden_size=args.encoder_hidden_size, num_layers=args.encoder_layers, dropout_in=args.encoder_dropout_in, dropout_out=args.encoder_dropout_out, bidirectional=args.encoder_bidirectional, pretrained_embed=pretrained_encoder_embed, max_source_positions=max_source_positions, ) decoder = LSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=utils.eval_bool(args.decoder_attention), encoder_output_units=encoder.output_units, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=( utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None ), max_target_positions=max_target_positions, residuals=False, ) return cls(encoder, decoder) def forward( self, src_tokens, src_lengths, prev_output_tokens, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): encoder_out = self.encoder(src_tokens, src_lengths=src_lengths) decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, ) return decoder_out class LSTMEncoder(FairseqEncoder): """LSTM encoder.""" def __init__( self, dictionary, embed_dim=512, hidden_size=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, bidirectional=False, left_pad=True, pretrained_embed=None, padding_idx=None, max_source_positions=DEFAULT_MAX_SOURCE_POSITIONS, ): super().__init__(dictionary) self.num_layers = num_layers self.dropout_in_module = FairseqDropout( dropout_in * 1.0, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out * 1.0, module_name=self.__class__.__name__ ) self.bidirectional = bidirectional self.hidden_size = hidden_size self.max_source_positions = max_source_positions num_embeddings = len(dictionary) self.padding_idx = padding_idx if padding_idx is not None else dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, self.padding_idx) else: self.embed_tokens = pretrained_embed self.lstm = LSTM( input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=self.dropout_out_module.p if num_layers > 1 else 0.0, bidirectional=bidirectional, ) self.left_pad = left_pad self.output_units = hidden_size if bidirectional: self.output_units *= 2 def forward( self, src_tokens: Tensor, src_lengths: Tensor, enforce_sorted: bool = True, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (LongTensor): lengths of each source sentence of shape `(batch)` enforce_sorted (bool, optional): if True, `src_tokens` is expected to contain sequences sorted by length in a decreasing order. If False, this condition is not required. Default: True. """ if self.left_pad: # nn.utils.rnn.pack_padded_sequence requires right-padding; # convert left-padding to right-padding src_tokens = utils.convert_padding_direction( src_tokens, torch.zeros_like(src_tokens).fill_(self.padding_idx), left_to_right=True, ) bsz, seqlen = src_tokens.size() # embed tokens x = self.embed_tokens(src_tokens) x = self.dropout_in_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # pack embedded source tokens into a PackedSequence packed_x = nn.utils.rnn.pack_padded_sequence( x, src_lengths.cpu(), enforce_sorted=enforce_sorted ) # apply LSTM if self.bidirectional: state_size = 2 * self.num_layers, bsz, self.hidden_size else: state_size = self.num_layers, bsz, self.hidden_size h0 = x.new_zeros(*state_size) c0 = x.new_zeros(*state_size) packed_outs, (final_hiddens, final_cells) = self.lstm(packed_x, (h0, c0)) # unpack outputs and apply dropout x, _ = nn.utils.rnn.pad_packed_sequence( packed_outs, padding_value=self.padding_idx * 1.0 ) x = self.dropout_out_module(x) assert list(x.size()) == [seqlen, bsz, self.output_units] if self.bidirectional: final_hiddens = self.combine_bidir(final_hiddens, bsz) final_cells = self.combine_bidir(final_cells, bsz) encoder_padding_mask = src_tokens.eq(self.padding_idx).t() return tuple( ( x, # seq_len x batch x hidden final_hiddens, # num_layers x batch x num_directions*hidden final_cells, # num_layers x batch x num_directions*hidden encoder_padding_mask, # seq_len x batch ) ) def combine_bidir(self, outs, bsz: int): out = outs.view(self.num_layers, 2, bsz, -1).transpose(1, 2).contiguous() return out.view(self.num_layers, bsz, -1) def reorder_encoder_out( self, encoder_out: Tuple[Tensor, Tensor, Tensor, Tensor], new_order ): return tuple( ( encoder_out[0].index_select(1, new_order), encoder_out[1].index_select(1, new_order), encoder_out[2].index_select(1, new_order), encoder_out[3].index_select(1, new_order), ) ) def max_positions(self): """Maximum input length supported by the encoder.""" return self.max_source_positions class AttentionLayer(nn.Module): def __init__(self, input_embed_dim, source_embed_dim, output_embed_dim, bias=False): super().__init__() self.input_proj = Linear(input_embed_dim, source_embed_dim, bias=bias) self.output_proj = Linear( input_embed_dim + source_embed_dim, output_embed_dim, bias=bias ) def forward(self, input, source_hids, encoder_padding_mask): # input: bsz x input_embed_dim # source_hids: srclen x bsz x source_embed_dim # x: bsz x source_embed_dim x = self.input_proj(input) # compute attention attn_scores = (source_hids * x.unsqueeze(0)).sum(dim=2) # don't attend over padding if encoder_padding_mask is not None: attn_scores = ( attn_scores.float() .masked_fill_(encoder_padding_mask, float("-inf")) .type_as(attn_scores) ) # FP16 support: cast to float and back attn_scores = F.softmax(attn_scores, dim=0) # srclen x bsz # sum weighted sources x = (attn_scores.unsqueeze(2) * source_hids).sum(dim=0) x = torch.tanh(self.output_proj(torch.cat((x, input), dim=1))) return x, attn_scores class LSTMDecoder(FairseqIncrementalDecoder): """LSTM decoder.""" def __init__( self, dictionary, embed_dim=512, hidden_size=512, out_embed_dim=512, num_layers=1, dropout_in=0.1, dropout_out=0.1, attention=True, encoder_output_units=512, pretrained_embed=None, share_input_output_embed=False, adaptive_softmax_cutoff=None, max_target_positions=DEFAULT_MAX_TARGET_POSITIONS, residuals=False, ): super().__init__(dictionary) self.dropout_in_module = FairseqDropout( dropout_in * 1.0, module_name=self.__class__.__name__ ) self.dropout_out_module = FairseqDropout( dropout_out * 1.0, module_name=self.__class__.__name__ ) self.hidden_size = hidden_size self.share_input_output_embed = share_input_output_embed self.need_attn = True self.max_target_positions = max_target_positions self.residuals = residuals self.num_layers = num_layers self.adaptive_softmax = None num_embeddings = len(dictionary) padding_idx = dictionary.pad() if pretrained_embed is None: self.embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) else: self.embed_tokens = pretrained_embed self.encoder_output_units = encoder_output_units if encoder_output_units != hidden_size and encoder_output_units != 0: self.encoder_hidden_proj = Linear(encoder_output_units, hidden_size) self.encoder_cell_proj = Linear(encoder_output_units, hidden_size) else: self.encoder_hidden_proj = self.encoder_cell_proj = None # disable input feeding if there is no encoder # input feeding is described in arxiv.org/abs/1508.04025 input_feed_size = 0 if encoder_output_units == 0 else hidden_size self.layers = nn.ModuleList( [ LSTMCell( input_size=input_feed_size + embed_dim if layer == 0 else hidden_size, hidden_size=hidden_size, ) for layer in range(num_layers) ] ) if attention: # TODO make bias configurable self.attention = AttentionLayer( hidden_size, encoder_output_units, hidden_size, bias=False ) else: self.attention = None if hidden_size != out_embed_dim: self.additional_fc = Linear(hidden_size, out_embed_dim) if adaptive_softmax_cutoff is not None: # setting adaptive_softmax dropout to dropout_out for now but can be redefined self.adaptive_softmax = AdaptiveSoftmax( num_embeddings, hidden_size, adaptive_softmax_cutoff, dropout=dropout_out, ) elif not self.share_input_output_embed: self.fc_out = Linear(out_embed_dim, num_embeddings, dropout=dropout_out) def forward( self, prev_output_tokens, encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, src_lengths: Optional[Tensor] = None, ): x, attn_scores = self.extract_features( prev_output_tokens, encoder_out, incremental_state ) return self.output_layer(x), attn_scores def extract_features( self, prev_output_tokens, encoder_out: Optional[Tuple[Tensor, Tensor, Tensor, Tensor]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): """ Similar to *forward* but only return features. """ # get outputs from encoder if encoder_out is not None: encoder_outs = encoder_out[0] encoder_hiddens = encoder_out[1] encoder_cells = encoder_out[2] encoder_padding_mask = encoder_out[3] else: encoder_outs = torch.empty(0) encoder_hiddens = torch.empty(0) encoder_cells = torch.empty(0) encoder_padding_mask = torch.empty(0) srclen = encoder_outs.size(0) if incremental_state is not None and len(incremental_state) > 0: prev_output_tokens = prev_output_tokens[:, -1:] bsz, seqlen = prev_output_tokens.size() # embed tokens x = self.embed_tokens(prev_output_tokens) x = self.dropout_in_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # initialize previous states (or get from cache during incremental generation) if incremental_state is not None and len(incremental_state) > 0: prev_hiddens, prev_cells, input_feed = self.get_cached_state( incremental_state ) elif encoder_out is not None: # setup recurrent cells prev_hiddens = [encoder_hiddens[i] for i in range(self.num_layers)] prev_cells = [encoder_cells[i] for i in range(self.num_layers)] if self.encoder_hidden_proj is not None: prev_hiddens = [self.encoder_hidden_proj(y) for y in prev_hiddens] prev_cells = [self.encoder_cell_proj(y) for y in prev_cells] input_feed = x.new_zeros(bsz, self.hidden_size) else: # setup zero cells, since there is no encoder zero_state = x.new_zeros(bsz, self.hidden_size) prev_hiddens = [zero_state for i in range(self.num_layers)] prev_cells = [zero_state for i in range(self.num_layers)] input_feed = None assert ( srclen > 0 or self.attention is None ), "attention is not supported if there are no encoder outputs" attn_scores: Optional[Tensor] = ( x.new_zeros(srclen, seqlen, bsz) if self.attention is not None else None ) outs = [] for j in range(seqlen): # input feeding: concatenate context vector from previous time step if input_feed is not None: input = torch.cat((x[j, :, :], input_feed), dim=1) else: input = x[j] for i, rnn in enumerate(self.layers): # recurrent cell hidden, cell = rnn(input, (prev_hiddens[i], prev_cells[i])) # hidden state becomes the input to the next layer input = self.dropout_out_module(hidden) if self.residuals: input = input + prev_hiddens[i] # save state for next time step prev_hiddens[i] = hidden prev_cells[i] = cell # apply attention using the last layer's hidden state if self.attention is not None: assert attn_scores is not None out, attn_scores[:, j, :] = self.attention( hidden, encoder_outs, encoder_padding_mask ) else: out = hidden out = self.dropout_out_module(out) # input feeding if input_feed is not None: input_feed = out # save final output outs.append(out) # Stack all the necessary tensors together and store prev_hiddens_tensor = torch.stack(prev_hiddens) prev_cells_tensor = torch.stack(prev_cells) cache_state = torch.jit.annotate( Dict[str, Optional[Tensor]], { "prev_hiddens": prev_hiddens_tensor, "prev_cells": prev_cells_tensor, "input_feed": input_feed, }, ) self.set_incremental_state(incremental_state, "cached_state", cache_state) # collect outputs across time steps x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size) # T x B x C -> B x T x C x = x.transpose(1, 0) if hasattr(self, "additional_fc") and self.adaptive_softmax is None: x = self.additional_fc(x) x = self.dropout_out_module(x) # srclen x tgtlen x bsz -> bsz x tgtlen x srclen if not self.training and self.need_attn and self.attention is not None: assert attn_scores is not None attn_scores = attn_scores.transpose(0, 2) else: attn_scores = None return x, attn_scores def output_layer(self, x): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: if self.share_input_output_embed: x = F.linear(x, self.embed_tokens.weight) else: x = self.fc_out(x) return x def get_cached_state( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], ) -> Tuple[List[Tensor], List[Tensor], Optional[Tensor]]: cached_state = self.get_incremental_state(incremental_state, "cached_state") assert cached_state is not None prev_hiddens_ = cached_state["prev_hiddens"] assert prev_hiddens_ is not None prev_cells_ = cached_state["prev_cells"] assert prev_cells_ is not None prev_hiddens = [prev_hiddens_[i] for i in range(self.num_layers)] prev_cells = [prev_cells_[j] for j in range(self.num_layers)] input_feed = cached_state[ "input_feed" ] # can be None for decoder-only language models return prev_hiddens, prev_cells, input_feed def reorder_incremental_state( self, incremental_state: Dict[str, Dict[str, Optional[Tensor]]], new_order: Tensor, ): if incremental_state is None or len(incremental_state) == 0: return prev_hiddens, prev_cells, input_feed = self.get_cached_state(incremental_state) prev_hiddens = [p.index_select(0, new_order) for p in prev_hiddens] prev_cells = [p.index_select(0, new_order) for p in prev_cells] if input_feed is not None: input_feed = input_feed.index_select(0, new_order) cached_state_new = torch.jit.annotate( Dict[str, Optional[Tensor]], { "prev_hiddens": torch.stack(prev_hiddens), "prev_cells": torch.stack(prev_cells), "input_feed": input_feed, }, ) self.set_incremental_state(incremental_state, "cached_state", cached_state_new), return def max_positions(self): """Maximum output length supported by the decoder.""" return self.max_target_positions def make_generation_fast_(self, need_attn=False, **kwargs): self.need_attn = need_attn def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.uniform_(m.weight, -0.1, 0.1) nn.init.constant_(m.weight[padding_idx], 0) return m def LSTM(input_size, hidden_size, **kwargs): m = nn.LSTM(input_size, hidden_size, **kwargs) for name, param in m.named_parameters(): if "weight" in name or "bias" in name: param.data.uniform_(-0.1, 0.1) return m def LSTMCell(input_size, hidden_size, **kwargs): m = nn.LSTMCell(input_size, hidden_size, **kwargs) for name, param in m.named_parameters(): if "weight" in name or "bias" in name: param.data.uniform_(-0.1, 0.1) return m def Linear(in_features, out_features, bias=True, dropout=0.0): """Linear layer (input: N x T x C)""" m = nn.Linear(in_features, out_features, bias=bias) m.weight.data.uniform_(-0.1, 0.1) if bias: m.bias.data.uniform_(-0.1, 0.1) return m @register_model_architecture("lstm", "lstm") def base_architecture(args): args.dropout = getattr(args, "dropout", 0.1) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_freeze_embed = getattr(args, "encoder_freeze_embed", False) args.encoder_hidden_size = getattr( args, "encoder_hidden_size", args.encoder_embed_dim ) args.encoder_layers = getattr(args, "encoder_layers", 1) args.encoder_bidirectional = getattr(args, "encoder_bidirectional", False) args.encoder_dropout_in = getattr(args, "encoder_dropout_in", args.dropout) args.encoder_dropout_out = getattr(args, "encoder_dropout_out", args.dropout) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_freeze_embed = getattr(args, "decoder_freeze_embed", False) args.decoder_hidden_size = getattr( args, "decoder_hidden_size", args.decoder_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 1) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512) args.decoder_attention = getattr(args, "decoder_attention", "1") args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout) args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.adaptive_softmax_cutoff = getattr( args, "adaptive_softmax_cutoff", "10000,50000,200000" ) @register_model_architecture("lstm", "lstm_wiseman_iwslt_de_en") def lstm_wiseman_iwslt_de_en(args): args.dropout = getattr(args, "dropout", 0.1) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_dropout_in = getattr(args, "encoder_dropout_in", 0) args.encoder_dropout_out = getattr(args, "encoder_dropout_out", 0) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 256) args.decoder_dropout_in = getattr(args, "decoder_dropout_in", 0) args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout) base_architecture(args) @register_model_architecture("lstm", "lstm_luong_wmt_en_de") def lstm_luong_wmt_en_de(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1000) args.encoder_layers = getattr(args, "encoder_layers", 4) args.encoder_dropout_out = getattr(args, "encoder_dropout_out", 0) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1000) args.decoder_layers = getattr(args, "decoder_layers", 4) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 1000) args.decoder_dropout_out = getattr(args, "decoder_dropout_out", 0) base_architecture(args) ================================================ FILE: fairseq/models/lstm_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq import utils from fairseq.models import ( FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.lstm import Embedding, LSTMDecoder DEFAULT_MAX_TARGET_POSITIONS = 1e5 @register_model("lstm_lm") class LSTMLanguageModel(FairseqLanguageModel): def __init__(self, decoder): super().__init__(decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # fmt: off parser.add_argument('--dropout', type=float, metavar='D', help='dropout probability') parser.add_argument('--decoder-embed-dim', type=int, metavar='N', help='decoder embedding dimension') parser.add_argument('--decoder-embed-path', type=str, metavar='STR', help='path to pre-trained decoder embedding') parser.add_argument('--decoder-hidden-size', type=int, metavar='N', help='decoder hidden size') parser.add_argument('--decoder-layers', type=int, metavar='N', help='number of decoder layers') parser.add_argument('--decoder-out-embed-dim', type=int, metavar='N', help='decoder output embedding dimension') parser.add_argument('--decoder-attention', type=str, metavar='BOOL', help='decoder attention') parser.add_argument('--adaptive-softmax-cutoff', metavar='EXPR', help='comma separated list of adaptive softmax cutoff points. ' 'Must be used with adaptive_loss criterion') parser.add_argument('--residuals', default=False, action='store_true', help='applying residuals between LSTM layers') # Granular dropout settings (if not specified these default to --dropout) parser.add_argument('--decoder-dropout-in', type=float, metavar='D', help='dropout probability for decoder input embedding') parser.add_argument('--decoder-dropout-out', type=float, metavar='D', help='dropout probability for decoder output') parser.add_argument('--share-decoder-input-output-embed', default=False, action='store_true', help='share decoder input and output embeddings') # fmt: on @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if getattr(args, "max_target_positions", None) is not None: max_target_positions = args.max_target_positions else: max_target_positions = getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) def load_pretrained_embedding_from_file(embed_path, dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() embed_tokens = Embedding(num_embeddings, embed_dim, padding_idx) embed_dict = utils.parse_embedding(embed_path) utils.print_embed_overlap(embed_dict, dictionary) return utils.load_embedding(embed_dict, dictionary, embed_tokens) pretrained_decoder_embed = None if args.decoder_embed_path: pretrained_decoder_embed = load_pretrained_embedding_from_file( args.decoder_embed_path, task.target_dictionary, args.decoder_embed_dim ) if args.share_decoder_input_output_embed: # double check all parameters combinations are valid if task.source_dictionary != task.target_dictionary: raise ValueError( "--share-decoder-input-output-embeddings requires a joint dictionary" ) if args.decoder_embed_dim != args.decoder_out_embed_dim: raise ValueError( "--share-decoder-input-output-embeddings requires " "--decoder-embed-dim to match --decoder-out-embed-dim" ) decoder = LSTMDecoder( dictionary=task.dictionary, embed_dim=args.decoder_embed_dim, hidden_size=args.decoder_hidden_size, out_embed_dim=args.decoder_out_embed_dim, num_layers=args.decoder_layers, dropout_in=args.decoder_dropout_in, dropout_out=args.decoder_dropout_out, attention=False, # decoder-only language model doesn't support attention encoder_output_units=0, pretrained_embed=pretrained_decoder_embed, share_input_output_embed=args.share_decoder_input_output_embed, adaptive_softmax_cutoff=( utils.eval_str_list(args.adaptive_softmax_cutoff, type=int) if args.criterion == "adaptive_loss" else None ), max_target_positions=max_target_positions, residuals=args.residuals, ) return cls(decoder) @register_model_architecture("lstm_lm", "lstm_lm") def base_architecture(args): args.dropout = getattr(args, "dropout", 0.1) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_hidden_size = getattr( args, "decoder_hidden_size", args.decoder_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 1) args.decoder_out_embed_dim = getattr(args, "decoder_out_embed_dim", 512) args.decoder_attention = getattr(args, "decoder_attention", "0") args.decoder_dropout_in = getattr(args, "decoder_dropout_in", args.dropout) args.decoder_dropout_out = getattr(args, "decoder_dropout_out", args.dropout) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.adaptive_softmax_cutoff = getattr( args, "adaptive_softmax_cutoff", "10000,50000,200000" ) args.residuals = getattr(args, "residuals", False) ================================================ FILE: fairseq/models/masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.models import ( FairseqEncoder, FairseqEncoderModel, register_model, register_model_architecture, ) from fairseq.modules import ( LayerNorm, SinusoidalPositionalEmbedding, TransformerSentenceEncoder, ) from fairseq.modules.transformer_sentence_encoder import init_bert_params from fairseq.utils import safe_hasattr logger = logging.getLogger(__name__) @register_model("masked_lm") class MaskedLMModel(FairseqEncoderModel): """ Class for training a Masked Language Model. It also supports an additional sentence level prediction if the sent-loss argument is set. """ def __init__(self, args, encoder): super().__init__(encoder) self.args = args # if specified then apply bert initialization on the model. We need # to explictly call this to make sure that the output embeddings # and projection layers are also correctly initialized if getattr(args, "apply_bert_init", False): self.apply(init_bert_params) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # Arguments related to dropout parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for" " attention weights", ) parser.add_argument( "--act-dropout", type=float, metavar="D", help="dropout probability after" " activation in FFN", ) # Arguments related to hidden states and self-attention parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) # Arguments related to input and output embeddings parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--share-encoder-input-output-embed", action="store_true", help="share encoder input" " and output embeddings", ) parser.add_argument( "--encoder-learned-pos", action="store_true", help="use learned positional embeddings in the encoder", ) parser.add_argument( "--no-token-positional-embeddings", action="store_true", help="if set, disables positional embeddings" " (outside self attention)", ) parser.add_argument( "--num-segment", type=int, metavar="N", help="num segment in the input" ) parser.add_argument( "--max-positions", type=int, help="number of positional embeddings to learn" ) # Arguments related to sentence level prediction parser.add_argument( "--sentence-class-num", type=int, metavar="N", help="number of classes for sentence task", ) parser.add_argument( "--sent-loss", action="store_true", help="if set," " calculate sentence level predictions", ) # Arguments related to parameter initialization parser.add_argument( "--apply-bert-init", action="store_true", help="use custom param initialization for BERT", ) # misc params parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--pooler-activation-fn", choices=utils.get_available_activation_fns(), help="Which activation function to use for pooler layer.", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) def forward(self, src_tokens, segment_labels=None, **kwargs): return self.encoder(src_tokens, segment_labels=segment_labels, **kwargs) def max_positions(self): return self.encoder.max_positions @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if not safe_hasattr(args, "max_positions"): args.max_positions = args.tokens_per_sample logger.info(args) encoder = MaskedLMEncoder(args, task.dictionary) return cls(args, encoder) class MaskedLMEncoder(FairseqEncoder): """ Encoder for Masked Language Modelling. """ def __init__(self, args, dictionary): super().__init__(dictionary) self.padding_idx = dictionary.pad() self.vocab_size = dictionary.__len__() self.max_positions = args.max_positions self.sentence_encoder = TransformerSentenceEncoder( padding_idx=self.padding_idx, vocab_size=self.vocab_size, num_encoder_layers=args.encoder_layers, embedding_dim=args.encoder_embed_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=args.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.act_dropout, max_seq_len=self.max_positions, num_segments=args.num_segment, use_position_embeddings=not args.no_token_positional_embeddings, encoder_normalize_before=args.encoder_normalize_before, apply_bert_init=args.apply_bert_init, activation_fn=args.activation_fn, learned_pos_embedding=args.encoder_learned_pos, ) self.share_input_output_embed = args.share_encoder_input_output_embed self.embed_out = None self.sentence_projection_layer = None self.sentence_out_dim = args.sentence_class_num self.lm_output_learned_bias = None # Remove head is set to true during fine-tuning self.load_softmax = not getattr(args, "remove_head", False) self.masked_lm_pooler = nn.Linear( args.encoder_embed_dim, args.encoder_embed_dim ) self.pooler_activation = utils.get_activation_fn(args.pooler_activation_fn) self.lm_head_transform_weight = nn.Linear( args.encoder_embed_dim, args.encoder_embed_dim ) self.activation_fn = utils.get_activation_fn(args.activation_fn) self.layer_norm = LayerNorm(args.encoder_embed_dim) self.lm_output_learned_bias = None if self.load_softmax: self.lm_output_learned_bias = nn.Parameter(torch.zeros(self.vocab_size)) if not self.share_input_output_embed: self.embed_out = nn.Linear( args.encoder_embed_dim, self.vocab_size, bias=False ) if args.sent_loss: self.sentence_projection_layer = nn.Linear( args.encoder_embed_dim, self.sentence_out_dim, bias=False ) def forward(self, src_tokens, segment_labels=None, masked_tokens=None, **unused): """ Forward pass for Masked LM encoder. This first computes the token embedding using the token embedding matrix, position embeddings (if specified) and segment embeddings (if specified). Here we assume that the sentence representation corresponds to the output of the classification_token (see bert_task or cross_lingual_lm task for more details). Args: - src_tokens: B x T matrix representing sentences - segment_labels: B x T matrix representing segment label for tokens Returns: - a tuple of the following: - logits for predictions in format B x T x C to be used in softmax afterwards - a dictionary of additional data, where 'pooled_output' contains the representation for classification_token and 'inner_states' is a list of internal model states used to compute the predictions (similar in ELMO). 'sentence_logits' is the prediction logit for NSP task and is only computed if this is specified in the input arguments. """ inner_states, sentence_rep = self.sentence_encoder( src_tokens, segment_labels=segment_labels, ) x = inner_states[-1].transpose(0, 1) # project masked tokens only if masked_tokens is not None: x = x[masked_tokens, :] x = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(x))) pooled_output = self.pooler_activation(self.masked_lm_pooler(sentence_rep)) # project back to size of vocabulary if self.share_input_output_embed and hasattr( self.sentence_encoder.embed_tokens, "weight" ): x = F.linear(x, self.sentence_encoder.embed_tokens.weight) elif self.embed_out is not None: x = self.embed_out(x) if self.lm_output_learned_bias is not None: x = x + self.lm_output_learned_bias sentence_logits = None if self.sentence_projection_layer: sentence_logits = self.sentence_projection_layer(pooled_output) return x, { "inner_states": inner_states, "pooled_output": pooled_output, "sentence_logits": sentence_logits, } def max_positions(self): """Maximum output length supported by the encoder.""" return self.max_positions def upgrade_state_dict_named(self, state_dict, name): if not self.load_softmax: for k in list(state_dict.keys()): if ( "embed_out.weight" in k or "sentence_projection_layer.weight" in k or "lm_output_learned_bias" in k ): del state_dict[k] return state_dict @register_model_architecture("masked_lm", "masked_lm") def base_architecture(args): args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.act_dropout = getattr(args, "act_dropout", 0.0) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.share_encoder_input_output_embed = getattr( args, "share_encoder_input_output_embed", False ) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.num_segment = getattr(args, "num_segment", 2) args.sentence_class_num = getattr(args, "sentence_class_num", 2) args.sent_loss = getattr(args, "sent_loss", False) args.apply_bert_init = getattr(args, "apply_bert_init", False) args.activation_fn = getattr(args, "activation_fn", "relu") args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) @register_model_architecture("masked_lm", "bert_base") def bert_base_architecture(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 768) args.share_encoder_input_output_embed = getattr( args, "share_encoder_input_output_embed", True ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) args.num_segment = getattr(args, "num_segment", 2) args.encoder_layers = getattr(args, "encoder_layers", 12) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 12) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 3072) args.sentence_class_num = getattr(args, "sentence_class_num", 2) args.sent_loss = getattr(args, "sent_loss", True) args.apply_bert_init = getattr(args, "apply_bert_init", True) args.activation_fn = getattr(args, "activation_fn", "gelu") args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) base_architecture(args) @register_model_architecture("masked_lm", "bert_large") def bert_large_architecture(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_layers = getattr(args, "encoder_layers", 24) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) bert_base_architecture(args) @register_model_architecture("masked_lm", "xlm_base") def xlm_architecture(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.share_encoder_input_output_embed = getattr( args, "share_encoder_input_output_embed", True ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", True) args.num_segment = getattr(args, "num_segment", 1) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.sent_loss = getattr(args, "sent_loss", False) args.activation_fn = getattr(args, "activation_fn", "gelu") args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") args.apply_bert_init = getattr(args, "apply_bert_init", True) base_architecture(args) ================================================ FILE: fairseq/models/model_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import List, Optional import torch from torch import Tensor @torch.jit.script def script_skip_tensor_list(x: List[Tensor], mask): res = [xi[mask] if xi.size(0) == mask.size(0) else xi[:, mask] for xi in x] outputs = [] for i, t in enumerate(res): if t.numel() != 0: outputs.append(t) else: outputs.append(x[i]) return outputs @torch.jit.script def script_skip_tensor(x: Tensor, mask): # None case if x.size(0) == 0: return x res = x[mask] if x.size(0) == mask.size(0) else x[:, mask] if res.numel() == 0: return x else: return res @torch.jit.script def expand_2d_or_3d_tensor(x, trg_dim: int, padding_idx: int): """ Expand 2D/3D tensor on dim=1 """ if x is None: return None assert x.dim() == 2 or x.dim() == 3 assert trg_dim >= x.size(1), (trg_dim, x.size()) if trg_dim == x.size(1): return x dims = [x.size(0), trg_dim - x.size(1)] if x.dim() == 3: dims.append(x.size(2)) x = torch.cat([x, torch.zeros(dims).to(x).fill_(padding_idx)], 1) return x @torch.jit.script def coalesce(x: Optional[Tensor], y: Tensor) -> Tensor: return x if x is not None else y @torch.jit.script def fill_tensors( x: Optional[Tensor], mask, y: Optional[Tensor], padding_idx: int ) -> Optional[Tensor]: """ Filling tensor x with y at masked positions (dim=0). """ if x is None or x.size()[0] == 0 or y is None: return x assert x.dim() == y.dim() and mask.size(0) == x.size(0) assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2)) n_selected = mask.sum() if n_selected == 0: return x assert n_selected == y.size(0) if n_selected == x.size(0): return y if x.size(1) < y.size(1): x = expand_2d_or_3d_tensor(x, y.size(1), padding_idx) x[mask] = y elif x.size(1) > y.size(1): x[mask] = torch.tensor(padding_idx).type_as(x) if x.dim() == 2: x[mask, : y.size(1)] = y else: x[mask, : y.size(1), :] = y else: x[mask] = y return x ================================================ FILE: fairseq/models/multilingual_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import OrderedDict from fairseq import utils from fairseq.models import ( FairseqMultiModel, register_model, register_model_architecture, ) from fairseq.models.transformer import ( Embedding, TransformerDecoder, TransformerEncoder, TransformerModel, base_architecture, ) from fairseq.utils import safe_hasattr @register_model("multilingual_transformer") class MultilingualTransformerModel(FairseqMultiModel): """Train Transformer models for multiple language pairs simultaneously. Requires `--task multilingual_translation`. We inherit all arguments from TransformerModel and assume that all language pairs use a single Transformer architecture. In addition, we provide several options that are specific to the multilingual setting. Args: --share-encoder-embeddings: share encoder embeddings across all source languages --share-decoder-embeddings: share decoder embeddings across all target languages --share-encoders: share all encoder params (incl. embeddings) across all source languages --share-decoders: share all decoder params (incl. embeddings) across all target languages """ def __init__(self, encoders, decoders): super().__init__(encoders, decoders) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" TransformerModel.add_args(parser) parser.add_argument( "--share-encoder-embeddings", action="store_true", help="share encoder embeddings across languages", ) parser.add_argument( "--share-decoder-embeddings", action="store_true", help="share decoder embeddings across languages", ) parser.add_argument( "--share-encoders", action="store_true", help="share encoders across languages", ) parser.add_argument( "--share-decoders", action="store_true", help="share decoders across languages", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" from fairseq.tasks.multilingual_translation import MultilingualTranslationTask assert isinstance(task, MultilingualTranslationTask) # make sure all arguments are present in older models base_multilingual_architecture(args) if not safe_hasattr(args, "max_source_positions"): args.max_source_positions = 1024 if not safe_hasattr(args, "max_target_positions"): args.max_target_positions = 1024 src_langs = [lang_pair.split("-")[0] for lang_pair in task.model_lang_pairs] tgt_langs = [lang_pair.split("-")[1] for lang_pair in task.model_lang_pairs] if args.share_encoders: args.share_encoder_embeddings = True if args.share_decoders: args.share_decoder_embeddings = True def build_embedding(dictionary, embed_dim, path=None): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb # build shared embeddings (if applicable) shared_encoder_embed_tokens, shared_decoder_embed_tokens = None, None if args.share_all_embeddings: if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embeddings not compatible with --decoder-embed-path" ) shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( dicts=task.dicts, langs=task.langs, embed_dim=args.encoder_embed_dim, build_embedding=build_embedding, pretrained_embed_path=args.encoder_embed_path, ) shared_decoder_embed_tokens = shared_encoder_embed_tokens args.share_decoder_input_output_embed = True else: if args.share_encoder_embeddings: shared_encoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( dicts=task.dicts, langs=src_langs, embed_dim=args.encoder_embed_dim, build_embedding=build_embedding, pretrained_embed_path=args.encoder_embed_path, ) if args.share_decoder_embeddings: shared_decoder_embed_tokens = FairseqMultiModel.build_shared_embeddings( dicts=task.dicts, langs=tgt_langs, embed_dim=args.decoder_embed_dim, build_embedding=build_embedding, pretrained_embed_path=args.decoder_embed_path, ) # encoders/decoders for each language lang_encoders, lang_decoders = {}, {} def get_encoder(lang): if lang not in lang_encoders: if shared_encoder_embed_tokens is not None: encoder_embed_tokens = shared_encoder_embed_tokens else: encoder_embed_tokens = build_embedding( task.dicts[lang], args.encoder_embed_dim, args.encoder_embed_path, ) lang_encoders[lang] = cls._get_module_class( True, args, task.dicts[lang], encoder_embed_tokens, src_langs ) return lang_encoders[lang] def get_decoder(lang): if lang not in lang_decoders: if shared_decoder_embed_tokens is not None: decoder_embed_tokens = shared_decoder_embed_tokens else: decoder_embed_tokens = build_embedding( task.dicts[lang], args.decoder_embed_dim, args.decoder_embed_path, ) lang_decoders[lang] = cls._get_module_class( False, args, task.dicts[lang], decoder_embed_tokens, tgt_langs ) return lang_decoders[lang] # shared encoders/decoders (if applicable) shared_encoder, shared_decoder = None, None if args.share_encoders: shared_encoder = get_encoder(src_langs[0]) if args.share_decoders: shared_decoder = get_decoder(tgt_langs[0]) encoders, decoders = OrderedDict(), OrderedDict() for lang_pair, src, tgt in zip(task.model_lang_pairs, src_langs, tgt_langs): encoders[lang_pair] = ( shared_encoder if shared_encoder is not None else get_encoder(src) ) decoders[lang_pair] = ( shared_decoder if shared_decoder is not None else get_decoder(tgt) ) return MultilingualTransformerModel(encoders, decoders) @classmethod def _get_module_class(cls, is_encoder, args, lang_dict, embed_tokens, langs): module_class = TransformerEncoder if is_encoder else TransformerDecoder return module_class(args, lang_dict, embed_tokens) def load_state_dict(self, state_dict, strict=True, model_cfg=None): state_dict_subset = state_dict.copy() for k, _ in state_dict.items(): assert k.startswith("models.") lang_pair = k.split(".")[1] if lang_pair not in self.models: del state_dict_subset[k] super().load_state_dict(state_dict_subset, strict=strict, model_cfg=model_cfg) @register_model_architecture("multilingual_transformer", "multilingual_transformer") def base_multilingual_architecture(args): base_architecture(args) args.share_encoder_embeddings = getattr(args, "share_encoder_embeddings", False) args.share_decoder_embeddings = getattr(args, "share_decoder_embeddings", False) args.share_encoders = getattr(args, "share_encoders", False) args.share_decoders = getattr(args, "share_decoders", False) @register_model_architecture( "multilingual_transformer", "multilingual_transformer_iwslt_de_en" ) def multilingual_transformer_iwslt_de_en(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.encoder_layers = getattr(args, "encoder_layers", 6) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.decoder_layers = getattr(args, "decoder_layers", 6) base_multilingual_architecture(args) ================================================ FILE: fairseq/models/multires_hubert/__init__.py ================================================ from .multires_hubert import * # noqa from .multires_hubert_asr import * # noqa ================================================ FILE: fairseq/models/multires_hubert/multires_hubert.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass, field from typing import Dict, List, Optional, Tuple import numpy as np import torch import math import torch.nn as nn from omegaconf import II from fairseq.models.wav2vec.wav2vec import norm_block from fairseq import utils from fairseq.data.data_utils import compute_mask_indices from fairseq.data.dictionary import Dictionary from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from fairseq.models.wav2vec.wav2vec2 import ( EXTRACTOR_MODE_CHOICES, MASKING_DISTRIBUTION_CHOICES, LAYER_TYPE_CHOICES, ConvFeatureExtractionModel, TransformerEncoder, ) from omegaconf import II, MISSING, open_dict from fairseq.modules import GradMultiply, LayerNorm from fairseq.tasks.multires_hubert_pretraining import ( MultiresHubertPretrainingConfig, MultiresHubertPretrainingTask, ) logger = logging.getLogger(__name__) @dataclass class MultiresHubertConfig(FairseqDataclass): label_rate: float = II("task.label_rate") # label_rate: 1,2,2,5 # (imply (1,2), (2,5)) # if base label_rate = 50 # (1,2), (2,5) --> label rates 50, 25, 10 label_rate_ratios: List[int] = field( default=MISSING, metadata={"help": "tuple for label rates e.g., [(1,2), (2,5)]"} ) extractor_mode: EXTRACTOR_MODE_CHOICES = field( default="default", metadata={ "help": "mode for feature extractor. default has a single group " "norm with d groups in the first conv block, whereas layer_norm " "has layer norms in every block (meant to use with normalize=True)" }, ) # the blocks for each label rate encoder_layers: int = field( default="2", metadata={ "help": "num encoder layers in the each block (one sub module of the U-net)" }, ) override_encoder_layers: str = field( default="", metadata={ "help": "specific layer numbers for each block (one sub module of the U-net) for the training" }, ) encoder_embed_dim: int = field( default=768, metadata={"help": "encoder embedding dimension"} ) encoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "encoder embedding dimension for FFN"} ) encoder_attention_heads: int = field( default=12, metadata={"help": "num encoder attention heads"} ) activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="gelu", metadata={"help": "activation function to use"} ) layer_type: LAYER_TYPE_CHOICES = field( default="transformer", metadata={"help": "layer type in encoder"} ) conv_adapator_kernal: int = field( default=7, metadata={"help": "kernal size for conv adaptor"} ) use_plain_updownsample: bool = field( default=False, metadata={"help": "whether to use plain up downsample"} ) # dropouts dropout: float = field( default=0.1, metadata={"help": "dropout probability for the transformer"}, ) attention_dropout: float = field( default=0.1, metadata={"help": "dropout probability for attention weights"}, ) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN"}, ) encoder_layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"}, ) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) dropout_features: float = field( default=0.0, metadata={"help": "dropout to apply to the features (after feat extr)"}, ) final_dim: int = field( default=0, metadata={ "help": "project final representations and targets to this many " "dimensions. set to encoder_embed_dim is <= 0" }, ) untie_final_proj: bool = field( default=True, metadata={"help": "use separate projection for each target"}, ) layer_norm_first: bool = field( default=False, metadata={"help": "apply layernorm first in the transformer"}, ) conv_feature_layers: str = field( default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", metadata={ "help": "string describing convolutional feature extraction " "layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" }, ) conv_bias: bool = field( default=False, metadata={"help": "include bias in conv encoder"} ) logit_temp: float = field( default=0.1, metadata={"help": "temperature to divide logits by"} ) target_glu: bool = field( default=False, metadata={"help": "adds projection + glu to targets"} ) feature_grad_mult: float = field( default=1.0, metadata={"help": "multiply feature extractor var grads by this"}, ) use_single_target: bool = field( default=False, metadata={ "help": "whether to use single data (in that case, we will compute with the fixed label rate)" }, ) use_single_prediction: bool = field( default=False, metadata={ "help": "if true, we will not conduct mlm prediction in low resolution in the middle" }, ) use_multi_stream: bool = field( default=False, metadata={ "help": "whether to use multi-stream setting (in this setting, we have multiple streams with the same resolution)" }, ) # masking mask_length: int = field(default=10, metadata={"help": "mask length"}) mask_prob: float = field( default=0.65, metadata={"help": "probability of replacing a token with mask"}, ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length"} ) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"} ) mask_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"}, ) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"}, ) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"}, ) mask_channel_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) # positional embeddings conv_pos: int = field( default=128, metadata={"help": "number of filters for convolutional positional embeddings"}, ) conv_pos_groups: int = field( default=16, metadata={"help": "number of groups for convolutional positional embedding"}, ) latent_temp: Tuple[float, float, float] = field( default=(2, 0.5, 0.999995), metadata={"help": "legacy (to be removed)"}, ) # loss computation skip_masked: bool = field( default=False, metadata={"help": "skip computing losses over masked frames"}, ) skip_nomask: bool = field( default=False, metadata={"help": "skip computing losses over unmasked frames"}, ) checkpoint_activations: bool = field( default=False, metadata={"help": "recompute activations and save memory for extra compute"}, ) # FP16 optimization required_seq_len_multiple: int = field( default=2, metadata={ "help": "pad the input to encoder such that the sequence length is divisible by multiple" }, ) # Conformer depthwise_conv_kernel_size: int = field( default=31, metadata={ "help": "depthwise-conv-kernel-size for convolution in conformer layer" }, ) attn_type: str = field( default="", metadata={"help": "if espnet use ESPNET MHA"}, ) pos_enc_type: str = field( default="abs", metadata={"help": "Positional encoding type to use in conformer"}, ) fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) @register_model("multires_hubert", dataclass=MultiresHubertConfig) class MultiresHubertModel(BaseFairseqModel): def __init__( self, cfg: MultiresHubertConfig, task_cfg: MultiresHubertPretrainingConfig, dictionaries: List[Dictionary], ) -> None: super().__init__() logger.info(f"MultiresHubertModel Config: {cfg}") feature_enc_layers = eval(cfg.conv_feature_layers) # noqa self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias, ) self.post_extract_proj = ( nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim else None ) # Estimate label rates assert ( cfg.label_rate_ratios != "None" ), "without ratios, the model is exactly as the Hubert model" self.label_rate_ratios = [] self.base_rate = cfg.label_rate self.label_rates = [] self.downsample_modules = nn.ModuleList() self.upsample_modules = nn.ModuleList() self.encoders = nn.ModuleList() self.decoders = nn.ModuleList() self.use_single_target = cfg.use_single_target self.use_single_prediction = cfg.use_single_prediction self.use_plain_updownsample = cfg.use_plain_updownsample # For decide the override encoder layers, so that the layer number is not equally distributed if cfg.override_encoder_layers != "": self.override_encoder_layers = eval(cfg.override_encoder_layers) assert ( len(self.override_encoder_layers) % 2 == 1 ), "must be odd number of layers if specify detailed layers" assert ( len(self.override_encoder_layers) // 2 == len(cfg.label_rate_ratios) // 2 ), "number of override encoder layers must match the label rate ratios information" self.len_encoder_modules = len(self.override_encoder_layers) else: self.override_encoder_layers = None self.len_encoder_modules = None # use different layers instead of equally distributed ones middle_override_encoder_layer = ( self.override_encoder_layers[self.len_encoder_modules // 2] if self.override_encoder_layers is not None else None ) skip_middle_pos_conv = False if len(cfg.label_rate_ratios) < 2 else True self.middle_encoder = TransformerEncoder( cfg, skip_pos_conv=skip_middle_pos_conv, override_encoder_layer=middle_override_encoder_layer, ) first_pos_conv = False # only enable pos_conv for the first encoder raw_label_rate_ratios = cfg.label_rate_ratios for i in range(len(raw_label_rate_ratios) // 2): # check if have override encoder layers if self.override_encoder_layers is not None: override_encoder_layer = self.override_encoder_layers[i] override_decoder_layer = self.override_encoder_layers[ self.len_encoder_modules - 1 - i ] else: override_encoder_layer, override_decoder_layer = None, None self.label_rate_ratios.append( (raw_label_rate_ratios[i * 2], raw_label_rate_ratios[i * 2 + 1]) ) if self.use_plain_updownsample: self.downsample_modules.append( ConvDownsampler( k=cfg.conv_adapator_kernal, label_rate=( ( raw_label_rate_ratios[i * 2], raw_label_rate_ratios[i * 2 + 1], ) ), dropout=0.0, channels=cfg.encoder_embed_dim, activation=nn.GELU(), log_compression=False, skip_connections=True, highway=True, residual_scale=0.4, ) ) else: self.downsample_modules.append( ConvAdapter( k=cfg.conv_adapator_kernal, label_rate=( ( raw_label_rate_ratios[i * 2], raw_label_rate_ratios[i * 2 + 1], ) ), dropout=0.0, channels=cfg.encoder_embed_dim, activation=nn.GELU(), log_compression=False, skip_connections=True, highway=True, residual_scale=0.4, ) ) if not first_pos_conv: self.encoders.append( TransformerEncoder( cfg, override_encoder_layer=override_encoder_layer ) ) # TODO(jiatong): add conformer options first_pos_conv = True else: self.encoders.append( TransformerEncoder( cfg, skip_pos_conv=True, override_encoder_layer=override_encoder_layer, ) ) if self.use_plain_updownsample: self.upsample_modules.append( ConvUpsampler( k=cfg.conv_adapator_kernal, label_rate=( ( raw_label_rate_ratios[i * 2 + 1], raw_label_rate_ratios[i * 2], ) ), dropout=0.0, channels=cfg.encoder_embed_dim, activation=nn.GELU(), log_compression=False, skip_connections=True, highway=True, residual_scale=0.4, ) ) else: self.upsample_modules.append( ConvAdapter( k=cfg.conv_adapator_kernal, label_rate=( ( raw_label_rate_ratios[i * 2 + 1], raw_label_rate_ratios[i * 2], ) ), dropout=0.0, channels=cfg.encoder_embed_dim, activation=nn.GELU(), log_compression=False, skip_connections=True, highway=True, residual_scale=0.4, ) ) self.decoders.append( TransformerEncoder( cfg, skip_pos_conv=True, override_encoder_layer=override_decoder_layer, ) ) base_ds_rate = np.prod([s for _, _, s in feature_enc_layers]) self.feature_ds_rates = [base_ds_rate] running_rate = self.base_rate if cfg.use_single_target or cfg.use_multi_stream: self.label_rates = self.base_rate else: self.label_rates.append(self.base_rate) for label_rate_ratio in self.label_rate_ratios: upsample_rate, downsample_rate = label_rate_ratio if (base_ds_rate * upsample_rate) % downsample_rate != 0: logger.warning( "base rate: {} cannot be ideally processed with downsample rate {}".format( base_ds_rate, downsample_rate ) ) base_ds_rate = base_ds_rate * downsample_rate // upsample_rate self.feature_ds_rates.append(base_ds_rate) if not cfg.use_single_target and not cfg.use_multi_stream: running_rate = running_rate * upsample_rate // downsample_rate self.label_rates.append(running_rate) self.label_nums = len( self.feature_ds_rates ) # the number of labels for prediction (activate at iter 2) if type(self.label_rates) == float: self.feat2tar_ratios = [ self.feature_ds_rates[i] * self.label_rates / task_cfg.sample_rate for i in range(len(self.feature_ds_rates)) ] else: self.feat2tar_ratios = [ self.feature_ds_rates[i] * self.label_rates[i] / task_cfg.sample_rate for i in range(len(self.feature_ds_rates)) ] # self.feat2tar_ratios = self.feat2tar_ratios[::-1] # An running example of the label rate: # base_ds_rate = 320 # self.label_rate_ratios = [(1, 2)] # self.feature_ds_rates = [320, 640] # self.label_rates = [50, 25] # self.feat2tar_ratios = [1, 1] # Another running example of the label rate: # base_ds_rate = 320 # self.label_rate_ratios = [(1, 2)] # self.feature_ds_rates = [320, 640] # self.label_rates = 100 # self.feat2tar_ratios = [4, 2] # self.use_sinlge_target = True logging.info( "ds_rates: {}, label_rates: {}, feat2tar_ratios: {}".format( self.feature_ds_rates, self.label_rates, self.feat2tar_ratios ) ) self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection self.mask_other = cfg.mask_other self.mask_length = cfg.mask_length self.no_mask_overlap = cfg.no_mask_overlap self.mask_min_space = cfg.mask_min_space self.mask_channel_prob = cfg.mask_channel_prob self.mask_channel_selection = cfg.mask_channel_selection self.mask_channel_other = cfg.mask_channel_other self.mask_channel_length = cfg.mask_channel_length self.no_mask_channel_overlap = cfg.no_mask_channel_overlap self.mask_channel_min_space = cfg.mask_channel_min_space self.dropout_input = nn.Dropout(cfg.dropout_input) self.dropout_features = nn.Dropout(cfg.dropout_features) self.feature_grad_mult = cfg.feature_grad_mult self.logit_temp = cfg.logit_temp self.skip_masked = cfg.skip_masked self.skip_nomask = cfg.skip_nomask # Note(jiatong): different from hubert, we just set the final dim as encoder_embed_dim final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim self.mask_emb = nn.Parameter( torch.FloatTensor(cfg.encoder_embed_dim).uniform_() ) self.layer_norm = LayerNorm(self.embed) self.predictor_head_num = 1 if self.use_single_prediction else self.label_nums self.target_glu = None if cfg.target_glu: self.target_glus = nn.ModuleList() for i in range(self.predictor_head_num): self.target_glus.append( nn.Sequential(nn.Linear(final_dim, final_dim * 2), nn.GLU()) ) self.untie_final_proj = cfg.untie_final_proj self.final_projs = nn.ModuleList() # Note(jiatong): we do not have untie cases for multires hubert for i in range(self.predictor_head_num): self.final_projs.append(nn.Linear(cfg.encoder_embed_dim, final_dim)) # modules below are not needed during fine-tuning self.multires_classes = [] self.label_embs_concat = nn.ParameterList() for i in range(self.predictor_head_num): if self.use_single_target: num_classes = len(dictionaries[0]) else: num_classes = len(dictionaries[i]) self.multires_classes.append(num_classes) self.label_embs_concat.append( nn.Parameter(torch.FloatTensor(num_classes, final_dim)) ) nn.init.uniform_(self.label_embs_concat[i]) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model( cls, cfg: MultiresHubertConfig, task: MultiresHubertPretrainingTask ): """Build a new model instance.""" model = MultiresHubertModel(cfg, task.cfg, task.dictionaries) return model def apply_mask(self, x, padding_mask, target_list): B, T, C = x.shape if self.mask_prob > 0: mask_indices = compute_mask_indices( (B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space, ) mask_indices = torch.from_numpy(mask_indices).to(x.device) x[mask_indices] = self.mask_emb else: mask_indices = None if self.mask_channel_prob > 0: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x[mask_channel_indices] = 0 return x, mask_indices def compute_nce(self, x, pos, negs): neg_is_pos = (pos == negs).all(-1) pos = pos.unsqueeze(0) targets = torch.cat([pos, negs], dim=0) logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1).type_as(x) logits /= self.logit_temp if neg_is_pos.any(): logits[1:][neg_is_pos] = float("-inf") logits = logits.transpose(0, 1) # (num_x, num_cls+1) return logits def forward_features(self, source: torch.Tensor) -> torch.Tensor: if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) return features def forward_targets( self, features: torch.Tensor, target: torch.Tensor, feat2tar_ratio: float, ) -> Tuple[torch.Tensor, torch.Tensor]: # Trim features to ensure labels exist and then get aligned labels feat_tsz = features.size(1) # skip if no target is provided if target is None: return features, None, None targ_tsz = target.size(1) if feat2tar_ratio * feat_tsz > targ_tsz: feat_tsz = int(targ_tsz / feat2tar_ratio) features = features[:, :feat_tsz] target_inds = torch.arange(feat_tsz).float() * feat2tar_ratio target = target[:, target_inds.long()] return features, target def forward_padding_mask( self, features: torch.Tensor, padding_mask: torch.Tensor, ) -> torch.Tensor: extra = padding_mask.size(1) % features.size(1) if extra > 0: padding_mask = padding_mask[:, :-extra] padding_mask = padding_mask.view(padding_mask.size(0), features.size(1), -1) padding_mask = padding_mask.all(-1) return padding_mask def forward( self, source: torch.Tensor, target_list: Optional[List[torch.Tensor]] = None, padding_mask: Optional[torch.Tensor] = None, mask: bool = True, features_only: bool = False, output_layer: Optional[int] = None, ) -> Dict[str, torch.Tensor]: """output layer is 1-based""" features = self.forward_features(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None: padding_mask = self.forward_padding_mask(features, padding_mask) if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) if mask: x, mask_indices = self.apply_mask(features, padding_mask, target_list) else: x = features mask_indices = None # feature: (B, T, D), float # target: (B, T), long # x: (B, T, D), float # padding_mask: (B, T), bool # mask_indices: (B, T), bool def align_size_sum(feat1, pad1, feat2): assert ( abs(feat1.size(1) - feat2.size(1)) < 10 ), "misaligned results for feat1 and feat2 of size {} - {}".format( feat1.size(1), feat2.size(1) ) common_size = min(feat1.size(1), feat2.size(1)) return ( feat1[:, :common_size] + feat2[:, :common_size], pad1[:, :common_size], ) # process encoders res_outputs = [] # final output for different resolution multi_mask_indices = [] # mask indices for different resolution residuals = [] # record the x in encoders padding_masks = [] # final padding masks # The encoder has (self.label_nums - 1) blocks for i in range(self.label_nums - 1): x, _ = self.encoders[i](x, padding_mask=padding_mask, layer=None) residuals.append(x) x, padding_mask, mask_indices = self.downsample_modules[i]( x, padding=padding_mask, mask_indices=mask_indices ) residual = self.middle_encoder(x, padding_mask=padding_mask, layer=None)[0] x = x + residual res_outputs.append(x) # process decoders # The encoder has (self.label_nums - 1) blocks padding_masks.append(padding_mask) multi_mask_indices.append(mask_indices) residuals.reverse() # NOTE(jiatong): reverse res_output to match corresponding input for i in range(self.label_nums - 1): x, padding_mask, mask_indices = self.upsample_modules[ self.label_nums - 2 - i ](x, padding=padding_mask, mask_indices=mask_indices) x, _ = self.decoders[i](x, padding_mask=padding_mask, layer=None) x, padding_mask = align_size_sum(x, padding_mask, residuals[i]) res_outputs.append(x) padding_masks.append(padding_mask) multi_mask_indices.append(mask_indices) # NOTE(jiatong): need reverse of target list to allow matched target-representation res_outputs.reverse() padding_masks.reverse() multi_mask_indices.reverse() if target_list is not None: new_target_list = [] for i in range(self.label_nums): if self.use_single_target: res_outputs[i], reformat_target_list = self.forward_targets( res_outputs[i], target_list[0], self.feat2tar_ratios[i] ) new_target_list.append(reformat_target_list) else: if target_list[i] is not None: res_outputs[i], reformat_target_list = self.forward_targets( res_outputs[i], target_list[i], self.feat2tar_ratios[i] ) new_target_list.append(reformat_target_list) else: # Append a None target list then it won't be used to calculate loss new_target_list.append(None) if padding_masks[i] is not None: padding_masks[i] = self.forward_padding_mask( res_outputs[i], padding_masks[i] ) if multi_mask_indices[i] is not None: multi_mask_indices[i] = self.forward_padding_mask( res_outputs[i], multi_mask_indices[i] ) if features_only: # NOTE(jiatong): need to reverse back res_outputs.reverse() return { "x": res_outputs, "padding_mask": padding_masks[0], "features": features, } def compute_pred(proj_x, target, label_embs): # compute logits for the i-th label set y = torch.index_select(label_embs, 0, target.long()) negs = label_embs.unsqueeze(1).expand(-1, proj_x.size(0), -1) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) # proj_x: (S, D) # y: (S, D) # negs: (Neg, S, D) return self.compute_nce(proj_x, y, negs) logit_m_list, logit_u_list = [], [] for j in range(self.label_nums): if new_target_list[j] is None: continue # skip empty targets label_embs_list = self.label_embs_concat[j].split( [self.multires_classes[j]], 0 ) # set the variables (after the set, the procedure is the same as hubert) # all the elements are list with only one element (to simulate the normal hubert process) x = res_outputs[j] target = new_target_list[j] padding_mask = padding_masks[j] mask_indices = multi_mask_indices[j] final_proj = self.final_projs[j] if not self.skip_masked: masked_indices = torch.logical_and(~padding_mask, mask_indices) proj_x_m = final_proj(x[masked_indices]) logit_m_list.append( compute_pred(proj_x_m, target[masked_indices], label_embs_list[0]) ) else: logit_m_list.append(None) if not self.skip_nomask: nomask_indices = torch.logical_and(~padding_mask, ~mask_indices) proj_x_u = final_proj(x[nomask_indices]) logit_u_list.append( compute_pred(proj_x_u, target[nomask_indices], label_embs_list[0]) ) else: logit_u_list.append(None) # if we only want one prediction, we can exit now if self.predictor_head_num == 1: break result = { "logit_m_list": logit_m_list, "logit_u_list": logit_u_list, "padding_mask": padding_mask, "features_pen": features_pen, } return result def extract_features( self, source: torch.Tensor, padding_mask: Optional[torch.Tensor] = None, mask: bool = False, ret_conv: bool = False, output_layer: Optional[int] = None, last_layer: Optional[bool] = False, ) -> Tuple[torch.Tensor, torch.Tensor]: res = self.forward( source, padding_mask=padding_mask, mask=mask, features_only=True, output_layer=output_layer, ) feature = res["features"] if ret_conv else res["x"] if last_layer: feature = feature[-1] return feature, res["padding_mask"] def get_logits(self, net_output, is_masked=True): if is_masked: logits_list = net_output["logit_m_list"] else: logits_list = net_output["logit_u_list"] logits_list = [x.float() for x in logits_list if x is not None] return logits_list def get_targets(self, net_output, is_masked=True): logits_list = self.get_logits(net_output, is_masked) targets_list = [x.new_zeros(x.size(0), dtype=torch.long) for x in logits_list] return targets_list def get_extra_losses(self, net_output): extra_losses = [] names = [] if "features_pen" in net_output: extra_losses.append(net_output["features_pen"]) names.append("features_pen") return extra_losses, names def remove_pretraining_modules(self): self.target_glu = None self.final_proj = None class ConvAdapter(nn.Module): """Conv adapter that combines two modules with different label rate with downsample or upsample. To allow different ratios than integer, two convs are utilized with first to upsample (numerator) and the second to downsample (denominator)""" def __init__( self, k, label_rate, dropout, channels, activation, log_compression=False, skip_connections=True, highway=True, residual_scale=0.4, non_affine_group_norm=False, ): super().__init__() def downsample_block(channel, k, stride): return nn.Sequential( # with padding (k - 1) // 2 to keep the same size nn.Conv1d( channel, channel, k, stride=stride, bias=False, padding=(k - 1) // 2, ), nn.Dropout(p=dropout), norm_block( is_layer_norm=False, dim=channel, affine=not non_affine_group_norm ), activation, ) def upsample_block(channel, k, stride): return nn.Sequential( # with padding (k - 1) // 2 to keep the same size nn.ConvTranspose1d( channel, channel, k, stride=stride, bias=False, padding=0, # padding=(k - 1) // 2, output_padding=(stride - 1), ), nn.Dropout(p=dropout), norm_block( is_layer_norm=False, dim=channel, affine=not non_affine_group_norm ), activation, ) assert len(label_rate) == 2, "label_rate should be sized two to apply fusion" # Lout =(Lin~H~R1)~Wstride~H~R2~Wpadding+dilation~W(kernel_size~H~R1)+output_padding+1 self.upsample_conv = upsample_block(channels, k, label_rate[0]) self.downsample_conv = downsample_block(channels, k, label_rate[1]) self.upsample_rate, self.downsample_rate = label_rate self.log_compression = log_compression self.skip_connections = skip_connections self.highway = highway self.residual_scale = math.sqrt(residual_scale) def forward(self, x, padding=None, mask_indices=None): # Assume x1 = (B, T, C) as input x = x.permute(0, 2, 1) residual_before_upsample = x x = self.upsample_conv(x) upsample_size = x.size(2) # conduct upsample if self.skip_connections: residual_upsample = torch.repeat_interleave( residual_before_upsample, self.upsample_rate, dim=2 ) upsample_size = min(upsample_size, residual_upsample.size(2)) x = ( x[..., :upsample_size] + residual_upsample[..., :upsample_size] ) * self.residual_scale residual_before_downsample = x x = self.downsample_conv(x) downsample_size = x.size(2) if self.skip_connections: residual_downsample = residual_before_downsample[ ..., :: self.downsample_rate ] downsample_size = min(x.size(2), residual_downsample.size(2)) x = ( x[..., :downsample_size] + residual_downsample[..., :downsample_size] ) * self.residual_scale if self.highway: residual_after_sample = residual_upsample[..., :: self.downsample_rate] final_size = min(x.size(2), residual_after_sample.size(2)) x = ( x[..., :final_size] + residual_after_sample[..., :final_size] ) * self.residual_scale if self.log_compression: x = x.abs() x = x + 1 x = x.log() x = x.permute(0, 2, 1) # process padding if padding is not None: padding = torch.repeat_interleave(padding, self.upsample_rate, dim=1) padding = padding[..., :: self.downsample_rate] padding = padding[..., : x.size(1)] # process mask indices if mask_indices is not None: mask_indices = torch.repeat_interleave( mask_indices, self.upsample_rate, dim=1 ) mask_indices = mask_indices[..., :: self.downsample_rate] mask_indices = mask_indices[..., : x.size(1)] return x, padding, mask_indices class ConvDownsampler(nn.Module): """Conv downsampler that combines two modules with different label rate with downsample or upsample. To allow different ratios than integer, two convs are utilized with first to upsample (numerator) and the second to downsample (denominator)""" def __init__( self, k, label_rate, dropout, channels, activation, log_compression=False, skip_connections=True, highway=True, residual_scale=0.4, non_affine_group_norm=False, ): super().__init__() def downsample_block(channel, k, stride): return nn.Sequential( # with padding (k - 1) // 2 to keep the same size nn.Conv1d( channel, channel, k, stride=stride, bias=False, padding=(k - 1) // 2, ), nn.Dropout(p=dropout), norm_block( is_layer_norm=False, dim=channel, affine=not non_affine_group_norm ), activation, ) assert len(label_rate) == 2, "label_rate should be sized two to apply fusion" self.downsample_conv = downsample_block(channels, k, label_rate[1]) upsample_rate, self.downsample_rate = label_rate assert upsample_rate == 1, "must be 1 to perform downsample only" self.log_compression = log_compression self.skip_connections = skip_connections self.highway = highway # Useless as placeholder self.residual_scale = math.sqrt(residual_scale) def forward(self, x, padding=None, mask_indices=None): # Assume x1 = (B, T, C) as input x = x.permute(0, 2, 1) residual_before_downsample = x x = self.downsample_conv(x) downsample_size = x.size(2) if self.skip_connections: residual_downsample = residual_before_downsample[ ..., :: self.downsample_rate ] downsample_size = min(x.size(2), residual_downsample.size(2)) x = ( x[..., :downsample_size] + residual_downsample[..., :downsample_size] ) * self.residual_scale if self.log_compression: x = x.abs() x = x + 1 x = x.log() x = x.permute(0, 2, 1) # process padding if padding is not None: padding = padding[..., :: self.downsample_rate] padding = padding[..., : x.size(1)] # process mask indices if mask_indices is not None: mask_indices = mask_indices[..., :: self.downsample_rate] mask_indices = mask_indices[..., : x.size(1)] return x, padding, mask_indices class ConvUpsampler(nn.Module): """Conv upsampler that combines two modules with different label rate with downsample or upsample. To allow different ratios than integer, two convs are utilized with first to upsample (numerator) and the second to downsample (denominator)""" def __init__( self, k, label_rate, dropout, channels, activation, log_compression=False, skip_connections=True, highway=True, residual_scale=0.4, non_affine_group_norm=False, ): super().__init__() def upsample_block(channel, k, stride): return nn.Sequential( # with padding (k - 1) // 2 to keep the same size nn.ConvTranspose1d( channel, channel, k, stride=stride, bias=False, padding=0, # padding=(k - 1) // 2, output_padding=(stride - 1), ), nn.Dropout(p=dropout), norm_block( is_layer_norm=False, dim=channel, affine=not non_affine_group_norm ), activation, ) assert len(label_rate) == 2, "label_rate should be sized two to apply fusion" # Lout =(Lin~H~R1)~Wstride~H~R2~Wpadding+dilation~W(kernel_size~H~R1)+output_padding+1 self.upsample_conv = upsample_block(channels, k, label_rate[0]) self.upsample_rate, downsample_rate = label_rate assert downsample_rate == 1, "must be 1 to perform downsample only" self.log_compression = log_compression self.skip_connections = skip_connections self.highway = highway # Useless self.residual_scale = math.sqrt(residual_scale) def forward(self, x, padding=None, mask_indices=None): # Assume x1 = (B, T, C) as input x = x.permute(0, 2, 1) residual_before_upsample = x x = self.upsample_conv(x) upsample_size = x.size(2) # conduct upsample if self.skip_connections: residual_upsample = torch.repeat_interleave( residual_before_upsample, self.upsample_rate, dim=2 ) upsample_size = min(upsample_size, residual_upsample.size(2)) x = ( x[..., :upsample_size] + residual_upsample[..., :upsample_size] ) * self.residual_scale if self.log_compression: x = x.abs() x = x + 1 x = x.log() x = x.permute(0, 2, 1) # process padding if padding is not None: padding = torch.repeat_interleave(padding, self.upsample_rate, dim=1) padding = padding[..., : x.size(1)] # process mask indices if mask_indices is not None: mask_indices = torch.repeat_interleave( mask_indices, self.upsample_rate, dim=1 ) mask_indices = mask_indices[..., : x.size(1)] return x, padding, mask_indices ================================================ FILE: fairseq/models/multires_hubert/multires_hubert_asr.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib from argparse import Namespace from dataclasses import dataclass, field from typing import Any import torch import torch.nn as nn from omegaconf import II, MISSING from fairseq import checkpoint_utils, tasks, utils from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import BaseFairseqModel, FairseqEncoder, register_model from fairseq.models.hubert.hubert import MASKING_DISTRIBUTION_CHOICES from fairseq.tasks import FairseqTask @dataclass class MultiresHubertAsrConfig(FairseqDataclass): multires_hubert_path: str = field( default=MISSING, metadata={"help": "path to multires_hubert model"} ) no_pretrained_weights: bool = field( default=False, metadata={"help": "if true, does not load pretrained weights"}, ) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) final_dropout: float = field( default=0.0, metadata={"help": "dropout after transformer and before final projection"}, ) dropout: float = field( default=0.0, metadata={"help": "dropout probability inside hubert model"}, ) attention_dropout: float = field( default=0.0, metadata={ "help": "dropout probability for attention weights " "inside hubert model" }, ) activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN " "inside hubert model" }, ) # masking apply_mask: bool = field( default=False, metadata={"help": "apply masking during fine-tuning"} ) mask_length: int = field( default=10, metadata={"help": "repeat the mask indices multiple times"} ) mask_prob: float = field( default=0.5, metadata={ "help": "probability of replacing a token with mask " "(normalized by length)" }, ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose masks"} ) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"} ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"}, ) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"}, ) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument " "(used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"}, ) freeze_finetune_updates: int = field( default=0, metadata={"help": "dont finetune hubert for this many updates"}, ) feature_grad_mult: float = field( default=0.0, metadata={"help": "reset feature grad mult in hubert to this"}, ) layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a layer in hubert"}, ) normalize: bool = II("task.normalize") data: str = II("task.data") # this holds the loaded hubert args multires_hubert_args: Any = None @dataclass class MultiresHubertCtcConfig(MultiresHubertAsrConfig): pass @register_model("multires_hubert_ctc", dataclass=MultiresHubertAsrConfig) class MultiresHubertCtc(BaseFairseqModel): def __init__( self, cfg: MultiresHubertAsrConfig, multireshubert_encoder: BaseFairseqModel ): super().__init__() self.cfg = cfg self.multireshubert_encoder = multireshubert_encoder def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model(cls, cfg: MultiresHubertAsrConfig, task: FairseqTask): """Build a new model instance.""" multireshubert_encoder = MultiresHubertEncoder(cfg, task) return cls(cfg, multireshubert_encoder) def get_normalized_probs(self, net_output, log_probs, sample=None): """Get normalized probabilities (or log probs) from a net's output.""" logits = net_output["encoder_out"] if log_probs: return utils.log_softmax(logits.float(), dim=-1) else: return utils.softmax(logits.float(), dim=-1) def get_logits(self, net_output): logits = net_output["encoder_out"] padding = net_output["encoder_padding_mask"] if padding is not None and padding.any(): padding = padding.T logits[padding][..., 0] = 0 logits[padding][..., 1:] = float("-inf") return logits def forward(self, **kwargs): x = self.multireshubert_encoder(**kwargs) return x @dataclass class MultiresHubertSeq2SeqConfig(MultiresHubertAsrConfig): decoder_embed_dim: int = field( default=768, metadata={"help": "decoder embedding dimension"} ) decoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "decoder embedding dimension for FFN"} ) decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"}) decoder_layerdrop: float = field( default=0.0, metadata={"help": "decoder layerdrop chance"} ) decoder_attention_heads: int = field( default=4, metadata={"help": "num decoder attention heads"} ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"}, ) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings " "(outside self attention)" }, ) decoder_dropout: float = field( default=0.0, metadata={"help": "dropout probability in the decoder"} ) decoder_attention_dropout: float = field( default=0.0, metadata={ "help": "dropout probability for attention weights " "inside the decoder" }, ) decoder_activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN " "inside the decoder" }, ) max_target_positions: int = field( default=2048, metadata={"help": "max target positions"} ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"}, ) class MultiresHubertEncoder(FairseqEncoder): def __init__(self, cfg: MultiresHubertAsrConfig, task): self.apply_mask = cfg.apply_mask arg_overrides = { "dropout": cfg.dropout, "activation_dropout": cfg.activation_dropout, "dropout_input": cfg.dropout_input, "attention_dropout": cfg.attention_dropout, "mask_length": cfg.mask_length, "mask_prob": cfg.mask_prob, "mask_selection": cfg.mask_selection, "mask_other": cfg.mask_other, "no_mask_overlap": cfg.no_mask_overlap, "mask_channel_length": cfg.mask_channel_length, "mask_channel_prob": cfg.mask_channel_prob, "mask_channel_selection": cfg.mask_channel_selection, "mask_channel_other": cfg.mask_channel_other, "no_mask_channel_overlap": cfg.no_mask_channel_overlap, "encoder_layerdrop": cfg.layerdrop, "feature_grad_mult": cfg.feature_grad_mult, } if cfg.multires_hubert_args is None: state = checkpoint_utils.load_checkpoint_to_cpu( cfg.multires_hubert_path, arg_overrides ) multires_hubert_args = state.get("cfg", None) if multires_hubert_args is None: multires_hubert_args = convert_namespace_to_omegaconf(state["args"]) cfg.multires_hubert_args = multires_hubert_args else: state = None multires_hubert_args = cfg.multires_hubert_args if isinstance(multires_hubert_args, Namespace): cfg.multires_hubert_args = ( multires_hubert_args ) = convert_namespace_to_omegaconf(multires_hubert_args) assert cfg.normalize == multires_hubert_args.task.normalize, ( "Fine-tuning works best when data normalization is the same. " "Please check that --normalize is set or unset for " "both pre-training and here" ) multires_hubert_args.task.data = cfg.data pretrain_task = tasks.setup_task(multires_hubert_args.task) if state is not None and "task_state" in state: # This will load the stored "dictionaries" object pretrain_task.load_state_dict(state["task_state"]) else: pretrain_task.load_state_dict(task.state_dict()) model = pretrain_task.build_model( multires_hubert_args.model, from_checkpoint=True ) if state is not None and not cfg.no_pretrained_weights: # set strict=False because we omit some modules model.load_state_dict(state["model"], strict=False) model.remove_pretraining_modules() super().__init__(pretrain_task.source_dictionary) d = multires_hubert_args.model.encoder_embed_dim self.multires_hubert_model = model self.final_dropout = nn.Dropout(cfg.final_dropout) self.freeze_finetune_updates = cfg.freeze_finetune_updates self.num_updates = 0 if task.target_dictionary is not None: self.proj = Linear(d, len(task.target_dictionary)) elif getattr(cfg, "decoder_embed_dim", d) != d: self.proj = Linear(d, cfg.decoder_embed_dim) else: self.proj = None def set_num_updates(self, num_updates): """Set the number of parameters updates.""" super().set_num_updates(num_updates) self.num_updates = num_updates def forward(self, source, padding_mask, tbc=True, **kwargs): multires_hubert_args = { "source": source, "padding_mask": padding_mask, "mask": self.apply_mask and self.training, "last_layer": True, } ft = self.freeze_finetune_updates <= self.num_updates with torch.no_grad() if not ft else contextlib.ExitStack(): x, padding_mask = self.multires_hubert_model.extract_features( **multires_hubert_args ) if tbc: # B x T x C -> T x B x C x = x.transpose(0, 1) x = self.final_dropout(x) if self.proj: x = self.proj(x) return { "encoder_out": x, # T x B x C "encoder_padding_mask": padding_mask, # B x T "padding_mask": padding_mask, } def reorder_encoder_out(self, encoder_out, new_order): if encoder_out["encoder_out"] is not None: encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 1, new_order ) if encoder_out["encoder_padding_mask"] is not None: encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(0, new_order) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" return None def upgrade_state_dict_named(self, state_dict, name): return state_dict def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m ================================================ FILE: fairseq/models/nat/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" from .fairseq_nat_model import * from .nonautoregressive_transformer import * from .nat_crf_transformer import * from .iterative_nonautoregressive_transformer import * from .cmlm_transformer import * from .levenshtein_transformer import * from .insertion_transformer import * ================================================ FILE: fairseq/models/nat/cmlm_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ This file implements: Ghazvininejad, Marjan, et al. "Constant-time machine translation with conditional masked language models." arXiv preprint arXiv:1904.09324 (2019). """ from fairseq.models import register_model, register_model_architecture from fairseq.models.nat import NATransformerModel from fairseq.utils import new_arange def _skeptical_unmasking(output_scores, output_masks, p): sorted_index = output_scores.sort(-1)[1] boundary_len = ( (output_masks.sum(1, keepdim=True).type_as(output_scores) - 2) * p ).long() skeptical_mask = new_arange(output_masks) < boundary_len return skeptical_mask.scatter(1, sorted_index, skeptical_mask) @register_model("cmlm_transformer") class CMLMNATransformerModel(NATransformerModel): @staticmethod def add_args(parser): NATransformerModel.add_args(parser) def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs ): assert not self.decoder.src_embedding_copy, "do not support embedding copy." # encoding encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) # length prediction length_out = self.decoder.forward_length( normalize=False, encoder_out=encoder_out ) length_tgt = self.decoder.forward_length_prediction( length_out, encoder_out, tgt_tokens ) # decoding word_ins_out = self.decoder( normalize=False, prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, ) word_ins_mask = prev_output_tokens.eq(self.unk) return { "word_ins": { "out": word_ins_out, "tgt": tgt_tokens, "mask": word_ins_mask, "ls": self.args.label_smoothing, "nll_loss": True, }, "length": { "out": length_out, "tgt": length_tgt, "factor": self.decoder.length_loss_factor, }, } def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs): step = decoder_out.step max_step = decoder_out.max_step output_tokens = decoder_out.output_tokens output_scores = decoder_out.output_scores history = decoder_out.history # execute the decoder output_masks = output_tokens.eq(self.unk) _scores, _tokens = self.decoder( normalize=True, prev_output_tokens=output_tokens, encoder_out=encoder_out, ).max(-1) output_tokens.masked_scatter_(output_masks, _tokens[output_masks]) output_scores.masked_scatter_(output_masks, _scores[output_masks]) if history is not None: history.append(output_tokens.clone()) # skeptical decoding (depend on the maximum decoding steps.) if (step + 1) < max_step: skeptical_mask = _skeptical_unmasking( output_scores, output_tokens.ne(self.pad), 1 - (step + 1) / max_step ) output_tokens.masked_fill_(skeptical_mask, self.unk) output_scores.masked_fill_(skeptical_mask, 0.0) if history is not None: history.append(output_tokens.clone()) return decoder_out._replace( output_tokens=output_tokens, output_scores=output_scores, attn=None, history=history, ) @register_model_architecture("cmlm_transformer", "cmlm_transformer") def cmlm_base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", True) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.apply_bert_init = getattr(args, "apply_bert_init", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) # --- special arguments --- args.sg_length_pred = getattr(args, "sg_length_pred", False) args.pred_length_offset = getattr(args, "pred_length_offset", False) args.length_loss_factor = getattr(args, "length_loss_factor", 0.1) args.ngram_predictor = getattr(args, "ngram_predictor", 1) args.src_embedding_copy = getattr(args, "src_embedding_copy", False) @register_model_architecture("cmlm_transformer", "cmlm_transformer_wmt_en_de") def cmlm_wmt_en_de(args): cmlm_base_architecture(args) ================================================ FILE: fairseq/models/nat/fairseq_nat_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch from fairseq.models.transformer import ( TransformerDecoder, TransformerEncoder, TransformerModel, ) from fairseq.modules.transformer_sentence_encoder import init_bert_params def ensemble_encoder(func): def wrapper(self, *args, **kwargs): if self.ensemble_models is None or len(self.ensemble_models) == 1: return func(self, *args, **kwargs) encoder_outs = [ func(model, *args, **kwargs, return_all_hiddens=True) for model in self.ensemble_models ] _encoder_out = encoder_outs[0].copy() def stack(key): outs = [e[key][0] for e in encoder_outs] return [torch.stack(outs, -1) if outs[0] is not None else None] _encoder_out["encoder_out"] = stack("encoder_out") _encoder_out["encoder_embedding"] = stack("encoder_embedding") num_layers = len(_encoder_out["encoder_states"]) if num_layers > 0: _encoder_out["encoder_states"] = [ torch.stack([e["encoder_states"][i] for e in encoder_outs], -1) for i in range(num_layers) ] return _encoder_out return wrapper def ensemble_decoder(func): def wrapper(self, normalize=False, encoder_out=None, *args, **kwargs): if self.ensemble_models is None or len(self.ensemble_models) == 1: return func( self, normalize=normalize, encoder_out=encoder_out, *args, **kwargs ) def _replace(encoder_out, new_val): new_encoder_out = encoder_out.copy() new_encoder_out["encoder_out"] = [new_val] return new_encoder_out action_outs = [ func( model, normalize=normalize, encoder_out=_replace( encoder_out, encoder_out["encoder_out"][0][:, :, :, i] ), *args, **kwargs ) for i, model in enumerate(self.ensemble_models) ] if not isinstance(action_outs[0], tuple): # return multiple values action_outs = [[a] for a in action_outs] else: action_outs = [list(a) for a in action_outs] ensembled_outs = [] for i in range(len(action_outs[0])): if i == 0 and normalize: ensembled_outs += [ torch.logsumexp( torch.stack([a[i] for a in action_outs], -1), dim=-1 ) - math.log(len(self.ensemble_models)) ] elif action_outs[0][i] is not None: ensembled_outs += [torch.stack([a[i] for a in action_outs], -1)] else: ensembled_outs += [None] if len(ensembled_outs) == 1: return ensembled_outs[0] return tuple(ensembled_outs) return wrapper class FairseqNATModel(TransformerModel): """ Abstract class for all nonautoregressive-based models """ def __init__(self, args, encoder, decoder): super().__init__(args, encoder, decoder) self.tgt_dict = decoder.dictionary self.bos = decoder.dictionary.bos() self.eos = decoder.dictionary.eos() self.pad = decoder.dictionary.pad() self.unk = decoder.dictionary.unk() self.ensemble_models = None @property def allow_length_beam(self): return False @property def allow_ensemble(self): return True def enable_ensemble(self, models): self.encoder.ensemble_models = [m.encoder for m in models] self.decoder.ensemble_models = [m.decoder for m in models] @staticmethod def add_args(parser): TransformerModel.add_args(parser) parser.add_argument( "--apply-bert-init", action="store_true", help="use custom param initialization for BERT", ) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): decoder = FairseqNATDecoder(args, tgt_dict, embed_tokens) if getattr(args, "apply_bert_init", False): decoder.apply(init_bert_params) return decoder @classmethod def build_encoder(cls, args, src_dict, embed_tokens): encoder = FairseqNATEncoder(args, src_dict, embed_tokens) if getattr(args, "apply_bert_init", False): encoder.apply(init_bert_params) return encoder def forward_encoder(self, encoder_inputs): return self.encoder(*encoder_inputs) def forward_decoder(self, *args, **kwargs): return NotImplementedError def initialize_output_tokens(self, *args, **kwargs): return NotImplementedError def forward(self, *args, **kwargs): return NotImplementedError class FairseqNATEncoder(TransformerEncoder): def __init__(self, args, dictionary, embed_tokens): super().__init__(args, dictionary, embed_tokens) self.ensemble_models = None @ensemble_encoder def forward(self, *args, **kwargs): return super().forward(*args, **kwargs) class FairseqNATDecoder(TransformerDecoder): def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(args, dictionary, embed_tokens, no_encoder_attn) self.ensemble_models = None ================================================ FILE: fairseq/models/nat/insertion_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch import torch.nn.functional as F from fairseq.models import register_model, register_model_architecture from fairseq.models.nat import ( FairseqNATModel, LevenshteinTransformerDecoder, LevenshteinTransformerModel, ensemble_decoder, ) from fairseq.models.transformer import Linear from fairseq.modules.transformer_sentence_encoder import init_bert_params from fairseq.utils import new_arange class NegativeDistanceScore(object): def __init__(self): # pre-compute some values self.scores = {} self.scores[0.5] = self.compute_score_full(50, 0.5) self.scores[1.0] = self.compute_score_full(50, 1.0) self.scores[2.0] = self.compute_score_full(50, 2.0) def __call__(self, i, L, tau): if (tau is None) or (tau > 1000): return 1 / L if tau in self.scores: if L < self.scores[tau].shape[0]: return self.scores[tau][L - 1, i] return self.compute_score(L, tau)[i] def compute_score(self, L, tau): s = np.array([-abs(L / 2 - i) / tau for i in range(L)]) s = np.exp(s - s.max()) return s / s.sum() def compute_score_full(self, L, tau): s = -abs(np.arange(0, L - 1)[:, None] / 2 - np.arange(L)[None, :]) / tau s = np.tril(s, 0) + np.triu(s - float("inf"), 1) s = np.exp(s - s.max(1, keepdims=True)) return s / s.sum(1, keepdims=True) neg_scorer = NegativeDistanceScore() def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx, vocab_size, tau=None): try: from fairseq import libnat except ImportError as e: import sys sys.stderr.write("ERROR: missing libnat. run `pip install --editable .`\n") raise e B = in_tokens.size(0) T = in_tokens.size(1) V = vocab_size with torch.cuda.device_of(in_tokens): in_tokens_list = [ [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist()) ] out_tokens_list = [ [t for t in s if t != padding_idx] for i, s in enumerate(out_tokens.tolist()) ] full_labels = libnat.suggested_ed2_path( in_tokens_list, out_tokens_list, padding_idx ) insert_labels = [a[:-1] for a in full_labels] # numericalize1 insert_label_tensors = in_tokens.new_zeros(B * (T - 1) * V).float() insert_index, insert_labels = zip( *[ (w + (j + i * (T - 1)) * V, neg_scorer(k, len(label), tau)) for i, labels in enumerate(insert_labels) for j, label in enumerate(labels[1:-1]) for k, w in enumerate(label) ] ) # HACK 1:-1 insert_index, insert_labels = [ torch.tensor(list(a), device=in_tokens.device) for a in [insert_index, insert_labels] ] insert_label_tensors.scatter_(0, insert_index.long(), insert_labels) insert_label_tensors = insert_label_tensors.view(B, T - 1, V) return insert_label_tensors def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, padding_idx): padding_masks = in_tokens[:, 1:].eq(padding_idx) word_ins_scores.masked_fill_(padding_masks, 0.0) word_ins_pred.masked_fill_(padding_masks, padding_idx) in_coords = new_arange(in_tokens).type_as(in_scores) # shift all padding predictions to infinite out_coords = (in_coords[:, 1:] - 0.5).masked_fill( word_ins_pred.eq(padding_idx), float("inf") ) out_coords = torch.cat([in_coords, out_coords], 1).sort(-1)[1] out_tokens = torch.cat([in_tokens, word_ins_pred], 1).gather(1, out_coords) out_scores = torch.cat([in_scores, word_ins_scores], 1).gather(1, out_coords) return out_tokens, out_scores @register_model("insertion_transformer") class InsertionTransformerModel(LevenshteinTransformerModel): def __init__(self, args, encoder, decoder): super().__init__(args, encoder, decoder) @staticmethod def add_args(parser): FairseqNATModel.add_args(parser) parser.add_argument("--label-tau", default=None, type=float) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): decoder = InsertionTransformerDecoder(args, tgt_dict, embed_tokens) if getattr(args, "apply_bert_init", False): decoder.apply(init_bert_params) return decoder def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs ): assert tgt_tokens is not None, "forward function only supports training." # encoding encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) # generate training labels for insertion word_ins_out = self.decoder.forward_word_ins( normalize=False, prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, ) word_ins_tgt = _get_ins_targets( prev_output_tokens, tgt_tokens, self.pad, self.unk, len(self.tgt_dict), tau=self.decoder.label_tau, ).type_as(word_ins_out) word_ins_masks = prev_output_tokens[:, 1:].ne(self.pad) return { "word_ins": { "out": word_ins_out, "tgt": word_ins_tgt, "mask": word_ins_masks, "ls": self.args.label_smoothing, "nll_loss": True, } } def forward_decoder( self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs ): output_tokens = decoder_out.output_tokens output_scores = decoder_out.output_scores history = decoder_out.history # TODO: decoding for InsertionTransformer word_ins_score = self.decoder.forward_word_ins( normalize=True, prev_output_tokens=output_tokens, encoder_out=encoder_out ) if eos_penalty > 0.0: word_ins_score[:, :, self.pad] -= eos_penalty word_ins_score, word_ins_pred = word_ins_score.max(-1) output_tokens, output_scores = _apply_ins_words( output_tokens, output_scores, word_ins_pred, word_ins_score, self.pad ) # delete some unnecessary paddings cut_off = output_tokens.ne(self.pad).sum(1).max() output_tokens = output_tokens[:, :cut_off] output_scores = output_scores[:, :cut_off] if history is not None: history.append(output_tokens.clone()) return decoder_out._replace( output_tokens=output_tokens, output_scores=output_scores, attn=None, history=history, ) class InsertionTransformerDecoder(LevenshteinTransformerDecoder): def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): # use the TransformerDecoder's __init__ super(LevenshteinTransformerDecoder, self).__init__( args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn ) self.dictionary = dictionary self.bos = dictionary.bos() self.unk = dictionary.unk() self.eos = dictionary.eos() self.pool_out = Linear(self.output_embed_dim * 2, self.output_embed_dim) self.label_tau = getattr(args, "label_tau", None) @ensemble_decoder def forward_word_ins(self, normalize, encoder_out, prev_output_tokens): features = self.extract_features(prev_output_tokens, encoder_out=encoder_out)[0] features = self.pool_out( torch.cat([features[:, :-1, :], features[:, 1:, :]], 2) ) decoder_out = self.output_layer(features) return F.log_softmax(decoder_out, -1) if normalize else decoder_out def forward_mask_ins(self, *args, **kwargs): raise NotImplementedError def forward_word_del(self, *args, **kwargs): raise NotImplementedError @register_model_architecture("insertion_transformer", "insertion_transformer") def insertion_base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.apply_bert_init = getattr(args, "apply_bert_init", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) # special for insertion transformer args.label_tau = getattr(args, "label_tau", None) ================================================ FILE: fairseq/models/nat/iterative_nonautoregressive_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.models import register_model, register_model_architecture from fairseq.models.nat import NATransformerModel def _sequential_poisoning(s, V, beta=0.33, bos=2, eos=3, pad=1): # s: input batch # V: vocabulary size rand_words = torch.randint(low=4, high=V, size=s.size(), device=s.device) choices = torch.rand(size=s.size(), device=s.device) choices.masked_fill_((s == pad) | (s == bos) | (s == eos), 1) replace = choices < beta / 3 repeat = (choices >= beta / 3) & (choices < beta * 2 / 3) swap = (choices >= beta * 2 / 3) & (choices < beta) safe = choices >= beta for i in range(s.size(1) - 1): rand_word = rand_words[:, i] next_word = s[:, i + 1] self_word = s[:, i] replace_i = replace[:, i] swap_i = swap[:, i] & (next_word != 3) repeat_i = repeat[:, i] & (next_word != 3) safe_i = safe[:, i] | ((next_word == 3) & (~replace_i)) s[:, i] = ( self_word * (safe_i | repeat_i).long() + next_word * swap_i.long() + rand_word * replace_i.long() ) s[:, i + 1] = ( next_word * (safe_i | replace_i).long() + self_word * (swap_i | repeat_i).long() ) return s def gumbel_noise(input, TINY=1e-8): return ( input.new_zeros(*input.size()) .uniform_() .add_(TINY) .log_() .neg_() .add_(TINY) .log_() .neg_() ) @register_model("iterative_nonautoregressive_transformer") class IterNATransformerModel(NATransformerModel): @staticmethod def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument( "--train-step", type=int, help="number of refinement iterations during training", ) parser.add_argument( "--dae-ratio", type=float, help="the probability of switching to the denoising auto-encoder loss", ) parser.add_argument( "--stochastic-approx", action="store_true", help="sampling from the decoder as the inputs for next iteration", ) @classmethod def build_model(cls, args, task): model = super().build_model(args, task) model.train_step = getattr(args, "train_step", 4) model.dae_ratio = getattr(args, "dae_ratio", 0.5) model.stochastic_approx = getattr(args, "stochastic_approx", False) return model def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs ): B, T = prev_output_tokens.size() # encoding encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) # length prediction length_out = self.decoder.forward_length( normalize=False, encoder_out=encoder_out ) length_tgt = self.decoder.forward_length_prediction( length_out, encoder_out, tgt_tokens ) # decoding word_ins_outs, word_ins_tgts, word_ins_masks = [], [], [] for t in range(self.train_step): word_ins_out = self.decoder( normalize=False, prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, step=t, ) word_ins_tgt = tgt_tokens word_ins_mask = word_ins_tgt.ne(self.pad) word_ins_outs.append(word_ins_out) word_ins_tgts.append(word_ins_tgt) word_ins_masks.append(word_ins_mask) if t < (self.train_step - 1): # prediction for next iteration if self.stochastic_approx: word_ins_prediction = ( word_ins_out + gumbel_noise(word_ins_out) ).max(-1)[1] else: word_ins_prediction = word_ins_out.max(-1)[1] prev_output_tokens = prev_output_tokens.masked_scatter( word_ins_mask, word_ins_prediction[word_ins_mask] ) if self.dae_ratio > 0: # we do not perform denoising for the first iteration corrputed = ( torch.rand(size=(B,), device=prev_output_tokens.device) < self.dae_ratio ) corrputed_tokens = _sequential_poisoning( tgt_tokens[corrputed], len(self.tgt_dict), 0.33, self.bos, self.eos, self.pad, ) prev_output_tokens[corrputed] = corrputed_tokens # concat everything word_ins_out = torch.cat(word_ins_outs, 0) word_ins_tgt = torch.cat(word_ins_tgts, 0) word_ins_mask = torch.cat(word_ins_masks, 0) return { "word_ins": { "out": word_ins_out, "tgt": word_ins_tgt, "mask": word_ins_mask, "ls": self.args.label_smoothing, "nll_loss": True, }, "length": { "out": length_out, "tgt": length_tgt, "factor": self.decoder.length_loss_factor, }, } @register_model_architecture( "iterative_nonautoregressive_transformer", "iterative_nonautoregressive_transformer" ) def inat_base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.apply_bert_init = getattr(args, "apply_bert_init", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) # --- special arguments --- args.sg_length_pred = getattr(args, "sg_length_pred", False) args.pred_length_offset = getattr(args, "pred_length_offset", False) args.length_loss_factor = getattr(args, "length_loss_factor", 0.1) args.ngram_predictor = getattr(args, "ngram_predictor", 1) args.src_embedding_copy = getattr(args, "src_embedding_copy", False) args.train_step = getattr(args, "train_step", 4) args.dae_ratio = getattr(args, "dae_ratio", 0.5) args.stochastic_approx = getattr(args, "stochastic_approx", False) @register_model_architecture( "iterative_nonautoregressive_transformer", "iterative_nonautoregressive_transformer_wmt_en_de", ) def iter_nat_wmt_en_de(args): inat_base_architecture(args) ================================================ FILE: fairseq/models/nat/levenshtein_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F from fairseq.iterative_refinement_generator import DecoderOut from fairseq.models import register_model, register_model_architecture from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder from fairseq.models.transformer import Embedding from fairseq.modules import TransformerDecoderLayer from fairseq.modules.transformer_sentence_encoder import init_bert_params from .levenshtein_utils import ( _apply_del_words, _apply_ins_masks, _apply_ins_words, _fill, _get_del_targets, _get_ins_targets, _skip, _skip_encoder_out, ) @register_model("levenshtein_transformer") class LevenshteinTransformerModel(FairseqNATModel): @property def allow_length_beam(self): return False @staticmethod def add_args(parser): FairseqNATModel.add_args(parser) parser.add_argument( "--early-exit", default="6,6,6", type=str, help="number of decoder layers before word_del, mask_ins, word_ins", ) parser.add_argument( "--no-share-discriminator", action="store_true", help="separate parameters for discriminator", ) parser.add_argument( "--no-share-maskpredictor", action="store_true", help="separate parameters for mask-predictor", ) parser.add_argument( "--share-discriminator-maskpredictor", action="store_true", help="share the parameters for both mask-predictor and discriminator", ) parser.add_argument( "--sampling-for-deletion", action="store_true", help="instead of argmax, use sampling to predict the tokens", ) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): decoder = LevenshteinTransformerDecoder(args, tgt_dict, embed_tokens) if getattr(args, "apply_bert_init", False): decoder.apply(init_bert_params) return decoder def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs ): assert tgt_tokens is not None, "forward function only supports training." # encoding encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) # generate training labels for insertion masked_tgt_masks, masked_tgt_tokens, mask_ins_targets = _get_ins_targets( prev_output_tokens, tgt_tokens, self.pad, self.unk ) mask_ins_targets = mask_ins_targets.clamp(min=0, max=255) # for safe prediction mask_ins_masks = prev_output_tokens[:, 1:].ne(self.pad) mask_ins_out, _ = self.decoder.forward_mask_ins( normalize=False, prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, ) word_ins_out, _ = self.decoder.forward_word_ins( normalize=False, prev_output_tokens=masked_tgt_tokens, encoder_out=encoder_out, ) # make online prediction if self.decoder.sampling_for_deletion: word_predictions = torch.multinomial( F.softmax(word_ins_out, -1).view(-1, word_ins_out.size(-1)), 1 ).view(word_ins_out.size(0), -1) else: word_predictions = F.log_softmax(word_ins_out, dim=-1).max(2)[1] word_predictions.masked_scatter_( ~masked_tgt_masks, tgt_tokens[~masked_tgt_masks] ) # generate training labels for deletion word_del_targets = _get_del_targets(word_predictions, tgt_tokens, self.pad) word_del_out, _ = self.decoder.forward_word_del( normalize=False, prev_output_tokens=word_predictions, encoder_out=encoder_out, ) word_del_masks = word_predictions.ne(self.pad) return { "mask_ins": { "out": mask_ins_out, "tgt": mask_ins_targets, "mask": mask_ins_masks, "ls": 0.01, }, "word_ins": { "out": word_ins_out, "tgt": tgt_tokens, "mask": masked_tgt_masks, "ls": self.args.label_smoothing, "nll_loss": True, }, "word_del": { "out": word_del_out, "tgt": word_del_targets, "mask": word_del_masks, }, } def forward_decoder( self, decoder_out, encoder_out, eos_penalty=0.0, max_ratio=None, **kwargs ): output_tokens = decoder_out.output_tokens output_scores = decoder_out.output_scores attn = decoder_out.attn history = decoder_out.history bsz = output_tokens.size(0) if max_ratio is None: max_lens = torch.zeros_like(output_tokens).fill_(255) else: if not encoder_out["encoder_padding_mask"]: max_src_len = encoder_out["encoder_out"].size(0) src_lens = encoder_out["encoder_out"].new(bsz).fill_(max_src_len) else: src_lens = (~encoder_out["encoder_padding_mask"][0]).sum(1) max_lens = (src_lens * max_ratio).clamp(min=10).long() # delete words # do not delete tokens if it is <s> </s> can_del_word = output_tokens.ne(self.pad).sum(1) > 2 if can_del_word.sum() != 0: # we cannot delete, skip word_del_score, word_del_attn = self.decoder.forward_word_del( normalize=True, prev_output_tokens=_skip(output_tokens, can_del_word), encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_del_word), ) word_del_pred = word_del_score.max(-1)[1].bool() _tokens, _scores, _attn = _apply_del_words( output_tokens[can_del_word], output_scores[can_del_word], word_del_attn, word_del_pred, self.pad, self.bos, self.eos, ) output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad) output_scores = _fill(output_scores, can_del_word, _scores, 0) attn = _fill(attn, can_del_word, _attn, 0.0) if history is not None: history.append(output_tokens.clone()) # insert placeholders can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens if can_ins_mask.sum() != 0: mask_ins_score, _ = self.decoder.forward_mask_ins( normalize=True, prev_output_tokens=_skip(output_tokens, can_ins_mask), encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_mask), ) if eos_penalty > 0.0: mask_ins_score[:, :, 0] = mask_ins_score[:, :, 0] - eos_penalty mask_ins_pred = mask_ins_score.max(-1)[1] mask_ins_pred = torch.min( mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred) ) _tokens, _scores = _apply_ins_masks( output_tokens[can_ins_mask], output_scores[can_ins_mask], mask_ins_pred, self.pad, self.unk, self.eos, ) output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad) output_scores = _fill(output_scores, can_ins_mask, _scores, 0) if history is not None: history.append(output_tokens.clone()) # insert words can_ins_word = output_tokens.eq(self.unk).sum(1) > 0 if can_ins_word.sum() != 0: word_ins_score, word_ins_attn = self.decoder.forward_word_ins( normalize=True, prev_output_tokens=_skip(output_tokens, can_ins_word), encoder_out=_skip_encoder_out(self.encoder, encoder_out, can_ins_word), ) word_ins_score, word_ins_pred = word_ins_score.max(-1) _tokens, _scores = _apply_ins_words( output_tokens[can_ins_word], output_scores[can_ins_word], word_ins_pred, word_ins_score, self.unk, ) output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad) output_scores = _fill(output_scores, can_ins_word, _scores, 0) attn = _fill(attn, can_ins_word, word_ins_attn, 0.0) if history is not None: history.append(output_tokens.clone()) # delete some unnecessary paddings cut_off = output_tokens.ne(self.pad).sum(1).max() output_tokens = output_tokens[:, :cut_off] output_scores = output_scores[:, :cut_off] attn = None if attn is None else attn[:, :cut_off, :] return decoder_out._replace( output_tokens=output_tokens, output_scores=output_scores, attn=attn, history=history, ) def initialize_output_tokens(self, encoder_out, src_tokens): initial_output_tokens = src_tokens.new_zeros(src_tokens.size(0), 2) initial_output_tokens[:, 0] = self.bos initial_output_tokens[:, 1] = self.eos initial_output_scores = initial_output_tokens.new_zeros( *initial_output_tokens.size() ).type_as(encoder_out["encoder_out"][0]) return DecoderOut( output_tokens=initial_output_tokens, output_scores=initial_output_scores, attn=None, step=0, max_step=0, history=None, ) class LevenshteinTransformerDecoder(FairseqNATDecoder): def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__( args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn ) self.dictionary = dictionary self.bos = dictionary.bos() self.unk = dictionary.unk() self.eos = dictionary.eos() self.sampling_for_deletion = getattr(args, "sampling_for_deletion", False) self.embed_mask_ins = Embedding(256, self.output_embed_dim * 2, None) self.embed_word_del = Embedding(2, self.output_embed_dim, None) # del_word, ins_mask, ins_word self.early_exit = [int(i) for i in args.early_exit.split(",")] assert len(self.early_exit) == 3 # copy layers for mask-predict/deletion self.layers_msk = None if getattr(args, "no_share_maskpredictor", False): self.layers_msk = nn.ModuleList( [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(self.early_exit[1]) ] ) self.layers_del = None if getattr(args, "no_share_discriminator", False): self.layers_del = nn.ModuleList( [ TransformerDecoderLayer(args, no_encoder_attn) for _ in range(self.early_exit[0]) ] ) if getattr(args, "share_discriminator_maskpredictor", False): assert getattr( args, "no_share_discriminator", False ), "must set saperate discriminator" self.layers_msk = self.layers_del def extract_features( self, prev_output_tokens, encoder_out=None, early_exit=None, layers=None, **unused ): """ Similar to *forward* but only return features. Inputs: prev_output_tokens: Tensor(B, T) encoder_out: a dictionary of hidden states and masks Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs the LevenshteinTransformer decoder has full-attention to all generated tokens """ # embed positions positions = ( self.embed_positions(prev_output_tokens) if self.embed_positions is not None else None ) # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers decoder_padding_mask = prev_output_tokens.eq(self.padding_idx) layers = self.layers if layers is None else layers early_exit = len(layers) if early_exit is None else early_exit for _, layer in enumerate(layers[:early_exit]): x, attn, _ = layer( x, encoder_out["encoder_out"][0] if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) else None, encoder_out["encoder_padding_mask"][0] if ( encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0 ) else None, self_attn_mask=None, self_attn_padding_mask=decoder_padding_mask, ) inner_states.append(x) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {"attn": attn, "inner_states": inner_states} @ensemble_decoder def forward_mask_ins(self, normalize, encoder_out, prev_output_tokens, **unused): features, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[1], layers=self.layers_msk, **unused ) features_cat = torch.cat([features[:, :-1, :], features[:, 1:, :]], 2) decoder_out = F.linear(features_cat, self.embed_mask_ins.weight) if normalize: return F.log_softmax(decoder_out, -1), extra["attn"] return decoder_out, extra["attn"] @ensemble_decoder def forward_word_ins(self, normalize, encoder_out, prev_output_tokens, **unused): features, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[2], layers=self.layers, **unused ) decoder_out = self.output_layer(features) if normalize: return F.log_softmax(decoder_out, -1), extra["attn"] return decoder_out, extra["attn"] @ensemble_decoder def forward_word_del(self, normalize, encoder_out, prev_output_tokens, **unused): features, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, early_exit=self.early_exit[0], layers=self.layers_del, **unused ) decoder_out = F.linear(features, self.embed_word_del.weight) if normalize: return F.log_softmax(decoder_out, -1), extra["attn"] return decoder_out, extra["attn"] @register_model_architecture("levenshtein_transformer", "levenshtein_transformer") def levenshtein_base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.apply_bert_init = getattr(args, "apply_bert_init", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.sampling_for_deletion = getattr(args, "sampling_for_deletion", False) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.early_exit = getattr(args, "early_exit", "6,6,6") args.no_share_discriminator = getattr(args, "no_share_discriminator", False) args.no_share_maskpredictor = getattr(args, "no_share_maskpredictor", False) args.share_discriminator_maskpredictor = getattr( args, "share_discriminator_maskpredictor", False ) args.no_share_last_layer = getattr(args, "no_share_last_layer", False) @register_model_architecture( "levenshtein_transformer", "levenshtein_transformer_wmt_en_de" ) def levenshtein_transformer_wmt_en_de(args): levenshtein_base_architecture(args) # similar parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) @register_model_architecture( "levenshtein_transformer", "levenshtein_transformer_vaswani_wmt_en_de_big" ) def levenshtein_transformer_vaswani_wmt_en_de_big(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.dropout = getattr(args, "dropout", 0.3) levenshtein_base_architecture(args) # default parameters used in tensor2tensor implementation @register_model_architecture( "levenshtein_transformer", "levenshtein_transformer_wmt_en_de_big" ) def levenshtein_transformer_wmt_en_de_big_t2t(args): args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_dropout = getattr(args, "activation_dropout", 0.1) levenshtein_transformer_vaswani_wmt_en_de_big(args) ================================================ FILE: fairseq/models/nat/levenshtein_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq.utils import new_arange # -------------- Helper Functions --------------------------------------------------- # def load_libnat(): try: from fairseq import libnat_cuda return libnat_cuda, True except ImportError as e: print(str(e) + "... fall back to CPU version") try: from fairseq import libnat return libnat, False except ImportError as e: import sys sys.stderr.write( "ERROR: missing libnat_cuda. run `python setup.py build_ext --inplace`\n" ) raise e def _get_ins_targets(in_tokens, out_tokens, padding_idx, unk_idx): libnat, use_cuda = load_libnat() def _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx): in_masks = in_tokens.ne(padding_idx) out_masks = out_tokens.ne(padding_idx) mask_ins_targets, masked_tgt_masks = libnat.generate_insertion_labels( out_tokens.int(), libnat.levenshtein_distance( in_tokens.int(), out_tokens.int(), in_masks.sum(1).int(), out_masks.sum(1).int(), ), ) masked_tgt_masks = masked_tgt_masks.bool() & out_masks mask_ins_targets = mask_ins_targets.type_as(in_tokens)[ :, 1 : in_masks.size(1) ].masked_fill_(~in_masks[:, 1:], 0) masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx) return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets def _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx): in_seq_len, out_seq_len = in_tokens.size(1), out_tokens.size(1) in_tokens_list = [ [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist()) ] out_tokens_list = [ [t for t in s if t != padding_idx] for i, s in enumerate(out_tokens.tolist()) ] full_labels = libnat.suggested_ed2_path( in_tokens_list, out_tokens_list, padding_idx ) mask_inputs = [ [len(c) if c[0] != padding_idx else 0 for c in a[:-1]] for a in full_labels ] # generate labels masked_tgt_masks = [] for mask_input in mask_inputs: mask_label = [] for beam_size in mask_input[1:-1]: # HACK 1:-1 mask_label += [0] + [1 for _ in range(beam_size)] masked_tgt_masks.append( mask_label + [0 for _ in range(out_seq_len - len(mask_label))] ) mask_ins_targets = [ mask_input[1:-1] + [0 for _ in range(in_seq_len - 1 - len(mask_input[1:-1]))] for mask_input in mask_inputs ] # transform to tensor masked_tgt_masks = torch.tensor( masked_tgt_masks, device=out_tokens.device ).bool() mask_ins_targets = torch.tensor(mask_ins_targets, device=in_tokens.device) masked_tgt_tokens = out_tokens.masked_fill(masked_tgt_masks, unk_idx) return masked_tgt_masks, masked_tgt_tokens, mask_ins_targets if use_cuda: return _get_ins_targets_cuda(in_tokens, out_tokens, padding_idx, unk_idx) return _get_ins_targets_cpu(in_tokens, out_tokens, padding_idx, unk_idx) def _get_del_targets(in_tokens, out_tokens, padding_idx): libnat, use_cuda = load_libnat() def _get_del_targets_cuda(in_tokens, out_tokens, padding_idx): in_masks = in_tokens.ne(padding_idx) out_masks = out_tokens.ne(padding_idx) word_del_targets = libnat.generate_deletion_labels( in_tokens.int(), libnat.levenshtein_distance( in_tokens.int(), out_tokens.int(), in_masks.sum(1).int(), out_masks.sum(1).int(), ), ) word_del_targets = word_del_targets.type_as(in_tokens).masked_fill_( ~in_masks, 0 ) return word_del_targets def _get_del_targets_cpu(in_tokens, out_tokens, padding_idx): out_seq_len = out_tokens.size(1) with torch.cuda.device_of(in_tokens): in_tokens_list = [ [t for t in s if t != padding_idx] for i, s in enumerate(in_tokens.tolist()) ] out_tokens_list = [ [t for t in s if t != padding_idx] for i, s in enumerate(out_tokens.tolist()) ] full_labels = libnat.suggested_ed2_path( in_tokens_list, out_tokens_list, padding_idx ) word_del_targets = [b[-1] for b in full_labels] word_del_targets = [ labels + [0 for _ in range(out_seq_len - len(labels))] for labels in word_del_targets ] # transform to tensor word_del_targets = torch.tensor(word_del_targets, device=out_tokens.device) return word_del_targets if use_cuda: return _get_del_targets_cuda(in_tokens, out_tokens, padding_idx) return _get_del_targets_cpu(in_tokens, out_tokens, padding_idx) def _apply_ins_masks( in_tokens, in_scores, mask_ins_pred, padding_idx, unk_idx, eos_idx ): in_masks = in_tokens.ne(padding_idx) in_lengths = in_masks.sum(1) # HACK: hacky way to shift all the paddings to eos first. in_tokens.masked_fill_(~in_masks, eos_idx) mask_ins_pred.masked_fill_(~in_masks[:, 1:], 0) out_lengths = in_lengths + mask_ins_pred.sum(1) out_max_len = out_lengths.max() out_masks = new_arange(out_lengths, out_max_len)[None, :] < out_lengths[:, None] reordering = (mask_ins_pred + in_masks[:, 1:].long()).cumsum(1) out_tokens = ( in_tokens.new_zeros(in_tokens.size(0), out_max_len) .fill_(padding_idx) .masked_fill_(out_masks, unk_idx) ) out_tokens[:, 0] = in_tokens[:, 0] out_tokens.scatter_(1, reordering, in_tokens[:, 1:]) out_scores = None if in_scores is not None: in_scores.masked_fill_(~in_masks, 0) out_scores = in_scores.new_zeros(*out_tokens.size()) out_scores[:, 0] = in_scores[:, 0] out_scores.scatter_(1, reordering, in_scores[:, 1:]) return out_tokens, out_scores def _apply_ins_words(in_tokens, in_scores, word_ins_pred, word_ins_scores, unk_idx): word_ins_masks = in_tokens.eq(unk_idx) out_tokens = in_tokens.masked_scatter(word_ins_masks, word_ins_pred[word_ins_masks]) if in_scores is not None: out_scores = in_scores.masked_scatter( word_ins_masks, word_ins_scores[word_ins_masks] ) else: out_scores = None return out_tokens, out_scores def _apply_del_words( in_tokens, in_scores, in_attn, word_del_pred, padding_idx, bos_idx, eos_idx ): # apply deletion to a tensor in_masks = in_tokens.ne(padding_idx) bos_eos_masks = in_tokens.eq(bos_idx) | in_tokens.eq(eos_idx) max_len = in_tokens.size(1) word_del_pred.masked_fill_(~in_masks, 1) word_del_pred.masked_fill_(bos_eos_masks, 0) reordering = new_arange(in_tokens).masked_fill_(word_del_pred, max_len).sort(1)[1] out_tokens = in_tokens.masked_fill(word_del_pred, padding_idx).gather(1, reordering) out_scores = None if in_scores is not None: out_scores = in_scores.masked_fill(word_del_pred, 0).gather(1, reordering) out_attn = None if in_attn is not None: _mask = word_del_pred[:, :, None].expand_as(in_attn) _reordering = reordering[:, :, None].expand_as(in_attn) out_attn = in_attn.masked_fill(_mask, 0.0).gather(1, _reordering) return out_tokens, out_scores, out_attn def _skip(x, mask): """ Getting sliced (dim=0) tensor by mask. Supporting tensor and list/dict of tensors. """ if isinstance(x, int): return x if x is None: return None if isinstance(x, torch.Tensor): if x.size(0) == mask.size(0): return x[mask] elif x.size(1) == mask.size(0): return x[:, mask] if isinstance(x, list): return [_skip(x_i, mask) for x_i in x] if isinstance(x, dict): return {k: _skip(v, mask) for k, v in x.items()} raise NotImplementedError def _skip_encoder_out(encoder, encoder_out, mask): if not mask.any(): return encoder_out else: return encoder.reorder_encoder_out( encoder_out, mask.nonzero(as_tuple=False).squeeze() ) def _fill(x, mask, y, padding_idx): """ Filling tensor x with y at masked positions (dim=0). """ if x is None: return y assert x.dim() == y.dim() and mask.size(0) == x.size(0) assert x.dim() == 2 or (x.dim() == 3 and x.size(2) == y.size(2)) n_selected = mask.sum() assert n_selected == y.size(0) if n_selected == x.size(0): return y if x.size(1) < y.size(1): dims = [x.size(0), y.size(1) - x.size(1)] if x.dim() == 3: dims.append(x.size(2)) x = torch.cat([x, x.new_zeros(*dims).fill_(padding_idx)], 1) x[mask] = y elif x.size(1) > y.size(1): x[mask] = padding_idx if x.dim() == 2: x[mask, : y.size(1)] = y else: x[mask, : y.size(1), :] = y else: x[mask] = y return x ================================================ FILE: fairseq/models/nat/nat_crf_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.models import register_model, register_model_architecture from fairseq.models.nat import NATransformerModel, base_architecture from fairseq.modules import DynamicCRF @register_model("nacrf_transformer") class NACRFTransformerModel(NATransformerModel): def __init__(self, args, encoder, decoder): super().__init__(args, encoder, decoder) self.crf_layer = DynamicCRF( num_embedding=len(self.tgt_dict), low_rank=args.crf_lowrank_approx, beam_size=args.crf_beam_approx, ) @property def allow_ensemble(self): return False @staticmethod def add_args(parser): NATransformerModel.add_args(parser) parser.add_argument( "--crf-lowrank-approx", type=int, help="the dimension of low-rank approximation of transition", ) parser.add_argument( "--crf-beam-approx", type=int, help="the beam size for apporixmating the normalizing factor", ) parser.add_argument( "--word-ins-loss-factor", type=float, help="weights on NAT loss used to co-training with CRF loss.", ) def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs ): # encoding encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) # length prediction length_out = self.decoder.forward_length( normalize=False, encoder_out=encoder_out ) length_tgt = self.decoder.forward_length_prediction( length_out, encoder_out, tgt_tokens ) # decoding word_ins_out = self.decoder( normalize=False, prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, ) word_ins_tgt, word_ins_mask = tgt_tokens, tgt_tokens.ne(self.pad) # compute the log-likelihood of CRF crf_nll = -self.crf_layer(word_ins_out, word_ins_tgt, word_ins_mask) crf_nll = (crf_nll / word_ins_mask.type_as(crf_nll).sum(-1)).mean() return { "word_ins": { "out": word_ins_out, "tgt": word_ins_tgt, "mask": word_ins_mask, "ls": self.args.label_smoothing, "nll_loss": True, "factor": self.args.word_ins_loss_factor, }, "word_crf": {"loss": crf_nll}, "length": { "out": length_out, "tgt": length_tgt, "factor": self.decoder.length_loss_factor, }, } def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs): output_tokens = decoder_out.output_tokens output_scores = decoder_out.output_scores history = decoder_out.history # execute the decoder and get emission scores output_masks = output_tokens.ne(self.pad) word_ins_out = self.decoder( normalize=False, prev_output_tokens=output_tokens, encoder_out=encoder_out ) # run viterbi decoding through CRF _scores, _tokens = self.crf_layer.forward_decoder(word_ins_out, output_masks) output_tokens.masked_scatter_(output_masks, _tokens[output_masks]) output_scores.masked_scatter_(output_masks, _scores[output_masks]) if history is not None: history.append(output_tokens.clone()) return decoder_out._replace( output_tokens=output_tokens, output_scores=output_scores, attn=None, history=history, ) @register_model_architecture("nacrf_transformer", "nacrf_transformer") def nacrf_base_architecture(args): args.crf_lowrank_approx = getattr(args, "crf_lowrank_approx", 32) args.crf_beam_approx = getattr(args, "crf_beam_approx", 64) args.word_ins_loss_factor = getattr(args, "word_ins_loss_factor", 0.5) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) base_architecture(args) ================================================ FILE: fairseq/models/nat/nonautoregressive_ensembles.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.nn.functional as F from fairseq.models.nat import ( _apply_del_words, _apply_ins_masks, _apply_ins_words, _fill, _skip, _skip_encoder_out, ) class _EnsembleModelEncoder(object): def __init__(self, models): self.models = models def reorder_encoder_out(self, encoder_outs, new_order): encoder_outs = [ model.encoder.reorder_encoder_out(encoder_out, new_order) for model, encoder_out in zip(self.models, encoder_outs) ] return encoder_outs class BasicEnsembleModel(torch.nn.Module): """A wrapper around an ensemble of models.""" def __init__(self, models): super().__init__() self.models = torch.nn.ModuleList(models) self.bos = self.models[0].decoder.dictionary.bos() self.eos = self.models[0].decoder.dictionary.eos() self.pad = self.models[0].decoder.dictionary.pad() self.unk = self.models[0].decoder.dictionary.unk() self.encoder = _EnsembleModelEncoder(self.models) def has_encoder(self): return hasattr(self.models[0], "encoder") def max_decoder_positions(self): return min(m.max_decoder_positions() for m in self.models) @torch.no_grad() def forward_encoder(self, encoder_input): if not self.has_encoder(): return None return [model.forward_encoder(encoder_input) for model in self.models] @torch.no_grad() def forward_decoder(self, *inputs): raise NotImplementedError def initialize_output_tokens(self, *inputs): raise NotImplementedError class EnsembleLevT(BasicEnsembleModel): """A wrapper around an ensemble of models.""" def __init__(self, models): super().__init__(models) @torch.no_grad() def forward_decoder( self, decoder_out, encoder_outs, eos_penalty=0.0, max_ratio=None, **kwargs ): # LevT ensembling # A pipeline of three steps: deletion, placeholder, and word insertion. # We need to average scores in each step in a pipeline way because of dependence. # deletion output_tokens = decoder_out.output_tokens output_scores = decoder_out.output_scores attn = decoder_out.attn bsz = output_tokens.size(0) if max_ratio is None: max_lens = output_tokens.new().fill_(255) else: if not encoder_outs[0]["encoder_padding_mask"]: src_lens = ( encoder_outs[0]["encoder_out"][0] .new(bsz) .fill_(encoder_outs[0]["encoder_out"][0].size(1)) ) else: src_lens = (~encoder_outs[0]["encoder_padding_mask"][0]).sum(1) max_lens = (src_lens * max_ratio).clamp(min=10).long() # delete words # do not delete tokens if it is <s> </s> can_del_word = output_tokens.ne(self.pad).sum(1) > 2 if can_del_word.sum() != 0: # we cannot delete, skip output_tokens, output_scores, attn = self.forward_word_del( encoder_outs, output_tokens, output_scores, attn, can_del_word, ) # insert placeholders can_ins_mask = output_tokens.ne(self.pad).sum(1) < max_lens if can_ins_mask.sum() != 0: output_tokens, output_scores = self.forward_mask_ins( encoder_outs, output_tokens, output_scores, can_ins_mask, eos_penalty, max_lens, ) # insert words can_ins_word = output_tokens.eq(self.unk).sum(1) > 0 if can_ins_word.sum() != 0: output_tokens, output_scores, attn = self.forward_word_ins( encoder_outs, output_tokens, output_scores, attn, can_ins_word, ) # delete some unnecessary paddings cut_off = output_tokens.ne(self.pad).sum(1).max() output_tokens = output_tokens[:, :cut_off] output_scores = output_scores[:, :cut_off] attn = None if attn is None else attn[:, :cut_off, :] return decoder_out._replace( output_tokens=output_tokens, output_scores=output_scores, attn=attn, history=None, ) def forward_word_del( self, encoder_outs, output_tokens, output_scores, attn, can_del_word ): word_del_score_avg = [] word_del_attn_avg = [] for model, encoder_out in zip(self.models, encoder_outs): word_del_out, word_del_attn = model.decoder.forward_word_del( _skip(output_tokens, can_del_word), _skip_encoder_out(model.encoder, encoder_out, can_del_word), ) word_del_score = F.log_softmax(word_del_out, 2) word_del_score_avg.append(word_del_score) word_del_attn_avg.append(word_del_attn) word_del_score_avg = torch.logsumexp( torch.stack(word_del_score_avg, dim=0), dim=0 ) - math.log(len(self.models)) word_del_pred = word_del_score_avg.max(-1)[1].bool() if word_del_attn_avg[0] is not None: word_del_attn_avg = torch.stack(word_del_attn_avg, dim=0) / len(self.models) else: word_del_attn_avg = None _tokens, _scores, _attn = _apply_del_words( output_tokens[can_del_word], output_scores[can_del_word], word_del_attn_avg, word_del_pred, self.pad, self.bos, self.eos, ) output_tokens = _fill(output_tokens, can_del_word, _tokens, self.pad) output_scores = _fill(output_scores, can_del_word, _scores, 0) attn = _fill(attn, can_del_word, _attn, 0.0) return output_tokens, output_scores, attn def forward_mask_ins( self, encoder_outs, output_tokens, output_scores, can_ins_mask, eos_penalty, max_lens, ): mask_ins_score_avg = [] for model, encoder_out in zip(self.models, encoder_outs): mask_ins_out, _ = model.decoder.forward_mask_ins( _skip(output_tokens, can_ins_mask), _skip_encoder_out(model.encoder, encoder_out, can_ins_mask), ) mask_ins_score = F.log_softmax(mask_ins_out, 2) if eos_penalty > 0.0: mask_ins_score[:, :, 0] -= eos_penalty mask_ins_score_avg.append(mask_ins_score) mask_ins_score_avg = torch.logsumexp( torch.stack(mask_ins_score_avg, dim=0), dim=0 ) - math.log(len(self.models)) mask_ins_pred = mask_ins_score_avg.max(-1)[1] mask_ins_pred = torch.min( mask_ins_pred, max_lens[can_ins_mask, None].expand_as(mask_ins_pred) ) _tokens, _scores = _apply_ins_masks( output_tokens[can_ins_mask], output_scores[can_ins_mask], mask_ins_pred, self.pad, self.unk, self.eos, ) output_tokens = _fill(output_tokens, can_ins_mask, _tokens, self.pad) output_scores = _fill(output_scores, can_ins_mask, _scores, 0) return output_tokens, output_scores def forward_word_ins( self, encoder_outs, output_tokens, output_scores, attn, can_ins_word ): word_ins_score_avg = [] word_ins_attn_avg = [] for model, encoder_out in zip(self.models, encoder_outs): word_ins_out, word_ins_attn = model.decoder.forward_word_ins( _skip(output_tokens, can_ins_word), _skip_encoder_out(model.encoder, encoder_out, can_ins_word), ) word_ins_score = F.log_softmax(word_ins_out, 2) word_ins_score_avg.append(word_ins_score) word_ins_attn_avg.append(word_ins_attn) word_ins_score_avg = torch.logsumexp( torch.stack(word_ins_score_avg, dim=0), dim=0 ) - math.log(len(self.models)) if word_ins_attn_avg[0] is not None: word_ins_attn_avg = torch.stack(word_ins_attn_avg, dim=0) / len(self.models) else: word_ins_attn_avg = None word_ins_score_max, word_ins_pred = word_ins_score_avg.max(-1) _tokens, _scores = _apply_ins_words( output_tokens[can_ins_word], output_scores[can_ins_word], word_ins_pred, word_ins_score_max, self.unk, ) output_tokens = _fill(output_tokens, can_ins_word, _tokens, self.pad) output_scores = _fill(output_scores, can_ins_word, _scores, 0) attn = _fill(attn, can_ins_word, word_ins_attn, 0.0) return output_tokens, output_scores, attn def initialize_output_tokens(self, encoder_outs, src_tokens): # LevT doesn't do length prediction. return self.models[0].initialize_output_tokens(encoder_outs[0], src_tokens) ================================================ FILE: fairseq/models/nat/nonautoregressive_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn.functional as F from fairseq import utils from fairseq.iterative_refinement_generator import DecoderOut from fairseq.models import register_model, register_model_architecture from fairseq.models.nat import FairseqNATDecoder, FairseqNATModel, ensemble_decoder from fairseq.models.transformer import Embedding from fairseq.modules.transformer_sentence_encoder import init_bert_params def _mean_pooling(enc_feats, src_masks): # enc_feats: T x B x C # src_masks: B x T or None if src_masks is None: enc_feats = enc_feats.mean(0) else: src_masks = (~src_masks).transpose(0, 1).type_as(enc_feats) enc_feats = ( (enc_feats / src_masks.sum(0)[None, :, None]) * src_masks[:, :, None] ).sum(0) return enc_feats def _argmax(x, dim): return (x == x.max(dim, keepdim=True)[0]).type_as(x) def _uniform_assignment(src_lens, trg_lens): max_trg_len = trg_lens.max() steps = (src_lens.float() - 1) / (trg_lens.float() - 1) # step-size # max_trg_len index_t = utils.new_arange(trg_lens, max_trg_len).float() index_t = steps[:, None] * index_t[None, :] # batch_size X max_trg_len index_t = torch.round(index_t).long().detach() return index_t @register_model("nonautoregressive_transformer") class NATransformerModel(FairseqNATModel): @property def allow_length_beam(self): return True @staticmethod def add_args(parser): FairseqNATModel.add_args(parser) # length prediction parser.add_argument( "--src-embedding-copy", action="store_true", help="copy encoder word embeddings as the initial input of the decoder", ) parser.add_argument( "--pred-length-offset", action="store_true", help="predicting the length difference between the target and source sentences", ) parser.add_argument( "--sg-length-pred", action="store_true", help="stop the gradients back-propagated from the length predictor", ) parser.add_argument( "--length-loss-factor", type=float, help="weights on the length prediction loss", ) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): decoder = NATransformerDecoder(args, tgt_dict, embed_tokens) if getattr(args, "apply_bert_init", False): decoder.apply(init_bert_params) return decoder def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_tokens, **kwargs ): # encoding encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) # length prediction length_out = self.decoder.forward_length( normalize=False, encoder_out=encoder_out ) length_tgt = self.decoder.forward_length_prediction( length_out, encoder_out, tgt_tokens ) # decoding word_ins_out = self.decoder( normalize=False, prev_output_tokens=prev_output_tokens, encoder_out=encoder_out, ) return { "word_ins": { "out": word_ins_out, "tgt": tgt_tokens, "mask": tgt_tokens.ne(self.pad), "ls": self.args.label_smoothing, "nll_loss": True, }, "length": { "out": length_out, "tgt": length_tgt, "factor": self.decoder.length_loss_factor, }, } def forward_decoder(self, decoder_out, encoder_out, decoding_format=None, **kwargs): step = decoder_out.step output_tokens = decoder_out.output_tokens output_scores = decoder_out.output_scores history = decoder_out.history # execute the decoder output_masks = output_tokens.ne(self.pad) _scores, _tokens = self.decoder( normalize=True, prev_output_tokens=output_tokens, encoder_out=encoder_out, step=step, ).max(-1) output_tokens.masked_scatter_(output_masks, _tokens[output_masks]) output_scores.masked_scatter_(output_masks, _scores[output_masks]) if history is not None: history.append(output_tokens.clone()) return decoder_out._replace( output_tokens=output_tokens, output_scores=output_scores, attn=None, history=history, ) def initialize_output_tokens(self, encoder_out, src_tokens): # length prediction length_tgt = self.decoder.forward_length_prediction( self.decoder.forward_length(normalize=True, encoder_out=encoder_out), encoder_out=encoder_out, ) max_length = length_tgt.clamp_(min=2).max() idx_length = utils.new_arange(src_tokens, max_length) initial_output_tokens = src_tokens.new_zeros( src_tokens.size(0), max_length ).fill_(self.pad) initial_output_tokens.masked_fill_( idx_length[None, :] < length_tgt[:, None], self.unk ) initial_output_tokens[:, 0] = self.bos initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos) initial_output_scores = initial_output_tokens.new_zeros( *initial_output_tokens.size() ).type_as(encoder_out["encoder_out"][0]) return DecoderOut( output_tokens=initial_output_tokens, output_scores=initial_output_scores, attn=None, step=0, max_step=0, history=None, ) def regenerate_length_beam(self, decoder_out, beam_size): output_tokens = decoder_out.output_tokens length_tgt = output_tokens.ne(self.pad).sum(1) length_tgt = ( length_tgt[:, None] + utils.new_arange(length_tgt, 1, beam_size) - beam_size // 2 ) length_tgt = length_tgt.view(-1).clamp_(min=2) max_length = length_tgt.max() idx_length = utils.new_arange(length_tgt, max_length) initial_output_tokens = output_tokens.new_zeros( length_tgt.size(0), max_length ).fill_(self.pad) initial_output_tokens.masked_fill_( idx_length[None, :] < length_tgt[:, None], self.unk ) initial_output_tokens[:, 0] = self.bos initial_output_tokens.scatter_(1, length_tgt[:, None] - 1, self.eos) initial_output_scores = initial_output_tokens.new_zeros( *initial_output_tokens.size() ).type_as(decoder_out.output_scores) return decoder_out._replace( output_tokens=initial_output_tokens, output_scores=initial_output_scores ) class NATransformerDecoder(FairseqNATDecoder): def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__( args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn ) self.dictionary = dictionary self.bos = dictionary.bos() self.unk = dictionary.unk() self.eos = dictionary.eos() self.encoder_embed_dim = args.encoder_embed_dim self.sg_length_pred = getattr(args, "sg_length_pred", False) self.pred_length_offset = getattr(args, "pred_length_offset", False) self.length_loss_factor = getattr(args, "length_loss_factor", 0.1) self.src_embedding_copy = getattr(args, "src_embedding_copy", False) self.embed_length = Embedding(256, self.encoder_embed_dim, None) @ensemble_decoder def forward(self, normalize, encoder_out, prev_output_tokens, step=0, **unused): features, _ = self.extract_features( prev_output_tokens, encoder_out=encoder_out, embedding_copy=(step == 0) & self.src_embedding_copy, ) decoder_out = self.output_layer(features) return F.log_softmax(decoder_out, -1) if normalize else decoder_out @ensemble_decoder def forward_length(self, normalize, encoder_out): enc_feats = encoder_out["encoder_out"][0] # T x B x C if len(encoder_out["encoder_padding_mask"]) > 0: src_masks = encoder_out["encoder_padding_mask"][0] # B x T else: src_masks = None enc_feats = _mean_pooling(enc_feats, src_masks) if self.sg_length_pred: enc_feats = enc_feats.detach() length_out = F.linear(enc_feats, self.embed_length.weight) return F.log_softmax(length_out, -1) if normalize else length_out def extract_features( self, prev_output_tokens, encoder_out=None, early_exit=None, embedding_copy=False, **unused ): """ Similar to *forward* but only return features. Inputs: prev_output_tokens: Tensor(B, T) encoder_out: a dictionary of hidden states and masks Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs the LevenshteinTransformer decoder has full-attention to all generated tokens """ # embedding if embedding_copy: src_embd = encoder_out["encoder_embedding"][0] if len(encoder_out["encoder_padding_mask"]) > 0: src_mask = encoder_out["encoder_padding_mask"][0] else: src_mask = None src_mask = ( ~src_mask if src_mask is not None else prev_output_tokens.new_ones(*src_embd.size()[:2]).bool() ) x, decoder_padding_mask = self.forward_embedding( prev_output_tokens, self.forward_copying_source( src_embd, src_mask, prev_output_tokens.ne(self.padding_idx) ), ) else: x, decoder_padding_mask = self.forward_embedding(prev_output_tokens) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers for i, layer in enumerate(self.layers): # early exit from the decoder. if (early_exit is not None) and (i >= early_exit): break x, attn, _ = layer( x, encoder_out["encoder_out"][0] if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) else None, encoder_out["encoder_padding_mask"][0] if ( encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0 ) else None, self_attn_mask=None, self_attn_padding_mask=decoder_padding_mask, ) inner_states.append(x) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {"attn": attn, "inner_states": inner_states} def forward_embedding(self, prev_output_tokens, states=None): # embed positions positions = ( self.embed_positions(prev_output_tokens) if self.embed_positions is not None else None ) # embed tokens and positions if states is None: x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) else: x = states if positions is not None: x += positions x = self.dropout_module(x) decoder_padding_mask = prev_output_tokens.eq(self.padding_idx) return x, decoder_padding_mask def forward_copying_source(self, src_embeds, src_masks, tgt_masks): length_sources = src_masks.sum(1) length_targets = tgt_masks.sum(1) mapped_inputs = _uniform_assignment(length_sources, length_targets).masked_fill( ~tgt_masks, 0 ) copied_embedding = torch.gather( src_embeds, 1, mapped_inputs.unsqueeze(-1).expand( *mapped_inputs.size(), src_embeds.size(-1) ), ) return copied_embedding def forward_length_prediction(self, length_out, encoder_out, tgt_tokens=None): enc_feats = encoder_out["encoder_out"][0] # T x B x C if len(encoder_out["encoder_padding_mask"]) > 0: src_masks = encoder_out["encoder_padding_mask"][0] # B x T else: src_masks = None if self.pred_length_offset: if src_masks is None: src_lengs = enc_feats.new_ones(enc_feats.size(1)).fill_( enc_feats.size(0) ) else: src_lengs = (~src_masks).transpose(0, 1).type_as(enc_feats).sum(0) src_lengs = src_lengs.long() if tgt_tokens is not None: # obtain the length target tgt_lengs = tgt_tokens.ne(self.padding_idx).sum(1).long() if self.pred_length_offset: length_tgt = tgt_lengs - src_lengs + 128 else: length_tgt = tgt_lengs length_tgt = length_tgt.clamp(min=0, max=255) else: # predict the length target (greedy for now) # TODO: implementing length-beam pred_lengs = length_out.max(-1)[1] if self.pred_length_offset: length_tgt = pred_lengs - 128 + src_lengs else: length_tgt = pred_lengs return length_tgt @register_model_architecture( "nonautoregressive_transformer", "nonautoregressive_transformer" ) def base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.apply_bert_init = getattr(args, "apply_bert_init", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) # --- special arguments --- args.sg_length_pred = getattr(args, "sg_length_pred", False) args.pred_length_offset = getattr(args, "pred_length_offset", False) args.length_loss_factor = getattr(args, "length_loss_factor", 0.1) args.src_embedding_copy = getattr(args, "src_embedding_copy", False) @register_model_architecture( "nonautoregressive_transformer", "nonautoregressive_transformer_wmt_en_de" ) def nonautoregressive_transformer_wmt_en_de(args): base_architecture(args) ================================================ FILE: fairseq/models/roberta/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .hub_interface import * # noqa from .model import * # noqa from .enc_dec import * # noqa from .model_camembert import * # noqa from .model_gottbert import * # noqa from .model_xlmr import * # noqa ================================================ FILE: fairseq/models/roberta/alignment_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import Counter from typing import List import torch def align_bpe_to_words(roberta, bpe_tokens: torch.LongTensor, other_tokens: List[str]): """ Helper to align GPT-2 BPE to other tokenization formats (e.g., spaCy). Args: roberta (RobertaHubInterface): RoBERTa instance bpe_tokens (torch.LongTensor): GPT-2 BPE tokens of shape `(T_bpe)` other_tokens (List[str]): other tokens of shape `(T_words)` Returns: List[str]: mapping from *other_tokens* to corresponding *bpe_tokens*. """ assert bpe_tokens.dim() == 1 assert bpe_tokens[0] == 0 def clean(text): return text.strip() # remove whitespaces to simplify alignment bpe_tokens = [roberta.task.source_dictionary.string([x]) for x in bpe_tokens] bpe_tokens = [ clean(roberta.bpe.decode(x) if x not in {"<s>", ""} else x) for x in bpe_tokens ] other_tokens = [clean(str(o)) for o in other_tokens] # strip leading <s> bpe_tokens = bpe_tokens[1:] assert "".join(bpe_tokens) == "".join(other_tokens) # create alignment from every word to a list of BPE tokens alignment = [] bpe_toks = filter(lambda item: item[1] != "", enumerate(bpe_tokens, start=1)) j, bpe_tok = next(bpe_toks) for other_tok in other_tokens: bpe_indices = [] while True: if other_tok.startswith(bpe_tok): bpe_indices.append(j) other_tok = other_tok[len(bpe_tok) :] try: j, bpe_tok = next(bpe_toks) except StopIteration: j, bpe_tok = None, None elif bpe_tok.startswith(other_tok): # other_tok spans multiple BPE tokens bpe_indices.append(j) bpe_tok = bpe_tok[len(other_tok) :] other_tok = "" else: raise Exception('Cannot align "{}" and "{}"'.format(other_tok, bpe_tok)) if other_tok == "": break assert len(bpe_indices) > 0 alignment.append(bpe_indices) assert len(alignment) == len(other_tokens) return alignment def align_features_to_words(roberta, features, alignment): """ Align given features to words. Args: roberta (RobertaHubInterface): RoBERTa instance features (torch.Tensor): features to align of shape `(T_bpe x C)` alignment: alignment between BPE tokens and words returned by func:`align_bpe_to_words`. """ assert features.dim() == 2 bpe_counts = Counter(j for bpe_indices in alignment for j in bpe_indices) assert bpe_counts[0] == 0 # <s> shouldn't be aligned denom = features.new([bpe_counts.get(j, 1) for j in range(len(features))]) weighted_features = features / denom.unsqueeze(-1) output = [weighted_features[0]] largest_j = -1 for bpe_indices in alignment: output.append(weighted_features[bpe_indices].sum(dim=0)) largest_j = max(largest_j, *bpe_indices) for j in range(largest_j + 1, len(features)): output.append(weighted_features[j]) output = torch.stack(output) assert torch.all(torch.abs(output.sum(dim=0) - features.sum(dim=0)) < 1e-4) return output def spacy_nlp(): if getattr(spacy_nlp, "_nlp", None) is None: try: from spacy.lang.en import English spacy_nlp._nlp = English() except ImportError: raise ImportError("Please install spacy with: pip install spacy") return spacy_nlp._nlp def spacy_tokenizer(): if getattr(spacy_tokenizer, "_tokenizer", None) is None: try: nlp = spacy_nlp() spacy_tokenizer._tokenizer = nlp.Defaults.create_tokenizer(nlp) except ImportError: raise ImportError("Please install spacy with: pip install spacy") return spacy_tokenizer._tokenizer ================================================ FILE: fairseq/models/roberta/enc_dec.py ================================================ import argparse import logging import torch.nn as nn import fairseq.checkpoint_utils from fairseq.models import ( FairseqEncoderDecoderModel, register_model, register_model_architecture, ) from fairseq.models.transformer import TransformerDecoder from fairseq.models.roberta import model as roberta logger = logging.getLogger(__name__) @register_model("roberta_enc_dec") class RobertaEncDecModel(FairseqEncoderDecoderModel): @staticmethod def add_args(parser): parser.add_argument( "--pretrained-mlm-checkpoint", default=None, type=str, metavar="PRETRAINED", help="path to pretrained mlm checkpoint", ) parser.add_argument( "--pretrained-decoder", action="store_true", help="reload decoder" ) parser.add_argument( "--hack-layernorm-embedding", action="store_true", help="hack to reload old models trained with encoder-normalize-before=False (no equivalent to encoder-normalize-before=False and layernorm_embedding=False", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--share-all-embeddings", action="store_true", help="share encoder, decoder and output embeddings" " (requires shared dictionary and embed dim)", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present base_enc_dec_architecture(args) if args.pretrained_mlm_checkpoint: arg_overrides = None if args.hack_layernorm_embedding: arg_overrides = {"layernorm_embedding": False} loaded = fairseq.checkpoint_utils.load_model_ensemble_and_task( [args.pretrained_mlm_checkpoint], arg_overrides=arg_overrides ) ([roberta_enc], _cfg, _task) = loaded else: # Do we need to edit untie_weights here ? share_in_out = ( args.share_decoder_input_output_embed or args.share_all_embeddings ) args.untie_weights_roberta = not share_in_out if args.hack_layernorm_embedding: args.layernorm_embedding = False args.encoder_normalize_before = False roberta_enc = roberta.RobertaModel.build_model(args, task) return cls.from_roberta(roberta_enc, args, task.source_dictionary) @staticmethod def from_roberta(roberta_enc: roberta.RobertaModel, args, dictionary): encoder = roberta_enc.encoder.sentence_encoder vocab_size, embed_dim = encoder.embed_tokens.weight.shape if args.share_all_embeddings: lm_head = roberta_enc.encoder.lm_head assert encoder.embed_tokens.weight is lm_head.weight, ( "Can't use --share-all-embeddings with a model " "that was pretraiend with --untie-weights-roberta_enc" ) else: lm_head = roberta.RobertaLMHead( embed_dim, vocab_size, roberta_enc.args.activation_fn ) dec_embs = nn.Embedding(vocab_size, embed_dim, dictionary.pad()) if args.share_all_embeddings or args.share_decoder_input_output_embed: # Note: I wasn't able to use Embedding _weight parameter to achive this sharing. dec_embs.weight = lm_head.weight decoder = TransformerDecoder( RobertaEncDecModel.read_args_from_roberta(roberta_enc.args), dictionary, dec_embs, no_encoder_attn=False, output_projection=lm_head, ) if getattr(args, "pretrained_decoder", False): decoder_dict = encoder.state_dict() # TODO: hide setting "encoder_attn" layers behind a flag. for k, w in list(decoder_dict.items()): if ".self_attn" in k: k_enc_attn = k.replace(".self_attn", ".encoder_attn") decoder_dict[k_enc_attn] = w.detach().clone() for k, w in lm_head.state_dict().items(): decoder_dict["output_projection." + k] = w missing_keys, unexpected_keys = decoder.load_state_dict( decoder_dict, strict=False ) # missing_keys = [m for m in missing_keys if ".encoder_attn" not in m] assert not missing_keys and not unexpected_keys, ( "Failed to load state dict. " f"Missing keys: {missing_keys}. " f"Unexpected keys: {unexpected_keys}." ) if args.share_all_embeddings: assert decoder.output_projection.weight is decoder.embed_tokens.weight assert encoder.embed_tokens.weight is decoder.embed_tokens.weight elif args.share_decoder_input_output_embed: assert decoder.output_projection.weight is decoder.embed_tokens.weight assert encoder.embed_tokens.weight is not decoder.embed_tokens.weight else: assert decoder.output_projection.weight is not decoder.embed_tokens.weight assert encoder.embed_tokens.weight is not decoder.embed_tokens.weight return RobertaEncDecModel(encoder, decoder) @staticmethod def read_args_from_roberta(roberta_args: argparse.Namespace): # TODO: this would become easier if encoder/decoder where using a similar # TransformerConfig object args = argparse.Namespace(**vars(roberta_args)) attr_map = [ ("encoder_attention_heads", "decoder_attention_heads"), ("encoder_embed_dim", "decoder_embed_dim"), ("encoder_embed_dim", "decoder_output_dim"), ("encoder_normalize_before", "decoder_normalize_before"), ("encoder_layers_to_keep", "decoder_layers_to_keep"), ("encoder_ffn_embed_dim", "decoder_ffn_embed_dim"), ("encoder_layerdrop", "decoder_layerdrop"), ("encoder_layers", "decoder_layers"), ("encoder_learned_pos", "decoder_learned_pos"), # should this be set from here ? ("max_positions", "max_target_positions"), ] for k1, k2 in attr_map: setattr(args, k2, getattr(roberta_args, k1)) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = not roberta_args.untie_weights_roberta return args def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" super().upgrade_state_dict_named(state_dict, name) old_keys = list(state_dict.keys()) # rename decoder -> encoder before upgrading children modules for k in old_keys: if k.startswith(prefix + "encoder.lm_head"): state_dict.pop(k) continue new_k = k new_k = new_k.replace(".sentence_encoder.", ".") new_k = new_k.replace("decoder.lm_head.", "decoder.output_projection.") if k == new_k: continue # print(k, "->", new_k) state_dict[new_k] = state_dict.pop(k) @register_model_architecture("roberta_enc_dec", "roberta_enc_dec") def base_enc_dec_architecture(args): args.hack_layernorm_embedding = getattr(args, "hack_layernorm_embedding", False) args.pretrained_mlm_checkpoint = getattr(args, "pretrained_mlm_checkpoint", None) args.pretrained_decoder = getattr(args, "pretrained_decoder", None) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) roberta.base_architecture(args) ================================================ FILE: fairseq/models/roberta/hub_interface.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.data import encoders class RobertaHubInterface(nn.Module): """A simple PyTorch Hub interface to RoBERTa. Usage: https://github.com/pytorch/fairseq/tree/main/examples/roberta """ def __init__(self, cfg, task, model): super().__init__() self.cfg = cfg self.task = task self.model = model self.bpe = encoders.build_bpe(cfg.bpe) # this is useful for determining the device self.register_buffer("_float_tensor", torch.tensor([0], dtype=torch.float)) @property def device(self): return self._float_tensor.device def encode( self, sentence: str, *addl_sentences, no_separator=False ) -> torch.LongTensor: """ BPE-encode a sentence (or multiple sentences). Every sequence begins with a beginning-of-sentence (`<s>`) symbol. Every sentence ends with an end-of-sentence (`</s>`) and we use an extra end-of-sentence (`</s>`) as a separator. Example (single sentence): `<s> a b c </s>` Example (sentence pair): `<s> d e f </s> </s> 1 2 3 </s>` The BPE encoding follows GPT-2. One subtle detail is that the GPT-2 BPE requires leading spaces. For example:: >>> roberta.encode('Hello world').tolist() [0, 31414, 232, 2] >>> roberta.encode(' world').tolist() [0, 232, 2] >>> roberta.encode('world').tolist() [0, 8331, 2] """ bpe_sentence = "<s> " + self.bpe.encode(sentence) + " </s>" for s in addl_sentences: bpe_sentence += " </s>" if not no_separator else "" bpe_sentence += " " + self.bpe.encode(s) + " </s>" tokens = self.task.source_dictionary.encode_line( bpe_sentence, append_eos=False, add_if_not_exist=False ) return tokens.long() def decode(self, tokens: torch.LongTensor): assert tokens.dim() == 1 tokens = tokens.numpy() if tokens[0] == self.task.source_dictionary.bos(): tokens = tokens[1:] # remove <s> eos_mask = tokens == self.task.source_dictionary.eos() doc_mask = eos_mask[1:] & eos_mask[:-1] sentences = np.split(tokens, doc_mask.nonzero()[0] + 1) sentences = [ self.bpe.decode(self.task.source_dictionary.string(s)) for s in sentences ] if len(sentences) == 1: return sentences[0] return sentences def extract_features( self, tokens: torch.LongTensor, return_all_hiddens: bool = False ) -> torch.Tensor: if tokens.dim() == 1: tokens = tokens.unsqueeze(0) if tokens.size(-1) > self.model.max_positions(): raise ValueError( "tokens exceeds maximum length: {} > {}".format( tokens.size(-1), self.model.max_positions() ) ) features, extra = self.model( tokens.to(device=self.device), features_only=True, return_all_hiddens=return_all_hiddens, ) if return_all_hiddens: # convert from T x B x C -> B x T x C inner_states = extra["inner_states"] return [inner_state.transpose(0, 1) for inner_state in inner_states] else: return features # just the last layer's features def register_classification_head( self, name: str, num_classes: int = None, embedding_size: int = None, **kwargs ): self.model.register_classification_head( name, num_classes=num_classes, embedding_size=embedding_size, **kwargs ) def predict(self, head: str, tokens: torch.LongTensor, return_logits: bool = False): features = self.extract_features(tokens.to(device=self.device)) logits = self.model.classification_heads[head](features) if return_logits: return logits return F.log_softmax(logits, dim=-1) def extract_features_aligned_to_words( self, sentence: str, return_all_hiddens: bool = False ) -> torch.Tensor: """Extract RoBERTa features, aligned to spaCy's word-level tokenizer.""" from fairseq.models.roberta import alignment_utils from spacy.tokens import Doc nlp = alignment_utils.spacy_nlp() tokenizer = alignment_utils.spacy_tokenizer() # tokenize both with GPT-2 BPE and spaCy bpe_toks = self.encode(sentence) spacy_toks = tokenizer(sentence) spacy_toks_ws = [t.text_with_ws for t in tokenizer(sentence)] alignment = alignment_utils.align_bpe_to_words(self, bpe_toks, spacy_toks_ws) # extract features and align them features = self.extract_features( bpe_toks, return_all_hiddens=return_all_hiddens ) features = features.squeeze(0) aligned_feats = alignment_utils.align_features_to_words( self, features, alignment ) # wrap in spaCy Doc doc = Doc( nlp.vocab, words=["<s>"] + [x.text for x in spacy_toks] + ["</s>"], spaces=[True] + [x.endswith(" ") for x in spacy_toks_ws[:-1]] + [True, False], ) assert len(doc) == aligned_feats.size(0) doc.user_token_hooks["vector"] = lambda token: aligned_feats[token.i] return doc def fill_mask(self, masked_input: str, topk: int = 5): masked_token = "<mask>" assert ( masked_token in masked_input and masked_input.count(masked_token) == 1 ), "Please add one {0} token for the input, eg: 'He is a {0} guy'".format( masked_token ) text_spans = masked_input.split(masked_token) text_spans_bpe = ( (" {0} ".format(masked_token)) .join([self.bpe.encode(text_span.rstrip()) for text_span in text_spans]) .strip() ) tokens = self.task.source_dictionary.encode_line( "<s> " + text_spans_bpe + " </s>", append_eos=False, add_if_not_exist=False, ) masked_index = (tokens == self.task.mask_idx).nonzero(as_tuple=False) if tokens.dim() == 1: tokens = tokens.unsqueeze(0) with utils.model_eval(self.model): features, extra = self.model( tokens.long().to(device=self.device), features_only=False, return_all_hiddens=False, ) logits = features[0, masked_index, :].squeeze() prob = logits.softmax(dim=0) values, index = prob.topk(k=topk, dim=0) topk_predicted_token_bpe = self.task.source_dictionary.string(index) topk_filled_outputs = [] for index, predicted_token_bpe in enumerate( topk_predicted_token_bpe.split(" ") ): predicted_token = self.bpe.decode(predicted_token_bpe) # Quick hack to fix https://github.com/pytorch/fairseq/issues/1306 if predicted_token_bpe.startswith("\u2581"): predicted_token = " " + predicted_token if " {0}".format(masked_token) in masked_input: topk_filled_outputs.append( ( masked_input.replace( " {0}".format(masked_token), predicted_token ), values[index].item(), predicted_token, ) ) else: topk_filled_outputs.append( ( masked_input.replace(masked_token, predicted_token), values[index].item(), predicted_token, ) ) return topk_filled_outputs def disambiguate_pronoun(self, sentence: str) -> bool: """ Usage:: >>> disambiguate_pronoun('The _trophy_ would not fit in the brown suitcase because [it] was too big.') True >>> disambiguate_pronoun('The trophy would not fit in the brown suitcase because [it] was too big.') 'The trophy' """ assert hasattr( self.task, "disambiguate_pronoun" ), "roberta.disambiguate_pronoun() requires a model trained with the WSC task." with utils.model_eval(self.model): return self.task.disambiguate_pronoun( self.model, sentence, use_cuda=self.device.type == "cuda" ) ================================================ FILE: fairseq/models/roberta/model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ RoBERTa: A Robustly Optimized BERT Pretraining Approach. """ import logging import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.models import ( FairseqEncoder, FairseqEncoderModel, register_model, register_model_architecture, ) from fairseq.models.transformer import DEFAULT_MIN_PARAMS_TO_WRAP, TransformerEncoder from fairseq.modules import LayerNorm from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ from fairseq.modules.transformer_sentence_encoder import init_bert_params from fairseq.utils import safe_getattr, safe_hasattr from .hub_interface import RobertaHubInterface logger = logging.getLogger(__name__) @register_model("roberta") class RobertaModel(FairseqEncoderModel): @classmethod def hub_models(cls): return { "roberta.base": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.base.tar.gz", "roberta.large": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.tar.gz", "roberta.large.mnli": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.mnli.tar.gz", "roberta.large.wsc": "http://dl.fbaipublicfiles.com/fairseq/models/roberta.large.wsc.tar.gz", } def __init__(self, args, encoder): super().__init__(encoder) self.args = args # We follow BERT's random weight initialization self.apply(init_bert_params) self.classification_heads = nn.ModuleDict() @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--encoder-layers", type=int, metavar="L", help="num encoder layers" ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="H", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="F", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="A", help="num encoder attention heads", ) parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--pooler-activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use for pooler layer", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN", ) parser.add_argument( "--pooler-dropout", type=float, metavar="D", help="dropout probability in the masked_lm pooler layers", ) parser.add_argument( "--max-positions", type=int, help="number of positional embeddings to learn" ) parser.add_argument( "--load-checkpoint-heads", action="store_true", help="(re-)register and load heads when loading checkpoints", ) parser.add_argument( "--untie-weights-roberta", action="store_true", help="Untie weights between embeddings and classifiers in RoBERTa", ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) parser.add_argument( "--encoder-layerdrop", type=float, metavar="D", default=0, help="LayerDrop probability for encoder", ) parser.add_argument( "--encoder-layers-to-keep", default=None, help="which layers to *keep* when pruning as a comma-separated list", ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) parser.add_argument( "--quant-noise-pq", type=float, metavar="D", default=0, help="iterative PQ quantization noise at training time", ) parser.add_argument( "--quant-noise-pq-block-size", type=int, metavar="D", default=8, help="block size of quantization noise at training time", ) parser.add_argument( "--quant-noise-scalar", type=float, metavar="D", default=0, help="scalar quantization noise and scalar quantization at training time", ) # args for "Better Fine-Tuning by Reducing Representational Collapse" (Aghajanyan et al. 2020) parser.add_argument( "--spectral-norm-classification-head", action="store_true", default=False, help="Apply spectral normalization on the classification head", ) # args for Fully Sharded Data Parallel (FSDP) training parser.add_argument( "--min-params-to-wrap", type=int, metavar="D", default=DEFAULT_MIN_PARAMS_TO_WRAP, help=( "minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed." ), ) # args for AdaPruning # In short, it adds regularizarion for the multihead attention module and feed forward neural nets # For more details, please refer to the paper https://openreview.net/forum?id=_CMSV7FTzGI parser.add_argument( "--mha-reg-scale-factor", type=float, metavar="D", default=0.0, help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375", ) parser.add_argument( "--ffn-reg-scale-factor", type=float, metavar="D", default=0.0, help="scaling factor for regularization term in adptive pruning, recommendation is 0.000375", ) parser.add_argument( "--mha-heads-to-keep", type=int, metavar="D", default=-1, help="number of heads to keep in each multi-head attention module, -1 means keeping all heads", ) parser.add_argument( "--ffn-blocks-to-remove", type=int, metavar="D", default=-1, help="number of feedforward blocks to remove in each transformer layer, -1 means keeping all ffn blocks", ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" from omegaconf import OmegaConf if OmegaConf.is_config(args): OmegaConf.set_struct(args, False) # make sure all arguments are present base_architecture(args) if not safe_hasattr(args, "max_positions"): if not safe_hasattr(args, "tokens_per_sample"): args.tokens_per_sample = task.max_positions() args.max_positions = args.tokens_per_sample encoder = RobertaEncoder(args, task.source_dictionary) if OmegaConf.is_config(args): OmegaConf.set_struct(args, True) return cls(args, encoder) def forward( self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, **kwargs, ): if classification_head_name is not None: features_only = True x, extra = self.encoder(src_tokens, features_only, return_all_hiddens, **kwargs) if classification_head_name is not None: x = self.classification_heads[classification_head_name](x) return x, extra def _get_adaptive_head_loss(self): norm_loss = 0 scaling = float(self.args.mha_reg_scale_factor) for layer in self.encoder.sentence_encoder.layers: norm_loss_layer = 0 for i in range(layer.self_attn.num_heads): start_idx = i * layer.self_attn.head_dim end_idx = (i + 1) * layer.self_attn.head_dim norm_loss_layer += scaling * ( torch.sum( torch.abs( layer.self_attn.q_proj.weight[ start_idx:end_idx, ] ) ) + torch.sum( torch.abs(layer.self_attn.q_proj.bias[start_idx:end_idx]) ) ) norm_loss_layer += scaling * ( torch.sum( torch.abs( layer.self_attn.k_proj.weight[ start_idx:end_idx, ] ) ) + torch.sum( torch.abs(layer.self_attn.k_proj.bias[start_idx:end_idx]) ) ) norm_loss_layer += scaling * ( torch.sum( torch.abs( layer.self_attn.v_proj.weight[ start_idx:end_idx, ] ) ) + torch.sum( torch.abs(layer.self_attn.v_proj.bias[start_idx:end_idx]) ) ) norm_loss += norm_loss_layer return norm_loss def _get_adaptive_ffn_loss(self): ffn_scale_factor = float(self.args.ffn_reg_scale_factor) filter_loss = 0 for layer in self.encoder.sentence_encoder.layers: filter_loss += torch.sum( torch.abs(layer.fc1.weight * ffn_scale_factor) ) + torch.sum(torch.abs(layer.fc2.weight * ffn_scale_factor)) filter_loss += torch.sum( torch.abs(layer.fc1.bias * ffn_scale_factor) ) + torch.sum(torch.abs(layer.fc2.bias * ffn_scale_factor)) return filter_loss def get_normalized_probs(self, net_output, log_probs, sample=None): """Get normalized probabilities (or log probs) from a net's output.""" logits = net_output[0].float() if log_probs: return F.log_softmax(logits, dim=-1) else: return F.softmax(logits, dim=-1) def register_classification_head( self, name, num_classes=None, inner_dim=None, **kwargs ): """Register a classification head.""" if name in self.classification_heads: prev_num_classes = self.classification_heads[name].out_proj.out_features prev_inner_dim = self.classification_heads[name].dense.out_features if num_classes != prev_num_classes or inner_dim != prev_inner_dim: logger.warning( 're-registering head "{}" with num_classes {} (prev: {}) ' "and inner_dim {} (prev: {})".format( name, num_classes, prev_num_classes, inner_dim, prev_inner_dim ) ) self.classification_heads[name] = RobertaClassificationHead( input_dim=self.args.encoder_embed_dim, inner_dim=inner_dim or self.args.encoder_embed_dim, num_classes=num_classes, activation_fn=self.args.pooler_activation_fn, pooler_dropout=self.args.pooler_dropout, q_noise=self.args.quant_noise_pq, qn_block_size=self.args.quant_noise_pq_block_size, do_spectral_norm=self.args.spectral_norm_classification_head, ) @property def supported_targets(self): return {"self"} @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="gpt2", **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) logger.info(x["args"]) return RobertaHubInterface(x["args"], x["task"], x["models"][0]) def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" # rename decoder -> encoder before upgrading children modules for k in list(state_dict.keys()): if k.startswith(prefix + "decoder"): new_k = prefix + "encoder" + k[len(prefix + "decoder") :] state_dict[new_k] = state_dict[k] del state_dict[k] # rename emb_layer_norm -> layernorm_embedding for k in list(state_dict.keys()): if ".emb_layer_norm." in k: new_k = k.replace(".emb_layer_norm.", ".layernorm_embedding.") state_dict[new_k] = state_dict[k] del state_dict[k] # upgrade children modules super().upgrade_state_dict_named(state_dict, name) # Handle new classification heads present in the state dict. current_head_names = ( [] if not hasattr(self, "classification_heads") else self.classification_heads.keys() ) keys_to_delete = [] for k in state_dict.keys(): if not k.startswith(prefix + "classification_heads."): continue head_name = k[len(prefix + "classification_heads.") :].split(".")[0] num_classes = state_dict[ prefix + "classification_heads." + head_name + ".out_proj.weight" ].size(0) inner_dim = state_dict[ prefix + "classification_heads." + head_name + ".dense.weight" ].size(0) if getattr(self.args, "load_checkpoint_heads", False): if head_name not in current_head_names: self.register_classification_head(head_name, num_classes, inner_dim) else: if head_name not in current_head_names: logger.warning( "deleting classification head ({}) from checkpoint " "not present in current model: {}".format(head_name, k) ) keys_to_delete.append(k) elif ( num_classes != self.classification_heads[head_name].out_proj.out_features or inner_dim != self.classification_heads[head_name].dense.out_features ): logger.warning( "deleting classification head ({}) from checkpoint " "with different dimensions than current model: {}".format( head_name, k ) ) keys_to_delete.append(k) for k in keys_to_delete: del state_dict[k] # Copy any newly-added classification heads into the state dict # with their current weights. if hasattr(self, "classification_heads"): cur_state = self.classification_heads.state_dict() for k, v in cur_state.items(): if prefix + "classification_heads." + k not in state_dict: logger.info("Overwriting " + prefix + "classification_heads." + k) state_dict[prefix + "classification_heads." + k] = v # adapt data2vec models if ( "encoder._ema" in state_dict and "encoder.lm_head.weight" not in state_dict ): lm_state = self.encoder.lm_head.state_dict() for k, v in lm_state.items(): state_dict["encoder.lm_head." + k] = v for k in list(state_dict.keys()): if k.startswith("encoder.regression_head") or k == "encoder._ema": del state_dict[k] class RobertaLMHead(nn.Module): """Head for masked language modeling.""" def __init__(self, embed_dim, output_dim, activation_fn, weight=None): super().__init__() self.dense = nn.Linear(embed_dim, embed_dim) self.activation_fn = utils.get_activation_fn(activation_fn) self.layer_norm = LayerNorm(embed_dim) if weight is None: weight = nn.Linear(embed_dim, output_dim, bias=False).weight self.weight = weight self.bias = nn.Parameter(torch.zeros(output_dim)) def forward(self, features, masked_tokens=None, **kwargs): # Only project the masked tokens while training, # saves both memory and computation if masked_tokens is not None: features = features[masked_tokens, :] x = self.dense(features) x = self.activation_fn(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = F.linear(x, self.weight) + self.bias return x class RobertaClassificationHead(nn.Module): """Head for sentence-level classification tasks.""" def __init__( self, input_dim, inner_dim, num_classes, activation_fn, pooler_dropout, q_noise=0, qn_block_size=8, do_spectral_norm=False, ): super().__init__() self.dense = nn.Linear(input_dim, inner_dim) self.activation_fn = utils.get_activation_fn(activation_fn) self.dropout = nn.Dropout(p=pooler_dropout) self.out_proj = apply_quant_noise_( nn.Linear(inner_dim, num_classes), q_noise, qn_block_size ) if do_spectral_norm: if q_noise != 0: raise NotImplementedError( "Attempting to use Spectral Normalization with Quant Noise. This is not officially supported" ) self.out_proj = torch.nn.utils.spectral_norm(self.out_proj) def forward(self, features, **kwargs): x = features[:, 0, :] # take <s> token (equiv. to [CLS]) x = self.dropout(x) x = self.dense(x) x = self.activation_fn(x) x = self.dropout(x) x = self.out_proj(x) return x class RobertaEncoder(FairseqEncoder): """RoBERTa encoder.""" def __init__(self, args, dictionary): super().__init__(dictionary) # set any missing default values base_architecture(args) self.args = args if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) embed_tokens = self.build_embedding( len(dictionary), args.encoder_embed_dim, dictionary.pad() ) self.sentence_encoder = self.build_encoder(args, dictionary, embed_tokens) self.lm_head = self.build_lm_head( embed_dim=args.encoder_embed_dim, output_dim=len(dictionary), activation_fn=args.activation_fn, weight=( self.sentence_encoder.embed_tokens.weight if not args.untie_weights_roberta else None ), ) def build_embedding(self, vocab_size, embedding_dim, padding_idx): return nn.Embedding(vocab_size, embedding_dim, padding_idx) def build_encoder(self, args, dictionary, embed_tokens): encoder = TransformerEncoder(args, dictionary, embed_tokens) encoder.apply(init_bert_params) return encoder def build_lm_head(self, embed_dim, output_dim, activation_fn, weight): return RobertaLMHead(embed_dim, output_dim, activation_fn, weight) def forward( self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, **unused, ): """ Args: src_tokens (LongTensor): input tokens of shape `(batch, src_len)` features_only (bool, optional): skip LM head and just return features. If True, the output will be of shape `(batch, src_len, embed_dim)`. return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: tuple: - the LM output of shape `(batch, src_len, vocab)` - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the hidden states have shape `(src_len, batch, vocab)`. """ x, extra = self.extract_features( src_tokens, return_all_hiddens=return_all_hiddens ) if not features_only: x = self.output_layer(x, masked_tokens=masked_tokens) return x, extra def extract_features(self, src_tokens, return_all_hiddens=False, **kwargs): encoder_out = self.sentence_encoder( src_tokens, return_all_hiddens=return_all_hiddens, token_embeddings=kwargs.get("token_embeddings", None), ) # T x B x C -> B x T x C features = encoder_out["encoder_out"][0].transpose(0, 1) inner_states = encoder_out["encoder_states"] if return_all_hiddens else None return features, {"inner_states": inner_states} def output_layer(self, features, masked_tokens=None, **unused): return self.lm_head(features, masked_tokens) def max_positions(self): """Maximum output length supported by the encoder.""" return self.args.max_positions @register_model_architecture("roberta", "roberta") def base_architecture(args): args.encoder_layers = safe_getattr(args, "encoder_layers", 12) args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 768) args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 3072) args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 12) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_dropout = safe_getattr(args, "activation_dropout", 0.0) args.pooler_dropout = safe_getattr(args, "pooler_dropout", 0.0) args.max_source_positions = safe_getattr(args, "max_positions", 512) args.no_token_positional_embeddings = safe_getattr( args, "no_token_positional_embeddings", False ) # BERT has a few structural differences compared to the original Transformer args.encoder_learned_pos = safe_getattr(args, "encoder_learned_pos", True) args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", True) args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", True) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") args.encoder_normalize_before = safe_getattr( args, "encoder_normalize_before", False ) args.pooler_activation_fn = safe_getattr(args, "pooler_activation_fn", "tanh") args.untie_weights_roberta = safe_getattr(args, "untie_weights_roberta", False) # Adaptive input config args.adaptive_input = safe_getattr(args, "adaptive_input", False) # LayerDrop config args.encoder_layerdrop = safe_getattr(args, "encoder_layerdrop", 0.0) args.encoder_layers_to_keep = safe_getattr(args, "encoder_layers_to_keep", None) # Quantization noise config args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0) args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8) args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0) # R4F config args.spectral_norm_classification_head = safe_getattr( args, "spectral_norm_classification_head", False ) @register_model_architecture("roberta", "roberta_prenorm") def roberta_prenorm_architecture(args): args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False) args.encoder_normalize_before = safe_getattr(args, "encoder_normalize_before", True) base_architecture(args) @register_model_architecture("roberta", "roberta_base") def roberta_base_architecture(args): base_architecture(args) @register_model_architecture("roberta", "roberta_large") def roberta_large_architecture(args): args.encoder_layers = safe_getattr(args, "encoder_layers", 24) args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16) base_architecture(args) @register_model_architecture("roberta", "xlm") def xlm_architecture(args): args.encoder_layers = safe_getattr(args, "encoder_layers", 16) args.encoder_embed_dim = safe_getattr(args, "encoder_embed_dim", 1280) args.encoder_ffn_embed_dim = safe_getattr(args, "encoder_ffn_embed_dim", 1280 * 4) args.encoder_attention_heads = safe_getattr(args, "encoder_attention_heads", 16) base_architecture(args) ================================================ FILE: fairseq/models/roberta/model_camembert.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ CamemBERT: a Tasty French Language Model """ from fairseq.models import register_model from .hub_interface import RobertaHubInterface from .model import RobertaModel @register_model("camembert") class CamembertModel(RobertaModel): @classmethod def hub_models(cls): return { "camembert": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz", "camembert.v0": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz", "camembert-base": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base.tar.gz", "camembert-large": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-large.tar.gz", "camembert-base-ccnet": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet.tar.gz", "camembert-base-ccnet-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-ccnet-4gb.tar.gz", "camembert-base-wikipedia-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-wikipedia-4gb.tar.gz", "camembert-base-oscar-4gb": "http://dl.fbaipublicfiles.com/fairseq/models/camembert-base-oscar-4gb.tar.gz", } @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="sentencepiece", **kwargs ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) return RobertaHubInterface(x["args"], x["task"], x["models"][0]) ================================================ FILE: fairseq/models/roberta/model_gottbert.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ GottBERT: a pure German Language Model """ from fairseq.models import register_model from .hub_interface import RobertaHubInterface from .model import RobertaModel @register_model("gottbert") class GottbertModel(RobertaModel): @classmethod def hub_models(cls): return { "gottbert-base": "https://dl.gottbert.de/fairseq/models/gottbert-base.tar.gz", } @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="hf_byte_bpe", bpe_vocab="vocab.json", bpe_merges="merges.txt", bpe_add_prefix_space=False, **kwargs ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, bpe_vocab=bpe_vocab, bpe_merges=bpe_merges, bpe_add_prefix_space=bpe_add_prefix_space, **kwargs, ) return RobertaHubInterface(x["args"], x["task"], x["models"][0]) ================================================ FILE: fairseq/models/roberta/model_xlmr.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Unsupervised Cross-lingual Representation Learning at Scale """ from fairseq.models import register_model from .hub_interface import RobertaHubInterface from .model import RobertaModel @register_model("xlmr") class XLMRModel(RobertaModel): @classmethod def hub_models(cls): return { "xlmr.base": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.base.tar.gz", "xlmr.large": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr.large.tar.gz", "xlmr.xl": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xl.tar.gz", "xlmr.xxl": "http://dl.fbaipublicfiles.com/fairseq/models/xlmr/xlmr.xxl.tar.gz", } @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="sentencepiece", **kwargs ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) return RobertaHubInterface(x["args"], x["task"], x["models"][0]) ================================================ FILE: fairseq/models/speech_dlm/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .speech_dlm import * # noqa from .hub_interface import * # noqa ================================================ FILE: fairseq/models/speech_dlm/hub_interface.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import logging from typing import Any, Dict, Iterator, List import torch from fairseq import utils from omegaconf import open_dict from torch import nn from tqdm import tqdm from fairseq.hub_utils import GeneratorHubInterface logger = logging.getLogger(__name__) class MultichannelGeneratorHubInterface(GeneratorHubInterface): """Pytorch Hub interface for generating sequences from a pre-trained multichannel language model. """ def __init__(self, cfg, task, models): super().__init__(cfg, task, models) self.cfg = cfg self.task = task self.models = nn.ModuleList(models) self.src_dicts = task.source_dictionaries self.tgt_dicts = task.target_dictionaries self.channels = task.channels # optimize model for generation for model in self.models: model.prepare_for_inference_(cfg) def sample( self, sentences: List[Dict[str, str]], beam: int = 1, verbose: bool = False, **kwargs ) -> List[str]: if isinstance(sentences, dict): return self.sample([sentences], beam=beam, verbose=verbose, **kwargs)[0] tokenized_sentences = [self.encode(sentence) for sentence in sentences] batched_hypos = self.generate(tokenized_sentences, beam, verbose, **kwargs) return [self.decode(hypos[0]["tokens"]) for hypos in batched_hypos] def score(self, sentences: List[Dict[str, str]], **kwargs): raise NotImplementedError( "MultichannelGeneratorHubInterface doesn't support score() method" ) def generate( self, tokenized_sentences: List[Dict[str, torch.LongTensor]], beam: int = 5, verbose: bool = False, skip_invalid_size_inputs=False, inference_step_args=None, **kwargs ) -> List[List[Dict[str, torch.Tensor]]]: if isinstance(tokenized_sentences, dict): return self.generate( [tokenized_sentences], beam=beam, verbose=verbose, **kwargs )[0] # build generator using current args as well as any kwargs gen_args = copy.deepcopy(self.cfg.generation) with open_dict(gen_args): gen_args.beam = beam for k, v in kwargs.items(): setattr(gen_args, k, v) generator = self.task.build_generator(self.models, gen_args) inference_step_args = inference_step_args or {} results = [] for batch in tqdm( self._build_batches(tokenized_sentences, skip_invalid_size_inputs) ): batch = utils.apply_to_sample(lambda t: t.to(self.device), batch) translations = self.task.inference_step( generator, self.models, batch, **inference_step_args ) for id, hypos in zip(batch["id"].tolist(), translations): # The output of the generator is supposed to be a tensor of size (bsz x max_len x n_channels) # So we need to convert it to dictionary form for i in range(len(hypos)): hypos[i]["tokens"] = { channel: hypos[i]["tokens"][..., j] for j, channel in enumerate(self.channels) } results.append((id, hypos)) # sort output to match input order outputs = [hypos for _, hypos in sorted(results, key=lambda x: x[0])] if verbose: def getarg(name, default): return getattr(gen_args, name, getattr(self.cfg, name, default)) for source_tokens, target_hypotheses in zip(tokenized_sentences, outputs): src_str_with_unk = { channel: self.string(source_tokens[channel], channel) for channel in source_tokens } logger.info("S\t{}".format(src_str_with_unk)) for hypo in target_hypotheses: hypo_str = self.decode(hypo["tokens"]) logger.info("H\t{}\t{}".format(hypo["score"], hypo_str)) # hypo["positional_scores"]: T x n_channels pos_scores = {} for c, channel in enumerate(source_tokens): pos_scores[channel] = " ".join( map( lambda x: "{:.4f}".format(x), hypo["positional_scores"][:, c].tolist(), ) ) logger.info("P\t{}".format(pos_scores)) return outputs def encode(self, sentence: Dict[str, str]) -> Dict[str, torch.LongTensor]: assert isinstance( sentence, dict ), "Input sentence is expected to be a dictionary over channels" assert set(sentence.keys()) == set( self.channels ), "Mismatch between input sentence keys and model channels ({} vs {})".format( set(sentence.keys()), set(self.channels) ) encoded_sentence = {} for channel in sentence: sentence_channel = sentence[channel] sentence_channel = self.tokenize(sentence_channel) sentence_channel = self.apply_bpe(sentence_channel) sentence_channel = self.binarize(sentence_channel, channel) encoded_sentence[channel] = sentence_channel sentence_size = encoded_sentence[self.channels[0]].size() assert all( encoded_sentence[channel].size() == sentence_size for channel in encoded_sentence ), "Input tensors are expected to have the same size in all channels" return encoded_sentence def decode(self, tokens: Dict[str, torch.LongTensor]) -> Dict[str, str]: assert isinstance( tokens, dict ), "Input tokens are expected to be a dictionary over channels" assert set(tokens.keys()) == set( self.channels ), "Mismatch between input tokens keys and model channels ({} vs {})".format( set(tokens.keys()), set(self.channels) ) decoded_sentence = {} for channel in tokens: tokens_channel = tokens[channel] sentence_channel = self.string(tokens_channel, channel) sentence_channel = self.remove_bpe(sentence_channel) sentence_channel = self.detokenize(sentence_channel) decoded_sentence[channel] = sentence_channel return decoded_sentence def binarize(self, sentence: str, channel: str) -> torch.LongTensor: return ( self.src_dicts[channel].encode_line(sentence, add_if_not_exist=False).long() ) def string(self, tokens: torch.LongTensor, channel: str) -> str: return self.tgt_dicts[channel].string(tokens) def _build_batches( self, tokens: List[Dict[str, List[int]]], skip_invalid_size_inputs: bool ) -> Iterator[Dict[str, Any]]: lengths = torch.LongTensor([next(iter(d.values())).numel() for d in tokens]) batch_iterator = self.task.get_batch_iterator( dataset=self.task.build_dataset_for_inference(tokens, lengths), max_tokens=self.cfg.dataset.max_tokens, max_sentences=self.cfg.dataset.batch_size, max_positions=self.max_positions, ignore_invalid_inputs=skip_invalid_size_inputs, disable_iterator_cache=True, ).next_epoch_itr(shuffle=False) return batch_iterator ================================================ FILE: fairseq/models/speech_dlm/modules/__init__.py ================================================ ================================================ FILE: fairseq/models/speech_dlm/modules/speech_dlm_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Any, Dict, List, Optional, Tuple import torch import torch.nn as nn from fairseq import utils from fairseq.models import FairseqIncrementalDecoder from fairseq.modules import ( FairseqDropout, LayerDropModuleList, LayerNorm, PositionalEmbedding, ) from .speech_dlm_decoder_layer import ( CrossChannelTransformerDecoderLayer, StandardTransformerDecoderLayer, ) from fairseq.modules.checkpoint_activations import checkpoint_wrapper from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ from torch import Tensor class CrossChannelTransformerDecoder(FairseqIncrementalDecoder): """ Cross-channel Transformer Decoder Block for parallel spoken dialogue units as described in the paper: https://arxiv.org/pdf/2203.16502.pdf; consisting of *args.decoder_layers* layers. Each layer is a :class:`StandardTransformerDecoderLayer` or :class:`CrossChannelTransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding channels (list): list of channel names (string) no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__(self, args, dictionary, embed_tokens, channels, no_encoder_attn=False): self.args = args super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.decoder_layerdrop = args.decoder_layerdrop self.share_input_output_embed = args.share_decoder_input_output_embed self.channels = channels input_embed_dim = embed_tokens.embedding_dim embed_dim = args.decoder_embed_dim self.embed_dim = embed_dim self.output_embed_dim = args.decoder_output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = args.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim) if args.quant_noise_pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: self.quant_noise = None self.project_in_dim = ( nn.Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( self.max_target_positions, embed_dim, self.padding_idx, learned=args.decoder_learned_pos, ) if not args.no_token_positional_embeddings else None ) if getattr(args, "layernorm_embedding", False): self.layernorm_embedding = LayerNorm(embed_dim) else: self.layernorm_embedding = None self.cross_self_attention = getattr(args, "cross_self_attention", False) assert 0 <= args.decoder_cross_layers <= args.decoder_layers, ( "The number of cross-channel attention decoder layers must be non-negative" f"and not exceeds the number of decoder layers (found {args.decoder_cross_layers})" ) if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [ self.build_decoder_layer(args, no_encoder_attn) if i < args.decoder_layers - args.decoder_cross_layers else self.build_cross_decoder_layer(args, no_encoder_attn) for i in range(args.decoder_layers) ] ) self.num_layers = len(self.layers) self.non_cross_layers = args.decoder_layers - args.decoder_cross_layers if args.decoder_normalize_before and not getattr( args, "no_decoder_final_norm", False ): self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None self.project_out_dim = ( nn.Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim else None ) self.output_projection = None self.is_cross_prediction = bool( float(args.main_and_cross_weights.split(",")[1]) != 0 ) self.n_output_projections = ( 1 if not self.is_cross_prediction else len(self.channels) ) if self.share_input_output_embed: # Output projection is a list of projections # where the first proj is for the main-channel, # then roll in a cicular way. # For example: if the main channel has index i # the second proj is for channel i+1 (mod N_channels), etc. self.output_projection = nn.ModuleList( [ nn.Linear( embed_tokens.weight.shape[1], # embed_dim embed_tokens.weight.shape[0], # n_dictionaries bias=False, ) for _ in range(self.n_output_projections) ] ) # Only share the main-channel projection self.output_projection[0].weight = embed_tokens.weight for i in range(1, self.n_output_projections): nn.init.normal_( self.output_projection[i].weight, mean=0, std=embed_tokens.weight.shape[1] ** -0.5, ) else: self.output_projection = nn.ModuleList( [ nn.Linear(self.output_embed_dim, len(dictionary), bias=False) for _ in range(self.n_output_projections) ] ) for i in range(self.n_output_projections): nn.init.normal_( self.output_projection[i].weight, mean=0, std=self.output_embed_dim**-0.5, ) self.output_duration_prediction = ( None if str(args.duration_prediction).lower() == "false" else nn.ModuleList( [ nn.Linear(self.output_embed_dim, 1) for _ in range(self.n_output_projections) ] ) ) def build_decoder_layer(self, args, no_encoder_attn=False): layer = StandardTransformerDecoderLayer(args, no_encoder_attn) if getattr(args, "checkpoint_activations", False): offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) return layer def build_cross_decoder_layer(self, args, no_encoder_attn=False): layer = CrossChannelTransformerDecoderLayer(args, no_encoder_attn) if getattr(args, "checkpoint_activations", False): offload_to_cpu = getattr(args, "offload_activations", False) layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) return layer def forward( self, prev_output_tokens: Dict[str, Tensor], encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[ List[Dict[str, Dict[str, Optional[Tensor]]]] ] = None, features_only: bool = False, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, # return_all_hiddens: bool = False, ): """ Args: prev_output_tokens (dict[str, LongTensor]): previous decoder outputs, dictionary over all channels with the values being the tensors of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention incremental_state (dict): list of dictionaries used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). Returns: tuple: - the decoder's output, dict over channels of tensors of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, full_context_alignment=full_context_alignment, alignment_layer=alignment_layer, alignment_heads=alignment_heads, ) if not features_only: x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens: Dict[str, Tensor], encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[ List[Dict[str, Dict[str, Optional[Tensor]]]] ] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): return self.extract_features_scriptable( prev_output_tokens, encoder_out, incremental_state, full_context_alignment, alignment_layer, alignment_heads, ) """ A scriptable subclass of this class has an extract_features method and calls super().extract_features, but super() is not supported in torchscript. A copy of this function is made to be used in the subclass instead. """ def extract_features_scriptable( self, prev_output_tokens: Dict[str, Tensor], encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[ List[Dict[str, Dict[str, Optional[Tensor]]]] ] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): """ The core function of *forward* but only return features. The input (prev_output_tokens) is a dictionary over all channels, expected to have the following form: { 'channel1' : Tensor((batch x tgt_len)), 'channel2' : Tensor((batch x tgt_len)), } Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features, dict over channels of tensors of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ if alignment_layer is None: alignment_layer = self.num_layers - 1 x_list = [] for i, channel in enumerate(self.channels): # embed positions positions = None if self.embed_positions is not None: positions = self.embed_positions( prev_output_tokens[channel], incremental_state=incremental_state[i] if incremental_state is not None else None, ) if incremental_state is not None: prev_output_tokens[channel] = prev_output_tokens[channel][:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_tokens(prev_output_tokens[channel]) if self.project_in_dim is not None: x = self.project_in_dim(x) x = self.embed_scale * x if self.quant_noise is not None: x = self.quant_noise(x) if positions is not None: x += positions if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) x_list.append(x) self_attn_padding_mask: Optional[Tensor] = None if ( self.cross_self_attention or prev_output_tokens[self.channels[0]].eq(self.padding_idx).any() ): self_attn_padding_mask = prev_output_tokens[self.channels[0]].eq( self.padding_idx ) # decoder layers attn: Optional[Dict[Tensor]] = None inner_states: List[Optional[Dict[str, Tensor]]] = [ {channel: x_list[i] for i, channel in enumerate(self.channels)} ] for idx, layer in enumerate(self.layers): if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x_list[0]) else: self_attn_mask = None # need to change to tensor for the checkpoint activation to work if isinstance(x_list, list): x_list = torch.stack(x_list) x_list, layer_attn_list, _ = layer( x_list, encoder_out["encoder_out"][0] if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) else None, encoder_out["encoder_padding_mask"][0] if ( encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0 ) else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append( {channel: x_list[i] for i, channel in enumerate(self.channels)} ) if idx == alignment_layer and all( layer_attn is not None for layer_attn in layer_attn_list ): attn = { channel: layer_attn_list[i].float().to(x_list[0]) for i, channel in enumerate(self.channels) } # change back from tensor to list if not isinstance(x_list, list): x_list = list(torch.unbind(x_list)) if attn is not None: for channel in attn: if alignment_heads is not None: attn[channel] = attn[channel][:alignment_heads] # average probabilities over heads attn[channel] = attn[channel].mean(dim=0) for i, x in enumerate(x_list): if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) x_list[i] = x x = {channel: x_list[i] for i, channel in enumerate(self.channels)} return x, {"attn": [attn], "inner_states": inner_states} def output_layer(self, features): """Project features to the vocabulary size. Return a dictionary of the form: { 'input-channel': { 'predicted-channel': token prediction tensor of shape `(batch, tgt_len, vocab)`, } } if duration_prediction is enabled { 'input-channel': { 'predicted-channel': { 'pred_token': token prediction tensor of shape `(batch, tgt_len, vocab)`, 'pred_duration': duration prediction tensor } } } """ # project back to size of vocabulary if self.output_duration_prediction is None: if self.is_cross_prediction: return { channel: { pred_channel: self.output_projection[j - i](features[channel]) for j, pred_channel in enumerate(self.channels) } for i, channel in enumerate(self.channels) } else: return { channel: {channel: self.output_projection[0](features[channel])} for i, channel in enumerate(self.channels) } else: if self.is_cross_prediction: return { channel: { pred_channel: { "pred_token": self.output_projection[j - i]( features[channel] ), "pred_duration": self.output_duration_prediction[j - i]( features[channel] ), } for j, pred_channel in enumerate(self.channels) } for i, channel in enumerate(self.channels) } else: return { channel: { channel: { "pred_token": self.output_projection[0](features[channel]), "pred_duration": self.output_duration_prediction[0]( features[channel] ), } } for i, channel in enumerate(self.channels) } def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions) def buffered_future_mask(self, tensor): dim = tensor.size(0) # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. if ( self._future_mask.size(0) == 0 or (not self._future_mask.device == tensor.device) or self._future_mask.size(0) < dim ): self._future_mask = torch.triu( utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 ) self._future_mask = self._future_mask.to(tensor) return self._future_mask[:dim, :dim] def get_normalized_probs_scriptable( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): """Get normalized probabilities (or log probs) from a net's output.""" logits_dict = net_output[0] out_dict = {} for channel in logits_dict: out_dict[channel] = {} for pred_channel in logits_dict[channel]: if isinstance(logits_dict[channel][pred_channel], dict): pred_token_logits = logits_dict[channel][pred_channel]["pred_token"] else: pred_token_logits = logits_dict[channel][pred_channel] if log_probs: out = utils.log_softmax( pred_token_logits, dim=-1, onnx_trace=self.onnx_trace ) else: out = utils.softmax( pred_token_logits, dim=-1, onnx_trace=self.onnx_trace ) if isinstance(logits_dict[channel][pred_channel], dict): out_dict[channel][pred_channel] = { "pred_token": out, "pred_duration": logits_dict[channel][pred_channel][ "pred_duration" ].float(), } # move to float32 to avoid inf loss else: out_dict[channel][pred_channel] = out return out_dict def reorder_incremental_state_scripting( self, incremental_state: List[Dict[str, Dict[str, Optional[Tensor]]]], new_order: Tensor, ): """Main entry point for reordering the incremental state. Due to limitations in TorchScript, we call this function in :class:`fairseq.sequence_generator.SequenceGenerator` instead of calling :func:`reorder_incremental_state` directly. """ for module in self.modules(): if hasattr(module, "reorder_incremental_state"): for i, incremental_state_channel in enumerate(incremental_state): result = module.reorder_incremental_state( incremental_state_channel, new_order ) if result is not None: incremental_state[i] = result ================================================ FILE: fairseq/models/speech_dlm/modules/speech_dlm_decoder_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Tuple, Optional import torch import torch.nn as nn from fairseq import utils from fairseq.modules import LayerNorm, MultiheadAttention from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise from torch import Tensor class CrossChannelTransformerDecoderLayer(nn.Module): """Cross-Attention Transformer Decoder Layer block as described in the paper: https://arxiv.org/pdf/2203.16502.pdf Composed of a Multi-head Self Attention block followed by a Multi-head Cross-Attention block which attends to the self-attention outputs of the other channels. The weights of the attention blocks in all channels are shared. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__() self.embed_dim = args.decoder_embed_dim self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.quant_noise = getattr(args, "quant_noise_pq", 0) self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) # This cross_self_attention is used for encoder-decoder systems, # It's not the cross-channel attention (defined below as cross_channel_attn) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.self_attn = self.build_self_attention( self.embed_dim, args, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.cross_channel_attn = self.build_cross_channel_attention( self.embed_dim, args, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.activation_fn = utils.get_activation_fn( activation=str(args.activation_fn) if getattr(args, "activation_fn", None) is not None else "relu" ) activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 if activation_dropout_p == 0: # for backwards compatibility with models that use args.relu_dropout activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, "char_inputs", False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.cross_channel_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = self.build_encoder_attention(self.embed_dim, args) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = self.build_fc1( self.embed_dim, args.decoder_ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( args.decoder_ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_self_attention( self, embed_dim, args, add_bias_kv=False, add_zero_attn=False ): return MultiheadAttention( embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) def build_cross_channel_attention( self, embed_dim, args, add_bias_kv=False, add_zero_attn=False ): return MultiheadAttention( embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=False, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) def build_encoder_attention(self, embed_dim, args): return MultiheadAttention( embed_dim, args.decoder_attention_heads, kdim=getattr(args, "encoder_embed_dim", None), vdim=getattr(args, "encoder_embed_dim", None), dropout=args.attention_dropout, encoder_decoder_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) def prepare_for_onnx_export_(self): self.onnx_trace = True def residual_connection(self, x, residual): return residual + x def forward( self, x_list_tensor: List[torch.Tensor], encoder_out: Optional[torch.Tensor] = None, encoder_padding_mask: Optional[torch.Tensor] = None, incremental_state: Optional[ List[Dict[str, Dict[str, Optional[Tensor]]]] ] = None, prev_self_attn_state: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None, prev_attn_state: Optional[List[torch.Tensor]] = None, self_attn_mask: Optional[torch.Tensor] = None, self_attn_padding_mask: Optional[torch.Tensor] = None, need_attn: bool = False, need_head_weights: bool = False, ): """ Args: x_list_tensor (List[Tensor]): list of input tensors in different channels, each tensor is of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor, optional): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. incremental_state (optional): list of incremental_state dictionaries over different channels (sequence generation mode) prev_self_attn_state (List[Tuple[Tensor, Tensor]], optional): list of tuples (self_attn_state, cross_channel_attn_state) over different channels need_attn (bool, optional): return attention weights need_head_weights (bool, optional): return attention weights for each head (default: return average over heads). Returns: list of encoded output of shape `(seq_len, batch, embed_dim)` """ n_channels = len(x_list_tensor) if need_head_weights: need_attn = True # incremental_state is a list of dictionaries over different channels if incremental_state is not None: assert isinstance(incremental_state, list) assert len(incremental_state) == n_channels # prev_self_attn_state is a list of tuples (self_attn_state, cross_channel_attn_state) over different channels if prev_self_attn_state is not None: assert isinstance(prev_self_attn_state, list) assert len(prev_self_attn_state) == n_channels for prev_self_attn_state_channel in prev_self_attn_state: assert isinstance(prev_self_attn_state_channel, tuple) assert len(prev_self_attn_state_channel) == 2 # Backup for other channels & cross channel attention self_attn_mask_orin = self_attn_mask self_attn_padding_mask_orin = self_attn_padding_mask x_list = [] attn_list = [] for i, x in enumerate(x_list_tensor): residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) if prev_self_attn_state is not None: prev_key, prev_value = prev_self_attn_state[i][0][:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_self_attn_state[i][0]) >= 3: saved_state["prev_key_padding_mask"] = prev_self_attn_state[i][0][2] assert incremental_state is not None self.self_attn._set_input_buffer(incremental_state[i], saved_state) _self_attn_input_buffer = self.self_attn._get_input_buffer( incremental_state[i] if incremental_state is not None else None ) if self.cross_self_attention and not ( incremental_state is not None and _self_attn_input_buffer is not None and "prev_key" in _self_attn_input_buffer ): if self_attn_mask_orin is not None: assert encoder_out is not None self_attn_mask = torch.cat( ( x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask_orin, ), dim=1, ) if self_attn_padding_mask_orin is not None: if encoder_padding_mask is None: assert encoder_out is not None encoder_padding_mask = self_attn_padding_mask_orin.new_zeros( encoder_out.size(1), encoder_out.size(0) ) self_attn_padding_mask = torch.cat( (encoder_padding_mask, self_attn_padding_mask_orin), dim=1 ) assert encoder_out is not None y = torch.cat((encoder_out, x), dim=0) else: y = x x, attn = self.self_attn( query=x, key=y, value=y, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state[i] if incremental_state is not None else None, need_weights=False, attn_mask=self_attn_mask, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.self_attn_layer_norm(x) if self.encoder_attn is not None and encoder_out is not None: residual = x if self.normalize_before: x = self.encoder_attn_layer_norm(x) if prev_attn_state is not None: prev_key, prev_value = prev_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_attn_state[2] assert incremental_state is not None self.encoder_attn._set_input_buffer( incremental_state[i], saved_state ) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state[i] if incremental_state is not None else None, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.encoder_attn_layer_norm(x) x_list.append(x) attn_list.append(attn) # Store attentions & new x(s) (bc the old x(s) are used in other channels) x_list_new = [] # Here comes the cross channel attention for i, x in enumerate(x_list): residual = x if self.normalize_before: x = self.cross_channel_attn_layer_norm(x) if prev_self_attn_state is not None: prev_key, prev_value = prev_self_attn_state[i][1][:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_self_attn_state[i][1]) >= 3: saved_state["prev_key_padding_mask"] = prev_self_attn_state[i][1][2] assert incremental_state is not None self.cross_channel_attn._set_input_buffer( incremental_state[i], saved_state ) # The cross attention is computed with the concatenation of attentions from other channels if len(x_list) > 1: x_other = torch.cat( [x_list[(i + j) % len(x_list)] for j in range(1, len(x_list))], dim=0, ) else: # Self-attention when having only one channel x_other = x_list[i] x, attn = self.cross_channel_attn( query=x, key=x_other, value=x_other, key_padding_mask=self_attn_padding_mask_orin, incremental_state=incremental_state[i] if incremental_state is not None else None, need_weights=False, attn_mask=self_attn_mask_orin, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.cross_channel_attn_layer_norm(x) x_list_new.append(x) x_list = x_list_new for i, x in enumerate(x_list): residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) x_list[i] = x # Trick for the checkpoint activation x_list_tensor = torch.stack(x_list) if self.onnx_trace and incremental_state is not None: self_and_cross_attn_state_list = [] for i in range(n_channels): self_and_cross_attn_state = [] for self_attn_module in [self.self_attn, self.cross_channel_attn]: saved_state = self_attn_module._get_input_buffer( incremental_state[i] ) assert saved_state is not None if self_attn_padding_mask is not None: self_attn_module_state = [ saved_state["prev_key"], saved_state["prev_value"], saved_state["prev_key_padding_mask"], ] else: self_attn_module_state = [ saved_state["prev_key"], saved_state["prev_value"], ] self_and_cross_attn_state.append(self_attn_module_state) self_and_cross_attn_state_list.append(tuple(self_and_cross_attn_state)) return x_list_tensor, attn_list, self_and_cross_attn_state_list return x_list_tensor, attn_list, None def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn # Rewrite fairseq.modules.TransformerDecoderLayer # to be compatible with checkpoint_activations # (avoid forwarding model multiple times) class StandardTransformerDecoderLayer(nn.Module): """Rewrite fairseq.modules.TransformerDecoderLayer to avoid forwarding model multiple times and be compatible with checkpoint_activations. The input is expected to be a list of tensors from different channels, each is forwarded to the same model (shared attention weights). In the original paper each operation (multi-head attention, encoder attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *args.decoder_normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__() self.embed_dim = args.decoder_embed_dim self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.quant_noise = getattr(args, "quant_noise_pq", 0) self.quant_noise_block_size = getattr(args, "quant_noise_pq_block_size", 8) self.cross_self_attention = getattr(args, "cross_self_attention", False) self.self_attn = self.build_self_attention( self.embed_dim, args, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.activation_fn = utils.get_activation_fn( activation=str(args.activation_fn) if getattr(args, "activation_fn", None) is not None else "relu" ) activation_dropout_p = getattr(args, "activation_dropout", 0) or 0 if activation_dropout_p == 0: # for backwards compatibility with models that use args.relu_dropout activation_dropout_p = getattr(args, "relu_dropout", 0) or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = args.decoder_normalize_before # use layerNorm rather than FusedLayerNorm for exporting. # char_inputs can be used to determint this. # TODO remove this once we update apex with the fix export = getattr(args, "char_inputs", False) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = self.build_encoder_attention(self.embed_dim, args) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=export) self.fc1 = self.build_fc1( self.embed_dim, args.decoder_ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( args.decoder_ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=export) self.need_attn = True self.onnx_trace = False def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_self_attention( self, embed_dim, args, add_bias_kv=False, add_zero_attn=False ): return MultiheadAttention( embed_dim, args.decoder_attention_heads, dropout=args.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not getattr(args, "cross_self_attention", False), q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) def build_encoder_attention(self, embed_dim, args): return MultiheadAttention( embed_dim, args.decoder_attention_heads, kdim=getattr(args, "encoder_embed_dim", None), vdim=getattr(args, "encoder_embed_dim", None), dropout=args.attention_dropout, encoder_decoder_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, ) def prepare_for_onnx_export_(self): self.onnx_trace = True def residual_connection(self, x, residual): return residual + x def forward( self, x_list_tensor: List[torch.Tensor], encoder_out: Optional[torch.Tensor] = None, encoder_padding_mask: Optional[torch.Tensor] = None, incremental_state: Optional[ List[Dict[str, Dict[str, Optional[Tensor]]]] ] = None, prev_self_attn_state: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None, prev_attn_state: Optional[List[torch.Tensor]] = None, self_attn_mask: Optional[torch.Tensor] = None, self_attn_padding_mask: Optional[torch.Tensor] = None, need_attn: bool = False, need_head_weights: bool = False, ): """ Args: x_list_tensor (List[Tensor]): list of input tensors in different channels, each tensor is of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor, optional): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. incremental_state (optional): list of incremental_state dictionaries over different channels (sequence generation mode) prev_self_attn_state (List[Tuple[Tensor, Tensor]], optional): list of tuples (self_attn_state, cross_channel_attn_state) over different channels need_attn (bool, optional): return attention weights need_head_weights (bool, optional): return attention weights for each head (default: return average over heads). Returns: list of encoded output of shape `(seq_len, batch, embed_dim)` """ n_channels = len(x_list_tensor) if need_head_weights: need_attn = True # incremental_state is a list of dictionaries over different channels if incremental_state is not None: assert isinstance(incremental_state, list) assert len(incremental_state) == n_channels # prev_self_attn_state is a list of self_attn_state over different channels if prev_self_attn_state is not None: assert isinstance(prev_self_attn_state, list) assert len(prev_self_attn_state) == n_channels x_list = [] attn_list = [] for i, x in enumerate(x_list_tensor): residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) if prev_self_attn_state is not None: prev_key, prev_value = prev_self_attn_state[i][:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_self_attn_state[i]) >= 3: saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] assert incremental_state is not None self.self_attn._set_input_buffer(incremental_state[i], saved_state) _self_attn_input_buffer = self.self_attn._get_input_buffer( incremental_state ) if self.cross_self_attention and not ( incremental_state is not None and _self_attn_input_buffer is not None and "prev_key" in _self_attn_input_buffer ): if self_attn_mask is not None: assert encoder_out is not None self_attn_mask = torch.cat( (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1, ) if self_attn_padding_mask is not None: if encoder_padding_mask is None: assert encoder_out is not None encoder_padding_mask = self_attn_padding_mask.new_zeros( encoder_out.size(1), encoder_out.size(0) ) self_attn_padding_mask = torch.cat( (encoder_padding_mask, self_attn_padding_mask), dim=1 ) assert encoder_out is not None y = torch.cat((encoder_out, x), dim=0) else: y = x x, attn = self.self_attn( query=x, key=y, value=y, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state[i] if incremental_state is not None else None, need_weights=False, attn_mask=self_attn_mask, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.self_attn_layer_norm(x) if self.encoder_attn is not None and encoder_out is not None: residual = x if self.normalize_before: x = self.encoder_attn_layer_norm(x) if prev_attn_state is not None: prev_key, prev_value = prev_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_attn_state[2] assert incremental_state is not None self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state[i] if incremental_state is not None else None, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.encoder_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) x_list.append(x) attn_list.append(attn) # Trick for the checkpoint activation x_list_tensor = torch.stack(x_list) if self.onnx_trace and incremental_state is not None: self_attn_state_list = [] for i in range(n_channels): saved_state = self.self_attn._get_input_buffer(incremental_state[i]) assert saved_state is not None if self_attn_padding_mask is not None: self_attn_state = [ saved_state["prev_key"], saved_state["prev_value"], saved_state["prev_key_padding_mask"], ] else: self_attn_state = [ saved_state["prev_key"], saved_state["prev_value"], ] self_attn_state_list.append(self_attn_state) return x_list_tensor, attn_list, self_attn_state_list return x_list_tensor, attn_list, None def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn ================================================ FILE: fairseq/models/speech_dlm/sequence_generator/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .multichannel_sequence_generator import * # noqa ================================================ FILE: fairseq/models/speech_dlm/sequence_generator/multichannel_search.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, Optional import torch import torch.nn as nn from torch import Tensor class MultichannelSearch(nn.Module): def __init__(self, tgt_dicts): super().__init__() tgt_dict = list(tgt_dicts.values())[0] self.pad = tgt_dict.pad() self.unk = tgt_dict.unk() self.eos = tgt_dict.eos() for tgt_dict in tgt_dicts.values(): assert self.pad == tgt_dict.pad() assert self.unk == tgt_dict.unk() assert self.eos == tgt_dict.eos() self.vocab_sizes = {channel: len(tgt_dicts[channel]) for channel in tgt_dicts} self.src_lengths = torch.tensor(-1) self.supports_constraints = False self.stop_on_max_len = False def step( self, step, lprobs, scores, prev_output_tokens=None, original_batch_idxs=None ): """Take a single search step. Args: step: the current search step, starting at 0 lprobs: dictionary of channels {channel : (bsz x input_beam_size x vocab_size_channel)} the model's log-probabilities over the vocabulary at the current step scores: {channel : (bsz x input_beam_size x step)} the historical model scores of each hypothesis up to this point prev_output_tokens: {channel : (bsz x step)} the previously generated oputput tokens original_batch_idxs: (bsz) the tensor with the batch indices, in the range [0, bsz) this is useful in case there has been applied a re-ordering and we need to know the orignal indices Return: A tuple of (scores, indices, beams) where: scores: {channel : (bsz x output_beam_size)} the scores of the chosen elements; output_beam_size can be larger than input_beam_size, e.g., we may return 2*input_beam_size to account for EOS indices: {channel : (bsz x output_beam_size)} the indices of the chosen elements beams: (bsz x output_beam_size) the hypothesis ids of the chosen elements, in the range [0, input_beam_size) """ raise NotImplementedError @torch.jit.export def set_src_lengths(self, src_lengths): self.src_lengths = src_lengths @torch.jit.export def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int): """Initialize constraint states for constrained decoding (if supported). Args: batch_constraints: (torch.Tensor, optional) the list of constraints, in packed form beam_size: (int) the beam size Returns: *encoder_out* rearranged according to *new_order* """ pass def prune_sentences(self, batch_idxs: Tensor): """ Removes constraint states for completed sentences (if supported). This is called from sequence_generator._generate() when sentences are deleted from the batch. Args: batch_idxs: Indices of *sentences* whose constraint state should be *kept*. """ pass def update_constraints(self, active_hypos: Tensor): """ Updates the constraint states by selecting the beam items that are retained. This is called at each time step of sequence_generator._generate() when the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size. Args: active_hypos: (batch size, beam size) list of integers denoting, for each sentence, which beam candidate items should be kept. """ pass def unravel_index(index, shape): out = [] for dim in reversed(shape): out.append(index % dim) index = index // dim return torch.stack(tuple(reversed(out)), dim=-1) def topk_sum(lprobs_list, k): """ lprobs_list = [lprobs_1,...,lprobs_n], where: lprobs_1 : (batch_size x beam_size x vocab_1) ... lprobs_n : (batch_size x beam_size x vocab_n) Return: - topk_values : (batch_size x k) values of the topk sum of the form : lprobs_1[bsz, beam_idx, vocab_1_idx] + ... + lprobs_n[bsz, beam_idx, vocab_n_idx] - topk_idxs : (batch_size x k x n+1) each (n+1)-tensor being [beam_idx, vocab_1_idx, ..., vocab_n_idx] """ # Reduce all lprobs to k candidates first to reduce later complexity # We may assume that k << vocab lprobs_topk_list = [] lprobs_topk_indices_list = [] for lprobs in lprobs_list: k_i = min(k, lprobs.size(-1)) topk_values, topk_indices = torch.topk(lprobs, k=k_i) # topk_values : (batch_size x beam_size x k_i) # topk_indices : (batch_size x beam_size x k_i) lprobs_topk_list.append(topk_values) lprobs_topk_indices_list.append(topk_indices) # Compute all possible sums sum_lprobs_topk = lprobs_topk_list[0] for i in range(1, len(lprobs_topk_list)): unsqueezed_lprobs = lprobs_topk_list[i] for _ in range(i): unsqueezed_lprobs = unsqueezed_lprobs.unsqueeze(-2) sum_lprobs_topk = sum_lprobs_topk.unsqueeze(-1) + unsqueezed_lprobs # sum_lprobs : (batch_size x beam_size x k_1 x ... x k_n) # Get the top k sums and the (transformed indices) topk_sum_values, topk_sum_indices = torch.topk( sum_lprobs_topk.view(sum_lprobs_topk.size(0), -1), k=k ) # topk_sum_values : (batch_size x k) # topk_sum_indices : (batch_size x k) topk_sum_indices = unravel_index(topk_sum_indices, tuple(sum_lprobs_topk.shape[1:])) # topk_sum_indices : (batch_size x k x n+1) # Convert the transformed indices to the true indices for i_batch in range(topk_sum_indices.size(0)): for i_cand in range(topk_sum_indices.size(1)): i_beam, *transformed_vocab_indices = topk_sum_indices[i_batch, i_cand] true_vocab_indices = [i_beam] for j, transformed_vocab_j_idx in enumerate(transformed_vocab_indices): true_vocab_j_idx = lprobs_topk_indices_list[j][ i_batch, i_beam, transformed_vocab_j_idx ] true_vocab_indices.append(true_vocab_j_idx) topk_sum_indices[i_batch, i_cand] = torch.tensor(true_vocab_indices) topk_sum_beams = topk_sum_indices[:, :, 0] topk_sum_indices = topk_sum_indices[:, :, 1:] return topk_sum_values, topk_sum_indices, topk_sum_beams class MultichannelBeamSearch(MultichannelSearch): def __init__(self, tgt_dicts): super().__init__(tgt_dicts) self.constraint_states = None @torch.jit.export def step( self, step: int, lprobs, scores: Optional[Dict[str, Tensor]], prev_output_tokens: Optional[Dict[str, Tensor]] = None, original_batch_idxs: Optional[Tensor] = None, ): channels = list(lprobs.keys()) bsz, beam_size, _ = lprobs[channels[0]].size() lprobs_list = [] if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam for channel in channels: lprobs_list.append(lprobs[channel][:, ::beam_size, :].contiguous()) else: # make probs contain cumulative scores for each hypothesis assert scores is not None for channel in channels: lprobs_list.append( lprobs[channel] + scores[channel][:, :, step - 1].unsqueeze(-1) ) topk_sum_values, topk_sum_indices, topk_sum_beams = topk_sum( lprobs_list, k=beam_size * 2 ) beams_buf = topk_sum_beams scores_buf = {} indices_buf = {} for i, channel in enumerate(channels): indices_buf[channel] = topk_sum_indices[:, :, i] scores_buf[channel] = ( torch.tensor( [ lprobs_list[i][i_batch, i_beam, i_index] for i_batch in range(bsz) for i_beam, i_index in zip( beams_buf[i_batch], indices_buf[channel][i_batch] ) ] ) .view(bsz, -1) .to(lprobs_list[i].device) ) # At this point, beams_buf and indices_buf are single-dim and contain relative indices return scores_buf, indices_buf, beams_buf class ContiguousMultichannelBeamSearch(MultichannelSearch): def __init__(self, tgt_dicts): super().__init__(tgt_dicts) self.constraint_states = None @torch.jit.export def step( self, step: int, lprobs, scores: Optional[Tensor], prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, ): n_channels = len(lprobs) bsz, beam_size, _ = lprobs[0].size() lprobs_list = [] if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam for i in range(n_channels): lprobs_list.append(lprobs[i][:, ::beam_size, :].contiguous()) else: # make probs contain cumulative scores for each hypothesis assert scores is not None for i in range(n_channels): lprobs_list.append(lprobs[i] + scores[:, :, step - 1, i].unsqueeze(-1)) topk_sum_values, topk_sum_indices, topk_sum_beams = topk_sum( lprobs_list, k=beam_size * 2 ) beams_buf = topk_sum_beams indices_buf = topk_sum_indices scores_buf = ( torch.tensor( [ lprobs_list[i][i_batch, i_beam, i_index] for i in range(len(lprobs_list)) for i_batch in range(bsz) for i_beam, i_index in zip( beams_buf[i_batch], indices_buf[i_batch, :, i] ) ] ) .view(len(lprobs_list), bsz, -1) .permute(1, 2, 0) .to(lprobs_list[0].device) ) # At this point, beams_buf and indices_buf are single-dim and contain relative indices return scores_buf, indices_buf, beams_buf class ContiguousMultichannelSampling(MultichannelSearch): sampling_topk: int sampling_topp: float def __init__(self, tgt_dicts, sampling_topk=-1, sampling_topp=-1.0): super().__init__(tgt_dicts) self.sampling_topk = sampling_topk self.sampling_topp = sampling_topp def _sample_topp(self, lprobs): """Sample among the smallest set of elements whose cumulative probability mass exceeds p. See `"The Curious Case of Neural Text Degeneration" (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_. Args: lprobs: (bsz x input_beam_size x vocab_size) the model's log-probabilities over the vocabulary at the current step Return: A tuple of (trimed_probs, truncated_indices) where: trimed_probs: (bsz x input_beam_size x ?) the model's probabilities over the elements selected to sample from. The width of the third dimension is determined by top-P. truncated_indices: (bsz x input_beam_size x ?) the indices of the chosen elements. """ probs = lprobs.exp_() # sort the last dimension (vocab dimension) in descending order sorted_probs, sorted_indices = probs.sort(descending=True) # compute a mask to indicate the words to be included in the top-P set. cumsum_probs = sorted_probs.cumsum(dim=2) mask = cumsum_probs.lt(self.sampling_topp) # note that mask was computed by 'lt'. One more word needs to be included # so that the cumulative probability mass can exceed p. cumsum_mask = mask.cumsum(dim=2) last_included = cumsum_mask[:, :, -1:] last_included.clamp_(0, mask.size()[2] - 1) mask = mask.scatter_(2, last_included, 1) # truncate unnecessary dims. max_dim = last_included.max() truncated_mask = mask[:, :, : max_dim + 1] truncated_probs = sorted_probs[:, :, : max_dim + 1] truncated_indices = sorted_indices[:, :, : max_dim + 1] # trim the words that are not in top-P by setting their probabilities # to 0, so that they would not be sampled later. trim_mask = ~truncated_mask trimed_probs = truncated_probs.masked_fill_(trim_mask, 0) return trimed_probs, truncated_indices @torch.jit.export def step( self, step: int, lprobs, scores, prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, ): n_channels = len(lprobs) bsz, beam_size, vocab_size = lprobs[0].size() if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam for i in range(n_channels): lprobs[i] = lprobs[i][:, ::beam_size, :].contiguous() probs = [] top_indices = [] for i in range(n_channels): if self.sampling_topp > 0: # only sample from the smallest set of words whose cumulative probability mass exceeds p probs_i, top_indices_i = self._sample_topp(lprobs[i]) elif self.sampling_topk > 0: # only sample from top-k candidates lprobs[i], top_indices_i = lprobs[i].topk( min(self.sampling_topk, lprobs[i].size(-1)) ) probs_i = lprobs[i].exp_() else: probs_i = lprobs[i].exp_() # dummy data to be consistent with true branch for type check top_indices_i = torch.empty(0).to(probs_i) probs.append(probs_i) top_indices.append(top_indices_i) # sample indices_buf = [] for i in range(n_channels): if step == 0: indices_buf.append( torch.multinomial( probs[i].view(bsz, -1), beam_size, replacement=True, ).view(bsz, beam_size) ) else: indices_buf.append( torch.multinomial( probs[i].view(bsz * beam_size, -1), 1, replacement=True, ).view(bsz, beam_size) ) if step == 0: for i in range(n_channels): # expand to beam size probs[i] = probs[i].expand(bsz, beam_size, -1) # gather scores scores_buf = [] for i in range(n_channels): scores_buf.append( torch.gather(probs[i], dim=2, index=indices_buf[i].unsqueeze(-1)) ) scores_buf[i] = scores_buf[i].log_().view(bsz, -1) # remap indices if using top-k or top-P sampling if self.sampling_topk > 0 or self.sampling_topp > 0: for i in range(n_channels): indices_buf[i] = torch.gather( top_indices[i].expand(bsz, beam_size, -1), dim=2, index=indices_buf[i].unsqueeze(-1), ).squeeze(2) if step == 0: beams_buf = indices_buf[0].new_zeros(bsz, beam_size) else: beams_buf = torch.arange(0, beam_size).to(indices_buf[0]).repeat(bsz, 1) # make scores cumulative for i in range(n_channels): scores_buf[i].add_( torch.gather(scores[:, :, step - 1, i], dim=1, index=beams_buf) ) scores_buf = torch.stack(scores_buf, dim=-1) indices_buf = torch.stack(indices_buf, dim=-1) return scores_buf, indices_buf, beams_buf ================================================ FILE: fairseq/models/speech_dlm/sequence_generator/multichannel_sequence_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Dict, List, Optional from omegaconf.listconfig import ListConfig from omegaconf.dictconfig import DictConfig import torch import torch.nn as nn from fairseq.models import FairseqIncrementalDecoder from torch import Tensor from fairseq.ngram_repeat_block import NGramRepeatBlock from .multichannel_search import ContiguousMultichannelBeamSearch from fairseq.models.speech_dlm import SpeechDLM class MultichannelSequenceGenerator(nn.Module): def __init__( self, models, tgt_dicts, beam_size=1, max_len_a=0, max_len_b=200, min_len=1, normalize_scores=True, len_penalty=1.0, unk_penalty=0.0, temperature=1.0, match_source_len=False, no_repeat_ngram_size=0, search_strategy=None, eos=None, symbols_to_strip_from_output=None, lm_model=None, lm_weight=1.0, duration_temperature=1.0, ): """Generate multi-channel parallel units with the SpeechDLM model as described in the paper: https://arxiv.org/pdf/2203.16502.pdf; Args: models (List[~fairseq.models.FairseqModel]): ensemble of models, currently support fairseq.models.TransformerModel for scripting beam_size (int, optional): beam width (default: 1) max_len_a/b (int, optional): generate sequences of maximum length ax + b, where x is the source length min_len (int, optional): the minimum length of the generated output (not including end-of-sentence) normalize_scores (bool, optional): normalize scores by the length of the output (default: True) len_penalty (float, optional): length penalty, where <1.0 favors shorter, >1.0 favors longer sentences (default: 1.0) unk_penalty (float, optional): unknown word penalty, where <0 produces more unks, >0 produces fewer (default: 0.0) temperature (float, optional): temperature, where values >1.0 produce more uniform samples and values <1.0 produce sharper samples (default: 1.0) match_source_len (bool, optional): outputs should match the source length (default: False) duration_temperature (float, optional): rate of the duration prediction, higher rate induces a faster generated wav (default: 1.0) """ super().__init__() if isinstance(models, MultichannelEnsembleModel): self.model = models else: self.model = MultichannelEnsembleModel(models) self.tgt_dicts = tgt_dicts self.pad = list(tgt_dicts.values())[0].pad() self.unk = list(tgt_dicts.values())[0].unk() self.eos = list(tgt_dicts.values())[0].eos() if eos is None else eos self.symbols_to_strip_from_output = ( symbols_to_strip_from_output.union({self.eos}) if symbols_to_strip_from_output is not None else {self.eos} ) self.channels = list(tgt_dicts.keys()) self.n_channels = len(self.channels) self.vocab_sizes = [len(tgt_dicts[channel]) for channel in self.channels] # the max beam size is the dictionary size - 1, since we never select pad max_possible_beam_size = 1 for i in self.vocab_sizes: max_possible_beam_size *= i - 1 self.beam_size = min(beam_size, max_possible_beam_size) self.max_len_a = max_len_a self.max_len_b = max_len_b self.min_len = min_len self.normalize_scores = normalize_scores self.len_penalty = len_penalty self.unk_penalty = unk_penalty if isinstance(temperature, (int, float)): temperature = {channel: temperature for channel in self.channels} elif isinstance(temperature, ListConfig) or isinstance(temperature, list): temperature = { channel: temperature[i] for i, channel in enumerate(self.channels) } assert isinstance(temperature, DictConfig) or isinstance( temperature, dict ), f"temperature: expected dict, but found {type(temperature)}" self.temperature = temperature self.match_source_len = match_source_len if no_repeat_ngram_size > 0: self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size) else: self.repeat_ngram_blocker = None for channel in temperature: assert temperature[channel] > 0, "--temperature must be greater than 0" if search_strategy is None: self.search = ContiguousMultichannelBeamSearch(tgt_dicts) else: self.search = search_strategy # We only need to set src_lengths in LengthConstrainedBeamSearch. # As a module attribute, setting it would break in multithread # settings when the model is shared. self.should_set_src_lengths = ( hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths ) self.model.eval() self.lm_model = lm_model self.lm_weight = lm_weight if self.lm_model is not None: self.lm_model.eval() self.duration_prediction = bool( str(getattr(models[0].decoder.args, "duration_prediction", "false")).lower() == "true" ) self.delayed_duration = bool( str( getattr(models[0].decoder.args, "delayed_duration_target", "false") ).lower() == "true" ) self.duration_temperature = duration_temperature def cuda(self): self.model.cuda() return self @torch.no_grad() def forward( self, sample: Dict[str, Dict[str, Tensor]], # TODO: Modify this prefix_tokens: Optional[Dict[str, Tensor]] = None, bos_token: Optional[int] = None, ): """Generate a batch of translations. Args: sample (dict): batch prefix_tokens (dict of torch.LongTensor, optional): force decoder to begin with these tokens bos_token (int, optional): beginning of sentence token (default: self.eos) """ return self._generate(sample, prefix_tokens, bos_token=bos_token) @torch.no_grad() def generate(self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs): """Generate translations. Match the api of other fairseq generators. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models sample (dict): batch prefix_tokens (dict of torch.LongTensor, optional): force decoder to begin with these tokens constraints (torch.LongTensor, optional): force decoder to include the list of constraints bos_token (int, optional): beginning of sentence token (default: self.eos) """ return self._generate(sample, **kwargs) def _generate( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Dict[str, Tensor]] = None, constraints: Optional[Tensor] = None, bos_token: Optional[int] = None, ): """ Here sample is expected to have the following form { 'id': index, 'net_input': { 'src_tokens': { 'channel1' : tensor((batch x src_length)), 'channel2' : tensor((batch x src_length)), }, ... }, } and prefix_tokens { 'channel1' : tensor((batch x prefix_length)), 'channel2' : tensor((batch x prefix_length)), } """ if self.model.is_speech_dlm: incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [{} for _ in range(self.n_channels)], ) for i in range(self.model.models_size) ], ) else: incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) for i in range(self.model.models_size) ], ) net_input = sample["net_input"] # Convert from dict to tensor form # shape of src_tokens : (bsz x src_len x n_channels) src_tokens = torch.stack( [net_input["src_tokens"][channel] for channel in self.channels], dim=-1 ) prefix_tokens = torch.stack( [prefix_tokens[channel] for channel in self.channels], dim=-1 ) # length of the source text being the character length except EndOfSentence and pad src_lengths = ( (src_tokens[..., 0].ne(self.eos) & src_tokens[..., 0].ne(self.pad)) .long() .sum(dim=1) ) # bsz: total number of sentences in beam # Note that src_tokens may have more than 2 dimensions (i.e. audio features) bsz, src_len = src_tokens.size()[:2] beam_size = self.beam_size if constraints is not None and not self.search.supports_constraints: raise NotImplementedError( "Target-side constraints were provided, but search method doesn't support them" ) # Initialize constraints, when active self.search.init_constraints(constraints, beam_size) max_len: int = -1 if self.match_source_len: max_len = src_lengths.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), # exclude the EOS marker self.model.max_decoder_positions() - 1, ) assert ( self.min_len <= max_len ), "min_len cannot be larger than max_len, please adjust these!" # compute the encoder output for each beam encoder_outs = self.model.forward_encoder(net_input) # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) # ensure encoder_outs is a List. assert encoder_outs is not None # initialize buffers # cumulative scores of hypotheses scores = ( torch.zeros(bsz * beam_size, max_len + 1, self.n_channels) .to(src_tokens) .float() ) # +1 for eos; pad is never chosen for scoring tokens = ( torch.zeros(bsz * beam_size, max_len + 2, self.n_channels) .to(src_tokens) .long() .fill_(self.pad) ) # +2 for eos and pad tokens[:, 0] = self.eos if bos_token is None else bos_token attn: Optional[Tensor] = None # A list that indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then cands_to_ignore would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. cands_to_ignore = ( torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) ) # forward and backward-compatible False mask # list of completed sentences finalized = torch.jit.annotate( List[List[Dict[str, Tensor]]], [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)], ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step finished = [ False for i in range(bsz) ] # a boolean array indicating if the sentence at the index is finished or not num_remaining_sent = bsz # number of sentences remaining # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = ( (torch.arange(0, bsz) * beam_size) .unsqueeze(1) .type_as(tokens) .to(src_tokens.device) ) cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device) reorder_state: Optional[Tensor] = None batch_idxs: Optional[Tensor] = None original_batch_idxs: Optional[Tensor] = None if "id" in sample and isinstance(sample["id"], Tensor): original_batch_idxs = sample["id"] else: original_batch_idxs = torch.arange(0, bsz).type_as(tokens) if self.duration_prediction: dur_counter = torch.ones(bsz * beam_size, self.n_channels).to(src_tokens) # save the indice where the dur_counter just copied from dur_pred dur_counter_jump_indices = None for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as( batch_idxs ) reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size ) original_batch_idxs = original_batch_idxs[batch_idxs] self.model.reorder_incremental_state(incremental_states, reorder_state) encoder_outs = self.model.reorder_encoder_out( encoder_outs, reorder_state ) input_tokens = { channel: tokens[:, : step + 1, i] for i, channel in enumerate(self.channels) } lprobs_dict, avg_attn_scores = self.model.forward_decoder( input_tokens, encoder_outs, incremental_states, self.temperature, ) # Because the sizes of vocab is different, we cannot concat the lprobs to form a single tensor if not self.duration_prediction: lprobs_list = list(lprobs_dict.values()) else: lprobs_list = [ net_output["pred_token"] for net_output in lprobs_dict.values() ] # non-positive predicted durations dur_preds = ( torch.stack( [ net_output["pred_duration"] for net_output in lprobs_dict.values() ] ) .squeeze(-1) .T ) dur_preds = dur_preds / self.duration_temperature dur_preds = dur_preds.round().long() dur_preds[dur_preds < 1] = 1 # dur_preds & dur_counter needs to be modified when there isn't an edge if step > 0: non_edge_indices = tokens[:, step, :] == tokens[:, step - 1, :] if self.delayed_duration: dur_preds[non_edge_indices] = 1 else: if dur_counter_jump_indices is not None: dur_counter[dur_counter_jump_indices & non_edge_indices] = 2 # update dur_counter if step > 0: if self.delayed_duration: dur_counter -= ( (dur_counter == 1) | (tokens[:, step, :] == tokens[:, step - 1, :]) ).int() dur_counter[dur_counter < 0] = 0 else: dur_counter -= ( tokens[:, step, :] == tokens[:, step - 1, :] ).int() dur_counter[dur_counter < 1] = 1 # whether to copy previous token (ie. if the counter is still on) # and get get the new duration if self.delayed_duration: dur_counter_jump_indices = dur_counter == 0 dur_counter[dur_counter_jump_indices] = dur_preds[ dur_counter_jump_indices ] # whether to copy previous token in this step copy_prev_token = dur_counter != 1 if self.delayed_duration is False: dur_counter_jump_indices = dur_counter == 1 dur_counter[dur_counter_jump_indices] = dur_preds[ dur_counter_jump_indices ] # else: # dur_counter[dur_counter==0] = dur_preds[dur_counter==0] - 1 # copy_prev_token = (dur_counter > 0) if self.lm_model is not None: assert False, "Currently not supported in multichannelLM case" for i in range(self.n_channels): lprobs_list[i][lprobs_list[i] != lprobs_list[i]] = torch.tensor( -math.inf ).to(lprobs_list[i]) lprobs_list[i][:, self.pad] = -math.inf # never select pad lprobs_list[i][:, self.unk] -= self.unk_penalty # apply unk penalty # handle max length constraint if step >= max_len: lprobs_list[i][:, : self.eos] = -math.inf lprobs_list[i][:, self.eos + 1 :] = -math.inf else: lprobs_list[i][ :, self.eos ] = -math.inf # quick fix for short generation # handle prefix tokens (possibly with different lengths) if ( prefix_tokens is not None and step < prefix_tokens.size(1) and step < max_len ): ( lprobs_list[i], tokens[..., i], scores[..., i], ) = self._prefix_tokens( step, lprobs_list[i], scores[..., i], tokens[..., i], prefix_tokens[..., i], beam_size, ) if self.duration_prediction: # Can copy previous token if the prefix token is padding or unk (1-channel conditionned case) can_copy_mask = ( prefix_tokens[:, step, i].eq(self.pad) | prefix_tokens[:, step, i].eq(self.unk) ).repeat_interleave(beam_size) copy_prev_token[:, i] &= can_copy_mask elif step < self.min_len: # minimum length constraint (does not apply if using prefix_tokens) lprobs_list[i][:, self.eos] = -math.inf if self.duration_prediction: if step < max_len: for j in range(copy_prev_token.size(0)): if copy_prev_token[j, i]: prev_token = tokens[j, step, i] lprobs_list[i][j, :prev_token] = -math.inf lprobs_list[i][j, prev_token + 1 :] = -math.inf # lprobs_list[i][j, prev_token] = 0. # dur_counter[j,i] -= 1 # else: # prev_token = tokens[j, step, i] # if not (lprobs_list[i][j,:].ne(-math.inf).nonzero() == prev_token).all(): # lprobs_list[i][j, prev_token] = -math.inf # dur_counter[j,i] = 0. # Record attention scores, only support avg_attn_scores is a Tensor if avg_attn_scores is not None: if attn is None: attn = torch.empty( bsz * beam_size, avg_attn_scores.size(1), max_len + 2 ).to(scores) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs_list[0]) eos_bbsz_idx = torch.empty(0).to( tokens ) # indices of hypothesis ending with eos (finished sentences) eos_scores = torch.empty(0).to( scores ) # scores of hypothesis ending with eos (finished sentences) if self.should_set_src_lengths: self.search.set_src_lengths(src_lengths) if self.repeat_ngram_blocker is not None: for i in range(self.n_channels): lprobs_list[i] = self.repeat_ngram_blocker( tokens, lprobs_list[i], bsz, beam_size, step ) # Shape: (batch, cand_size) cand_scores, cand_indices, cand_beams = self.search.step( step, [ lprobs_list[i].view(bsz, -1, self.vocab_sizes[i]) for i in range(self.n_channels) ], scores.view(bsz, beam_size, -1, self.n_channels)[:, :, :step, :], tokens[:, : step + 1], original_batch_idxs, ) # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos # Shape of eos_mask: (batch size, beam size) eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) eos_mask = torch.any(eos_mask, dim=-1, keepdim=False) eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask) # only consider eos when it's among the top beam_size indices # Now we know what beam item(s) to finish # Shape: 1d list of absolute-numbered eos_bbsz_idx = torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] ) finalized_sents: List[int] = [] if eos_bbsz_idx.numel() > 0: eos_scores = torch.stack( [ torch.masked_select( cand_scores[:, :beam_size, i], mask=eos_mask[:, :beam_size] ) for i in range(self.n_channels) ], dim=-1, ) finalized_sents = self.finalize_hypos( step, eos_bbsz_idx, eos_scores, tokens, scores, finalized, finished, beam_size, attn, src_lengths, max_len, ) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break if self.search.stop_on_max_len and step >= max_len: break assert step < max_len, f"{step} < {max_len}" # Remove finalized sentences (ones for which {beam_size} # finished hypotheses have been generated) from the batch. if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones( bsz, dtype=torch.bool, device=cand_indices.device ) batch_mask[finalized_sents] = False # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it batch_idxs = torch.arange( bsz, device=cand_indices.device ).masked_select(batch_mask) # Choose the subset of the hypothesized constraints that will continue self.search.prune_sentences(batch_idxs) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths = src_lengths[batch_idxs] cands_to_ignore = cands_to_ignore[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1, self.n_channels ) tokens = tokens.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, -1, self.n_channels ) if self.duration_prediction: dur_counter = dur_counter.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, self.n_channels ) if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, attn.size(1), -1 ) bsz = new_bsz else: batch_idxs = None # Set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos # Rewrite the operator since the element wise or is not supported in torchscript. eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) active_mask = torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[: eos_mask.size(1)], ) # get the top beam_size active hypotheses, which are just # the hypos with the smallest values in active_mask. # {active_hypos} indicates which {beam_size} hypotheses # from the list of {2 * beam_size} candidates were # selected. Shapes: (batch size, beam size) new_cands_to_ignore, active_hypos = torch.topk( active_mask, k=beam_size, dim=1, largest=False ) # update cands_to_ignore to ignore any finalized hypos. cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] # Make sure there is at least one active item for each sentence in the batch. assert (~cands_to_ignore).any(dim=1).all() # update cands_to_ignore to ignore any finalized hypos # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam # can be selected more than once). active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) active_bbsz_idx = active_bbsz_idx.view(-1) # active_scores = torch.stack([ # torch.gather(cand_scores[...,0], dim=1, index=active_hypos) # for i in range(self.n_channels) # ], dim = -1) # active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses # Set the tokens for each beam (can select the same row more than once) tokens[:, : step + 1] = torch.index_select( tokens[:, : step + 1], dim=0, index=active_bbsz_idx ) # Select the next token for each of them for i in range(self.n_channels): tokens.view(bsz, beam_size, -1, self.n_channels)[ :, :, step + 1, i ] = torch.gather(cand_indices[..., i], dim=1, index=active_hypos) if step > 0: scores[:, :step] = torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx ) for i in range(self.n_channels): scores.view(bsz, beam_size, -1, self.n_channels)[ :, :, step, i ] = torch.gather(cand_scores[..., i], dim=1, index=active_hypos) if self.duration_prediction: dur_counter = torch.index_select( dur_counter, dim=0, index=active_bbsz_idx ) # Update constraints based on which candidates were selected for the next beam self.search.update_constraints(active_hypos) # copy attention for active hypotheses if attn is not None: attn[:, :, : step + 2] = torch.index_select( attn[:, :, : step + 2], dim=0, index=active_bbsz_idx ) # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): scores = torch.tensor( [float(elem["score"].item()) for elem in finalized[sent]] ) _, sorted_scores_indices = torch.sort(scores, descending=True) finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices] finalized[sent] = torch.jit.annotate( List[Dict[str, Tensor]], finalized[sent] ) return finalized def _prefix_tokens( self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int ): """Handle prefix tokens""" prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1) prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1)) prefix_mask = prefix_toks.ne(self.pad) # used for 1-channel generation, do not force the unk token (i.e. unk tokens are changed) prefix_mask &= prefix_toks.ne(self.unk) # zeroing the copying tokens # if step > 0: # copy_mask = (prefix_tokens[:, step] == prefix_tokens[:, step-1]).unsqueeze(-1).repeat(1, beam_size).view(-1) # prefix_lprobs[copy_mask & prefix_mask] = 0. lprobs[prefix_mask] = torch.tensor(-math.inf).to(lprobs) lprobs[prefix_mask] = lprobs[prefix_mask].scatter( -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask] ) # shouldn't stop at unk token unk_mask = prefix_toks.eq(self.unk) if len(lprobs[unk_mask]) > 0: # otherwise it won't assign to lprobs, # see: https://discuss.pytorch.org/t/how-to-mask-and-assign-a-value-to-tensor/18437 copy_lprobs = lprobs[unk_mask][:, :] copy_lprobs[:, self.eos] = -math.inf lprobs[unk_mask] = copy_lprobs # if prefix includes eos, then we should make sure tokens and # scores are the same across all beams eos_mask = prefix_toks.eq(self.eos) if eos_mask.any(): # validate that the first beam matches the prefix first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[ :, 0, 1 : step + 1 ] eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0] target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step] assert (first_beam == target_prefix).all() # copy tokens, scores and lprobs from the first beam to all beams tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size) scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size) lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size) return lprobs, tokens, scores def replicate_first_beam(self, tensor, mask, beam_size: int): tensor = tensor.view(-1, beam_size, tensor.size(-1)) tensor[mask] = tensor[mask][:, :1, :] return tensor.view(-1, tensor.size(-1)) def finalize_hypos( self, step: int, bbsz_idx, eos_scores, tokens, scores, finalized: List[List[Dict[str, Tensor]]], finished: List[bool], beam_size: int, attn: Optional[Tensor], src_lengths, max_len: int, ): """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly. A sentence is finalized when {beam_size} finished items have been collected for it. Returns number of sentences (not beam items) being finalized. These will be removed from the batch and not processed further. Args: bbsz_idx (Tensor): """ assert bbsz_idx.numel() == eos_scores.size(0) # clone relevant token and attention tensors. # tokens is (batch * beam, max_len). So the index_select # gets the newly EOS rows, then selects cols 1..{step + 2} tokens_clone = tokens.index_select(0, bbsz_idx)[ :, 1 : step + 2 ] # skip the first index, which is EOS tokens_clone[:, step] = self.eos attn_clone = ( attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2] if attn is not None else None ) # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1] pos_scores[:, step, :] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1) ** self.len_penalty # cum_unfin records which sentences in the batch are finished. # It helps match indexing between (a) the original sentences # in the batch and (b) the current, possibly-reduced set of # sentences. cum_unfin: List[int] = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) # The keys here are of the form "{sent}_{unfin_idx}", where # "unfin_idx" is the index in the current (possibly reduced) # list of sentences, and "sent" is the index in the original, # unreduced batch # set() is not supported in script export sents_seen: Dict[str, Optional[Tensor]] = {} # For every finished beam item for i in range(bbsz_idx.size()[0]): idx = bbsz_idx[i] score = eos_scores[i].sum() # sentence index in the current (possibly reduced) batch unfin_idx = idx // beam_size # sentence index in the original (unreduced) batch sent = unfin_idx + cum_unfin[unfin_idx] # Cannot create dict for key type '(int, int)' in torchscript. # The workaround is to cast int to string seen = str(sent.item()) + "_" + str(unfin_idx.item()) if seen not in sents_seen: sents_seen[seen] = None if self.match_source_len and step > src_lengths[unfin_idx]: score = torch.tensor(-math.inf).to(score) # An input sentence (among those in a batch) is finished when # beam_size hypotheses have been collected for it if len(finalized[sent]) < beam_size: if attn_clone is not None: # remove padding tokens from attn scores hypo_attn = attn_clone[i] else: hypo_attn = torch.empty(0) finalized[sent].append( { "tokens": tokens_clone[i], "score": score, "attention": hypo_attn, # src_len x tgt_len "alignment": torch.empty(0), "positional_scores": pos_scores[i], } ) newly_finished: List[int] = [] for seen in sents_seen.keys(): # check termination conditions for this sentence sent: int = int(float(seen.split("_")[0])) unfin_idx: int = int(float(seen.split("_")[1])) if not finished[sent] and self.is_finished( step, unfin_idx, max_len, len(finalized[sent]), beam_size ): finished[sent] = True newly_finished.append(unfin_idx) return newly_finished def is_finished( self, step: int, unfin_idx: int, max_len: int, finalized_sent_len: int, beam_size: int, ): """ Check whether decoding for a sentence is finished, which occurs when the list of finalized sentences has reached the beam size, or when we reach the maximum length. """ assert finalized_sent_len <= beam_size if finalized_sent_len == beam_size or step == max_len: return True return False class MultichannelEnsembleModel(nn.Module): """A wrapper around an ensemble of SpeechDLM models.""" def __init__(self, models): super().__init__() self.models_size = len(models) # method '__len__' is not supported in ModuleList for torch script self.single_model = models[0] self.models = nn.ModuleList(models) self.has_incremental: bool = False if all( hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder) for m in models ): self.has_incremental = True if isinstance(models[0], SpeechDLM): self.is_speech_dlm = True # Otherwise it's a multi-channel language model (without cross-prediction outputs) else: self.is_speech_dlm = False if getattr(models[0].decoder.args, "duration_prediction", False): self.is_duration_prediction = True else: self.is_duration_prediction = False def forward(self): pass def has_encoder(self): return hasattr(self.single_model, "encoder") def has_incremental_states(self): return self.has_incremental def max_decoder_positions(self): return min([m.max_decoder_positions() for m in self.models]) @torch.jit.export def forward_encoder(self, net_input: Dict[str, Tensor]): if not self.has_encoder(): return None return [model.encoder.forward_torchscript(net_input) for model in self.models] @torch.jit.export def forward_decoder( self, tokens, encoder_outs: List[Dict[str, List[Tensor]]], incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], temperature: Dict[str, float] = 1.0, ): if isinstance(temperature, (float, int)): temperature = {channel: temperature for channel in tokens} log_probs = {channel: [] for channel in tokens} avg_attn: Optional[Tensor] = None encoder_out: Optional[Dict[str, List[Tensor]]] = None for i, model in enumerate(self.models): if self.has_encoder(): encoder_out = encoder_outs[i] # decode each model if self.has_incremental_states(): decoder_out = model.decoder.forward( tokens, encoder_out=encoder_out, incremental_state=incremental_states[i], ) else: decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out) attn: Optional[Tensor] = None decoder_len = len(decoder_out) if decoder_len > 1 and decoder_out[1] is not None: if isinstance(decoder_out[1], Tensor): attn = decoder_out[1] else: attn_holder = decoder_out[1]["attn"] if isinstance(attn_holder, Tensor): attn = attn_holder elif attn_holder is not None: attn = attn_holder[0] if attn is not None: attn = attn[:, -1, :] if self.is_speech_dlm: if self.is_duration_prediction: decoder_out_divided_by_temperature = { channel_src: { channel_pred: { "pred_token": decoder_out[0][channel_src][channel_pred][ "pred_token" ][:, -1:, :].div_(temperature[channel_pred]), "pred_duration": decoder_out[0][channel_src][ channel_pred ]["pred_duration"][:, -1:, :], } for channel_pred in decoder_out[0][channel_src] } for channel_src in decoder_out[0] } else: decoder_out_divided_by_temperature = { channel_src: { channel_pred: decoder_out[0][channel_src][channel_pred][ :, -1:, : ].div_(temperature[channel_pred]) for channel_pred in decoder_out[0][channel_src] } for channel_src in decoder_out[0] } else: decoder_out_divided_by_temperature = { channel: decoder_out[0][channel][:, -1:, :].div_( temperature[channel] ) for channel in decoder_out[0] } decoder_out_tuple = ( decoder_out_divided_by_temperature, None if decoder_len <= 1 else decoder_out[1], ) probs = model.get_normalized_probs( decoder_out_tuple, log_probs=True, sample=None ) if self.is_speech_dlm: if self.is_duration_prediction: probs = { channel: { "pred_token": probs[channel][channel]["pred_token"][ :, -1, : ], "pred_duration": probs[channel][channel]["pred_duration"][ :, -1, : ], } for channel in probs } else: probs = { channel: probs[channel][channel][:, -1, :] for channel in probs } else: probs = {channel: probs[channel][:, -1, :] for channel in probs} if self.models_size == 1: return probs, attn for channel in probs: log_probs[channel].append(probs[channel]) if attn is not None: if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs = {} for channel in log_probs: avg_probs[channel] = torch.logsumexp( torch.stack(log_probs[channel], dim=0), dim=0 ) - math.log(self.models_size) if avg_attn is not None: avg_attn.div_(self.models_size) return avg_probs, avg_attn @torch.jit.export def reorder_encoder_out( self, encoder_outs: Optional[List[Dict[str, List[Tensor]]]], new_order ): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ new_outs: List[Dict[str, List[Tensor]]] = [] if not self.has_encoder(): return new_outs for i, model in enumerate(self.models): assert encoder_outs is not None new_outs.append( model.encoder.reorder_encoder_out(encoder_outs[i], new_order) ) return new_outs @torch.jit.export def reorder_incremental_state( self, incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], new_order, ): if not self.has_incremental_states(): return for i, model in enumerate(self.models): model.decoder.reorder_incremental_state_scripting( incremental_states[i], new_order ) ================================================ FILE: fairseq/models/speech_dlm/speech_dlm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from dataclasses import dataclass, field from typing import Optional from fairseq import utils from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import ( FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.transformer import Embedding from .modules.speech_dlm_decoder import CrossChannelTransformerDecoder from omegaconf import II DEFAULT_MAX_TARGET_POSITIONS = 1024 logger = logging.getLogger(__name__) @dataclass class SpeechDLMConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"} ) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"} ) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."} ) relu_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."} ) decoder_embed_dim: int = field( default=512, metadata={"help": "decoder embedding dimension"} ) decoder_output_dim: int = field( default=512, metadata={"help": "decoder output dimension"} ) decoder_input_dim: int = field( default=512, metadata={"help": "decoder input dimension"} ) decoder_ffn_embed_dim: int = field( default=2048, metadata={"help": "decoder embedding dimension for FFN"} ) decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"}) decoder_cross_layers: int = field( default=-1, metadata={"help": "num self cross attention decoder layers"} ) decoder_attention_heads: int = field( default=8, metadata={"help": "num decoder attention heads"} ) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"} ) no_decoder_final_norm: bool = field( default=False, metadata={"help": "don't add an extra layernorm after the last decoder block"}, ) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings (outside self attention)" }, ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"} ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) decoder_layerdrop: float = field( default=0.0, metadata={"help": "LayerDrop probability for decoder"} ) decoder_layers_to_keep: Optional[str] = field( default=None, metadata={ "help": "which layers to *keep* when pruning as a comma-separated list" }, ) layernorm_embedding: bool = field( default=False, metadata={"help": "add layernorm to embedding"} ) no_scale_embedding: bool = field( default=False, metadata={"help": "if True, dont scale embeddings"} ) checkpoint_activations: bool = field( default=False, metadata={"help": "checkpoint activations at each layer"} ) offload_activations: bool = field( default=False, metadata={"help": "move checkpointed activations to CPU after they are used."}, ) quant_noise_pq: float = field( default=0.0, metadata={"help": "iterative PQ quantization noise at training time"}, ) quant_noise_pq_block_size: int = field( default=8, metadata={"help": "block size of quantization noise at training time"}, ) # TODO common var add to parent quant_noise_scalar: float = field( default=0.0, metadata={ "help": "scalar quantization noise and scalar quantization at training time" }, ) add_bos_token: bool = II("task.add_bos_token") tokens_per_sample: int = II("task.tokens_per_sample") max_target_positions: Optional[int] = II("task.max_target_positions") tpu: bool = II("common.tpu") duration_prediction: str = II("task.duration_prediction") delayed_duration_target: str = II("task.delayed_duration_target") main_and_cross_weights: str = II("criterion.main_and_cross_weights") @register_model("speech_dlm", dataclass=SpeechDLMConfig) class SpeechDLM(FairseqLanguageModel): """Spoken Unit-based Dialogue Language Model model (SpeechDLM) as described in the paper: https://arxiv.org/pdf/2203.16502.pdf """ def __init__(self, decoder): super().__init__(decoder) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_lm_architecture(args) if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if args.decoder_cross_layers < 0: args.decoder_cross_layers = args.decoder_layers if getattr(args, "max_target_positions", None) is None: args.max_target_positions = getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) # Assert all dictionary to be the same assert all( task.source_dictionaries[channel] == task.source_dictionary for channel in task.channels ), "Source dictionaries of all channels are expected to be the same!!!" assert all( task.target_dictionaries[channel] == task.target_dictionary for channel in task.channels ), "Target dictionaries of all channels are expected to be the same!!!" # Build the unit embeddings embed_tokens = cls.build_embedding( args, task.source_dictionary, args.decoder_input_dim ) decoder = CrossChannelTransformerDecoder( args, task.target_dictionary, embed_tokens, channels=task.channels, no_encoder_attn=True, ) return cls(decoder) @classmethod def build_embedding(cls, args, dictionary, embed_dim, path=None): embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad()) return embed_tokens @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", **kwargs, ): """ Load a :class:`~fairseq.models.FairseqModel` from a pre-trained model file. Downloads and caches the pre-trained model file if needed. The base implementation returns a :class:`~fairseq.hub_utils.GeneratorHubInterface`, which can be used to generate translations or sample from language models. The underlying :class:`~fairseq.models.FairseqModel` can be accessed via the *generator.models* attribute. This function return a class:`MultichannelGeneratorHubInterface` object, which allows generation in multiple channels with a multichannel model. Args: model_name_or_path (str): either the name of a pre-trained model to load or a path/URL to a pre-trained model state dict checkpoint_file (str, optional): colon-separated list of checkpoint files in the model archive to ensemble (default: 'model.pt') data_name_or_path (str, optional): point args.data to the archive at the given path/URL. Can start with '.' or './' to reuse the model archive path. """ from fairseq import hub_utils from .hub_interface import MultichannelGeneratorHubInterface x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), **kwargs, ) logger.info(x["args"]) return MultichannelGeneratorHubInterface(x["args"], x["task"], x["models"]) @property def supported_targets(self): return {"next", "edge", "duration"} def base_lm_architecture(args): # backward compatibility for older model checkpoints if hasattr(args, "decoder_final_norm"): args.no_decoder_final_norm = not args.decoder_final_norm args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_cross_layers = getattr(args, "decoder_cross_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.activation_fn = getattr(args, "activation_fn", "relu") args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0) args.add_bos_token = getattr(args, "add_bos_token", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) # Model training is not stable without this args.decoder_normalize_before = True args.no_decoder_final_norm = getattr(args, "no_decoder_final_norm", False) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) args.checkpoint_activations = getattr(args, "checkpoint_activations", False) args.offload_activations = getattr(args, "offload_activations", False) if args.offload_activations: args.checkpoint_activations = True @register_model_architecture("speech_dlm", "speech_dlm_big") def speech_dlm_big(args): args.decoder_layers = getattr(args, "decoder_layers", 12) args.decoder_cross_layers = getattr(args, "decoder_cross_layers", 12) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) base_lm_architecture(args) ================================================ FILE: fairseq/models/speech_to_speech/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .s2s_conformer import * # noqa from .s2s_conformer_translatotron2 import * # noqa from .s2s_conformer_unity import * # noqa from .s2s_transformer import * # noqa ================================================ FILE: fairseq/models/speech_to_speech/modules/__init__.py ================================================ ================================================ FILE: fairseq/models/speech_to_speech/modules/ctc_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from torch import nn from fairseq.models import FairseqEncoder class CTCDecoder(FairseqEncoder): def __init__(self, dictionary, in_dim): super().__init__(dictionary) self.proj = nn.Linear(in_dim, len(dictionary)) def forward(self, src_tokens, src_lengths=None, **kwargs): encoder_out = self.proj(src_tokens) return {"encoder_out": encoder_out} ================================================ FILE: fairseq/models/speech_to_speech/modules/stacked_embedding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from torch import nn from fairseq.models.transformer import Linear class StackedEmbedding(nn.Embedding): """Embedding module that supports stacked units -> single embedding""" def __init__(self, num_embeddings, embed_dim, padding_idx, num_stacked=1): super().__init__(num_embeddings, embed_dim, padding_idx) # follow transformer.Embedding nn.init.normal_(self.weight, mean=0, std=embed_dim**-0.5) nn.init.constant_(self.weight[padding_idx], 0) self.offset = ( 4 # skip <bos>, <pad>, <eos>, <unk>, specific to fairseq dictionary ) self.vocab_size = num_embeddings - self.offset self.num_stacked = num_stacked if self.num_stacked > 1: self.project_in_dim = Linear(embed_dim * num_stacked, embed_dim, bias=False) def forward(self, input): if self.num_stacked == 1: return super().forward(input) # expand input indices mask = input >= self.offset stacked_input = [] cum_input = input.new_zeros(input.shape) for i in range(1, self.num_stacked + 1): div = pow(self.vocab_size, i) next_input = torch.remainder(input - self.offset - cum_input, div) cum_input += next_input next_input = torch.floor_divide(next_input, div // self.vocab_size) stacked_input.append((next_input + self.offset) * mask + input * ~mask) stacked_input = torch.stack(stacked_input[::-1], dim=2) embed = super().forward(stacked_input).view(input.size(0), input.size(1), -1) embed = self.project_in_dim(embed) return embed ================================================ FILE: fairseq/models/speech_to_speech/modules/transformer_decoder_aug.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Any, Dict, List, Optional from torch import Tensor from fairseq.models.transformer import Linear from fairseq.models.transformer.transformer_decoder_aug import AugTransformerDecoder class AugTransformerUnitDecoder(AugTransformerDecoder): """Based on Transformer decoder, with support to decoding stacked units""" def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): super().__init__( args, dictionary, embed_tokens, no_encoder_attn, output_projection ) self.n_frames_per_step = args.n_frames_per_step self.out_proj_n_frames = ( Linear( self.output_embed_dim, self.output_embed_dim * self.n_frames_per_step, bias=False, ) if self.n_frames_per_step > 1 else None ) def forward( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention, should be of size T x B x C incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, encoder_out_aug=encoder_out_aug, incremental_state=incremental_state, full_context_alignment=full_context_alignment, alignment_layer=alignment_layer, alignment_heads=alignment_heads, ) if not features_only: bsz, seq_len, d = x.size() if self.out_proj_n_frames: x = self.out_proj_n_frames(x) x = self.output_layer(x.view(bsz, seq_len, self.n_frames_per_step, d)) x = x.view(bsz, seq_len * self.n_frames_per_step, -1) if ( incremental_state is None and self.n_frames_per_step > 1 ): # teacher-forcing mode in training x = x[ :, : -(self.n_frames_per_step - 1), : ] # remove extra frames after <eos> return x, extra def upgrade_state_dict_named(self, state_dict, name): if self.n_frames_per_step > 1: move_keys = [ ( f"{name}.project_in_dim.weight", f"{name}.embed_tokens.project_in_dim.weight", ) ] for from_k, to_k in move_keys: if from_k in state_dict and to_k not in state_dict: state_dict[to_k] = state_dict[from_k] del state_dict[from_k] ================================================ FILE: fairseq/models/speech_to_speech/modules/transformer_encoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn from fairseq.models import FairseqEncoder from fairseq.modules import LayerNorm, TransformerEncoderLayer class TransformerEncoderNoEmb(FairseqEncoder): """Transformer encoder without token embeddings.""" def __init__(self, args): super().__init__(None) self.layers = nn.ModuleList( [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)] ) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None def forward(self, x, encoder_padding_mask, return_all_hiddens=False): encoder_states = [] for layer in self.layers: x = layer(x, encoder_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask is not None and encoder_padding_mask.any() else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], } def reorder_encoder_out(self, encoder_out, new_order): new_encoder_out = ( [] if len(encoder_out["encoder_out"]) == 0 else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] ) new_encoder_padding_mask = ( [] if len(encoder_out["encoder_padding_mask"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_padding_mask"] ] ) new_encoder_embedding = ( [] if len(encoder_out["encoder_embedding"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] ] ) encoder_states = encoder_out["encoder_states"] if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) return { "encoder_out": new_encoder_out, # T x B x C "encoder_padding_mask": new_encoder_padding_mask, # B x T "encoder_embedding": new_encoder_embedding, # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], # B x T "src_lengths": [], # B x 1 } ================================================ FILE: fairseq/models/speech_to_speech/s2s_conformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from pathlib import Path import torch from fairseq import checkpoint_utils from fairseq.models import register_model, register_model_architecture from fairseq.models.speech_to_speech.s2s_transformer import ( S2SpecTTransformerModel, S2UTTransformerModel, s2spect_architecture_base, s2ut_architecture_base, ) from fairseq.models.speech_to_text import S2TConformerEncoder from fairseq.models.transformer import Linear logger = logging.getLogger(__name__) def build_s2s_conformer_encoder(args): encoder = S2SConformerEncoder(args) pretraining_path = getattr(args, "load_pretrained_encoder_from", None) if pretraining_path is not None: if not Path(pretraining_path).exists(): logger.warning( f"skipped pretraining because {pretraining_path} does not exist" ) else: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=pretraining_path ) logger.info(f"loaded pretrained encoder from: {pretraining_path}") return encoder class S2SConformerEncoder(S2TConformerEncoder): """Based on S2T transformer encoder, with support to incorporate target speaker embedding.""" def __init__(self, args): super().__init__(args) self.spk_emb_proj = None if args.target_speaker_embed: self.spk_emb_proj = Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim ) def forward( self, src_tokens, src_lengths, tgt_speaker=None, return_all_hiddens=False ): out = super().forward(src_tokens, src_lengths, return_all_hiddens) if self.spk_emb_proj: x = out["encoder_out"][0] seq_len, bsz, _ = x.size() tgt_speaker_emb = tgt_speaker.view(1, bsz, -1).expand(seq_len, bsz, -1) x = self.spk_emb_proj(torch.cat([x, tgt_speaker_emb], dim=2)) out["encoder_out"][0] = x return out @register_model("s2ut_conformer") class S2UTConformerModel(S2UTTransformerModel): """ Direct speech-to-speech translation model with Conformer encoder + Transformer discrete unit decoder """ @staticmethod def add_args(parser): S2UTTransformerModel.add_args(parser) parser.add_argument( "--depthwise-conv-kernel-size", type=int, metavar="N", help="kernel size of depthwise convolution layers", ) parser.add_argument( "--attn-type", type=str, metavar="STR", help="If not specified uses fairseq MHA. Other valid option is espnet for using conformer", ) parser.add_argument( "--pos-enc-type", type=str, metavar="STR", help="Must be specified in addition to attn-type=espnet for rel_pos and rope", ) @classmethod def build_encoder(cls, args): return build_s2s_conformer_encoder(args) @register_model("s2spect_conformer") class S2SpecTConformerModel(S2SpecTTransformerModel): """ Direct speech-to-speech translation model with Conformer encoder + TTS Transformer decoder """ @staticmethod def add_args(parser): S2SpecTTransformerModel.add_args(parser) parser.add_argument("--depthwise-conv-kernel-size", type=int, default=31) parser.add_argument( "--attn-type", type=str, default=None, help="If not specified uses fairseq MHA. Other valid option is espnet for using conformer", ) parser.add_argument( "--pos-enc-type", type=str, default="abs", help="Must be specified in addition to attn-type=espnet for rel_pos and rope", ) @classmethod def build_encoder(cls, args): return build_s2s_conformer_encoder(args) @register_model_architecture("s2ut_conformer", "s2ut_conformer") def s2ut_conformer_architecture_base(args): args.attn_type = getattr(args, "attn_type", None) args.pos_enc_type = getattr(args, "pos_enc_type", "abs") args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.input_channels = getattr(args, "input_channels", 1) args.max_source_positions = getattr(args, "max_source_positions", 6000) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) args.encoder_layers = getattr(args, "encoder_layers", 16) args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) s2ut_architecture_base(args) @register_model_architecture("s2spect_conformer", "s2spect_conformer") def s2spect_conformer_architecture_base(args): args.attn_type = getattr(args, "attn_type", None) args.pos_enc_type = getattr(args, "pos_enc_type", "abs") args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.input_channels = getattr(args, "input_channels", 1) args.max_source_positions = getattr(args, "max_source_positions", 6000) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) args.encoder_layers = getattr(args, "encoder_layers", 16) args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) s2spect_architecture_base(args) @register_model_architecture("s2spect_conformer", "s2spect_conformer_fisher") def s2spect_architecture_fisher(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) # decoder args.prenet_dim = getattr(args, "prenet_dim", 32) s2spect_conformer_architecture_base(args) ================================================ FILE: fairseq/models/speech_to_speech/s2s_conformer_translatotron2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import logging from fairseq.models import ( FairseqEncoderModel, FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder from fairseq.models.speech_to_speech.modules.transformer_encoder import ( TransformerEncoderNoEmb, ) from fairseq.models.speech_to_speech.s2s_conformer import S2SpecTConformerModel from fairseq.models.speech_to_speech.s2s_conformer_unity import ( multitask_text_transformer_decoder_arch, ) from fairseq.models.speech_to_speech.s2s_transformer import ( base_multitask_text_transformer_decoder_arch, s2spect_architecture_base, ) from fairseq.models.text_to_speech import TTSTransformerDecoder from fairseq.models.transformer import TransformerDecoder, TransformerModelBase logger = logging.getLogger(__name__) @register_model("s2spect2_conformer") class S2SpecT2ConformerModel(S2SpecTConformerModel): """ Direct speech-to-speech translation model with Conformer encoder + MT Transformer decoder + TTS Transformer decoder """ @staticmethod def add_args(parser): S2SpecTConformerModel.add_args(parser) parser.add_argument( "--translation-decoder-layers", type=int, default=4, metavar="N", help="num decoder layers in the first-pass translation module", ) parser.add_argument( "--synthesizer", default="transformer", choices=["transformer"], help="", ) parser.add_argument( "--synthesizer-encoder-layers", type=int, default=0, metavar="N", help="num encoder layers in the second-pass synthesizer module", ) @classmethod def build_multitask_decoder( cls, args, tgt_dict, in_dim, is_mt_decoder, decoder_layers, decoder_embed_dim, decoder_attention_heads, ): decoder_args = args.decoder_args decoder_args.encoder_embed_dim = in_dim if args.decoder_type == "transformer": if is_mt_decoder: multitask_text_transformer_decoder_arch( decoder_args, decoder_layers, decoder_embed_dim, decoder_attention_heads, ) # 4L else: base_multitask_text_transformer_decoder_arch(decoder_args) # 2L task_decoder = TransformerDecoder( decoder_args, tgt_dict, embed_tokens=TransformerModelBase.build_embedding( decoder_args, tgt_dict, decoder_args.decoder_embed_dim, ), ) elif args.decoder_type == "ctc": task_decoder = CTCDecoder( dictionary=tgt_dict, in_dim=in_dim, ) else: raise NotImplementedError( "currently only support multitask decoder_type 'transformer', 'ctc'" ) return task_decoder @classmethod def build_decoder(cls, args): _args = copy.deepcopy(args) _args.encoder_embed_dim = args.decoder_embed_dim if args.synthesizer == "transformer": return TTSTransformerDecoder(_args, None, padding_idx=1) else: raise NotImplementedError(args.synthesizer) @classmethod def build_model(cls, args, task): encoder = cls.build_encoder(args) decoder = cls.build_decoder(args) base_model = cls(encoder, decoder) # set up multitask decoders base_model.mt_task_name = None base_model.multitask_decoders = {} has_first_pass_decoder = False for task_name, task_obj in task.multitask_tasks.items(): if task_obj.is_first_pass_decoder: has_first_pass_decoder = True base_model.mt_task_name = task_name in_dim = ( args.encoder_embed_dim if task_obj.args.input_from == "encoder" else args.decoder_embed_dim ) task_decoder = cls.build_multitask_decoder( task_obj.args, task_obj.target_dictionary, in_dim, task_obj.is_first_pass_decoder, getattr(args, "translation_decoder_layers", 4), getattr(args, "decoder_embed_dim", 256), getattr(args, "decoder_attention_heads", 4), ) setattr(base_model, f"{task_name}_decoder", task_decoder) decoder_model_cls = ( FairseqEncoderModel if task_obj.args.decoder_type == "ctc" else FairseqLanguageModel ) base_model.multitask_decoders[task_name] = decoder_model_cls( getattr(base_model, f"{task_name}_decoder") ) assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder" # set up encoder on top of the auxiliary MT decoder if getattr(args, "synthesizer_encoder_layers", 0) > 0: base_model.synthesizer_encoder = cls.build_text_encoder(args) else: base_model.synthesizer_encoder = None return base_model @classmethod def build_text_encoder(cls, args): _args = copy.deepcopy(args) _args.encoder_layers = args.synthesizer_encoder_layers _args.encoder_embed_dim = args.decoder_embed_dim _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim _args.encoder_attention_heads = args.decoder_attention_heads _args.encoder_normalize_before = True return TransformerEncoderNoEmb(_args) def forward( self, src_tokens, src_lengths, prev_output_tokens, prev_output_tokens_mt, tgt_speaker=None, incremental_state=None, target_lengths=None, speaker=None, return_all_hiddens=False, ): encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, tgt_speaker=tgt_speaker, return_all_hiddens=return_all_hiddens, ) # 1. MT decoder mt_decoder = getattr(self, f"{self.mt_task_name}_decoder") mt_decoder_out = mt_decoder( prev_output_tokens_mt, encoder_out=encoder_out, ) x = mt_decoder_out[1]["inner_states"][-1] if mt_decoder.layer_norm is not None: x = mt_decoder.layer_norm(x) mt_decoder_padding_mask = None if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) # 2. TTS encoder if self.synthesizer_encoder is not None: tts_encoder_out = self.synthesizer_encoder( x, mt_decoder_padding_mask, return_all_hiddens=return_all_hiddens, ) else: tts_encoder_out = { "encoder_out": [x], # T x B x C "encoder_padding_mask": [mt_decoder_padding_mask], # B x T } # 3. TTS decoder decoder_out = self.decoder( prev_output_tokens, encoder_out=tts_encoder_out, incremental_state=incremental_state, target_lengths=target_lengths, speaker=speaker, ) if return_all_hiddens: decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] decoder_out[-1]["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ] decoder_out[-1]["mt_decoder_out"] = mt_decoder_out return decoder_out @register_model_architecture( model_name="s2spect2_conformer", arch_name="s2spect2_conformer" ) def s2spect2_conformer_architecture_base(args): args.conv_version = getattr(args, "conv_version", "convtransformer") args.attn_type = getattr(args, "attn_type", None) args.pos_enc_type = getattr(args, "pos_enc_type", "abs") args.max_source_positions = getattr(args, "max_source_positions", 6000) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) args.encoder_layers = getattr(args, "encoder_layers", 16) args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) s2spect_architecture_base(args) # for old naming @register_model_architecture( model_name="s2spect2_conformer", arch_name="s2spect_conformer_translatotron2" ) def s2spect2_conformer_architecture_base_legacy(args): s2spect2_conformer_architecture_base(args) ================================================ FILE: fairseq/models/speech_to_speech/s2s_conformer_unity.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import logging from fairseq.models import ( FairseqEncoder, FairseqEncoderModel, FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder from fairseq.models.speech_to_speech.modules.stacked_embedding import StackedEmbedding from fairseq.models.speech_to_speech.modules.transformer_decoder_aug import ( AugTransformerUnitDecoder, ) from fairseq.models.speech_to_speech.modules.transformer_encoder import ( TransformerEncoderNoEmb, ) from fairseq.models.speech_to_speech.s2s_conformer import S2UTConformerModel from fairseq.models.speech_to_speech.s2s_transformer import ( TransformerUnitDecoder, base_multitask_text_transformer_decoder_arch, s2ut_architecture_base, ) from fairseq.models.transformer import TransformerDecoder, TransformerModelBase logger = logging.getLogger(__name__) def multitask_text_transformer_decoder_arch( args, decoder_layers, decoder_embed_dim=256, decoder_attention_heads=4 ): args.decoder_layers = decoder_layers args.decoder_embed_dim = decoder_embed_dim args.decoder_attention_heads = decoder_attention_heads base_multitask_text_transformer_decoder_arch(args) @register_model("unity_conformer") class UnityConformerModel(S2UTConformerModel): """ Direct speech-to-speech translation model with Conformer encoder + MT Transformer decoder + Transformer discrete unit decoder """ @staticmethod def add_args(parser): S2UTConformerModel.add_args(parser) parser.add_argument( "--translation-decoder-layers", type=int, default=4, metavar="N", help="num decoder layers in the first-pass translation module", ) parser.add_argument( "--synthesizer", default="transformer", choices=["transformer"], help="", ) parser.add_argument( "--synthesizer-encoder-layers", type=int, default=0, metavar="N", help="num encoder layers in the second-pass synthesizer module", ) parser.add_argument( "--synthesizer-augmented-cross-attention", action="store_true", default=False, help="augmented cross-attention over speech encoder output", ) @classmethod def build_multitask_decoder( cls, args, tgt_dict, in_dim, is_first_pass_decoder, decoder_layers, decoder_embed_dim, decoder_attention_heads, ): decoder_args = args.decoder_args decoder_args.encoder_embed_dim = in_dim if args.decoder_type == "transformer": if is_first_pass_decoder: multitask_text_transformer_decoder_arch( decoder_args, decoder_layers, decoder_embed_dim, decoder_attention_heads, ) # 4L else: base_multitask_text_transformer_decoder_arch(decoder_args) # 2L task_decoder = TransformerDecoder( decoder_args, tgt_dict, embed_tokens=TransformerModelBase.build_embedding( decoder_args, tgt_dict, decoder_args.decoder_embed_dim, ), ) elif args.decoder_type == "ctc": task_decoder = CTCDecoder( dictionary=tgt_dict, in_dim=in_dim, ) else: raise NotImplementedError( "currently only support multitask decoder_type 'transformer', 'ctc'" ) return task_decoder @classmethod def build_decoder(cls, args, tgt_dict, aug_attn=False): num_embeddings = len(tgt_dict) padding_idx = tgt_dict.pad() embed_tokens = StackedEmbedding( num_embeddings, args.decoder_embed_dim, padding_idx, num_stacked=args.n_frames_per_step, ) _args = copy.deepcopy(args) _args.encoder_embed_dim = args.decoder_embed_dim decoder_cls = AugTransformerUnitDecoder if aug_attn else TransformerUnitDecoder return decoder_cls( _args, tgt_dict, embed_tokens, ) @classmethod def build_model(cls, args, task): encoder = cls.build_encoder(args) decoder = cls.build_decoder( args, task.target_dictionary, aug_attn=getattr(args, "synthesizer_augmented_cross_attention", False), ) base_model = cls(encoder, decoder) base_model.t2u_augmented_cross_attn = getattr( args, "synthesizer_augmented_cross_attention", False ) # set up multitask decoders base_model.mt_task_name = None base_model.multitask_decoders = {} has_first_pass_decoder = False for task_name, task_obj in task.multitask_tasks.items(): if task_obj.is_first_pass_decoder: has_first_pass_decoder = True base_model.mt_task_name = task_name in_dim = ( args.encoder_embed_dim if task_obj.args.input_from == "encoder" else args.decoder_embed_dim ) task_decoder = cls.build_multitask_decoder( task_obj.args, task_obj.target_dictionary, in_dim, task_obj.is_first_pass_decoder, getattr(args, "translation_decoder_layers", 4), getattr(args, "decoder_embed_dim", 256), getattr(args, "decoder_attention_heads", 4), ) setattr(base_model, f"{task_name}_decoder", task_decoder) decoder_model_cls = ( FairseqEncoderModel if task_obj.args.decoder_type == "ctc" else FairseqLanguageModel ) base_model.multitask_decoders[task_name] = decoder_model_cls( getattr(base_model, f"{task_name}_decoder") ) assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder" # set up encoder on top of the auxiliary MT decoder if getattr(args, "synthesizer_encoder_layers", 0) > 0: base_model.synthesizer_encoder = cls.build_text_encoder(args) else: base_model.synthesizer_encoder = None return base_model @classmethod def build_text_encoder(cls, args): _args = copy.deepcopy(args) _args.encoder_layers = args.synthesizer_encoder_layers _args.encoder_embed_dim = args.decoder_embed_dim _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim _args.encoder_attention_heads = args.decoder_attention_heads _args.encoder_normalize_before = True return TransformerEncoderNoEmb(_args) def forward( self, src_tokens, src_lengths, prev_output_tokens, prev_output_tokens_mt, tgt_speaker=None, return_all_hiddens=False, ): mt_decoder = getattr(self, f"{self.mt_task_name}_decoder") encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, tgt_speaker=tgt_speaker, return_all_hiddens=return_all_hiddens, ) # 1. MT decoder mt_decoder_out = mt_decoder( prev_output_tokens_mt, encoder_out=encoder_out, ) x = mt_decoder_out[1]["inner_states"][-1] if mt_decoder.layer_norm is not None: x = mt_decoder.layer_norm(x) mt_decoder_padding_mask = None if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) # 2. T2U encoder if self.synthesizer_encoder is not None: t2u_encoder_out = self.synthesizer_encoder( x, mt_decoder_padding_mask, return_all_hiddens=return_all_hiddens, ) else: t2u_encoder_out = { "encoder_out": [x], # T x B x C "encoder_padding_mask": [mt_decoder_padding_mask], # B x T } # 3. T2U decoder if self.t2u_augmented_cross_attn: decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, encoder_out_aug=t2u_encoder_out, ) else: decoder_out = self.decoder( prev_output_tokens, encoder_out=t2u_encoder_out, ) if return_all_hiddens: decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] decoder_out[-1]["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ] decoder_out[-1]["mt_decoder_out"] = mt_decoder_out return decoder_out @register_model_architecture(model_name="unity_conformer", arch_name="unity_conformer") def unity_conformer_architecture_base(args): args.conv_version = getattr(args, "conv_version", "convtransformer") args.attn_type = getattr(args, "attn_type", None) args.pos_enc_type = getattr(args, "pos_enc_type", "abs") args.max_source_positions = getattr(args, "max_source_positions", 6000) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) args.encoder_layers = getattr(args, "encoder_layers", 16) args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) s2ut_architecture_base(args) # for old naming @register_model_architecture( model_name="unity_conformer", arch_name="s2ut_conformer_translatotron2" ) def unity_conformer_architecture_base_legacy(args): unity_conformer_architecture_base(args) ================================================ FILE: fairseq/models/speech_to_speech/s2s_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from pathlib import Path from typing import Any, Dict, List, Optional import torch from torch import Tensor from fairseq import checkpoint_utils, utils from fairseq.models import ( FairseqEncoderDecoderModel, FairseqEncoderModel, FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder from fairseq.models.speech_to_speech.modules.stacked_embedding import StackedEmbedding from fairseq.models.speech_to_text import S2TTransformerEncoder from fairseq.models.text_to_speech import TTSTransformerDecoder from fairseq.models.transformer import Linear, TransformerDecoder, TransformerModelBase logger = logging.getLogger(__name__) class S2STransformerEncoder(S2TTransformerEncoder): """Based on S2T transformer encoder, with support to incorporate target speaker embedding.""" def __init__(self, args): super().__init__(args) self.spk_emb_proj = None if args.target_speaker_embed: self.spk_emb_proj = Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim ) def forward( self, src_tokens, src_lengths, tgt_speaker=None, return_all_hiddens=False ): out = super().forward(src_tokens, src_lengths, return_all_hiddens) if self.spk_emb_proj: x = out["encoder_out"][0] seq_len, bsz, _ = x.size() tgt_speaker_emb = tgt_speaker.view(1, bsz, -1).expand(seq_len, bsz, -1) x = self.spk_emb_proj(torch.cat([x, tgt_speaker_emb], dim=2)) out["encoder_out"][0] = x return out class TransformerUnitDecoder(TransformerDecoder): """Based on Transformer decoder, with support to decoding stacked units""" def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): super().__init__( args, dictionary, embed_tokens, no_encoder_attn, output_projection ) self.n_frames_per_step = args.n_frames_per_step self.out_proj_n_frames = ( Linear( self.output_embed_dim, self.output_embed_dim * self.n_frames_per_step, bias=False, ) if self.n_frames_per_step > 1 else None ) def forward( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention, should be of size T x B x C incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, full_context_alignment=full_context_alignment, alignment_layer=alignment_layer, alignment_heads=alignment_heads, ) if not features_only: bsz, seq_len, d = x.size() if self.out_proj_n_frames: x = self.out_proj_n_frames(x) x = self.output_layer(x.view(bsz, seq_len, self.n_frames_per_step, d)) x = x.view(bsz, seq_len * self.n_frames_per_step, -1) if ( incremental_state is None and self.n_frames_per_step > 1 ): # teacher-forcing mode in training x = x[ :, : -(self.n_frames_per_step - 1), : ] # remove extra frames after <eos> return x, extra def upgrade_state_dict_named(self, state_dict, name): if self.n_frames_per_step > 1: move_keys = [ ( f"{name}.project_in_dim.weight", f"{name}.embed_tokens.project_in_dim.weight", ) ] for from_k, to_k in move_keys: if from_k in state_dict and to_k not in state_dict: state_dict[to_k] = state_dict[from_k] del state_dict[from_k] class S2STransformerMultitaskModelBase(FairseqEncoderDecoderModel): @classmethod def build_encoder(cls, args): encoder = S2STransformerEncoder(args) pretraining_path = getattr(args, "load_pretrained_encoder_from", None) if pretraining_path is not None: if not Path(pretraining_path).exists(): logger.warning( f"skipped pretraining because {pretraining_path} does not exist" ) else: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=pretraining_path ) logger.info(f"loaded pretrained encoder from: {pretraining_path}") return encoder @classmethod def build_multitask_decoder(cls, args, tgt_dict, in_dim): decoder_args = args.decoder_args decoder_args.encoder_embed_dim = in_dim if args.decoder_type == "transformer": base_multitask_text_transformer_decoder_arch(decoder_args) task_decoder = TransformerDecoder( decoder_args, tgt_dict, embed_tokens=TransformerModelBase.build_embedding( decoder_args, tgt_dict, decoder_args.decoder_embed_dim, ), ) elif args.decoder_type == "ctc": task_decoder = CTCDecoder( dictionary=tgt_dict, in_dim=in_dim, ) else: raise NotImplementedError( "currently only support multitask decoder_type 'transformer', 'ctc'" ) return task_decoder @classmethod def build_model(cls, args, task): encoder = cls.build_encoder(args) decoder = ( cls.build_decoder(args, task.target_dictionary) if task.args.target_is_code else cls.build_decoder(args) ) base_model = cls(encoder, decoder) # set up multitask decoders base_model.multitask_decoders = {} for task_name, task_obj in task.multitask_tasks.items(): in_dim = ( args.encoder_embed_dim if task_obj.args.input_from == "encoder" else args.decoder_embed_dim ) task_decoder = cls.build_multitask_decoder( task_obj.args, task_obj.target_dictionary, in_dim ) setattr(base_model, f"{task_name}_decoder", task_decoder) decoder_model_cls = ( FairseqEncoderModel if task_obj.args.decoder_type == "ctc" else FairseqLanguageModel ) base_model.multitask_decoders[task_name] = decoder_model_cls( getattr(base_model, f"{task_name}_decoder") ) return base_model def forward_encoder(self, src_tokens, src_lengths, speaker=None, **kwargs): return self.encoder( src_tokens, src_lengths=src_lengths, tgt_speaker=speaker, **kwargs ) @register_model("s2ut_transformer") class S2UTTransformerModel(S2STransformerMultitaskModelBase): """ Direct speech-to-speech translation model with Transformer encoder + Transformer discrete unit decoder https://arxiv.org/abs/2107.05604 """ @staticmethod def add_args(parser): # input parser.add_argument( "--conv-kernel-sizes", type=str, metavar="STR", help="kernel sizes of Conv1d (s2t_transformer) subsampling layers", ) parser.add_argument( "--conv-channels", type=int, metavar="N", help="# of channels in Conv1d (s2t_transformer) subsampling layers", ) parser.add_argument( "--conv-out-channels", type=int, metavar="N", help="# of channels in Conv2d (convtransformer) subsampling layers", ) parser.add_argument( "--conv-version", type=str, default="s2t_transformer", choices=["s2t_transformer", "convtransformer"], help="version of frontend convolutional layers", ) # Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-encoder-from", type=str, metavar="STR", help="model to take encoder weights from (for initialization)", ) parser.add_argument( "--encoder-freezing-updates", type=int, metavar="N", help="freeze encoder for first N updates", ) # speaker parser.add_argument( "--speaker-embed-dim", type=int, metavar="N", help="speaker embedding dimension", ) @classmethod def build_decoder(cls, args, tgt_dict): num_embeddings = len(tgt_dict) padding_idx = tgt_dict.pad() embed_tokens = StackedEmbedding( num_embeddings, args.decoder_embed_dim, padding_idx, num_stacked=args.n_frames_per_step, ) return TransformerUnitDecoder( args, tgt_dict, embed_tokens, ) def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_speaker=None, return_all_hiddens=False, ): encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, tgt_speaker=tgt_speaker, return_all_hiddens=return_all_hiddens, ) decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, ) if return_all_hiddens: decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] decoder_out[-1]["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ] return decoder_out @register_model("s2spect_transformer") class S2SpecTTransformerModel(S2STransformerMultitaskModelBase): """ Speech-to-spectrogram model with S2T Transformer encoder + TTS Transformer decoder """ @staticmethod def add_args(parser): # input parser.add_argument( "--conv-kernel-sizes", type=str, metavar="STR", help="kernel sizes of Conv1d (s2t_transformer) subsampling layers", ) parser.add_argument( "--conv-channels", type=int, metavar="N", help="# of channels in Conv1d (s2t_transformer) subsampling layers", ) parser.add_argument( "--conv-version", type=str, default="s2t_transformer", choices=["s2t_transformer", "convtransformer"], help="version of frontend convolutional layers", ) # Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-encoder-from", type=str, metavar="STR", help="model to take encoder weights from (for initialization)", ) parser.add_argument( "--encoder-freezing-updates", type=int, metavar="N", help="freeze encoder for first N updates", ) # speaker parser.add_argument( "--speaker-embed-dim", type=int, metavar="N", help="speaker embedding dimension", ) # decoder parser.add_argument("--output-frame-dim", type=int) # decoder prenet parser.add_argument("--prenet-dropout", type=float) parser.add_argument("--prenet-layers", type=int) parser.add_argument("--prenet-dim", type=int) # decoder postnet parser.add_argument("--postnet-dropout", type=float) parser.add_argument("--postnet-layers", type=int) parser.add_argument("--postnet-conv-dim", type=int) parser.add_argument("--postnet-conv-kernel-size", type=int) # decoder transformer layers parser.add_argument("--decoder-transformer-layers", type=int) parser.add_argument("--decoder-embed-dim", type=int) parser.add_argument("--decoder-ffn-embed-dim", type=int) parser.add_argument("--decoder-normalize-before", action="store_true") parser.add_argument("--decoder-attention-heads", type=int) @classmethod def build_decoder(cls, args): return TTSTransformerDecoder(args, None, padding_idx=1) def forward( self, src_tokens, src_lengths, prev_output_tokens, tgt_speaker=None, incremental_state=None, target_lengths=None, speaker=None, return_all_hiddens=False, ): encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, tgt_speaker=tgt_speaker, return_all_hiddens=return_all_hiddens, ) decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, target_lengths=target_lengths, speaker=speaker, ) if return_all_hiddens: decoder_out[-1]["encoder_states"] = encoder_out["encoder_states"] decoder_out[-1]["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ] return decoder_out def base_multitask_text_transformer_decoder_arch(args): args.dropout = getattr(args, "dropout", 0.3) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", True ) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.max_target_positions = getattr(args, "max_target_positions", 1024) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.adaptive_input = getattr(args, "adaptive_input", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.decoder_layers = getattr(args, "decoder_layers", 2) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) # decoder layer args.activation_dropout = getattr(args, "activation_dropout", args.dropout) args.activation_fn = getattr(args, "activation_fn", "relu") args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 2048) args.attention_dropout = getattr(args, "attention_dropout", args.dropout) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) def base_s2st_transformer_encoder_architecture(args): args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0) # Convolutional subsampler args.input_channels = getattr(args, "input_channels", 1) args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5") # for Conv1d args.conv_channels = getattr(args, "conv_channels", 1024) # for Conv1d args.conv_out_channels = getattr(args, "conv_out_channels", 256) # for Conv2d args.conv_version = getattr(args, "conv_version", "s2t_transformer") # Transformer args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 12) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", args.dropout) args.activation_dropout = getattr(args, "activation_dropout", args.dropout) args.activation_fn = getattr(args, "activation_fn", "relu") args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 256) @register_model_architecture( model_name="s2ut_transformer", arch_name="s2ut_transformer" ) def s2ut_architecture_base(args): base_s2st_transformer_encoder_architecture(args) # decoder args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) @register_model_architecture("s2ut_transformer", "s2ut_transformer_fisher") def s2ut_architecture_fisher(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) s2ut_architecture_base(args) @register_model_architecture( model_name="s2spect_transformer", arch_name="s2spect_transformer" ) def s2spect_architecture_base(args): base_s2st_transformer_encoder_architecture(args) # decoder args.output_frame_dim = getattr(args, "output_frame_dim", 80) # decoder prenet args.prenet_dropout = getattr(args, "prenet_dropout", 0.5) args.prenet_layers = getattr(args, "prenet_layers", 2) args.prenet_dim = getattr(args, "prenet_dim", 256) # decoder postnet args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) args.postnet_layers = getattr(args, "postnet_layers", 5) args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) # decoder transformer layers args.decoder_transformer_layers = getattr(args, "decoder_transformer_layers", 6) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", 4 * args.decoder_embed_dim ) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) @register_model_architecture("s2spect_transformer", "s2spect_transformer_fisher") def s2spect_architecture_fisher(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) # decoder args.prenet_dim = getattr(args, "prenet_dim", 32) s2spect_architecture_base(args) ================================================ FILE: fairseq/models/speech_to_text/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .berard import * # noqa from .convtransformer import * # noqa from .multi_modality_model import * # noqa from .s2t_conformer import * # noqa from .s2t_transformer import * # noqa from .s2t_wav_transformer import * # noqa from .xm_transformer import * # noqa from .xm_transformer_unity import * # noqa ================================================ FILE: fairseq/models/speech_to_text/berard.py ================================================ #!/usr/bin/env python3 from ast import literal_eval from typing import List, Tuple import torch import torch.nn as nn import torch.nn.functional as F from fairseq import checkpoint_utils, utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, register_model_architecture, ) @register_model("s2t_berard") class BerardModel(FairseqEncoderDecoderModel): """Implementation of a model similar to https://arxiv.org/abs/1802.04200 Paper title: End-to-End Automatic Speech Translation of Audiobooks An implementation is available in tensorflow at https://github.com/eske/seq2seq Relevant files in this implementation are the config (https://github.com/eske/seq2seq/blob/master/config/LibriSpeech/AST.yaml) and the model code (https://github.com/eske/seq2seq/blob/master/translate/models.py). The encoder and decoder try to be close to the original implementation. The attention is an MLP as in Bahdanau et al. (https://arxiv.org/abs/1409.0473). There is no state initialization by averaging the encoder outputs. """ def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): parser.add_argument( "--input-layers", type=str, metavar="EXPR", help="List of linear layer dimensions. These " "layers are applied to the input features and " "are followed by tanh and possibly dropout.", ) parser.add_argument( "--dropout", type=float, metavar="D", help="Dropout probability to use in the encoder/decoder. " "Note that this parameters control dropout in various places, " "there is no fine-grained control for dropout for embeddings " "vs LSTM layers for example.", ) parser.add_argument( "--in-channels", type=int, metavar="N", help="Number of encoder input channels. " "Typically value is 1.", ) parser.add_argument( "--conv-layers", type=str, metavar="EXPR", help="List of conv layers " "(format: (channels, kernel, stride)).", ) parser.add_argument( "--num-blstm-layers", type=int, metavar="N", help="Number of encoder bi-LSTM layers.", ) parser.add_argument( "--lstm-size", type=int, metavar="N", help="LSTM hidden size." ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="Embedding dimension of the decoder target tokens.", ) parser.add_argument( "--decoder-hidden-dim", type=int, metavar="N", help="Decoder LSTM hidden dimension.", ) parser.add_argument( "--decoder-num-layers", type=int, metavar="N", help="Number of decoder LSTM layers.", ) parser.add_argument( "--attention-dim", type=int, metavar="N", help="Hidden layer dimension in MLP attention.", ) parser.add_argument( "--output-layer-dim", type=int, metavar="N", help="Hidden layer dim for linear layer prior to output projection.", ) parser.add_argument( "--load-pretrained-encoder-from", type=str, metavar="STR", help="model to take encoder weights from (for initialization)", ) parser.add_argument( "--load-pretrained-decoder-from", type=str, metavar="STR", help="model to take decoder weights from (for initialization)", ) @classmethod def build_encoder(cls, args, task): encoder = BerardEncoder( input_layers=literal_eval(args.input_layers), conv_layers=literal_eval(args.conv_layers), in_channels=args.input_channels, input_feat_per_channel=args.input_feat_per_channel, num_blstm_layers=args.num_blstm_layers, lstm_size=args.lstm_size, dropout=args.dropout, ) if getattr(args, "load_pretrained_encoder_from", None) is not None: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=args.load_pretrained_encoder_from ) return encoder @classmethod def build_decoder(cls, args, task): decoder = LSTMDecoder( dictionary=task.target_dictionary, embed_dim=args.decoder_embed_dim, num_layers=args.decoder_num_layers, hidden_size=args.decoder_hidden_dim, dropout=args.dropout, encoder_output_dim=2 * args.lstm_size, # bidirectional attention_dim=args.attention_dim, output_layer_dim=args.output_layer_dim, ) if getattr(args, "load_pretrained_decoder_from", None) is not None: decoder = checkpoint_utils.load_pretrained_component_from_model( component=decoder, checkpoint=args.load_pretrained_decoder_from ) return decoder @classmethod def build_model(cls, args, task): """Build a new model instance.""" encoder = cls.build_encoder(args, task) decoder = cls.build_decoder(args, task) return cls(encoder, decoder) def get_normalized_probs(self, net_output, log_probs, sample=None): # net_output['encoder_out'] is a (B, T, D) tensor lprobs = super().get_normalized_probs(net_output, log_probs, sample) # lprobs is a (B, T, D) tensor lprobs.batch_first = True return lprobs class BerardEncoder(FairseqEncoder): def __init__( self, input_layers: List[int], conv_layers: List[Tuple[int]], in_channels: int, input_feat_per_channel: int, num_blstm_layers: int, lstm_size: int, dropout: float, ): """ Args: input_layers: list of linear layer dimensions. These layers are applied to the input features and are followed by tanh and possibly dropout. conv_layers: list of conv2d layer configurations. A configuration is a tuple (out_channels, conv_kernel_size, stride). in_channels: number of input channels. input_feat_per_channel: number of input features per channel. These are speech features, typically 40 or 80. num_blstm_layers: number of bidirectional LSTM layers. lstm_size: size of the LSTM hidden (and cell) size. dropout: dropout probability. Dropout can be applied after the linear layers and LSTM layers but not to the convolutional layers. """ super().__init__(None) self.input_layers = nn.ModuleList() in_features = input_feat_per_channel for out_features in input_layers: if dropout > 0: self.input_layers.append( nn.Sequential( nn.Linear(in_features, out_features), nn.Dropout(p=dropout) ) ) else: self.input_layers.append(nn.Linear(in_features, out_features)) in_features = out_features self.in_channels = in_channels self.input_dim = input_feat_per_channel self.conv_kernel_sizes_and_strides = [] self.conv_layers = nn.ModuleList() lstm_input_dim = input_layers[-1] for conv_layer in conv_layers: out_channels, conv_kernel_size, conv_stride = conv_layer self.conv_layers.append( nn.Conv2d( in_channels, out_channels, conv_kernel_size, stride=conv_stride, padding=conv_kernel_size // 2, ) ) self.conv_kernel_sizes_and_strides.append((conv_kernel_size, conv_stride)) in_channels = out_channels lstm_input_dim //= conv_stride lstm_input_dim *= conv_layers[-1][0] self.lstm_size = lstm_size self.num_blstm_layers = num_blstm_layers self.lstm = nn.LSTM( input_size=lstm_input_dim, hidden_size=lstm_size, num_layers=num_blstm_layers, dropout=dropout, bidirectional=True, ) self.output_dim = 2 * lstm_size # bidirectional if dropout > 0: self.dropout = nn.Dropout(p=dropout) else: self.dropout = None def forward(self, src_tokens, src_lengths=None, **kwargs): """ Args src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (B,) """ bsz, max_seq_len, _ = src_tokens.size() # (B, C, T, feat) x = ( src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) .transpose(1, 2) .contiguous() ) for input_layer in self.input_layers: x = input_layer(x) x = torch.tanh(x) for conv_layer in self.conv_layers: x = conv_layer(x) bsz, _, output_seq_len, _ = x.size() # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) -> # (T, B, C * feat) x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1) input_lengths = src_lengths.clone() for k, s in self.conv_kernel_sizes_and_strides: p = k // 2 input_lengths = (input_lengths.float() + 2 * p - k) / s + 1 input_lengths = input_lengths.floor().long() packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths) h0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_() c0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_() packed_outs, _ = self.lstm(packed_x, (h0, c0)) # unpack outputs and apply dropout x, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_outs) if self.dropout is not None: x = self.dropout(x) encoder_padding_mask = ( lengths_to_padding_mask(output_lengths).to(src_tokens.device).t() ) return { "encoder_out": x, # (T, B, C) "encoder_padding_mask": encoder_padding_mask, # (T, B) } def reorder_encoder_out(self, encoder_out, new_order): encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 1, new_order ) encoder_out["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ].index_select(1, new_order) return encoder_out class MLPAttention(nn.Module): """The original attention from Badhanau et al. (2014) https://arxiv.org/abs/1409.0473, based on a Multi-Layer Perceptron. The attention score between position i in the encoder and position j in the decoder is: alpha_ij = V_a * tanh(W_ae * enc_i + W_ad * dec_j + b_a) """ def __init__(self, decoder_hidden_state_dim, context_dim, attention_dim): super().__init__() self.context_dim = context_dim self.attention_dim = attention_dim # W_ae and b_a self.encoder_proj = nn.Linear(context_dim, self.attention_dim, bias=True) # W_ad self.decoder_proj = nn.Linear( decoder_hidden_state_dim, self.attention_dim, bias=False ) # V_a self.to_scores = nn.Linear(self.attention_dim, 1, bias=False) def forward(self, decoder_state, source_hids, encoder_padding_mask): """The expected input dimensions are: decoder_state: bsz x decoder_hidden_state_dim source_hids: src_len x bsz x context_dim encoder_padding_mask: src_len x bsz """ src_len, bsz, _ = source_hids.size() # (src_len*bsz) x context_dim (to feed through linear) flat_source_hids = source_hids.view(-1, self.context_dim) # (src_len*bsz) x attention_dim encoder_component = self.encoder_proj(flat_source_hids) # src_len x bsz x attention_dim encoder_component = encoder_component.view(src_len, bsz, self.attention_dim) # 1 x bsz x attention_dim decoder_component = self.decoder_proj(decoder_state).unsqueeze(0) # Sum with broadcasting and apply the non linearity # src_len x bsz x attention_dim hidden_att = torch.tanh( (decoder_component + encoder_component).view(-1, self.attention_dim) ) # Project onto the reals to get attentions scores (src_len x bsz) attn_scores = self.to_scores(hidden_att).view(src_len, bsz) # Mask + softmax (src_len x bsz) if encoder_padding_mask is not None: attn_scores = ( attn_scores.float() .masked_fill_(encoder_padding_mask, float("-inf")) .type_as(attn_scores) ) # FP16 support: cast to float and back # srclen x bsz normalized_masked_attn_scores = F.softmax(attn_scores, dim=0) # Sum weighted sources (bsz x context_dim) attn_weighted_context = ( source_hids * normalized_masked_attn_scores.unsqueeze(2) ).sum(dim=0) return attn_weighted_context, normalized_masked_attn_scores class LSTMDecoder(FairseqIncrementalDecoder): def __init__( self, dictionary, embed_dim, num_layers, hidden_size, dropout, encoder_output_dim, attention_dim, output_layer_dim, ): """ Args: dictionary: target text dictionary. embed_dim: embedding dimension for target tokens. num_layers: number of LSTM layers. hidden_size: hidden size for LSTM layers. dropout: dropout probability. Dropout can be applied to the embeddings, the LSTM layers, and the context vector. encoder_output_dim: encoder output dimension (hidden size of encoder LSTM). attention_dim: attention dimension for MLP attention. output_layer_dim: size of the linear layer prior to output projection. """ super().__init__(dictionary) self.num_layers = num_layers self.hidden_size = hidden_size num_embeddings = len(dictionary) padding_idx = dictionary.pad() self.embed_tokens = nn.Embedding(num_embeddings, embed_dim, padding_idx) if dropout > 0: self.dropout = nn.Dropout(p=dropout) else: self.dropout = None self.layers = nn.ModuleList() for layer_id in range(num_layers): input_size = embed_dim if layer_id == 0 else encoder_output_dim self.layers.append( nn.LSTMCell(input_size=input_size, hidden_size=hidden_size) ) self.context_dim = encoder_output_dim self.attention = MLPAttention( decoder_hidden_state_dim=hidden_size, context_dim=encoder_output_dim, attention_dim=attention_dim, ) self.deep_output_layer = nn.Linear( hidden_size + encoder_output_dim + embed_dim, output_layer_dim ) self.output_projection = nn.Linear(output_layer_dim, num_embeddings) def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, **kwargs ): encoder_padding_mask = encoder_out["encoder_padding_mask"] encoder_outs = encoder_out["encoder_out"] if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] bsz, seqlen = prev_output_tokens.size() srclen = encoder_outs.size(0) # embed tokens embeddings = self.embed_tokens(prev_output_tokens) x = embeddings if self.dropout is not None: x = self.dropout(x) # B x T x C -> T x B x C x = x.transpose(0, 1) # initialize previous states (or get from cache during incremental # generation) cached_state = utils.get_incremental_state( self, incremental_state, "cached_state" ) if cached_state is not None: prev_hiddens, prev_cells = cached_state else: prev_hiddens = [encoder_out["encoder_out"].mean(dim=0)] * self.num_layers prev_cells = [x.new_zeros(bsz, self.hidden_size)] * self.num_layers attn_scores = x.new_zeros(bsz, srclen) attention_outs = [] outs = [] for j in range(seqlen): input = x[j, :, :] attention_out = None for i, layer in enumerate(self.layers): # the previous state is one layer below except for the bottom # layer where the previous state is the state emitted by the # top layer hidden, cell = layer( input, ( prev_hiddens[(i - 1) % self.num_layers], prev_cells[(i - 1) % self.num_layers], ), ) if self.dropout is not None: hidden = self.dropout(hidden) prev_hiddens[i] = hidden prev_cells[i] = cell if attention_out is None: attention_out, attn_scores = self.attention( hidden, encoder_outs, encoder_padding_mask ) if self.dropout is not None: attention_out = self.dropout(attention_out) attention_outs.append(attention_out) input = attention_out # collect the output of the top layer outs.append(hidden) # cache previous states (no-op except during incremental generation) utils.set_incremental_state( self, incremental_state, "cached_state", (prev_hiddens, prev_cells) ) # collect outputs across time steps x = torch.cat(outs, dim=0).view(seqlen, bsz, self.hidden_size) attention_outs_concat = torch.cat(attention_outs, dim=0).view( seqlen, bsz, self.context_dim ) # T x B x C -> B x T x C x = x.transpose(0, 1) attention_outs_concat = attention_outs_concat.transpose(0, 1) # concat LSTM output, attention output and embedding # before output projection x = torch.cat((x, attention_outs_concat, embeddings), dim=2) x = self.deep_output_layer(x) x = torch.tanh(x) if self.dropout is not None: x = self.dropout(x) # project back to size of vocabulary x = self.output_projection(x) # to return the full attn_scores tensor, we need to fix the decoder # to account for subsampling input frames # return x, attn_scores return x, None def reorder_incremental_state(self, incremental_state, new_order): super().reorder_incremental_state(incremental_state, new_order) cached_state = utils.get_incremental_state( self, incremental_state, "cached_state" ) if cached_state is None: return def reorder_state(state): if isinstance(state, list): return [reorder_state(state_i) for state_i in state] return state.index_select(0, new_order) new_state = tuple(map(reorder_state, cached_state)) utils.set_incremental_state(self, incremental_state, "cached_state", new_state) @register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard") def berard(args): """The original version: "End-to-End Automatic Speech Translation of Audiobooks" (https://arxiv.org/abs/1802.04200) """ args.input_layers = getattr(args, "input_layers", "[256, 128]") args.conv_layers = getattr(args, "conv_layers", "[(16, 3, 2), (16, 3, 2)]") args.num_blstm_layers = getattr(args, "num_blstm_layers", 3) args.lstm_size = getattr(args, "lstm_size", 256) args.dropout = getattr(args, "dropout", 0.2) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 128) args.decoder_num_layers = getattr(args, "decoder_num_layers", 2) args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 512) args.attention_dim = getattr(args, "attention_dim", 512) args.output_layer_dim = getattr(args, "output_layer_dim", 128) args.load_pretrained_encoder_from = getattr( args, "load_pretrained_encoder_from", None ) args.load_pretrained_decoder_from = getattr( args, "load_pretrained_decoder_from", None ) @register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_256_3_3") def berard_256_3_3(args): """Used in * "Harnessing Indirect Training Data for End-to-End Automatic Speech Translation: Tricks of the Trade" (https://arxiv.org/abs/1909.06515) * "CoVoST: A Diverse Multilingual Speech-To-Text Translation Corpus" (https://arxiv.org/pdf/2002.01320.pdf) * "Self-Supervised Representations Improve End-to-End Speech Translation" (https://arxiv.org/abs/2006.12124) """ args.decoder_num_layers = getattr(args, "decoder_num_layers", 3) berard(args) @register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_512_3_2") def berard_512_3_2(args): args.num_blstm_layers = getattr(args, "num_blstm_layers", 3) args.lstm_size = getattr(args, "lstm_size", 512) args.dropout = getattr(args, "dropout", 0.3) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) args.decoder_num_layers = getattr(args, "decoder_num_layers", 2) args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 1024) args.attention_dim = getattr(args, "attention_dim", 512) args.output_layer_dim = getattr(args, "output_layer_dim", 256) berard(args) @register_model_architecture(model_name="s2t_berard", arch_name="s2t_berard_512_5_3") def berard_512_5_3(args): args.num_blstm_layers = getattr(args, "num_blstm_layers", 5) args.lstm_size = getattr(args, "lstm_size", 512) args.dropout = getattr(args, "dropout", 0.3) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) args.decoder_num_layers = getattr(args, "decoder_num_layers", 3) args.decoder_hidden_dim = getattr(args, "decoder_hidden_dim", 1024) args.attention_dim = getattr(args, "attention_dim", 512) args.output_layer_dim = getattr(args, "output_layer_dim", 256) berard(args) ================================================ FILE: fairseq/models/speech_to_text/convtransformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor from fairseq import checkpoint_utils, utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_text.modules.convolution import infer_conv_output_dim from fairseq.models.transformer import Embedding, TransformerDecoder from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerEncoderLayer logger = logging.getLogger(__name__) @register_model("convtransformer") class ConvTransformerModel(FairseqEncoderDecoderModel): """ Transformer-based Speech translation model from ESPNet-ST https://arxiv.org/abs/2004.10234 """ def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" parser.add_argument( "--input-feat-per-channel", type=int, metavar="N", help="encoder input dimension per input channel", ) parser.add_argument( "--activation-fn", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--decoder-output-dim", type=int, metavar="N", help="decoder output dimension (extra linear layer if different from decoder embed dim)", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-encoder-from", type=str, metavar="STR", help="model to take encoder weights from (for initialization)", ) parser.add_argument( "--load-pretrained-decoder-from", type=str, metavar="STR", help="model to take decoder weights from (for initialization)", ) parser.add_argument( "--conv-out-channels", type=int, metavar="INT", help="the number of output channels of conv layer", ) @classmethod def build_encoder(cls, args): encoder = ConvTransformerEncoder(args) if getattr(args, "load_pretrained_encoder_from", None) is not None: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=args.load_pretrained_encoder_from ) return encoder @classmethod def build_decoder(cls, args, task, embed_tokens): decoder = TransformerDecoderNoExtra(args, task.target_dictionary, embed_tokens) if getattr(args, "load_pretrained_decoder_from", None) is not None: decoder = checkpoint_utils.load_pretrained_component_from_model( component=decoder, checkpoint=args.load_pretrained_decoder_from ) return decoder @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) def build_embedding(dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() return Embedding(num_embeddings, embed_dim, padding_idx) decoder_embed_tokens = build_embedding( task.target_dictionary, args.decoder_embed_dim ) encoder = cls.build_encoder(args) decoder = cls.build_decoder(args, task, decoder_embed_tokens) return cls(encoder, decoder) @staticmethod @torch.jit.unused def set_batch_first(lprobs): lprobs.batch_first = True def get_normalized_probs( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): # net_output['encoder_out'] is a (B, T, D) tensor lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample) if self.training: self.set_batch_first(lprobs) return lprobs def output_layout(self): return "BTD" """ The forward method inherited from the base class has a **kwargs argument in its input, which is not supported in torchscript. This method overrites the forward method definition without **kwargs. """ def forward(self, src_tokens, src_lengths, prev_output_tokens): encoder_out = self.encoder(src_tokens=src_tokens, src_lengths=src_lengths) decoder_out = self.decoder( prev_output_tokens=prev_output_tokens, encoder_out=encoder_out ) return decoder_out class ConvTransformerEncoder(FairseqEncoder): """Conv + Transformer encoder""" def __init__(self, args): """Construct an Encoder object.""" super().__init__(None) self.dropout = args.dropout self.embed_scale = ( 1.0 if args.no_scale_embedding else math.sqrt(args.encoder_embed_dim) ) self.padding_idx = 1 self.in_channels = 1 self.input_dim = args.input_feat_per_channel self.conv = torch.nn.Sequential( torch.nn.Conv2d(1, args.conv_out_channels, 3, stride=2, padding=3 // 2), torch.nn.ReLU(), torch.nn.Conv2d( args.conv_out_channels, args.conv_out_channels, 3, stride=2, padding=3 // 2, ), torch.nn.ReLU(), ) transformer_input_dim = infer_conv_output_dim( self.in_channels, self.input_dim, args.conv_out_channels ) self.out = torch.nn.Linear(transformer_input_dim, args.encoder_embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx, learned=False, ) self.transformer_layers = nn.ModuleList([]) self.transformer_layers.extend( [TransformerEncoderLayer(args) for i in range(args.encoder_layers)] ) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None def pooling_ratio(self): return 4 def forward(self, src_tokens, src_lengths): """Encode input sequence. :param torch.Tensor xs: input tensor :param torch.Tensor masks: input mask :return: position embedded tensor and mask :rtype Tuple[torch.Tensor, torch.Tensor]: """ bsz, max_seq_len, _ = src_tokens.size() x = ( src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) .transpose(1, 2) .contiguous() ) x = self.conv(x) bsz, _, output_seq_len, _ = x.size() x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1) x = self.out(x) x = self.embed_scale * x subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5) input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long() input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to( input_len_0.device ) input_lengths = torch.min(input_len_0, input_len_1) encoder_padding_mask = lengths_to_padding_mask(input_lengths) positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions x = F.dropout(x, p=self.dropout, training=self.training) for layer in self.transformer_layers: x = layer(x, encoder_padding_mask) if not encoder_padding_mask.any(): maybe_encoder_padding_mask = None else: maybe_encoder_padding_mask = encoder_padding_mask return { "encoder_out": [x], "encoder_padding_mask": [maybe_encoder_padding_mask] if maybe_encoder_padding_mask is not None else [], "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [], } @torch.jit.export def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)] if len(encoder_out["encoder_padding_mask"]) == 0: new_encoder_padding_mask = [] else: new_encoder_padding_mask = [ (encoder_out["encoder_padding_mask"][0]).index_select(0, new_order) ] if len(encoder_out["encoder_embedding"]) == 0: new_encoder_embedding = [] else: new_encoder_embedding = [ (encoder_out["encoder_embedding"][0]).index_select(0, new_order) ] encoder_states = encoder_out["encoder_states"] if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) return { "encoder_out": new_encoder_out, "encoder_padding_mask": new_encoder_padding_mask, "encoder_embedding": new_encoder_embedding, "encoder_states": encoder_states, "src_tokens": [], "src_lengths": [], } class TransformerDecoderNoExtra(TransformerDecoder): def extract_features( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): # call scriptable method from parent class x, _ = self.extract_features_scriptable( prev_output_tokens, encoder_out, incremental_state, full_context_alignment, alignment_layer, alignment_heads, ) return x, None @register_model_architecture(model_name="convtransformer", arch_name="convtransformer") def base_architecture(args): args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.max_source_positions = getattr(args, "max_source_positions", 3000) args.max_target_positions = getattr(args, "max_target_positions", 1024) args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) args.conv_out_channels = getattr(args, "conv_out_channels", args.encoder_embed_dim) @register_model_architecture("convtransformer", "convtransformer_espnet") def convtransformer_espnet(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_layers = getattr(args, "encoder_layers", 12) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) ================================================ FILE: fairseq/models/speech_to_text/hub_interface.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from argparse import Namespace from typing import Optional, Tuple, Union import torch import torch.nn as nn import torch.nn.functional as F import fairseq.data.audio.feature_transforms.utterance_cmvn as utt_cmvn from fairseq.data import encoders from fairseq.data.audio.audio_utils import convert_waveform as convert_wav from fairseq.data.audio.audio_utils import get_fbank from fairseq.data.audio.audio_utils import get_waveform as get_wav from fairseq.data.audio.speech_to_text_dataset import SpeechToTextDataset logger = logging.getLogger(__name__) class S2THubInterface(nn.Module): def __init__(self, cfg, task, model): super().__init__() self.cfg = cfg self.task = task self.model = model self.model.eval() self.generator = self.task.build_generator([self.model], self.cfg.generation) @classmethod def get_model_input(cls, task, audio: Union[str, torch.Tensor]): input_type = task.data_cfg.hub.get("input_type", "fbank80") if input_type == "fbank80_w_utt_cmvn": if isinstance(audio, str): feat = utt_cmvn.UtteranceCMVN()(get_fbank(audio)) feat = feat.unsqueeze(0) # T x D -> 1 x T x D else: import torchaudio.compliance.kaldi as kaldi feat = kaldi.fbank(audio, num_mel_bins=80).numpy() # 1 x T x D elif input_type in {"waveform", "standardized_waveform"}: if isinstance(audio, str): feat, sr = get_wav(audio) # C x T feat, _ = convert_wav( feat, sr, to_sample_rate=16_000, to_mono=True ) # C x T -> 1 x T else: feat = audio.numpy() else: raise ValueError(f"Unknown value: input_type = {input_type}") src_lengths = torch.Tensor([feat.shape[1]]).long() src_tokens = torch.from_numpy(feat) # 1 x T (x D) if input_type == "standardized_waveform": with torch.no_grad(): src_tokens = F.layer_norm(src_tokens, src_tokens.shape) return { "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, "prev_output_tokens": None, }, "target_lengths": None, "speaker": None, } @classmethod def detokenize(cls, task, tokens): text = task.tgt_dict.string(tokens) tkn_cfg = task.data_cfg.bpe_tokenizer tokenizer = encoders.build_bpe(Namespace(**tkn_cfg)) return text if tokenizer is None else tokenizer.decode(text) @classmethod def get_prefix_token(cls, task, lang): prefix_size = int(task.data_cfg.prepend_tgt_lang_tag) prefix_tokens = None if prefix_size > 0: assert lang is not None lang_tag = SpeechToTextDataset.get_lang_tag_idx(lang, task.tgt_dict) prefix_tokens = torch.Tensor([lang_tag]).long().unsqueeze(0) return prefix_tokens @classmethod def get_prediction( cls, task, model, generator, sample, tgt_lang=None, synthesize_speech=False ) -> Union[str, Tuple[str, Tuple[torch.Tensor, int]]]: _tgt_lang = tgt_lang or task.data_cfg.hub.get("tgt_lang", None) prefix = cls.get_prefix_token(task, _tgt_lang) pred_tokens = generator.generate([model], sample, prefix_tokens=prefix) pred = cls.detokenize(task, pred_tokens[0][0]["tokens"]) eos_token = task.data_cfg.config.get("eos_token", None) if eos_token: pred = " ".join(pred.split(" ")[:-1]) if synthesize_speech: pfx = f"{_tgt_lang}_" if task.data_cfg.prepend_tgt_lang_tag else "" tts_model_id = task.data_cfg.hub.get(f"{pfx}tts_model_id", None) speaker = task.data_cfg.hub.get(f"{pfx}speaker", None) if tts_model_id is None: logger.warning("TTS model configuration not found") else: _repo, _id = tts_model_id.split(":") tts_model = torch.hub.load(_repo, _id, verbose=False) pred = (pred, tts_model.predict(pred, speaker=speaker)) return pred def predict( self, audio: Union[str, torch.Tensor], tgt_lang: Optional[str] = None, synthesize_speech: bool = False, ) -> Union[str, Tuple[str, Tuple[torch.Tensor, int]]]: # `audio` is either a file path or a 1xT Tensor # return either text or (text, synthetic speech) sample = self.get_model_input(self.task, audio) return self.get_prediction( self.task, self.model, self.generator, sample, tgt_lang=tgt_lang, synthesize_speech=synthesize_speech, ) ================================================ FILE: fairseq/models/speech_to_text/modules/__init__.py ================================================ ================================================ FILE: fairseq/models/speech_to_text/modules/augmented_memory_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import List, Tuple import torch import torch.nn.functional as F from torch import Tensor, nn from fairseq.models import FairseqEncoder from fairseq.models.speech_to_text import ConvTransformerEncoder from fairseq.models.speech_to_text.utils import ( attention_suppression, lengths_to_encoder_padding_mask, segments_to_sequence, sequence_to_segments, ) from fairseq.modules import MultiheadAttention, TransformerEncoderLayer # ------------------------------------------------------------------------------ # AugmentedMemoryConvTransformerEncoder # ------------------------------------------------------------------------------ class AugmentedMemoryConvTransformerEncoder(ConvTransformerEncoder): def __init__(self, args): super().__init__(args) args.encoder_stride = self.stride() self.left_context = args.left_context // args.encoder_stride self.right_context = args.right_context // args.encoder_stride self.left_context_after_stride = args.left_context // args.encoder_stride self.right_context_after_stride = args.right_context // args.encoder_stride self.transformer_layers = nn.ModuleList([]) self.transformer_layers.extend( [ AugmentedMemoryTransformerEncoderLayer(args) for i in range(args.encoder_layers) ] ) def stride(self): # Hard coded here. Should infer from convs in future stride = 4 return stride def forward(self, src_tokens, src_lengths, states=None): """Encode input sequence. :param torch.Tensor xs: input tensor :param torch.Tensor masks: input mask :return: position embedded tensor and mask :rtype Tuple[torch.Tensor, torch.Tensor]: """ bsz, max_seq_len, _ = src_tokens.size() x = ( src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) .transpose(1, 2) .contiguous() ) x = self.conv(x) bsz, _, output_seq_len, _ = x.size() x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1) x = self.out(x) x = self.embed_scale * x subsampling_factor = 1.0 * max_seq_len / output_seq_len input_lengths = torch.max( (src_lengths.float() / subsampling_factor).ceil().long(), x.size(0) * src_lengths.new_ones([src_lengths.size(0)]).long(), ) encoder_padding_mask, _ = lengths_to_encoder_padding_mask( input_lengths, batch_first=True ) # TODO: fix positional embedding positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions x = F.dropout(x, p=self.dropout, training=self.training) # State to store memory banks etc. if states is None: states = [ {"memory_banks": None, "encoder_states": None} for i in range(len(self.transformer_layers)) ] for i, layer in enumerate(self.transformer_layers): # x size: # (self.left_size + self.segment_size + self.right_size) # / self.stride, num_heads, dim # TODO: Consider mask here x = layer(x, states[i]) states[i]["encoder_states"] = x[ self.left_context_after_stride : -self.right_context_after_stride ] lengths = ( ( ~encoder_padding_mask[ :, self.left_context_after_stride : -self.right_context_after_stride ] ) .sum(dim=1, keepdim=True) .long() ) return states[-1]["encoder_states"], lengths, states # ------------------------------------------------------------------------------ # AugmentedMemoryTransformerEncoderLayer # ------------------------------------------------------------------------------ class AugmentedMemoryTransformerEncoderLayer(TransformerEncoderLayer): def __init__(self, args): super().__init__(args) self.left_context = args.left_context // args.encoder_stride self.right_context = args.right_context // args.encoder_stride def forward(self, x, state): length, batch_size, x_dim = x.size() residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) # init_state if state.get("memory_banks", None) is None: state["memory_banks"] = [] # TODO reseach new sum_query method seg_start = self.left_context seg_end = length - self.right_context if seg_start < seg_end: summarization_query = torch.mean(x[seg_start:seg_end], keepdim=True, dim=0) else: summarization_query = x.new_zeros(1, batch_size, x_dim) x = torch.cat([x, summarization_query], dim=0) x = self.self_attn(input_and_summary=x, state=state) x = self.dropout_module(x) x = residual + x if not self.normalize_before: x = self.self_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = residual + x if not self.normalize_before: x = self.final_layer_norm(x) return x def build_self_attention(self, embed_dim, args): return AugmentedMemoryMultiheadAttention( embed_dim=embed_dim, num_heads=args.encoder_attention_heads, dropout=args.attention_dropout, self_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, tanh_on_mem=True, max_memory_size=args.max_memory_size, ) # ------------------------------------------------------------------------------ # AugmentedMemoryMultiheadAttention # ------------------------------------------------------------------------------ class AugmentedMemoryMultiheadAttention(MultiheadAttention): """ Augmented Memory Attention from Streaming Transformer-based Acoustic Models Using Self-attention with Augmented Memory https://arxiv.org/abs/2005.08042 """ def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, q_noise=0.0, qn_block_size=8, tanh_on_mem=False, memory_dim=None, std_scale=0.5, # 0.5 based on https://arxiv.org/abs/2005.09137 max_memory_size=-1, disable_mem_on_mem_attn=True, ): super().__init__( embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, self_attention, encoder_decoder_attention, q_noise, qn_block_size, ) self.memory_dim = memory_dim if memory_dim is not None else embed_dim self.std_scale = std_scale self.disable_mem_on_mem_attn = disable_mem_on_mem_attn # This Operator was used for factorization in PySpeech self.v2e = lambda x: x if tanh_on_mem: self.squash_mem = torch.tanh self.nonlinear_squash_mem = True else: self.squash_mem = lambda x: x self.nonlinear_squash_mem = False self.max_memory_size = max_memory_size def forward(self, input_and_summary, state): """ input: Encoder states of current segment with left or right context, plus one summarization query """ length, batch_size, _ = input_and_summary.shape length = length - 1 # not include sum_query, last index memory = state["memory_banks"] # TODO: positional embedding on memory if self.max_memory_size > -1 and len(memory) > self.max_memory_size: # TODO: need to fix here if self.max_memory_size == 0: memory = memory.new_zeros(1, memory.size(1), self.memory_dim) else: memory = memory[-self.max_memory_size :] memory_and_input = torch.cat(memory + [input_and_summary[:-1]], dim=0) input_and_sum_query = input_and_summary q = self.q_proj(self.v2e(input_and_sum_query)) k = self.k_proj(self.v2e(memory_and_input)) v = self.v_proj(self.v2e(memory_and_input)) q = ( q.contiguous() .view(-1, batch_size * self.num_heads, self.head_dim) .transpose(0, 1) * self.scaling ) k = ( k.contiguous() .view(-1, batch_size * self.num_heads, self.head_dim) .transpose(0, 1) ) v = ( v.contiguous() .view(-1, batch_size * self.num_heads, self.head_dim) .transpose(0, 1) ) attention_weights = torch.bmm(q, k.transpose(1, 2)) if self.disable_mem_on_mem_attn: attention_weights = self.suppress_mem_on_mem_attention( batch_size, self.num_heads, len(memory), attention_weights ) if self.std_scale is not None: attention_weights = attention_suppression(attention_weights, self.std_scale) assert list(attention_weights.shape) == [ batch_size * self.num_heads, length + 1, length + len(memory), ] attention_weights = torch.nn.functional.softmax( attention_weights.float(), dim=-1 ).type_as(attention_weights) attention_probs = self.dropout_module(attention_weights) # [T, T, B, n_head] + [T, B, n_head, d_head] -> [T, B, n_head, d_head] attention = torch.bmm(attention_probs, v) assert list(attention.shape) == [ batch_size * self.num_heads, length + 1, self.head_dim, ] attention = ( attention.transpose(0, 1) .contiguous() .view(length + 1, batch_size, self.embed_dim) ) output_and_memory = self.out_proj(attention) next_m = output_and_memory[-1:] next_m = self.squash_mem(next_m) output = output_and_memory[:-1] state["memory_banks"].append(next_m) return output def suppress_mem_on_mem_attention( self, B: int, num_heads: int, mem_size: int, attention_weight: Tensor ): """ Arguments: - B: batch size - num_heads: number of attention heads - mem_size: size of memory bank - attention_weight: a [B*num_heads, T + 1, T + mem_size] vector Return: modified attention_weight with [B*num_heads, -1, :mem_size] = -inf """ attention_weight[:, -1, :mem_size] = float("-inf") return attention_weight # ------------------------------------------------------------------------------ # SequenceEncoder # ------------------------------------------------------------------------------ class SequenceEncoder(FairseqEncoder): """ SequenceEncoder encodes sequences. More specifically, `src_tokens` and `src_lengths` in `forward()` should describe a batch of "complete" sequences rather than segments. Segment-by-segment inference can be triggered by `segment_size`: 1) `segment_size` is None: SequenceEncoder treats the input sequence as one single segment. 2) `segment_size` is not None (some int instead): SequenceEncoder does the following: 1. breaks the input sequence into several segments 2. inference on each segment and collect the outputs 3. concatanete segment outputs into the output sequence. Note that `segment_size` here shouldn't include additional left/right contexts needed, for example if we wish to infer with LC-BLSTM where the middle chunk size is 100 and right context is 20, `segment_size` should be 100. """ def __init__(self, args, module): super().__init__(None) self.module = module self.input_time_axis = 1 self.output_time_axis = 0 self.segment_size = args.segment_size self.left_context = args.left_context self.right_context = args.right_context def forward( self, src_tokens: Tensor, src_lengths: Tensor, states=None, ): seg_src_tokens_lengths = sequence_to_segments( sequence=src_tokens, time_axis=self.input_time_axis, lengths=src_lengths, segment_size=self.segment_size, extra_left_context=self.left_context, extra_right_context=self.right_context, ) seg_encoder_states_lengths: List[Tuple[Tensor, Tensor]] = [] for seg_src_tokens, seg_src_lengths in seg_src_tokens_lengths: (seg_encoder_states, seg_enc_lengths, states) = self.module( seg_src_tokens, seg_src_lengths, states=states, ) seg_encoder_states_lengths.append((seg_encoder_states, seg_enc_lengths)) encoder_out, enc_lengths = segments_to_sequence( segments=seg_encoder_states_lengths, time_axis=self.output_time_axis ) encoder_padding_mask, _ = lengths_to_encoder_padding_mask( enc_lengths, batch_first=True ) if not encoder_padding_mask.any(): encoder_padding_mask = None return { "encoder_out": [encoder_out], "encoder_padding_mask": [encoder_padding_mask], "encoder_embedding": [], "encoder_states": [states], "src_tokens": [], "src_lengths": [], } def incremental_encode( self, seg_src_tokens: Tensor, seg_src_lengths: Tensor, states=None, ): """ Different from forward function, this function takes segmented speech as input, and append encoder states to previous states """ (seg_encoder_states, seg_enc_lengths, states) = self.module( seg_src_tokens, seg_src_lengths, states=states, ) return seg_encoder_states, seg_enc_lengths, states # ------------------------------------------------------------------------------ # Augmented memory model decorator # ------------------------------------------------------------------------------ def augmented_memory(klass): class StreamSeq2SeqModel(klass): @staticmethod def add_args(parser): super(StreamSeq2SeqModel, StreamSeq2SeqModel).add_args(parser) parser.add_argument( "--segment-size", type=int, required=True, help="Length of the segment." ) parser.add_argument( "--left-context", type=int, default=0, help="Left context for the segment.", ) parser.add_argument( "--right-context", type=int, default=0, help="Right context for the segment.", ) parser.add_argument( "--max-memory-size", type=int, default=-1, help="Right context for the segment.", ) StreamSeq2SeqModel.__name__ = klass.__name__ return StreamSeq2SeqModel ================================================ FILE: fairseq/models/speech_to_text/modules/convolution.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import List import torch import torch.nn as nn class Conv1dSubsampler(nn.Module): """Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation via gated linear units (https://arxiv.org/abs/1911.08460) Args: in_channels (int): the number of input channels mid_channels (int): the number of intermediate channels out_channels (int): the number of output channels kernel_sizes (List[int]): the kernel size for each convolutional layer """ def __init__( self, in_channels: int, mid_channels: int, out_channels: int, kernel_sizes: List[int] = (3, 3), ): super(Conv1dSubsampler, self).__init__() self.n_layers = len(kernel_sizes) self.conv_layers = nn.ModuleList( nn.Conv1d( in_channels if i == 0 else mid_channels // 2, mid_channels if i < self.n_layers - 1 else out_channels * 2, k, stride=2, padding=k // 2, ) for i, k in enumerate(kernel_sizes) ) def get_out_seq_lens_tensor(self, in_seq_lens_tensor): out = in_seq_lens_tensor.clone() for _ in range(self.n_layers): out = ((out.float() - 1) / 2 + 1).floor().long() return out def forward(self, src_tokens, src_lengths): bsz, in_seq_len, _ = src_tokens.size() # B x T x (C x D) x = src_tokens.transpose(1, 2).contiguous() # -> B x (C x D) x T for conv in self.conv_layers: x = conv(x) x = nn.functional.glu(x, dim=1) _, _, out_seq_len = x.size() x = x.transpose(1, 2).transpose(0, 1).contiguous() # -> T x B x (C x D) return x, self.get_out_seq_lens_tensor(src_lengths) def infer_conv_output_dim(in_channels, input_dim, out_channels): sample_seq_len = 200 sample_bsz = 10 x = torch.randn(sample_bsz, in_channels, sample_seq_len, input_dim) x = torch.nn.Conv2d(in_channels, out_channels, 3, stride=2, padding=3 // 2)(x) x = torch.nn.Conv2d(out_channels, out_channels, 3, stride=2, padding=3 // 2)(x) x = x.transpose(1, 2) mb, seq = x.size()[:2] return x.contiguous().view(mb, seq, -1).size(-1) class Conv2dSubsampler(nn.Module): """Convolutional subsampler: a stack of 2D convolution based on ESPnet implementation (https://github.com/espnet/espnet) Args: input_channels (int): the number of input channels input_feat_per_channel (int): encoder input dimension per input channel conv_out_channels (int): the number of output channels of conv layer encoder_embed_dim (int): encoder dimentions """ def __init__( self, input_channels: int, input_feat_per_channel: int, conv_out_channels: int, encoder_embed_dim: int, ): super().__init__() assert input_channels == 1, input_channels self.conv = torch.nn.Sequential( torch.nn.Conv2d( input_channels, conv_out_channels, 3, stride=2, padding=3 // 2 ), torch.nn.ReLU(), torch.nn.Conv2d( conv_out_channels, conv_out_channels, 3, stride=2, padding=3 // 2, ), torch.nn.ReLU(), ) transformer_input_dim = infer_conv_output_dim( input_channels, input_feat_per_channel, conv_out_channels ) self.out = torch.nn.Linear(transformer_input_dim, encoder_embed_dim) def forward(self, src_tokens, src_lengths): B, T_i, C = src_tokens.size() x = src_tokens.view(B, T_i, 1, C).transpose(1, 2).contiguous() x = self.conv(x) B, _, T_o, _ = x.size() x = x.transpose(1, 2).transpose(0, 1).contiguous().view(T_o, B, -1) x = self.out(x) subsampling_factor = int(T_i * 1.0 / T_o + 0.5) input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long() input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to( input_len_0.device ) input_lengths = torch.min(input_len_0, input_len_1) return x, input_lengths ================================================ FILE: fairseq/models/speech_to_text/modules/emformer.py ================================================ #!/usr/bin/env python3 # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import math import re from functools import partial from typing import List, Optional, Tuple import torch import torch.nn as nn from torch import Tensor from torch import device as Device from fairseq.models import FairseqEncoder from fairseq.models.speech_to_text.utils import ( NoOp, attention_suppression, layer_norm_backward_hook, lengths_to_padding_mask, segments_to_sequence, ) try: import torch.ao.quantization as quantization from torch.ao.quantization.qconfig import ( default_dynamic_qconfig, per_channel_dynamic_qconfig, ) except ImportError: import torch.quantization as quantization from torch.quantization.qconfig import ( default_dynamic_qconfig, per_channel_dynamic_qconfig, ) class RelativePositionEmbedding(nn.Module): """ Implementation according to https://arxiv.org/abs/1803.02155 """ def __init__(self, head_dim, max_position, norm_init=True): super().__init__() self.head_dim = head_dim self.max_position = max_position self.embeddings = nn.Parameter(torch.Tensor(max_position * 2 + 1, head_dim)) if norm_init: nn.init.xavier_normal_(self.embeddings) else: nn.init.xavier_uniform_(self.embeddings) def forward(self, input: Tensor): output = nn.functional.embedding(input.long(), self.embeddings) return output class Fp32LayerNorm(nn.Module): def __init__( self, input_dim, clamp_grad=True, max_grad_value=256, eps=1e-5, elementwise_affine=True, ): super().__init__() self.torch_module = torch.nn.LayerNorm( input_dim, eps=eps, elementwise_affine=elementwise_affine ) if clamp_grad: hook = partial(layer_norm_backward_hook, clamp_value=max_grad_value) self.torch_module.register_backward_hook(hook) def forward(self, input): output = torch.nn.functional.layer_norm( input.float(), self.torch_module.normalized_shape, self.torch_module.weight.float() if self.torch_module.weight is not None else None, self.torch_module.bias.float() if self.torch_module.bias is not None else None, self.torch_module.eps, ).type_as(input) return output # ------------------------------------------------------------------------------ # PositionwiseFF # ------------------------------------------------------------------------------ class PositionwiseFF(nn.Module): """ FFN layer in transformer. Args: input_dim: input embedding dimension ffn_dim: FFN layer inner dimension dropout_on_fc1: dropout for first linear layer dropout_on_fc2: dropout fr second linear layer activation_fn: activation function used after first linear layer. \ Only relu or gelu is supported. """ def __init__( self, input_dim, ffn_dim, dropout_on_fc1, dropout_on_fc2, activation_fn ): super(PositionwiseFF, self).__init__() self.input_dim = input_dim self.ffn_dim = ffn_dim if activation_fn == "relu": ac = nn.ReLU() elif activation_fn == "gelu": ac = nn.GELU() else: raise ValueError("Unsupported activation_fn = ({})".format(activation_fn)) # fc1 -> ac -> dropout -> fc2 -> dropout self.module = nn.Sequential( nn.Linear(input_dim, ffn_dim), ac, nn.Dropout(dropout_on_fc1), nn.Linear(ffn_dim, input_dim), nn.Dropout(dropout_on_fc2), ) self.layer_norm = Fp32LayerNorm(input_dim) def forward(self, input): module_out = self.module(self.layer_norm(input)) output = module_out + input return output def quantize_(self, params=None): if params and "per_channel" in params and params["per_channel"]: qconfig = per_channel_dynamic_qconfig else: qconfig = default_dynamic_qconfig quantization.quantize_dynamic( self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True ) return self # ------------------------------------------------------------------------------ # SummarizationLayer # ------------------------------------------------------------------------------ class SummarizationLayer(nn.Module): def __init__(self, method, segment_size, embedding_dim): super(SummarizationLayer, self).__init__() self.segment_size = segment_size self.embedding_dim = embedding_dim nonlin_match = re.match(r"nonlinear\((?P<act>[a-z]+),(?P<dim>[0-9]+)\)", method) self.method = method if method == "mean": self.module = nn.AvgPool1d( kernel_size=segment_size, stride=segment_size, ceil_mode=True, ) elif method == "max": self.module = nn.MaxPool1d( kernel_size=segment_size, stride=segment_size, ceil_mode=True, ) elif method == "linear": self.module = nn.Linear(segment_size, 1) elif nonlin_match: nonlin_args = nonlin_match.groupdict() act_type = nonlin_args["act"] hid_dim = int(nonlin_args["dim"]) if act_type == "relu": act = nn.ReLU() elif act_type == "gelu": act = nn.GELU() else: raise ValueError("Unsupported activation_fn = ({})".format(act_type)) self.module = nn.Sequential( nn.Linear(segment_size, hid_dim), act, nn.Linear(hid_dim, 1), ) else: raise ValueError("Unsupported summarization method = ({})".format(method)) def forward(self, input): # T, B, D -> B, D, T input = input.permute(1, 2, 0) if self.method == "mean" or self.method == "max": output = self.module(input) output = output.permute(2, 0, 1) return output full_seg_length = input.size(2) // self.segment_size * self.segment_size if full_seg_length > 0: # at least one seg is full B = input.size(0) D = input.size(1) input_todo = ( input[:, :, :full_seg_length] .contiguous() .view(B, -1, self.segment_size) ) output = self.module(input_todo) output = output.view(B, D, -1) else: output = input.new_zeros(input.size(0), input.size(1), 0) left = input.size(2) - full_seg_length if left > 0: # when last seg is not full, use zeros as last memory placeholder zeros = input.new_zeros(input.size(0), input.size(1), 1) output = torch.cat([output, zeros], dim=2) output = output.permute(2, 0, 1) return output # ------------------------------------------------------------------------------ # NoSegAugmentedMemoryMultiheadAttentionBmm # ------------------------------------------------------------------------------ class NoSegAugmentedMemoryMultiheadAttentionBmm(nn.Module): """ Whole utterance augmented memory multihead attention using BMM. Different with previous augmented memory multihead attention where the utterance is chunked into segments. Here we use attention mask achieve so. The input embedding [right_context, utterance, summary] is a concatenation of right context, utterance and summary. Right context block is the concatenation of all the right context for each segments. [right_context_0, right_context_1, ..., right_context_n] For example, if we have utterance = [v0, v1, v2, ...., v20]. segment size 8, right_context size 4. Then the right context blocks = [v8, v9, v10, v11, v16, v17, v18, v19, 0, 0, 0, 0], where v8, v9, v10, and v11 are the right context for first segment. v16, v17, v18 and v19 are the right context for second segment. 0, 0, 0 and 0 are right context for the last segment. utterance is corresponding to input embedding sequence summary is concatenation of average of each segments. [summary_0, summary_1, ..., ]. In augmented memory multihead attention, the query is [right_context, utterance, summary], key is [memory, right_context, utterance]. Different with AugmentedMemoryMultiheadAttentionBmm, memory here is passed from previous attention layer. For the first attention layer, memory is average of each segment. Memory is a concatenation of memory from each segments in previous attention layer. For example, current layer is i, then memory is [m_0, m_1, ..., m_n]. Each m_k is the output from seg_k in layer i-1. args: input_dim: input embedding dimension num_heads: number of heads in multihead self-attention dropout: attention dropout std_scale: if std_scale is not None. The weak attention suppression is turned on. For std_scale = 0.5, all the attention smaller than mean + 0.5 * std will be suppressed. scaled_init: whether to use scaled init for linear weight tanh_on_mem: whether to use tanh on memory output use_mem: whether to use memory or not. When max_memory_size is 0, then we don't have memory anymore. layer_index: current self-attention layer index that is used in depth initialization max_relative_position: max relative position used in relative position embedding rpe_old_option: To be compatible with previous model. The previous model was trained with attention += attention + rpe. The correct equation should be attention = attention + rpe """ def __init__( self, input_dim, num_heads, dropout=0.0, std_scale=None, scaled_init=False, tanh_on_mem=False, use_mem=True, mini_batches=False, negative_inf="-inf", layer_index=-1, max_relative_position=0, rpe_old_option=True, ): if input_dim % num_heads: raise ValueError( "input_dim ({}) must be divisible by num_heads ({})".format( input_dim, num_heads ) ) super().__init__() embed_dim = input_dim self.e2h_kv = torch.nn.Linear(input_dim, 2 * input_dim, bias=True) self.e2h_q = torch.nn.Linear(input_dim, input_dim, bias=True) self.rpe_old_option = rpe_old_option if max_relative_position > 0: self.use_rpe = True self.rpe_k = RelativePositionEmbedding( head_dim=input_dim // num_heads, max_position=max_relative_position, ) self.rpe_v = RelativePositionEmbedding( head_dim=input_dim // num_heads, max_position=max_relative_position, ) else: self.use_rpe = False self.rpe_k = None self.rpe_v = None if scaled_init: if layer_index == -1: gain = 1.0 / math.sqrt(2) else: # https://arxiv.org/abs/2005.09684 depthwise initialization # stablize the training greatly. Use depthwise initialization to # replace incremental loss. gain = 1.0 / math.sqrt(layer_index + 1) torch.nn.init.xavier_uniform_(self.e2h_kv.weight, gain=gain) torch.nn.init.xavier_uniform_(self.e2h_q.weight, gain=gain) self.out_proj = torch.nn.Linear(embed_dim, embed_dim, bias=True) self.embed_dim = embed_dim self.num_heads = num_heads self.dropout = dropout self.head_dim = embed_dim // num_heads self.scaling = self.head_dim**-0.5 self.std_scale = std_scale self.use_mem = use_mem self.mini_batches = mini_batches self.negative_inf = negative_inf if tanh_on_mem: self.squash_mem = torch.tanh self.nonlinear_squash_mem = True else: self.squash_mem = NoOp() self.nonlinear_squash_mem = False def prepare_qkv( self, input: Tensor, mems: Tensor, lengths: Tensor, summary_length: int, lc_length: int, ): # T: right_context length + utterance_length + summary_length T, B, D = input.shape mem_length = mems.size(0) utterance_length = torch.max(lengths) right_context_blocks_length = T - utterance_length - summary_length rc_block = input[:right_context_blocks_length, :, :] utterance_block = input[right_context_blocks_length : T - summary_length, :, :] if B == 1: padding_mask = None else: klengths = lengths + mem_length + right_context_blocks_length + lc_length padding_mask = lengths_to_padding_mask(lengths=klengths) mem_rc_input = torch.cat([mems, rc_block, utterance_block], dim=0) # In training lc_length = 0 key_length = mem_rc_input.size(0) + lc_length rc_input_sum = input q = self.e2h_q(rc_input_sum) kv = self.e2h_kv(mem_rc_input) k, v = kv.chunk(chunks=2, dim=2) result_qkv = (q, k, v) input_shape = (T, B, D) result_lengths_info = ( mem_length, utterance_length, right_context_blocks_length, key_length, ) if padding_mask is not None: assert padding_mask.size(0) == B assert padding_mask.size(1) == key_length return result_qkv, input_shape, result_lengths_info, padding_mask def prepare_attention_weights( self, q: Tensor, new_k: Tensor, new_v: Tensor, input_shape: Tuple[int, int, int], rpe: Optional[Tensor], ) -> Tuple[Tensor, Tensor, Tensor]: T, B, D = input_shape q = ( q.contiguous().view(-1, B * self.num_heads, self.head_dim).transpose(0, 1) * self.scaling ) k = ( new_k.contiguous() .view(-1, B * self.num_heads, self.head_dim) .transpose(0, 1) ) v = ( new_v.contiguous() .view(-1, B * self.num_heads, self.head_dim) .transpose(0, 1) ) attention_weights = torch.bmm(q, k.transpose(1, 2)) if self.use_rpe and rpe is not None and self.rpe_v is not None: r_k = self.rpe_k(rpe) # [q, B*h, d] * [q, k, d] -> [B*h, q, k] attention_weights_rpe = torch.matmul( q.transpose(0, 1), r_k.transpose(1, 2) ).transpose(0, 1) attention_weights = attention_weights + attention_weights_rpe attention_weights_float = attention_weights.float() return attention_weights, attention_weights_float, v def prepare_attention_output( self, attention_weights: Tensor, attention_weights_float: Tensor, v: Tensor, input_shape: Tuple[int, int, int], key_length: int, padding_mask: Optional[Tensor], rpe: Optional[Tensor], ) -> Tensor: T, B, D = input_shape if padding_mask is not None: attention_weights_float = attention_weights_float.view( B, self.num_heads, T, key_length ) attention_weights_float = attention_weights_float.masked_fill( padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf") ) attention_weights_float = attention_weights_float.view( B * self.num_heads, T, key_length ) if self.std_scale is not None: attention_weights_float = attention_suppression( attention_weights_float, self.std_scale ) attention_weights_float = torch.nn.functional.softmax( attention_weights_float, dim=-1 ) attention_weights = attention_weights_float.type_as(attention_weights) attention_probs = torch.nn.functional.dropout( attention_weights, p=self.dropout, training=self.training ) # [T, key_length, B, n_head]+ [key_length, B, n_head, d_head] # -> [T, B, n_head, d_head] attention = torch.bmm(attention_probs, v) if self.use_rpe and rpe is not None and self.rpe_v is not None: r_v = self.rpe_v(rpe) attention_rpe = torch.matmul( attention_probs.transpose(0, 1), r_v ).transpose(0, 1) if self.rpe_old_option: attention += attention + attention_rpe else: attention = attention + attention_rpe assert list(attention.shape) == [B * self.num_heads, T, self.head_dim] attention = attention.transpose(0, 1).contiguous().view(T, B, self.embed_dim) rc_output_memory = self.out_proj(attention) return rc_output_memory @torch.jit.unused def forward( self, input: Tensor, lengths: Tensor, mems: Tensor, attention_mask: Tensor, pre_mems: Optional[Tensor] = None, left_context_key: Optional[Tensor] = None, left_context_val: Optional[Tensor] = None, rpe: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """ forward function for NoSegAugmentedMemoryMultiheadAttentionBmm in training. args: input: formed in the following way [right_context_0, right_contex_1, ..., seg_0, seg_1, ..., summary_0, summary_1,..] lengths: the length of query which is [seg_0, seg_1, ....] mems: [mem_0, mem_1, ...]. attention_mask: attention mask for query = [right_context, query, summary] key = [mem, right_context, query]. This is only used for traing. """ if self.use_mem: mem_length = mems.size(0) summary_length = mem_length + 1 if pre_mems is not None: mems = torch.cat([pre_mems, mems], dim=0) else: mem_length = 0 summary_length = 0 # In training, lc_length = 0 if left_context_key is not None: lc_length = left_context_key.size(0) else: lc_length = 0 results = self.prepare_qkv( input=input, mems=mems, lengths=lengths, summary_length=summary_length, lc_length=lc_length, ) result_qkv, input_shape, result_lengths_info, padding_mask = results q, k, v = result_qkv ( mem_length, utterance_length, right_context_blocks_length, key_length, ) = result_lengths_info if left_context_key is not None: # add the cache key and value new_k = torch.cat( [ k[: mem_length + right_context_blocks_length, :, :], left_context_key, k[-utterance_length:, :, :], ], dim=0, ) new_v = torch.cat( [ v[: mem_length + right_context_blocks_length, :, :], left_context_val, v[-utterance_length:, :, :], ], dim=0, ) next_k = new_k[mem_length + right_context_blocks_length :, :, :] next_v = new_v[mem_length + right_context_blocks_length :, :, :] else: new_k = k new_v = v next_k = None next_v = None attention_weights, attention_weights_float, v = self.prepare_attention_weights( q=q, new_k=new_k, new_v=new_v, input_shape=input_shape, rpe=rpe, ) # mask attention attention_mask = attention_mask.unsqueeze(0) attention_weights_float = attention_weights_float.masked_fill( attention_mask, float(self.negative_inf) ) rc_output_memory = self.prepare_attention_output( attention_weights=attention_weights, attention_weights_float=attention_weights_float, v=v, input_shape=input_shape, key_length=key_length, padding_mask=padding_mask, rpe=rpe, ) if self.use_mem: # next_m length equals to summary length - 1 # last memory is ignored if self.mini_batches: next_m = rc_output_memory[-summary_length:] else: next_m = rc_output_memory[-summary_length:-1] next_m = self.squash_mem(next_m) # rc and output rc_output = rc_output_memory[:-summary_length] if not self.nonlinear_squash_mem: next_m = torch.clamp(next_m, min=-10, max=10) else: next_m = mems rc_output = rc_output_memory return rc_output, next_m, next_k, next_v @torch.jit.export def forward_jit( self, input: Tensor, lengths: Tensor, mems: Tensor, left_context_key: Tensor, left_context_val: Tensor, rpe: Optional[Tensor], ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: """ forward function for NoSegAugmentedMemoryMultiheadAttentionBmm in decoding. args: input: formed in the following way [right_context_0, right_contex_1, ..., seg_0, seg_1, ..., summary_0, summary_1,..] lengths: the length of query which is [seg_0, seg_1, ....] mems: [mem_0, mem_1, ...]. left_context_key: left_context for key part. This is only used for online decoding. In training, this is empty tensor left_context_val: left_context for value part. This is only used for online decoding. In training, this is empty tensor """ lc_length = left_context_key.size(0) # In decoding, summary_length = 1 or 0 if self.use_mem: summary_length = 1 else: summary_length = 0 results = self.prepare_qkv( input=input, mems=mems, lengths=lengths, summary_length=summary_length, lc_length=lc_length, ) result_qkv, input_shape, result_lengths_info, padding_mask = results q, k, v = result_qkv ( mem_length, utterance_length, right_context_blocks_length, key_length, ) = result_lengths_info # add the cache key and value new_k = torch.cat( [ k[: mem_length + right_context_blocks_length, :, :], left_context_key, k[-utterance_length:, :, :], ], dim=0, ) new_v = torch.cat( [ v[: mem_length + right_context_blocks_length, :, :], left_context_val, v[-utterance_length:, :, :], ], dim=0, ) next_k = new_k[mem_length + right_context_blocks_length :, :, :] next_v = new_v[mem_length + right_context_blocks_length :, :, :] attention_weights, attention_weights_float, v = self.prepare_attention_weights( q=q, new_k=new_k, new_v=new_v, input_shape=input_shape, rpe=rpe, ) # In online decoding, we don't have attention mask. But we still need # to disable the attention from summary query to memory attention_weights_float[:, -1, :mem_length] = float(self.negative_inf) rc_output_memory = self.prepare_attention_output( attention_weights=attention_weights, attention_weights_float=attention_weights_float, v=v, input_shape=input_shape, key_length=key_length, padding_mask=padding_mask, rpe=rpe, ) # In decoding, summary length is 1 if self.use_mem: next_m = rc_output_memory[-1:] next_m = self.squash_mem(next_m) # rc and output rc_output = rc_output_memory[:-1] if not self.nonlinear_squash_mem: next_m = torch.clamp(next_m, min=-10, max=10) else: rc_output = rc_output_memory # empty tensor as input mems next_m = mems return rc_output, next_m, next_k, next_v def quantize_(self, params=None): if params and "per_channel" in params and params["per_channel"]: qconfig = per_channel_dynamic_qconfig else: qconfig = default_dynamic_qconfig quantization.quantize_dynamic( self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True ) return self class NoSegAugmentedMemoryTransformer(nn.Module): """ Whole utterance augmented memory transformer. This is not pyspeech nn layer. It is used as a module in a master layer where multiple transformers is used. """ def __init__( self, input_dim, num_heads, ffn_dim, dropout_in_attn=0.0, dropout_on_attn=None, dropout_on_fc1=None, dropout_on_fc2=None, activation_fn="relu", tanh_on_mem=False, std_scale=None, scaled_init=False, segment_size=128, use_mem=True, mini_batches=False, negative_inf="-inf", layer_index=-1, summarization_method="mean", max_relative_position=0, rpe_old_option=True, ): super(NoSegAugmentedMemoryTransformer, self).__init__() self.attention = NoSegAugmentedMemoryMultiheadAttentionBmm( input_dim=input_dim, num_heads=num_heads, dropout=dropout_in_attn, scaled_init=scaled_init, tanh_on_mem=tanh_on_mem, std_scale=std_scale, use_mem=use_mem, mini_batches=mini_batches, negative_inf=negative_inf, layer_index=layer_index, max_relative_position=max_relative_position, ) self.dropout = nn.Dropout(dropout_on_attn) self.pos_ff = PositionwiseFF( input_dim=input_dim, ffn_dim=ffn_dim, dropout_on_fc1=dropout_on_fc1, dropout_on_fc2=dropout_on_fc2, activation_fn=activation_fn, ) self.layer_norm_pre = Fp32LayerNorm(input_dim) self.layer_norm = Fp32LayerNorm(input_dim) self.segment_size = segment_size self.use_mem = use_mem self.memory_op = SummarizationLayer( summarization_method, segment_size, input_dim ) def set_mini_batches(self, mini_batches): self.attention.mini_batches = mini_batches def gen_summary_queries(self, input): sum_input = self.memory_op(input) return sum_input def pre_attention_ops(self, input, right_context_blocks): rc_length = right_context_blocks.size(0) input_length = input.size(0) rc_and_input = torch.cat([right_context_blocks, input], dim=0) residual_input = rc_and_input rc_and_input = self.layer_norm_pre(rc_and_input) query_input = rc_and_input[-input_length:, :, :] return rc_length, input_length, residual_input, query_input, rc_and_input def after_attention_ops(self, attention_output, residual_input): output = self.dropout(attention_output) output = output + residual_input output = self.pos_ff(output) output = self.layer_norm(output) return output @torch.jit.export def forward_jit( self, input: Tensor, lengths: Tensor, mems: Tensor, left_context_key: Tensor, left_context_val: Tensor, right_context_blocks: Tensor, rpe: Optional[Tensor], ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: results = self.pre_attention_ops(input, right_context_blocks) rc_length, input_length, residual_input, query_input, rc_and_input = results # In online decoding, the summary query size is always 1 or 0 if self.use_mem: summary_query = self.gen_summary_queries(query_input) summary_query = summary_query[0:1, :, :] rc_qu_su = torch.cat([rc_and_input, summary_query], dim=0) else: rc_qu_su = rc_and_input rc_output, next_m, next_k, next_v = self.attention.forward_jit( input=rc_qu_su, lengths=lengths, mems=mems, left_context_key=left_context_key, left_context_val=left_context_val, rpe=rpe, ) rc_output = self.after_attention_ops(rc_output, residual_input) results = ( rc_output[-input_length:, :, :], next_m, rc_output[0:rc_length, :, :], next_k, next_v, ) return results @torch.jit.unused def forward( self, input, lengths, mems, right_context_blocks, attention_mask, pre_mems, left_context_key, left_context_val, rpe, ): results = self.pre_attention_ops(input, right_context_blocks) rc_length, input_length, residual_input, query_input, rc_and_input = results if self.use_mem: summary_query = self.gen_summary_queries(query_input) rc_qu_su = torch.cat([rc_and_input, summary_query], dim=0) else: rc_qu_su = rc_and_input rc_output, next_m, next_k, next_v = self.attention( input=rc_qu_su, lengths=lengths, mems=mems, attention_mask=attention_mask, pre_mems=pre_mems, left_context_key=left_context_key, left_context_val=left_context_val, rpe=rpe, ) # [TODO] Note memory did not go through pos_ff. What happen if we pass # memory through the pos_ff as well? rc_output = self.after_attention_ops(rc_output, residual_input) results = ( rc_output[-input_length:, :, :], next_m, rc_output[0:rc_length, :, :], next_k, next_v, ) return results class NoSegAugmentedMemoryTransformerEncoderLayer(FairseqEncoder): """ Whole utterance augmented memory transformer encoder layer. This is a master layer where we can define multiple augmented memory transformers. There are two reasons to setup the master layer. 1. We only need to define once about the attention mask. All the layers in the master layer share the same mask. 2. pyspeech nn layer has special input and output format. Defining one master layer is easier to passing memory between different layes inside the master layer args: input_dim: input embedding dimension num_heads: number of heads in multihead self-attention ffn_dim: ffn dimension in FFN layer num_layers: number of augmented memory transformer layers dropout_in_attn: dropout used in multi-head self-attention dropout_on_attn: dropout used for output from te multihead self-attention dropout_on_fc1: dropout used in FFN layer for the first linear layer dropout_on_fc2: dropout used in FFN layer for the second linear layer segment_size: segment size for each segment context_config: (left_context_size, right_context_size) defines the surround context size for each segment max_memory_size: maximum memory size used for each segment scaled_init: whether use scaled init for weight initialization in attention layer std_scale: if std_scale is not None. The weak attention suppression is turned on. For std_scale = 0.5, all the attention smaller than mean + 0.5 * std will be suppressed. activation_fn: activation function used in FFN layer. [ReLU, GELU] supported tanh_on_mem: whether use tanh on memory mini_batches: use mini-btach training negative_inf: the negative infinity value used in attention masking. default is "-inf". For some situation, e.g. LM. it is better to use "-1e8" to avoid nan issue. summarization_method: method to generate segment summrization embedding max_relative_position: max relatie position for relative position embedding rpe_old_option: To be compatible with previous model. The previous model was trained with attention += attention + rpe. The correct equation should be attention = attention + rpe [TODO]: remove the rpe_old_option by the end of 2021 Q1. """ def __init__( self, input_dim, num_heads, ffn_dim, num_layers=1, dropout_in_attn=0.0, dropout_on_attn=0.0, dropout_on_fc1=0.0, dropout_on_fc2=0.0, segment_size=128, context_config=(0, 0), max_memory_size=0, scaled_init=True, std_scale=None, activation_fn="relu", tanh_on_mem=False, mini_batches=False, negative_inf="-inf", deep_init=True, summarization_method="mean", max_relative_position=0, rpe_old_option=True, ): super().__init__(None) if input_dim % num_heads: raise ValueError( "input_dim ({}) must be divisible by num_heads ({})".format( input_dim, num_heads ) ) # we used to support growing memory size. However, it will cause # cross stream batching failure. Now we need to have exact max memory size if max_memory_size < 0: raise ValueError("max_memory_size must be >= 0") # Only assign right_context. In decoding, left context will be cached. # No need to let the online decoder to re-assign the left context self.left_context, self.right_context = context_config self.segment_size = segment_size self.memory_dim = input_dim self.max_memory_size = max_memory_size self.mini_batches = mini_batches if self.max_memory_size != 0: self.use_mem = True else: self.use_mem = False self.memory_op = SummarizationLayer( summarization_method, segment_size, input_dim ) self.layers = torch.nn.ModuleList() self.num_layers = num_layers self.max_relative_position = max_relative_position if self.max_relative_position > 0: self.use_rpe = True else: self.use_rpe = False for i in range(self.num_layers): if deep_init: layer_index = i else: layer_index = -1 self.layers.append( NoSegAugmentedMemoryTransformer( num_heads=num_heads, input_dim=input_dim, ffn_dim=ffn_dim, dropout_in_attn=dropout_in_attn, dropout_on_attn=dropout_on_attn, dropout_on_fc1=dropout_on_fc1, dropout_on_fc2=dropout_on_fc2, segment_size=segment_size, std_scale=std_scale, activation_fn=activation_fn, tanh_on_mem=tanh_on_mem, scaled_init=scaled_init, use_mem=self.use_mem, mini_batches=mini_batches, negative_inf=negative_inf, layer_index=layer_index, summarization_method=summarization_method, max_relative_position=max_relative_position, rpe_old_option=rpe_old_option, ) ) def set_mini_batches(self, mini_batches): # handy function only used for unit test self.mini_batches = mini_batches for layer in self.layers: layer.set_mini_batches(mini_batches) def _get_relative_position( self, input: Tensor, max_relative_position: int, left_context_length: int, past_length: int, is_decoding: bool, ): # For training, we copy the right context to the start of the utterance # First dimension in distance is corresponding to query. # [right context, utterance, summary vector] # Second dimension in distance is corresponding to key. # [Memory bank, right context, utterance] # For summary vector in query part, the distance with # all other position is 2*max_position. For memory bank in key, # the distance with all other positions is 0. T, B, D = input.shape num_segs = math.ceil((T - self.right_context) / self.segment_size) # utterance u_st = past_length * self.segment_size u_ed = u_st + T utterance_ranges = torch.arange(u_st, u_ed - self.right_context) # left context. Only in minibatch or decoding left_context_ranges = torch.arange(u_st - left_context_length, u_st) # Right context block # right context + utterance right_context_blocks = [] for i in range(0, num_segs - 1): st = (i + 1) * self.segment_size + u_st ed = st + self.right_context assert ed < u_ed temp = torch.arange(st, ed) right_context_blocks.append(temp) right_context_blocks.append(torch.arange(u_ed - self.right_context, u_ed)) right_context_ranges = torch.cat(right_context_blocks) if self.use_mem: # Memory bank # The position for memory -n, .., -1 if is_decoding: memory_size = min(past_length, self.max_memory_size) else: memory_size = num_segs + past_length - 1 memory_bank_ranges = torch.arange( -max_relative_position - 1, -max_relative_position - 1 - memory_size, -1 ) # summary vector # The position for summary vector as the T+max_relative_position+1. # After the clamping, the relative position is max_relative_position summary_pos_st = u_ed + max_relative_position + 1 summary_vector_ranges = torch.arange( summary_pos_st, summary_pos_st + num_segs ) key_ranges = torch.cat( [ memory_bank_ranges, right_context_ranges, left_context_ranges, utterance_ranges, ] ) query_ranges = torch.cat( [right_context_ranges, utterance_ranges, summary_vector_ranges] ) else: key_ranges = torch.cat( [right_context_ranges, left_context_ranges, utterance_ranges] ) query_ranges = torch.cat([right_context_ranges, utterance_ranges]) distance = key_ranges[None, :] - query_ranges[:, None] distance_clamp = ( torch.clamp(distance, -max_relative_position, max_relative_position) + max_relative_position ) distance_clamp = distance_clamp.to(input.device).long().detach() return distance_clamp def _get_attention_mask(self, input, past_length=0, left_context_cache=0): # attention mask for each query contains three parts: # 1. memory part # 2. left_context + segment # 3. right_context_block # so for each segment and its correspoinding right context block, # the attention matrix is formed by 9 parts: # [0, m, 0, 0, right_context, 0, 0, seg, 0] # [before memory, memory, after memory, before right context, right_context, # after right context, before seg, seg, after seg] # # Query is formed in the way as [right_context_blocks, utterance, summary] # # Note: put m and right_context before segment is convenient # for padding_mask operation. # Key lengths = m_length + right_context_block_length + lengths utterance_length, batch_size, _ = input.shape summary_length = math.ceil(utterance_length / self.segment_size) num_segs = summary_length rc_length = self.right_context * num_segs rc = self.right_context lc = self.left_context # using mini-batches, there is left context cache available for current # sequence. lcc = left_context_cache # max_memory_size is 0 then we don't have memory and summary # past_length is the memory carry from previous sequence if self.use_mem: mem_length = num_segs - 1 + past_length else: mem_length = 0 rc_mask = [] query_mask = [] summary_mask = [] for j in range(0, num_segs): ssize = min(self.segment_size, utterance_length - j * self.segment_size) rc_size = rc rc_mat = [] q_mat = [] s_mat = [] m_start = max(j + past_length - self.max_memory_size, 0) # max_memory_size is 0, then we don't use memory if self.use_mem: # part 0: before memory rc_mat.append(input.new_zeros(rc_size, m_start)) q_mat.append(input.new_zeros(ssize, m_start)) s_mat.append(input.new_zeros(1, m_start)) # part 1: memory col_1 = j + past_length - m_start rc_mat.append(torch.ones(rc_size, col_1, device=input.device)) q_mat.append(torch.ones(ssize, col_1, device=input.device)) # based on D22875746, disable summary query attention # on memeory is better for long form utterance s_mat.append(input.new_zeros(1, col_1)) # part 2: after memory col_2 = mem_length - (j + past_length) rc_mat.append(input.new_zeros(rc_size, col_2)) q_mat.append(input.new_zeros(ssize, col_2)) s_mat.append(input.new_zeros(1, col_2)) # part 3: before right context rc_start = j * rc rc_mat.append(input.new_zeros(rc_size, rc_start)) q_mat.append(input.new_zeros(ssize, rc_start)) s_mat.append(input.new_zeros(1, rc_start)) # part 4: right context rc_end = rc_start + rc col_4 = rc rc_mat.append(torch.ones(rc_size, col_4, device=input.device)) q_mat.append(torch.ones(ssize, col_4, device=input.device)) s_mat.append(torch.ones(1, col_4, device=input.device)) # part 5: after right context col_5 = rc_length - rc_end rc_mat.append(input.new_zeros(rc_size, col_5)) q_mat.append(input.new_zeros(ssize, col_5)) s_mat.append(input.new_zeros(1, col_5)) # part 6: before query segment seg_start = max(j * self.segment_size + lcc - lc, 0) rc_mat.append(input.new_zeros(rc_size, seg_start)) q_mat.append(input.new_zeros(ssize, seg_start)) s_mat.append(input.new_zeros(1, seg_start)) # part 7: query segment # note: right context is put in right context block # here we only need to consider about left context seg_end = min((j + 1) * self.segment_size + lcc, utterance_length + lcc) col_7 = seg_end - seg_start rc_mat.append(torch.ones(rc_size, col_7, device=input.device)) q_mat.append(torch.ones(ssize, col_7, device=input.device)) s_mat.append(torch.ones(1, col_7, device=input.device)) # part 8: after query segment col_8 = utterance_length + lcc - seg_end rc_mat.append(input.new_zeros(rc_size, col_8)) q_mat.append(input.new_zeros(ssize, col_8)) s_mat.append(input.new_zeros(1, col_8)) rc_mask.append(torch.cat(rc_mat, dim=1)) query_mask.append(torch.cat(q_mat, dim=1)) summary_mask.append(torch.cat(s_mat, dim=1)) # no memory, then we don't need summary either if self.use_mem: attention_mask = ( 1 - torch.cat( [ torch.cat(rc_mask, dim=0), torch.cat(query_mask, dim=0), torch.cat(summary_mask, dim=0), ], dim=0, ) ).to(torch.bool) else: attention_mask = ( 1 - torch.cat( [torch.cat(rc_mask, dim=0), torch.cat(query_mask, dim=0)], dim=0 ) ).to(torch.bool) return attention_mask @torch.jit.export def init_state( self, batch_size: int, device: Optional[Device] = None ) -> List[Tensor]: empty_memory = torch.zeros( self.num_layers, self.max_memory_size, batch_size, self.memory_dim, device=device, ) left_context_key = torch.zeros( self.num_layers, self.left_context, batch_size, self.memory_dim, device=device, ) left_context_val = torch.zeros( self.num_layers, self.left_context, batch_size, self.memory_dim, device=device, ) past_length = torch.zeros(1, batch_size, dtype=torch.int32, device=device) return [empty_memory, left_context_key, left_context_val, past_length] @torch.jit.export def batch_state(self, states: List[List[Tensor]]) -> List[Tensor]: if len(states) == 0: return [] batched_m = [] batched_lc_key = [] batched_lc_val = [] batched_past_length = [] for state in states: if len(state) == 0: continue m, lc_key, lc_val, past_length = state batched_m.append(m) batched_lc_key.append(lc_key) batched_lc_val.append(lc_val) batched_past_length.append(past_length) if ( (len(batched_m) == 0) or (len(batched_lc_key) == 0) or (len(batched_lc_val) == 0) or (len(batched_past_length) == 0) ): return [ torch.tensor([]), torch.tensor([]), torch.tensor([]), torch.tensor([]), ] batched_m = torch.cat(batched_m, dim=2) batched_lc_key = torch.cat(batched_lc_key, dim=2) batched_lc_val = torch.cat(batched_lc_val, dim=2) batched_past_length = torch.cat(batched_past_length, dim=1) return [batched_m, batched_lc_key, batched_lc_val, batched_past_length] @torch.jit.export def reorder_state(self, state: List[Tensor], indices: Tensor) -> List[Tensor]: if len(state) == 0: return [] m, lc_key, lc_val, past_length = state indices = indices.to(device=m.device) reord_m = torch.index_select(m, 2, indices) reord_lc_key = torch.index_select(lc_key, 2, indices) reord_lc_val = torch.index_select(lc_val, 2, indices) reord_past_length = torch.index_select(past_length, 1, indices) return [reord_m, reord_lc_key, reord_lc_val, reord_past_length] @torch.jit.export def reset_state(self, state: List[Tensor], indices: Tensor) -> List[Tensor]: m, lc_key, lc_val, past_length = state m = m.index_fill(dim=2, index=indices, value=0.0) lc_key = lc_key.index_fill(dim=2, index=indices, value=0.0) lc_val = lc_val.index_fill(dim=2, index=indices, value=0.0) past_length = past_length.index_fill(dim=1, index=indices, value=0) return [m, lc_key, lc_val, past_length] @torch.jit.export def state_size(self) -> int: return 4 @torch.jit.export def batch_size_in_state( self, state: Optional[List[Tensor]], sloppy: bool = True ) -> Optional[int]: if state is None: return None return state[0].size(2) def gen_summary_queries(self, input): sum_input = self.memory_op(input) return sum_input def _gen_right_context_padded_input(self, input): # This function deals with input that is already # padded with right context (e.g. minibatch training) right_context_blocks = [] T, B, D = input.shape num_segs = math.ceil((T - self.right_context) / self.segment_size) for i in range(0, num_segs - 1): st = (i + 1) * self.segment_size ed = st + self.right_context assert ed < T temp = input[st:ed, :, :] right_context_blocks.append(temp) # last segment right context is already available right_context_blocks.append(input[T - self.right_context :, :, :]) return torch.cat(right_context_blocks, dim=0) def _gen_segs_right_context(self, input, lengths): segments = [] T, B, D = input.size() nT = T - self.right_context # assume input is right context padded num_segs = math.ceil(nT / self.segment_size) # pad zeros to the utterance to make sure each # segment has the same right context. For the for i in range(0, num_segs - 1): st = i * self.segment_size ed = min(T, st + self.segment_size + self.right_context) temp = input[st:ed, :, :] rest_lengths = torch.clamp( lengths - self.segment_size, min=0, max=nT - (i + 1) * self.segment_size ) segments.append((temp, lengths - rest_lengths + self.right_context)) lengths = rest_lengths last_seg = input[st + self.segment_size :, :, :] segments.append((last_seg, rest_lengths + self.right_context)) return segments @torch.jit.unused def forward( self, input: Tensor, padding_masks: Tensor, state: Optional[List[Tensor]] = None ) -> Tuple[Tensor, Tensor, List[Tensor], List[Tensor]]: # Xutai: originally the second argument is lengths. lengths = (~padding_masks).sum(dim=1).long() # mini batch training. if self.mini_batches: return self.forward_mini_batches(input, lengths, state) # regular full sequence training. Note, assume the right context in provided # in the input. T, B, D = input.size() right_context_blocks = self._gen_right_context_padded_input(input) # generate the relative positional embedding if self.use_rpe: rpe = self._get_relative_position( input=input, max_relative_position=self.max_relative_position, left_context_length=0, past_length=0, is_decoding=False, ) else: rpe = None input = input[: T - self.right_context, :, :] attention_mask = self._get_attention_mask(input) # firt layer use each segment mean as memory # ignore the last one seg average if self.use_mem: mems = self.gen_summary_queries(input)[:-1, :, :] else: mems = torch.zeros(0, input.size(1), input.size(2), device=input.device) mems = mems.type_as(input) output = input all_outputs = [] for layer in self.layers: output, mems, right_context_blocks, _, _ = layer( input=output, lengths=lengths, attention_mask=attention_mask, mems=mems, right_context_blocks=right_context_blocks, pre_mems=None, left_context_key=None, left_context_val=None, rpe=rpe, ) all_outputs.append(output) return output, padding_masks, [], all_outputs def forward_jit_mini_batch_init( self, seg: Tensor, state: Optional[List[Tensor]] = None, is_decoding: bool = False, ): # Prepare state. In whole sequence training, state is ignored. # For minibatch training, we need to prepare state if state is None: state = self.init_state(batch_size=seg.size(1), device=seg.device) if seg.dtype == torch.half: state = [state[0].half(), state[1].half(), state[2].half(), state[3]] if self.use_mem: # note input average only on seg, not on right context # first layer use each segmetn mean as memory. the last # one segment average is used in state full_mems = self.gen_summary_queries(seg) if is_decoding: mems = full_mems[0:1, :, :] state_mems = torch.cat([state[0][0], mems], dim=0) else: mems = full_mems[:-1, :, :] state_mems = torch.cat([state[0][0], full_mems], dim=0) else: mems = state[0][0] state_mems = mems # track processed segment number or memory number # the same batch as the same bumber of past length past_length = state[3][0][0].item() past_left_context = min(past_length * self.segment_size, self.left_context) past_length = min(self.max_memory_size, past_length) return state, mems, state_mems, past_length, past_left_context def state_update_before( self, layer: int, state: List[Tensor], past_length: int, past_left_context: int ): pre_mems = state[0][layer][self.max_memory_size - past_length :, :, :] lc_key = state[1][layer][self.left_context - past_left_context :, :, :] lc_val = state[2][layer][self.left_context - past_left_context :, :, :] return pre_mems, lc_key, lc_val def state_update_after( self, layer: int, state: List[Tensor], mems: Tensor, next_key: Tensor, next_val: Tensor, mems_list: List[Tensor], lc_key_list: List[Tensor], lc_val_list: List[Tensor], ): # mems is used for next layer if layer < self.num_layers - 1: state_mems = torch.cat([state[0][layer + 1], mems], dim=0) mems_list.append(state_mems[-self.max_memory_size :, :, :]) # when mems pass to next sequence, we need the last memory. when mems # use for the next layer, we can ignore the last memory mems = mems[:-1, :, :] # note state[1][i] and state[2][i] original length equals to self.left_context new_k = torch.cat([state[1][layer], next_key], dim=0) new_v = torch.cat([state[2][layer], next_val], dim=0) lc_key_list.append(new_k[-self.left_context :, :, :]) lc_val_list.append(new_v[-self.left_context :, :, :]) return mems_list, lc_key_list, lc_val_list, mems def state_update_after_loop( self, state: List[Tensor], mems_list: List[Tensor], lc_key_list: List[Tensor], lc_val_list: List[Tensor], update_length: int, ): state[0] = torch.stack(mems_list, dim=0) state[1] = torch.stack(lc_key_list, dim=0) state[2] = torch.stack(lc_val_list, dim=0) state[3] = state[3] + update_length return state @torch.jit.unused def forward_mini_batches( self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None ) -> Tuple[Tensor, Tensor, List[Tensor], List[Tensor]]: T, B, D = input.size() # input without right context seg = input[: T - self.right_context, :, :] # get right context blocks right_context_blocks = self._gen_right_context_padded_input(input) mems_list = [] lc_key_list = [] lc_val_list = [] results = self.forward_jit_mini_batch_init(seg, state, False) state, mems, state_mems, past_length, past_left_context = results # relative position embedding if self.use_rpe: rpe = self._get_relative_position( input=input, max_relative_position=self.max_relative_position, left_context_length=past_left_context, past_length=past_length, is_decoding=False, ) else: rpe = None # get attention mask based on seg (not include right context) and available # left context attention_mask = self._get_attention_mask(seg, past_length, past_left_context) mems_list.append(state_mems[-self.max_memory_size :, :, :]) output = seg i = 0 all_outputs = [] for layer in self.layers: # In order to make cross stream batching work, mem, left context key # and left context value in the state should always be the same shape. # We use the past length to track the processed segment number. In this # way, we take out the essential memory, left context key and left # context val from the state. After finish the forward for current segment # we add the new memory, left context key and left context value into the # staate and trim out the oldest part to keep the shape consistent. pre_mems, lc_key, lc_val = self.state_update_before( i, state, past_length, past_left_context ) output, mems, right_context_blocks, next_key, next_val = layer.forward( input=output, lengths=lengths, attention_mask=attention_mask, mems=mems, right_context_blocks=right_context_blocks, pre_mems=pre_mems, left_context_key=lc_key, left_context_val=lc_val, rpe=rpe, ) all_outputs.append(output) mems_list, lc_key_list, lc_val_list, mems = self.state_update_after( layer=i, state=state, mems=mems, next_key=next_key, next_val=next_val, mems_list=mems_list, lc_key_list=lc_key_list, lc_val_list=lc_val_list, ) i += 1 # update state update_length = math.ceil((T - self.right_context) / self.segment_size) state = self.state_update_after_loop( state=state, mems_list=mems_list, lc_key_list=lc_key_list, lc_val_list=lc_val_list, update_length=update_length, ) return output, lengths, state, all_outputs def forward_jit_test( self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None ) -> Tuple[Tensor, Tensor, List[Tensor]]: """ This one simulate sequence encoder forward jit. This is for unit test purpose. It is not used in training or decoding. Note, extra_right_context is set in the model. In unit test, input = [utterance, right_context], lengths = [utterance_length]. args: input: input utterance lengths: utterance input length state: None here. input is whole utterance """ # [TODO] sequence_to_segment has bug in lengths. seg_src_tokens_lengths = self._gen_segs_right_context(input, lengths) seg_enc_tokens_lengths: List[Tuple[Tensor, Tensor]] = [] state: Optional[List[Tensor]] = None for seg_src_tokens, seg_src_lengths in seg_src_tokens_lengths: seg_enc_tokens, seg_enc_lengths, state = self.forward_jit( input=seg_src_tokens, lengths=seg_src_lengths, state=state ) seg_enc_tokens_lengths.append((seg_enc_tokens, seg_enc_lengths)) enc_tokens, enc_lengths = segments_to_sequence( segments=seg_enc_tokens_lengths, time_axis=0 ) state = [] # returns trivial state return enc_tokens, enc_lengths, state @torch.jit.export def forward_jit( self, input: Tensor, lengths: Tensor, state: Optional[List[Tensor]] = None ) -> Tuple[Tensor, Tensor, List[Tensor]]: """ Forward helper for online decoding. args: input: [seg, right_context]. We assume in online we always padding the right context to the preset right context size. For the last segment, we may have short segment size, but right context size is the same as other segments lengths: utterance input length is the utterance segment length and right context size state: [memory, left_context_key, left_context_val]. To improve throughput, in addition to memory, we also cache key and value for left_context in multihead self-attention """ # In online decoding, input = [segment, right_context] # Lengths = [segment_length, right_context_length] # so we need strip right context in output T, B, D = input.size() rc_str = T - self.right_context rc_end = T right_context_blocks = input[rc_str:rc_end, :, :] seg = input[:rc_str, :, :] lengths = torch.clamp(lengths - self.right_context, min=0) mems_list = [] lc_key_list = [] lc_val_list = [] results = self.forward_jit_mini_batch_init(seg, state, True) state, mems, state_mems, past_length, past_left_context = results # relative position embedding if self.use_rpe: rpe = self._get_relative_position( input=input, max_relative_position=self.max_relative_position, left_context_length=past_left_context, past_length=past_length, is_decoding=True, ) else: rpe = None # memory for first layer. mems_list.append(state_mems[-self.max_memory_size :, :, :]) output = seg i = 0 for layer in self.layers: # In order to make cross stream batching work, mem, left context key # and left context value in the state should always be the same shape. # We use the past length to track the processed segment number. In this # way, we take out the essential memory, left context key and left # context val from the state. After finish the forward for current segment # we add the new memory, left context key and left context value into the # staate and trim out the oldest part to keep the shape consistent. true_mems, lc_key, lc_val = self.state_update_before( layer=i, state=state, past_length=past_length, past_left_context=past_left_context, ) output, mems, right_context_blocks, next_key, next_val = layer.forward_jit( input=output, lengths=lengths, mems=true_mems, right_context_blocks=right_context_blocks, left_context_key=lc_key, left_context_val=lc_val, rpe=rpe, ) # mems is used for next layer mems_list, lc_key_list, lc_val_list, _ = self.state_update_after( layer=i, state=state, mems_list=mems_list, mems=mems, next_key=next_key, next_val=next_val, lc_key_list=lc_key_list, lc_val_list=lc_val_list, ) i += 1 # update state state = self.state_update_after_loop( state=state, mems_list=mems_list, lc_key_list=lc_key_list, lc_val_list=lc_val_list, update_length=1, ) return output, lengths, state def quantize_(self, params=None): if params and "per_channel" in params and params["per_channel"]: qconfig = per_channel_dynamic_qconfig else: qconfig = default_dynamic_qconfig quantization.quantize_dynamic( self, {torch.nn.Linear: qconfig}, dtype=torch.qint8, inplace=True ) return self # ------------------------------------------------------------------------------ # Emformer encoder for seq2seq model # This is a wrapper over the original emformer # ------------------------------------------------------------------------------ def emformer_encoder(klass): class SpeechEncoder(klass): def __init__(self, args): super().__init__(args) stride = SpeechEncoder.conv_layer_stride(args) trf_left_context = args.segment_left_context // stride trf_right_context = args.segment_right_context // stride context_config = [trf_left_context, trf_right_context] self.transformer_layers = nn.ModuleList( [ NoSegAugmentedMemoryTransformerEncoderLayer( input_dim=args.encoder_embed_dim, num_heads=args.encoder_attention_heads, ffn_dim=args.encoder_ffn_embed_dim, num_layers=args.encoder_layers, dropout_in_attn=args.dropout, dropout_on_attn=args.dropout, dropout_on_fc1=args.dropout, dropout_on_fc2=args.dropout, activation_fn=args.activation_fn, context_config=context_config, segment_size=args.segment_length, max_memory_size=args.max_memory_size, scaled_init=True, # TODO: use constant for now. tanh_on_mem=args.amtrf_tanh_on_mem, ) ] ) def forward(self, src_tokens, src_lengths): encoder_out = super().forward(src_tokens, src_lengths) output = encoder_out["encoder_out"][0] encoder_padding_masks = encoder_out["encoder_padding_mask"][0] # This is because that in the original implementation # the output didn't consider the last segment as right context. encoder_padding_masks = encoder_padding_masks[:, : output.size(0)] return { "encoder_out": [output], "encoder_padding_mask": [encoder_padding_masks], "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [], } @staticmethod def conv_layer_stride(args): # TODO: make it configurable from the args return 4 SpeechEncoder.__name__ = klass.__name__ return SpeechEncoder ================================================ FILE: fairseq/models/speech_to_text/multi_modality_model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.models import FairseqDecoder, FairseqEncoder # a container for different encoders with training samples from different modality # each time, only one encoder is selected class MultiModalityEncoder(FairseqEncoder): def __init__(self, dictionary): super().__init__(dictionary) def select_encoder(self, mode, **kwargs): raise NotImplementedError("Model must implement the select_encoder method") return None, kwargs # def post_encoder(self, encoder_out, src_tokens, src_lengths, mode, **kwargs): # # Default do nothing # return encoder_out # get sample data from JointSpeechTextDataset def forward(self, src_tokens, src_lengths=None, mode="", **kwargs): encoder, kwargs = self.select_encoder(mode, **kwargs) # return self.post_encoder(encoder(src_tokens, src_lengths, **kwargs), src_tokens, src_lengths, mode, **kwargs) return encoder(src_tokens, src_lengths, **kwargs) # a container for different decoders with training samples from different modality # each time, only one decoder is selected class MultiInputDecoder(FairseqDecoder): def __init__(self, dictionary): super().__init__(dictionary) def select_decoder(self, mode, **kwargs): raise NotImplementedError("Model must implement the select_decoder method") return None, kwargs def forward( self, prev_output_tokens, encoder_out, incremental_state=None, mode="", **kwargs ): decoder, kwargs = self.select_decoder(mode, **kwargs) return decoder( prev_output_tokens, encoder_out, incremental_state=incremental_state, **kwargs ) ================================================ FILE: fairseq/models/speech_to_text/s2t_conformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from pathlib import Path import torch from fairseq import checkpoint_utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import FairseqEncoder, register_model, register_model_architecture from fairseq.models.speech_to_text.modules.convolution import ( Conv1dSubsampler, Conv2dSubsampler, ) from fairseq.models.speech_to_text.s2t_transformer import ( S2TTransformerEncoder, S2TTransformerModel, ) from fairseq.models.speech_to_text.s2t_transformer import ( base_architecture as transformer_base_architecture, ) from fairseq.modules import PositionalEmbedding, RelPositionalEncoding from fairseq.modules.conformer_layer import ConformerEncoderLayer logger = logging.getLogger(__name__) class S2TConformerEncoder(FairseqEncoder): """Conformer Encoder for speech translation based on https://arxiv.org/abs/2005.08100""" def __init__(self, args): super().__init__(None) self.encoder_freezing_updates = args.encoder_freezing_updates self.num_updates = 0 self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.conv_version = args.conv_version if self.conv_version == "s2t_transformer": self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) elif self.conv_version == "convtransformer": self.subsample = Conv2dSubsampler( args.input_channels, args.input_feat_per_channel, args.conv_out_channels, args.encoder_embed_dim, ) self.pos_enc_type = args.pos_enc_type if self.pos_enc_type == "rel_pos": self.embed_positions = RelPositionalEncoding( args.max_source_positions, args.encoder_embed_dim ) elif self.pos_enc_type == "rope": self.embed_positions = None else: # Use absolute positional embedding self.pos_enc_type = "abs" self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx ) self.linear = torch.nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.dropout = torch.nn.Dropout(args.dropout) self.conformer_layers = torch.nn.ModuleList( [ ConformerEncoderLayer( embed_dim=args.encoder_embed_dim, ffn_embed_dim=args.encoder_ffn_embed_dim, attention_heads=args.encoder_attention_heads, dropout=args.dropout, depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, attn_type=args.attn_type, pos_enc_type=self.pos_enc_type, use_fp16=args.fp16, ) for _ in range(args.encoder_layers) ] ) def _forward(self, src_tokens, src_lengths, return_all_hiddens=False): """ Args: src_tokens: Input source tokens Tensor of shape B X T X C src_lengths: Lengths Tensor corresponding to input source tokens return_all_hiddens: If true will append the self attention states to the encoder states Returns: encoder_out: Tensor of shape B X T X C encoder_padding_mask: Optional Tensor with mask encoder_embedding: Optional Tensor. Always empty here encoder_states: List of Optional Tensors wih self attention states src_tokens: Optional Tensor. Always empty here src_lengths: Optional Tensor. Always empty here """ x, input_lengths = self.subsample(src_tokens, src_lengths) # returns T X B X C encoder_padding_mask = lengths_to_padding_mask(input_lengths) x = self.embed_scale * x if self.pos_enc_type == "rel_pos": positions = self.embed_positions(x) elif self.pos_enc_type == "rope": positions = None else: positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions positions = None x = self.linear(x) x = self.dropout(x) encoder_states = [] # x is T X B X C for layer in self.conformer_layers: x, _ = layer(x, encoder_padding_mask, positions) if return_all_hiddens: encoder_states.append(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], } def forward(self, src_tokens, src_lengths, return_all_hiddens=False): if self.num_updates < self.encoder_freezing_updates: with torch.no_grad(): x = self._forward( src_tokens, src_lengths, return_all_hiddens=return_all_hiddens, ) else: x = self._forward( src_tokens, src_lengths, return_all_hiddens=return_all_hiddens, ) return x def reorder_encoder_out(self, encoder_out, new_order): """Required method for a FairseqEncoder. Calls the method from the parent class""" return S2TTransformerEncoder.reorder_encoder_out(self, encoder_out, new_order) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) self.num_updates = num_updates @register_model("s2t_conformer") class S2TConformerModel(S2TTransformerModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): S2TTransformerModel.add_args(parser) parser.add_argument( "--input-feat-per-channel", type=int, metavar="N", help="dimension of input features per channel", ) parser.add_argument( "--input-channels", type=int, metavar="N", help="number of chennels of input features", ) parser.add_argument( "--depthwise-conv-kernel-size", type=int, metavar="N", help="kernel size of depthwise convolution layers", ) parser.add_argument( "--attn-type", type=str, metavar="STR", help="If not specified uses fairseq MHA. Other valid option is espnet", ) parser.add_argument( "--pos-enc-type", type=str, metavar="STR", help="Must be specified in addition to attn-type=espnet for rel_pos and rope", ) @classmethod def build_encoder(cls, args): encoder = S2TConformerEncoder(args) pretraining_path = getattr(args, "load_pretrained_encoder_from", None) if pretraining_path is not None: if not Path(pretraining_path).exists(): logger.warning( f"skipped pretraining because {pretraining_path} does not exist" ) else: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=pretraining_path ) logger.info(f"loaded pretrained encoder from: {pretraining_path}") return encoder @register_model_architecture("s2t_conformer", "s2t_conformer") def conformer_base_architecture(args): args.attn_type = getattr(args, "attn_type", None) args.pos_enc_type = getattr(args, "pos_enc_type", "abs") args.input_feat_per_channel = getattr(args, "input_feat_per_channel", 80) args.input_channels = getattr(args, "input_channels", 1) args.max_source_positions = getattr(args, "max_source_positions", 6000) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) args.encoder_layers = getattr(args, "encoder_layers", 16) args.depthwise_conv_kernel_size = getattr(args, "depthwise_conv_kernel_size", 31) transformer_base_architecture(args) ================================================ FILE: fairseq/models/speech_to_text/s2t_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from pathlib import Path from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn from torch import Tensor from fairseq import checkpoint_utils, utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_text.hub_interface import S2THubInterface from fairseq.models.speech_to_text.modules.convolution import ( Conv1dSubsampler, Conv2dSubsampler, ) from fairseq.models.transformer import Embedding, TransformerDecoder from fairseq.modules import ( FairseqDropout, LayerNorm, PositionalEmbedding, TransformerEncoderLayer, ) logger = logging.getLogger(__name__) @register_model("s2t_transformer") class S2TTransformerModel(FairseqEncoderDecoderModel): """Adapted Transformer model (https://arxiv.org/abs/1706.03762) for speech-to-text tasks. The Transformer encoder/decoder remains the same. A trainable input subsampler is prepended to the Transformer encoder to project inputs into the encoder dimension as well as downsample input sequence for computational efficiency.""" @classmethod def hub_models(cls): base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t" model_ids = [ "s2t_transformer_s-en-asr-librispeech", "s2t_transformer_m-en-asr-librispeech", "s2t_transformer_l-en-asr-librispeech", ] return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", config_yaml="config.yaml", **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), config_yaml=config_yaml, **kwargs, ) return S2THubInterface(x["args"], x["task"], x["models"][0]) def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" # input parser.add_argument( "--conv-kernel-sizes", type=str, metavar="STR", help="kernel sizes of Conv1d (s2t_transformer) subsampling layers", ) parser.add_argument( "--conv-channels", type=int, metavar="N", help="# of channels in Conv1d (s2t_transformer) subsampling layers", ) parser.add_argument( "--conv-out-channels", type=int, metavar="N", help="# of channels in Conv2d (convtransformer) subsampling layers", ) parser.add_argument( "--conv-version", type=str, default="s2t_transformer", choices=["s2t_transformer", "convtransformer"], help="version of frontend convolutional layers", ) # Transformer parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension", ) parser.add_argument( "--encoder-ffn-embed-dim", type=int, metavar="N", help="encoder embedding dimension for FFN", ) parser.add_argument( "--encoder-layers", type=int, metavar="N", help="num encoder layers" ) parser.add_argument( "--encoder-attention-heads", type=int, metavar="N", help="num encoder attention heads", ) parser.add_argument( "--encoder-normalize-before", action="store_true", help="apply layernorm before each encoder block", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension", ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-encoder-from", type=str, metavar="STR", help="model to take encoder weights from (for initialization)", ) parser.add_argument( "--encoder-freezing-updates", type=int, metavar="N", help="freeze encoder for first N updates", ) @classmethod def build_encoder(cls, args): encoder = S2TTransformerEncoder(args) pretraining_path = getattr(args, "load_pretrained_encoder_from", None) if pretraining_path is not None: if not Path(pretraining_path).exists(): logger.warning( f"skipped pretraining because {pretraining_path} does not exist" ) else: encoder = checkpoint_utils.load_pretrained_component_from_model( component=encoder, checkpoint=pretraining_path ) logger.info(f"loaded pretrained encoder from: {pretraining_path}") return encoder @classmethod def build_decoder(cls, args, task, embed_tokens): return TransformerDecoderScriptable(args, task.target_dictionary, embed_tokens) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) def build_embedding(dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() return Embedding(num_embeddings, embed_dim, padding_idx) decoder_embed_tokens = build_embedding( task.target_dictionary, args.decoder_embed_dim ) args.tgt_dict_size = len(task.target_dictionary) encoder = cls.build_encoder(args) decoder = cls.build_decoder(args, task, decoder_embed_tokens) return cls(encoder, decoder) def get_normalized_probs( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): # net_output['encoder_out'] is a (B, T, D) tensor lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample) lprobs.batch_first = True return lprobs def get_ctc_target(self, sample: Optional[Dict[str, Tensor]]): return sample["target"], sample["target_lengths"] def get_ctc_output( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], sample: Optional[Dict[str, Tensor]], ): encoder_out = net_output[1]["encoder_out"]["encoder_out"][0] logits = self.encoder.ctc_proj(encoder_out) # T x B x C out = utils.log_softmax(logits.float(), dim=-1) padding_mask = net_output[1]["encoder_out"]["encoder_padding_mask"] lens = out.new_full((out.shape[1],), out.shape[0]).long() if len(padding_mask) > 0: lens -= padding_mask[0].sum(dim=-1) return out, lens def forward(self, src_tokens, src_lengths, prev_output_tokens): """ The forward method inherited from the base class has a **kwargs argument in its input, which is not supported in torchscript. This method overwrites the forward method definition without **kwargs. """ encoder_out = self.encoder(src_tokens=src_tokens, src_lengths=src_lengths) decoder_out = self.decoder( prev_output_tokens=prev_output_tokens, encoder_out=encoder_out ) return decoder_out class S2TTransformerEncoder(FairseqEncoder): """Speech-to-text Transformer encoder that consists of input subsampler and Transformer encoder.""" def __init__(self, args): super().__init__(None) self.encoder_freezing_updates = args.encoder_freezing_updates self.num_updates = 0 self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__ ) self.embed_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_embedding: self.embed_scale = 1.0 self.padding_idx = 1 self.conv_version = args.conv_version if self.conv_version == "s2t_transformer": self.subsample = Conv1dSubsampler( args.input_feat_per_channel * args.input_channels, args.conv_channels, args.encoder_embed_dim, [int(k) for k in args.conv_kernel_sizes.split(",")], ) elif self.conv_version == "convtransformer": self.subsample = Conv2dSubsampler( args.input_channels, args.input_feat_per_channel, args.conv_out_channels, args.encoder_embed_dim, ) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx ) self.transformer_layers = nn.ModuleList( [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)] ) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None self.ctc_proj = None if getattr(args, "ctc_weight", 0.0) > 0.0: self.ctc_proj = nn.Linear(args.encoder_embed_dim, args.tgt_dict_size) def _forward(self, src_tokens, src_lengths, return_all_hiddens=False): x, input_lengths = self.subsample(src_tokens, src_lengths) x = self.embed_scale * x encoder_padding_mask = lengths_to_padding_mask(input_lengths) positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions x = self.dropout_module(x) encoder_states = [] for layer in self.transformer_layers: x = layer(x, encoder_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], } def forward(self, src_tokens, src_lengths, return_all_hiddens=False): if self.num_updates < self.encoder_freezing_updates: with torch.no_grad(): x = self._forward( src_tokens, src_lengths, return_all_hiddens=return_all_hiddens ) else: x = self._forward( src_tokens, src_lengths, return_all_hiddens=return_all_hiddens ) return x def reorder_encoder_out(self, encoder_out, new_order): new_encoder_out = ( [] if len(encoder_out["encoder_out"]) == 0 else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] ) new_encoder_padding_mask = ( [] if len(encoder_out["encoder_padding_mask"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_padding_mask"] ] ) new_encoder_embedding = ( [] if len(encoder_out["encoder_embedding"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] ] ) encoder_states = encoder_out["encoder_states"] if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) return { "encoder_out": new_encoder_out, # T x B x C "encoder_padding_mask": new_encoder_padding_mask, # B x T "encoder_embedding": new_encoder_embedding, # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], # B x T "src_lengths": [], # B x 1 } def set_num_updates(self, num_updates): super().set_num_updates(num_updates) self.num_updates = num_updates class TransformerDecoderScriptable(TransformerDecoder): def extract_features( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): # call scriptable method from parent class x, _ = self.extract_features_scriptable( prev_output_tokens, encoder_out, incremental_state, full_context_alignment, alignment_layer, alignment_heads, ) extra = {"encoder_out": encoder_out} if incremental_state is None else None return x, extra @register_model_architecture(model_name="s2t_transformer", arch_name="s2t_transformer") def base_architecture(args): args.encoder_freezing_updates = getattr(args, "encoder_freezing_updates", 0) # Convolutional subsampler args.input_channels = getattr(args, "input_channels", 1) args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5") # for Conv1d args.conv_channels = getattr(args, "conv_channels", 1024) # for Conv1d args.conv_out_channels = getattr(args, "conv_out_channels", 256) # for Conv2d args.conv_version = getattr(args, "conv_version", "s2t_transformer") # Transformer args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 12) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", args.dropout) args.activation_dropout = getattr(args, "activation_dropout", args.dropout) args.activation_fn = getattr(args, "activation_fn", "relu") args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) @register_model_architecture("s2t_transformer", "s2t_transformer_s") def s2t_transformer_s(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.dropout = getattr(args, "dropout", 0.1) base_architecture(args) @register_model_architecture("s2t_transformer", "s2t_transformer_xs") def s2t_transformer_xs(args): args.encoder_layers = getattr(args, "encoder_layers", 6) args.decoder_layers = getattr(args, "decoder_layers", 3) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 4) args.dropout = getattr(args, "dropout", 0.3) s2t_transformer_s(args) @register_model_architecture("s2t_transformer", "s2t_transformer_sp") def s2t_transformer_sp(args): args.encoder_layers = getattr(args, "encoder_layers", 16) s2t_transformer_s(args) @register_model_architecture("s2t_transformer", "s2t_transformer_m") def s2t_transformer_m(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 512 * 4) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.dropout = getattr(args, "dropout", 0.15) base_architecture(args) @register_model_architecture("s2t_transformer", "s2t_transformer_mp") def s2t_transformer_mp(args): args.encoder_layers = getattr(args, "encoder_layers", 16) s2t_transformer_m(args) @register_model_architecture("s2t_transformer", "s2t_transformer_l") def s2t_transformer_l(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024 * 4) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.dropout = getattr(args, "dropout", 0.2) base_architecture(args) @register_model_architecture("s2t_transformer", "s2t_transformer_lp") def s2t_transformer_lp(args): args.encoder_layers = getattr(args, "encoder_layers", 16) s2t_transformer_l(args) ================================================ FILE: fairseq/models/speech_to_text/s2t_wav_transformer.py ================================================ #!/usr/bin/env python3 import math import torch import torch.nn as nn from fairseq.data.data_utils import compute_mask_indices from fairseq.models import FairseqEncoder from fairseq.models.wav2vec import ConvFeatureExtractionModel from fairseq.modules import GradMultiply, LayerNorm, SamePad, TransformerEncoderLayer # Transformer encoder with wave input, it is adopted from wav2vec 2.0 Encoder. # use wav input # use trained position embedding so it is easier to match with text input class SpeechWavTransformerEncoder(FairseqEncoder): # extra parameters for speech encoder besides those defined in transformermodel @staticmethod def add_args(parser): parser.add_argument( "--dropout-input", type=float, metavar="D", help="dropout to apply to the input (after feat extr)", ) parser.add_argument( "--dropout-features", type=float, metavar="D", help="dropout to apply to the unmasked features (after feat extr)", ) parser.add_argument( "--speech-extractor-mode", type=str, default="layer_norm", choices=["default", "layer_norm"], help="feature extractor norm", ) parser.add_argument( "--speech-conv-bias", action="store_true", help="include bias in speech conv encoder", ) parser.add_argument( "--conv-feature-layers", default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", help="string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]", ) parser.add_argument( "--speech-mask-length", type=int, help="repeat the mask indices multiple times", ) parser.add_argument( "--speech-mask-prob", type=float, help="probability of replacing a token with mask", ) parser.add_argument( "--speech-mask-selection", type=str, choices=["static", "uniform", "normal", "poisson"], help="how to choose masks", ) parser.add_argument( "--speech-mask-other", type=float, help="stdev of the mask length in case of 'normal' selection strategy", ) parser.add_argument( "--speech-no-mask-overlap", action="store_true", help="whether to allow masks to overlap", ) parser.add_argument( "--speech-mask-min-space", type=int, help="min space between spans (if no overlap is enabled)", ) parser.add_argument( "--speech-mask-channel-length", type=int, help="repeat the mask indices multiple times", ) parser.add_argument( "--speech-mask-channel-prob", type=float, help="probability of replacing a token with mask", ) parser.add_argument( "--speech-mask-channel-selection", type=str, choices=["static", "uniform", "normal", "poisson"], help="how to choose masks", ) parser.add_argument( "--speech-mask-channel-other", type=float, help="stdev of the mask length in case of 'normal' selection strategy", ) parser.add_argument( "--speech-no-mask-channel-overlap", action="store_true", help="whether to allow masks to overlap", ) parser.add_argument( "--no-scale-feature", action="store_true", help="no scale for the calculated features", ) parser.add_argument( "--speech-mask-channel-min-space", type=int, help="min space between spans (if no overlap is enabled)", ) parser.add_argument( "--feature-grad-mult", type=float, help="reset feature grad mult in wav2vec 2.0 to this", ) # positional embeddings parser.add_argument( "--conv-pos", type=int, default=128, help="number of filters for convolutional positional embeddings", ) parser.add_argument( "--conv-pos-groups", type=int, default=16, help="number of groups for convolutional positional embedding", ) # model configures parser.add_argument( "--speech-encoder-layers", type=int, help="number of speech encoder layers", ) parser.add_argument( "--text-encoder-layers", type=int, help="number of text encoder layers", ) def __init__(self, args, alway_mask=False): super().__init__(args) self.args = args self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.feat_scale = math.sqrt(args.encoder_embed_dim) if args.no_scale_feature: self.feat_scale = 1.0 subsample = ConvFeatureExtractionModel( conv_layers=eval(args.conv_feature_layers), dropout=0.0, mode=args.speech_extractor_mode, # default, layer_norm conv_bias=args.speech_conv_bias, ) self.feature_enc_layers = eval(args.conv_feature_layers) self.subsample = subsample self.feat_proj = ( nn.Linear(self.feature_enc_layers[-1][0], self.embedding_dim) if self.feature_enc_layers[-1][0] != self.embedding_dim else None ) self.feat_layer_norm = LayerNorm(self.feature_enc_layers[-1][0]) self.embed_positions = nn.Conv1d( self.embedding_dim, self.embedding_dim, kernel_size=args.conv_pos, padding=args.conv_pos // 2, groups=args.conv_pos_groups, ) std = math.sqrt(4 / (args.conv_pos * self.embedding_dim)) nn.init.normal_(self.embed_positions.weight, mean=0, std=std) nn.init.constant_(self.embed_positions.bias, 0) self.embed_positions = nn.utils.weight_norm( self.embed_positions, name="weight", dim=2 ) self.embed_positions = nn.Sequential( self.embed_positions, SamePad(args.conv_pos), nn.GELU() ) self.mask_prob = args.speech_mask_prob self.mask_selection = args.speech_mask_selection self.mask_other = args.speech_mask_other self.mask_length = args.speech_mask_length self.no_mask_overlap = args.speech_no_mask_overlap self.mask_min_space = args.speech_mask_min_space self.mask_channel_prob = args.speech_mask_channel_prob self.mask_channel_selection = args.speech_mask_channel_selection self.mask_channel_other = args.speech_mask_channel_other self.mask_channel_length = args.speech_mask_channel_length self.no_mask_channel_overlap = args.speech_no_mask_channel_overlap self.mask_channel_min_space = args.speech_mask_channel_min_space self.dropout_input = nn.Dropout(args.dropout_input) self.dropout_features = nn.Dropout(args.dropout_features) self.feature_grad_mult = args.feature_grad_mult self.mask_emb = nn.Parameter( torch.FloatTensor(args.encoder_embed_dim).uniform_() ) self.layers = nn.ModuleList( [TransformerEncoderLayer(args) for _ in range(args.encoder_layers)] ) self.layer_norm = LayerNorm(args.encoder_embed_dim) self.normalize_before = args.encoder_normalize_before self.alway_mask = alway_mask def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): """ Computes the output length of the convolutional layers """ def _conv_out_length(input_length, kernel_size, stride): return torch.floor((input_length - kernel_size) / stride + 1) for i in range(len(self.feature_enc_layers)): input_lengths = _conv_out_length( input_lengths, self.feature_enc_layers[i][1], self.feature_enc_layers[i][2], ) return input_lengths.to(torch.long) def apply_mask(self, x, padding_mask): B, T, C = x.shape if self.mask_prob > 0: mask_indices = compute_mask_indices( (B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space, ) mask_indices = torch.from_numpy(mask_indices).to(x.device) x[mask_indices] = self.mask_emb else: mask_indices = None if self.mask_channel_prob > 0: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x[mask_channel_indices] = 0 return x, mask_indices def forward( self, src_tokens, src_lengths, return_all_hiddens=False, padding_mask=None, features_only=True, ): mask = self.training or self.alway_mask if self.feature_grad_mult > 0 and self.training: features = self.subsample(src_tokens) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.subsample(src_tokens) features = features.transpose(1, 2) features = self.feat_layer_norm(features) if self.feat_proj is not None: features = self.feat_proj(features) if padding_mask is not None: input_lengths = (1 - padding_mask.long()).sum(-1) else: input_lengths = src_lengths # apply conv formula to get real output_lengths output_lengths = self._get_feat_extract_output_lengths(input_lengths) padding_mask = torch.zeros( features.shape[:2], dtype=features.dtype, device=features.device ) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[ ( torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1, ) ] = 1 padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() features = self.feat_scale * features if self.feat_scale != 1.0 else features unmasked_features = features.clone() features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) if mask: x, mask_indices = self.apply_mask(features, padding_mask) else: x = features mask_indices = None def cal_transformer_layers(x, encoder_padding_mask, return_all_hiddens=False): # x: B x T x C positions = self.embed_positions(x.transpose(1, 2)).transpose(1, 2) x = x + positions if not self.normalize_before: x = self.layer_norm(x) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states = [] for layer in self.layers: x = layer(x, encoder_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.normalize_before: x = self.layer_norm(x) return x, encoder_states x, encoder_states = cal_transformer_layers(x, padding_mask, return_all_hiddens) if features_only: return { "encoder_out": [x], # [T x B x C] "encoder_padding_mask": [padding_mask] if padding_mask is not None else [], # B x T "encoder_embedding": [], # "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], "mask_indices": [mask_indices], } x_unmasked = x if self.mask_prob > 0 or self.mask_channel_prob > 0: x_unmasked, _ = cal_transformer_layers(unmasked_features, padding_mask) return { "encoder_out": [x], # [T x B x C] "encoder_unmasked_out": [x_unmasked], # [T x B x C] "encoder_padding_mask": [padding_mask] if padding_mask is not None else [], # B x T "encoder_embedding": [], # "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], "mask_indices": [mask_indices] if mask_indices is not None else [], # B X T } def reorder_encoder_out(self, encoder_out, new_order): new_encoder_out = ( [] if len(encoder_out["encoder_out"]) == 0 else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] ) new_encoder_padding_mask = ( [] if len(encoder_out["encoder_padding_mask"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_padding_mask"] ] ) new_encoder_embedding = ( [] if len(encoder_out["encoder_embedding"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] ] ) encoder_states = encoder_out["encoder_states"] if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) return { "encoder_out": new_encoder_out, # T x B x C "encoder_padding_mask": new_encoder_padding_mask, # B x T "encoder_embedding": new_encoder_embedding, # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], # B x T "src_lengths": [], # B x 1 } class StackedSpeechWavTransformerEncoder(FairseqEncoder): def __init__(self, speech_enc, text_enc_layers, text_layer_norm): super().__init__(None) self.speech_encoder = speech_enc self.text_encoder_layers = text_enc_layers self.final_layer_norm = text_layer_norm def forward( self, src_tokens, src_lengths=None, return_all_hiddens=False, padding_mask=None, features_only=True, ): out = self.speech_encoder.forward( src_tokens, src_lengths, return_all_hiddens, padding_mask=padding_mask, features_only=features_only, ) x = out["encoder_out"][0] encoder_padding_mask = None if len(out["encoder_padding_mask"]) > 0: encoder_padding_mask = out["encoder_padding_mask"][0] def cal_text_layers(x, padding_mask, return_all_hiddens=False): encoder_states = [] for layer in self.text_encoder_layers: x = layer(x, padding_mask) if return_all_hiddens: encoder_states.append(x) if self.final_layer_norm is not None: x = self.final_layer_norm(x) return x, encoder_states x, encoder_states = cal_text_layers(x, encoder_padding_mask, return_all_hiddens) if features_only: return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask is not None else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], } x_u = out["encoder_unmasked_out"][0] x_u, _ = cal_text_layers(x_u, encoder_padding_mask) return { "encoder_out": [x], # [T x B x C] "encoder_unmasked_out": [x_u], # [T x B x C] "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask is not None else [], # B x T "encoder_embedding": [], # "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], "mask_indices": out["mask_indices"], # B X T } def reorder_encoder_out(self, encoder_out, new_order): return self.speech_encoder.reorder_encoder_out(encoder_out, new_order) ================================================ FILE: fairseq/models/speech_to_text/utils.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging from collections.abc import Iterable from itertools import repeat from typing import List, Optional, Tuple import torch from torch import Tensor # ------------------------------------------------------------------------------ # assert_equal() # ------------------------------------------------------------------------------ def assert_equal(value1, value2, name1=None, name2=None): """Asserts two values are equal otherwise raise an error.""" str_name1 = "" if name1 is None else "{} ".format(name1) str_name2 = "" if name2 is None else "{} ".format(name2) if value1 != value2: str_value1 = "{}" if name1 is None else "({})" str_value1 = str_value1.format(value1) str_value2 = "{}" if name2 is None else "({})" str_value2 = str_value2.format(value2) raise ValueError( "Expected {}{} == {}{}".format(str_name1, str_value1, str_name2, str_value2) ) def fill_config(config, key, value): if value is not None: if key not in config or config[key] is None: config[key] = value assert_equal(value, config[key], "value", f'config["{key}"]') # ------------------------------------------------------------------------------ # check_and_return_expected() # ------------------------------------------------------------------------------ def check_and_return_expected(value, undefined_value, expected_value, name=None): """ Return the expected value while checking if the given value is undefined or equal to the expected value. """ if (undefined_value is None and value is None) or (undefined_value == value): return expected_value if value != expected_value: str_name = "" if name is None else "{} ".format(name) str_value = "{}" if name is None else "({})" str_value = str_value.format(value) raise ValueError( "Expected {}{} == {}".format(str_name, str_value, expected_value) ) return expected_value # ------------------------------------------------------------------------------ # get_time_axis() # ------------------------------------------------------------------------------ def get_time_axis(layout): """ Extract the time axis from the layout, for example for breaking sequence into segments. """ if layout in ["TB", "TBD"]: return 0 if layout in ["BT", "BTD"]: return 1 if layout in ["BCTD"]: return 2 raise ValueError("Unsupported layout = {}".format(layout)) # ------------------------------------------------------------------------------ # get_batch_axis() # ------------------------------------------------------------------------------ def get_batch_axis(layout): """ Extract the batch axis from the layout """ if layout in ["TB", "TBD"]: return 1 if layout in ["BT", "BTD", "BCTD"]: return 0 raise ValueError("Unsupported layout = {}".format(layout)) # ------------------------------------------------------------------------------ # monotonically_increasing_and_bounded() # ------------------------------------------------------------------------------ def monotonically_increasing_and_bounded(iterable, min=None, max=None): """ Check if the elements in the given iterable are monotonically increasing and bounded by upper/lower bounds. """ if not isinstance(iterable, Iterable): raise TypeError( "Expected iterable to be of type Iterable, got ({})".format( iterable.__class__.__name__ ) ) for i in range(len(iterable)): if min is not None and iterable[i] < min: return False if max is not None and iterable[i] > max: return False if i > 0 and iterable[i] <= iterable[i - 1]: return False return True # ------------------------------------------------------------------------------ # to_pair() # ------------------------------------------------------------------------------ def to_pair(value, name): """Make a pair (of type tuple) of given value.""" if isinstance(value, Iterable): if len(value) != 2: raise ValueError( "Expected `{}` to have exactly 2 elements, got: ({})".format( name, value ) ) return value return tuple(repeat(value, 2)) # ------------------------------------------------------------------------------ # infer_conv_output_attrs() # ------------------------------------------------------------------------------ # TODO(cfyeh): figure out if we can get `output_dim` without calling the module. def infer_conv_output_attrs( module, input_channels, input_dim, batch_size=1, max_length=8 ): """Get output attributes of a module with input.""" input = torch.randn(batch_size, input_channels, max_length, input_dim) output = module(input) output_channels = output.shape[1] output_dim = output.shape[-1] return output_channels, output_dim # ------------------------------------------------------------------------------ # NoOp # ------------------------------------------------------------------------------ class NoOp(torch.nn.Module): """ NoOp simply passes the input as the output. """ def __init__(self): super().__init__() def forward(self, input: Tensor) -> Tensor: return input # ------------------------------------------------------------------------------ # Permute: a torch.nn.Module applies permutation on the input tensor. # ------------------------------------------------------------------------------ class Permute(torch.nn.Module): def __init__(self, dims): super().__init__() self.dims = dims def forward(self, input: Tensor) -> Tensor: return input.permute(self.dims).contiguous() # ------------------------------------------------------------------------------ # lengths_to_padding_mask() # ------------------------------------------------------------------------------ def lengths_to_padding_mask(lengths: Tensor) -> Tensor: """Convert lengths of shape (B, ) to padding mask.""" batch_size = lengths.shape[0] max_length = int(torch.max(lengths).item()) padding_mask = torch.arange( # [0, ..., T-1] max_length, device=lengths.device, dtype=lengths.dtype ).expand(batch_size, max_length) >= lengths.unsqueeze(1) return padding_mask # ------------------------------------------------------------------------------ # lengths_to_attention_mask() # ------------------------------------------------------------------------------ def lengths_to_attention_mask( lengths: Tensor, left_context: Optional[int] = None, right_context: Optional[int] = None, ) -> Optional[Tensor]: """ Generate attention mask based on (lengths, left_context, right_context). left_context is None means unlimited left context. right_context is None means unlimited right context. """ if left_context is None and right_context is None: return None max_length = int(torch.max(lengths).item()) # For example, with `max_length` == 5, # indices = tensor([ # [ 0, 1, 2, 3, 4, 5], # [-1, 0, 1, 2, 3, 4], # [-2, -1, 0, 1, 2, 3], # [-3, -2, -1, 0, 1, 2], # [-4, -3, -2, -1, 0, 1], # [-5, -4, -3, -2, -1, 0], # ]) # In some cases the second torch.arange is created on cpu which causes a # failure. Adding the device option to guard against it. indices = torch.arange( max_length, device=lengths.device, dtype=lengths.dtype ).expand(max_length, max_length) - torch.arange( max_length, device=lengths.device ).view( max_length, -1 ) # For example, with `max_length` == 5, # bool_mask = tensor([ # [True, True, True, True, True], # [True, True, True, True, True], # [True, True, True, True, True], # [True, True, True, True, True], # [True, True, True, True, True], # ]) bool_mask = ( torch.tensor([True]).to(device=lengths.device).expand(max_length, max_length) ) # For example, with `max_length` == 5, left_context == 2 # left_mask = tensor([ # [ True, True, True, True, True], # [ True, True, True, True, True], # [ True, True, True, True, True], # [False, True, True, True, True], # [False, False, True, True, True], # ]) if left_context is not None: left_mask = indices >= -left_context bool_mask = bool_mask & left_mask # For example, with `max_length` == 5, right_context == 1 # right_mask = tensor([ # [True, True, False, False, False], # [True, True, True, False, False], # [True, True, True, True, False], # [True, True, True, True, True], # [True, True, True, True, True], # ]) if right_context is not None: right_mask = indices <= right_context bool_mask = bool_mask & right_mask bool_mask = (~bool_mask).to(device=lengths.device) return bool_mask # ------------------------------------------------------------------------------ # infer_output_norm() # ------------------------------------------------------------------------------ def infer_output_norm(module, output_norm=None): """ Infer the output norm (string and module) needed on the module gvien desired output normalization. """ if output_norm == module.output_norm(): # output_norm already matches module.output_norm(). return (None, NoOp()) if output_norm is None and module.output_norm() is not None: logger = logging.getLogger("infer_output_norm()") logger.warning( "trying to set output_norm ({}) ".format(output_norm) + "but got module.output_norm() ({}), ".format(module.output_norm()) + "the combined output_norm() will be ({})".format(module.output_norm()) ) return (None, NoOp()) if output_norm == "log_softmax": if module.output_norm() is not None: raise ValueError( "incompatible output_norm ({}) ".format(output_norm) + "and module.output_norm() ({})".format(module.output_norm()) ) else: return ("log_softmax", torch.nn.LogSoftmax(dim=-1)) if output_norm == "softmax": if module.output_norm() is not None: raise ValueError( "incompatible output_norm ({}) ".format(output_norm) + "and module.output_norm() ({})".format(module.output_norm()) ) else: return ("softmax", torch.nn.Softmax(dim=-1)) raise ValueError( "output_norm ({}) not in ".format(output_norm) + "supported list = [None, softmax, log_softmax]" ) # ------------------------------------------------------------------------------ # infer_channels_from_layout() # ------------------------------------------------------------------------------ def infer_channels_from_layout(layout, channels): """Extract the number of channels from the layout.""" if layout in ("TBD", "BTD"): if channels is not None and channels != 1: raise ValueError( "Expected channels ({}) to be 1 for layout = {}".format( channels, layout ) ) if channels is None: return 1 return channels # ------------------------------------------------------------------------------ # pad_sequence() # ------------------------------------------------------------------------------ @torch.jit.export def pad_sequence( sequence: Tensor, time_axis: int, extra_left_context: int = 0, extra_right_context: int = 0, ) -> Tensor: """Pad extra left/right contexts to the sequence.""" if extra_left_context == 0 and extra_right_context == 0: return sequence tensors_to_concat = [] if extra_left_context: size = (extra_left_context,) fill_value = 0 indices = torch.full( size=size, fill_value=fill_value, dtype=torch.long, device=sequence.device, ) left_padding = torch.index_select(sequence, time_axis, indices) tensors_to_concat.append(left_padding) tensors_to_concat.append(sequence) # NOTE(cfyeh): for efficiency reason we pad 0 instead of the last frame for # extra right contexts. if extra_right_context: size = list(sequence.shape) size[time_axis] = extra_right_context right_padding = torch.zeros(size, dtype=sequence.dtype, device=sequence.device) tensors_to_concat.append(right_padding) padded_sequence = torch.cat(tensors_to_concat, dim=time_axis) return padded_sequence # ------------------------------------------------------------------------------ # sequence_to_segments() # ------------------------------------------------------------------------------ @torch.jit.export def sequence_to_segments( sequence: Tensor, time_axis: int, lengths: Tensor, segment_size: Optional[int] = None, extra_left_context: int = 0, extra_right_context: int = 0, ) -> List[Tuple[Tensor, Tensor]]: """Breaks sequence into segments.""" sequence = pad_sequence( sequence=sequence, time_axis=time_axis, extra_left_context=extra_left_context, extra_right_context=extra_right_context, ) lengths = lengths + extra_left_context + extra_right_context segments: List[Tuple[Tensor, Tensor]] = [] if segment_size is None: segments.append((sequence, lengths)) return segments offset = 0 end = sequence.shape[time_axis] step = segment_size size = extra_left_context + segment_size + extra_right_context while offset + extra_left_context + extra_right_context < end: clamped_size = min(size, end - offset) segment_lengths = torch.clamp(lengths - offset, min=0, max=clamped_size) indices = torch.arange( start=offset, end=(offset + clamped_size), step=1, dtype=torch.long, device=sequence.device, ) segment_tensor = torch.index_select(sequence, time_axis, indices) segments.append((segment_tensor, segment_lengths)) offset = offset + step return segments # ------------------------------------------------------------------------------ # segments_to_sequence() # ------------------------------------------------------------------------------ @torch.jit.export def segments_to_sequence( segments: List[Tuple[Tensor, Tensor]], time_axis: int ) -> Tuple[Tensor, Tensor]: """Concatenate segments into a full sequence.""" if len(segments) == 1: return segments[0] tensors_to_concat: List[Tensor] = [] lengths_to_stack: List[Tensor] = [] for tensor, lengths in segments: tensors_to_concat.append(tensor) lengths_to_stack.append(lengths) sequence = torch.cat(tensors_to_concat, dim=time_axis) lengths = torch.stack(lengths_to_stack, dim=0) lengths = torch.sum(lengths, dim=0) return sequence, lengths def lengths_to_encoder_padding_mask(lengths, batch_first: bool = False): """ convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor Args: lengths: a (B, )-shaped tensor batch_first: whether to return a (B, T) tensor Return: max_length: maximum length of B sequences encoder_padding_mask: a (max_length, B) binary mask, where [t, b] = False for t < lengths[b] and True otherwise TODO: kernelize this function if benchmarking shows this function is slow """ max_lengths = torch.max(lengths).item() bsz = lengths.size(0) encoder_padding_mask = torch.arange( max_lengths ).to( # a (T, ) tensor with [0, ..., T-1] lengths.device ).view( # move to the right device 1, max_lengths ).expand( # reshape to (1, T)-shaped tensor bsz, -1 ) > lengths.view( # expand to (B, T)-shaped tensor bsz, 1 ).expand( -1, max_lengths ) if not batch_first: return encoder_padding_mask.t(), max_lengths else: return encoder_padding_mask, max_lengths # ------------------------------------------------------------------------------ # attention suppression # ------------------------------------------------------------------------------ def attention_suppression(attention_weights: Tensor, scale: float): # B, H, qlen, klen -> B, H, qlen, 1 attention_prob = torch.nn.functional.softmax(attention_weights.float(), dim=-1) attention_nozeros = attention_prob.to(torch.bool) nozeros_sum = torch.sum(attention_nozeros.to(torch.float), dim=-1, keepdim=True) # For very sparse situation, we need get round about 0s key_sum = torch.sum(attention_prob, dim=-1, keepdim=True) # nozeros_sum should > 1 key_mean = key_sum / (nozeros_sum + 1e-8) # std calculation dis = (attention_prob - key_mean) * (attention_prob - key_mean) # if attention_prob[i] < threshold, then dis_masked[i] = 0; for all i dis_masked = torch.where( attention_nozeros, dis, attention_prob.new_zeros(attention_prob.size()) ) key_var = torch.sum(dis_masked, dim=-1, keepdim=True) key_var = key_var / (nozeros_sum - 1.0 + 1e-8) key_std = torch.sqrt(key_var) key_thread = key_mean - scale * key_std # if attention_prob[i] >= key_thread, then attention_prob[i] # , otherwise "-inf" inf_tensor = attention_prob.new_zeros(attention_prob.size()).detach() inf_tensor[:] = float("-inf") attention_weights_float = torch.where( attention_prob < key_thread, inf_tensor, attention_weights.float(), ) return attention_weights_float.type_as(attention_weights) def layer_norm_backward_hook(module, grad_input, grad_output, clamp_value): return tuple(torch.clamp(v, min=-clamp_value, max=clamp_value) for v in grad_input) ================================================ FILE: fairseq/models/speech_to_text/xm_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import logging from typing import Dict, List, Optional, Tuple import numpy as np import torch import torch.nn as nn from torch import Tensor from fairseq import checkpoint_utils, utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqEncoderModel, FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder from fairseq.models.speech_to_text.hub_interface import S2THubInterface from fairseq.models.transformer import ( Embedding, TransformerDecoder, TransformerModelBase, ) from fairseq.models.wav2vec import Wav2VecEncoder from fairseq.modules.layer_norm import LayerNorm logger = logging.getLogger(__name__) def build_embedding(dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() return Embedding(num_embeddings, embed_dim, padding_idx) class Conv1dAdaptor(nn.Module): def __init__( self, in_dim, out_dim, n_layers=3, kernel_size=3, stride=2, layerdrop=0.0, layernorm=False, proj=False, ): super().__init__() self.proj, self.proj_ln = None, None self.post_proj, self.post_proj_ln = None, None if proj: self.proj = nn.Sequential( nn.Linear(in_dim, in_dim * 4), nn.ReLU(), nn.Linear(in_dim * 4, in_dim) ) self.proj_ln = LayerNorm(in_dim) self.post_proj = nn.Sequential( nn.Linear(out_dim, out_dim * 4), nn.ReLU(), nn.Linear(out_dim * 4, out_dim), ) self.post_proj_ln = LayerNorm(out_dim) self.layers = nn.ModuleList( nn.Conv1d( in_dim if i == 0 else out_dim, out_dim * 2, kernel_size, stride=stride, padding=kernel_size // 2, ) for i in range(n_layers) ) self.stride = stride self.layerdrop = layerdrop self.layernorm = LayerNorm(in_dim) if layernorm else None @classmethod def add_args(cls, parser): parser.add_argument("--adaptor-n-layers", type=int) parser.add_argument("--adaptor-kernel-size", type=int) parser.add_argument("--adaptor-stride", type=int) parser.add_argument("--adaptor-layerdrop", type=float) parser.add_argument("--adaptor-layernorm", action="store_true") parser.add_argument("--adaptor-proj", action="store_true") def forward(self, x, padding_mask: Optional[torch.Tensor]): if self.layernorm is not None: x = self.layernorm(x) if self.proj is not None: x = x + 0.5 * self.proj(x) x = self.proj_ln(x) if padding_mask is not None: x = utils.index_put(x, padding_mask.T, 0) # T x B x C -> B x C x T x = x.transpose(0, 1).transpose(1, 2) out_lens = None if padding_mask is not None: out_lens = (~padding_mask).sum(1).float() for layer in self.layers: layerdrop_prob = np.random.random() if not self.training or (layerdrop_prob > self.layerdrop): x = nn.functional.glu(layer(x), dim=1) if padding_mask is not None: out_lens = ((out_lens - 1) / self.stride + 1).floor() # B x C x T -> T x B x C x = x.transpose(1, 2).transpose(0, 1) if self.post_proj is not None: x = x + 0.5 * self.post_proj(x) x = self.post_proj_ln(x) out_padding_mask = None if padding_mask is not None: out_padding_mask = lengths_to_padding_mask(out_lens.long()) x = utils.index_put(x, out_padding_mask.T, 0) return x, out_padding_mask def add_wav2vec_asr_args(parser): parser.add_argument("--w2v-path", help="path to wav2vec 2.0 model") parser.add_argument( "--no-pretrained-weights", action="store_true", help="if true, does not load pretrained weights", ) parser.add_argument( "--dropout-input", type=float, metavar="D", help="dropout to apply to the input (after feat extr)", ) parser.add_argument( "--final-dropout", type=float, metavar="D", help="dropout after transformer and before final projection", ) parser.add_argument( "--apply-mask", action="store_true", help="apply masking during fine-tuning" ) parser.add_argument( "--dropout", type=float, metavar="D", help="dropout probability inside wav2vec 2.0 model", ) parser.add_argument( "--attention-dropout", type=float, metavar="D", help="dropout probability for attention weights inside wav2vec 2.0 model", ) parser.add_argument( "--activation-dropout", "--relu-dropout", type=float, metavar="D", help="dropout probability after activation in FFN inside wav2vec 2.0 model", ) parser.add_argument( "--mask-length", type=int, help="repeat the mask indices multiple times" ) parser.add_argument( "--mask-prob", type=float, help="probability of replacing a token with mask" ) parser.add_argument( "--mask-selection", type=str, choices=["static", "uniform", "normal", "poisson"], help="how to choose masks", ) parser.add_argument( "--mask-other", type=float, help="stdev of the mask length in case of 'normal' selection strategy", ) parser.add_argument( "--no-mask-overlap", action="store_true", help="whether to allow masks to overlap", ) parser.add_argument( "--mask-channel-length", type=int, help="repeat the mask indices multiple times" ) parser.add_argument( "--mask-channel-prob", type=float, help="probability of replacing a token with mask", ) parser.add_argument( "--mask-channel-selection", type=str, choices=["static", "uniform", "normal", "poisson"], help="how to choose masks", ) parser.add_argument( "--mask-channel-other", type=float, help="stdev of the mask length in case of 'normal' selection strategy", ) parser.add_argument( "--no-mask-channel-overlap", action="store_true", help="whether to allow masks to overlap", ) parser.add_argument( "--freeze-finetune-updates", type=int, metavar="N", help="dont finetune wav2vec for this many updates", ) parser.add_argument( "--feature-grad-mult", type=float, metavar="D", help="reset feature grad mult in wav2vec 2.0 to this", ) parser.add_argument( "--layerdrop", type=float, metavar="D", help="probability of dropping a layer in wav2vec 2.0", ) parser.add_argument( "--max-positions", type=int, metavar="N", help="Max input positions to be used in the conformer encoder in wav2vec 2.0", ) parser.add_argument("--encoder-proj", action="store_true") parser.add_argument("--w2v-args", default=None) parser.add_argument( "--remove-weight-norm", action="store_true", help="if set, then the weight-norm (in one pos_conv layer) is removed from the model", ) parser.add_argument( "--encoder-embed-dim", type=int, metavar="N", help="encoder embedding dimension to be used when w2v_path is None and no encoder_proj is set", ) def need_finetuning(ft_params, param_name): if ft_params == "all": return True ft_params_list = ft_params.split(",") for ft_param in ft_params_list: if ft_param in param_name: return True return False class Wav2VecEncoderWithAdaptor(FairseqEncoder): def build_adaptor(self, args): adaptor = None if args.adaptor_n_layers > 0: adaptor = Conv1dAdaptor( args.decoder_embed_dim, args.decoder_embed_dim, n_layers=args.adaptor_n_layers, kernel_size=args.adaptor_kernel_size, stride=args.adaptor_stride, layerdrop=args.adaptor_layerdrop, layernorm=args.adaptor_layernorm, proj=args.adaptor_proj, ) return adaptor def __init__(self, args): super().__init__(None) self.w2v_encoder = Wav2VecEncoder(args) self.is_v0_arch = not args.adaptor_proj self.w2v_proj_ln = None if not self.is_v0_arch and self.w2v_encoder.proj is not None: self.w2v_proj_ln = LayerNorm(args.decoder_embed_dim) self.adaptor = self.build_adaptor(args) self.num_updates = 0 self.freezing_updates = args.w2v_freezing_updates self.finetuning_params = args.finetune_w2v_params for k, p in self.w2v_encoder.w2v_model.named_parameters(): p.requires_grad = need_finetuning(self.finetuning_params, k) @classmethod def add_args(cls, parser): """Add model-specific arguments to the parser.""" add_wav2vec_asr_args(parser) parser.add_argument( "--normalize", action="store_true", help="if set, normalizes input to have 0 mean and unit variance", ) parser.add_argument( "--finetune-w2v-params", type=str, metavar="STR", help="comma-separated param strings to finetune.", ) parser.add_argument("--w2v-freezing-updates", type=int) parser.add_argument("--load-pretrained-encoder-from", type=str, metavar="STR") Conv1dAdaptor.add_args(parser) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) self.num_updates = num_updates def forward(self, src_tokens, src_lengths=None, **kwargs): if ( self.freezing_updates is not None and self.num_updates > self.freezing_updates ): for p in self.w2v_encoder.w2v_model.parameters(): p.requires_grad = True padding_mask = lengths_to_padding_mask(src_lengths) out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) x, padding_mask = out["encoder_out"], out["padding_mask"] if self.w2v_proj_ln is not None: x = self.w2v_proj_ln(x) if self.adaptor is not None: x, padding_mask = self.adaptor(x, padding_mask) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [] if padding_mask is None else [padding_mask], # B x T "encoder_embedding": [], # B x T x C "encoder_states": [], # List[T x B x C] "src_tokens": [], "src_lengths": [], } def reorder_encoder_out(self, encoder_out, new_order): new_encoder_out = ( [] if len(encoder_out["encoder_out"]) == 0 else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] ) new_encoder_padding_mask = ( [] if len(encoder_out["encoder_padding_mask"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_padding_mask"] ] ) new_encoder_embedding = ( [] if len(encoder_out["encoder_embedding"]) == 0 else [ x.index_select(0, new_order) for x in encoder_out["encoder_embedding"] ] ) encoder_states = encoder_out["encoder_states"] if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) return { "encoder_out": new_encoder_out, # T x B x C "encoder_padding_mask": new_encoder_padding_mask, # B x T "encoder_embedding": new_encoder_embedding, # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], # B x T "src_lengths": [], # B x 1 } def add_decoder_args(parser): parser.add_argument( "--activation-fn", type=str, default="relu", choices=utils.get_available_activation_fns(), help="activation function to use", ) parser.add_argument( "--decoder-dropout", type=float, metavar="D", help="dropout probability" ) parser.add_argument( "--decoder-attention-dropout", type=float, metavar="D", help="dropout probability for attention weights", ) parser.add_argument( "--decoder-activation-dropout", type=float, metavar="D", help="dropout probability after activation in FFN.", ) parser.add_argument( "--decoder-embed-dim", type=int, metavar="N", help="decoder embedding dimension" ) parser.add_argument( "--decoder-ffn-embed-dim", type=int, metavar="N", help="decoder embedding dimension for FFN", ) parser.add_argument( "--decoder-layers", type=int, metavar="N", help="num decoder layers" ) parser.add_argument( "--decoder-attention-heads", type=int, metavar="N", help="num decoder attention heads", ) parser.add_argument( "--decoder-normalize-before", action="store_true", help="apply layernorm before each decoder block", ) parser.add_argument( "--layernorm-embedding", action="store_true", help="add layernorm to embedding" ) parser.add_argument( "--decoder-layerdrop", type=float, metavar="D", help="layerdrop probability for decoder", ) parser.add_argument( "--decoder-learned-pos", action="store_true", help="learn positional embedding in decoder", ) parser.add_argument( "--share-decoder-input-output-embed", action="store_true", help="share decoder input and output embeddings", ) parser.add_argument( "--no-scale-embedding", action="store_true", help="if True, dont scale embeddings", ) parser.add_argument( "--load-pretrained-decoder-from", type=str, metavar="STR", help="model to take decoder weights from (for initialization)", ) parser.add_argument( "--finetune-decoder-params", type=str, metavar="STR", help="comma-separated param strings to finetune.", ) def remove_weight_norm_from_model(model): from functools import reduce layers_with_wn = [] for param_name, _ in model.named_parameters(): if param_name.endswith("_g"): # retrieve the module with this param_name module_names = param_name.split(".")[ :-1 ] # exclude the actual parameter name wn_module = reduce(getattr, module_names, model) layers_with_wn.append(wn_module) for wn_module in layers_with_wn: torch.nn.utils.remove_weight_norm(wn_module) logger.warning(f"Weight norm removed from module with {wn_module}\n") @register_model("xm_transformer") class XMTransformerModel(FairseqEncoderDecoderModel): @classmethod def hub_models(cls): base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t" model_ids = [ "xm_transformer_600m-es_en-multi_domain", "xm_transformer_600m-ru_en-multi_domain", "xm_transformer_600m-fr_en-multi_domain", "xm_transformer_600m-en_es-multi_domain", "xm_transformer_600m-en_ru-multi_domain", "xm_transformer_600m-en_fr-multi_domain", "xm_transformer_600m-en_zh-multi_domain", "xm_transformer_600m-en_ar-multi_domain", "xm_transformer_600m-en_tr-multi_domain", "xm_transformer_600m-en_vi-multi_domain", "xm_transformer-21_en-xls_r_300m", "xm_transformer-en_15-xls_r_300m", "xm_transformer-21_en-xls_r_1b", "xm_transformer-en_15-xls_r_1b", "xm_transformer-21_en-xls_r_2b", "xm_transformer-en_15-xls_r_2b", "xm_transformer-22_16-xls_r_2b", "xm_transformer_s2ut_800m-es-en-st-asr-bt_h1_2022", "xm_transformer_s2ut_800m-en-es-st_plus_asr", "xm_transformer_s2ut_800m-hk-en-h1_2022", "xm_transformer_s2ut_800m-en-hk-h1_2022", ] return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", config_yaml="config.yaml", task="speech_to_text", generation_args=None, **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), config_yaml=config_yaml, task=task, generation_args=generation_args, **kwargs, ) return S2THubInterface(x["args"], x["task"], x["models"][0]) def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @classmethod def add_args(cls, parser): """Add model-specific arguments to the parser.""" Wav2VecEncoderWithAdaptor.add_args(parser) add_decoder_args(parser) parser.add_argument("--checkpoint-activations", action="store_true") parser.add_argument("--offload-activations", action="store_true") parser.add_argument("--min-params-to-wrap", type=int, metavar="N") @classmethod def maybe_load_pretrained(cls, component, checkpoint: Optional[str] = None): if checkpoint is None: return component _load = checkpoint_utils.load_pretrained_component_from_model try: return _load(component, checkpoint) except RuntimeError as e: logger.warning(e) return _load(component, checkpoint, strict=False) @classmethod def build_encoder(cls, args): _args = copy.deepcopy(args) if not args.adaptor_proj and not args.encoder_proj: # V0 arch if args.w2v_path: state = checkpoint_utils.load_checkpoint_to_cpu(args.w2v_path) if state.get("cfg") is not None: encoder_embed_dim = state["cfg"]._content["model"][ "encoder_embed_dim" ] elif state.get("args") is not None: encoder_embed_dim = state["args"].encoder_embed_dim else: raise ValueError(f"Invalid config in {args.w2v_path}") _args.decoder_embed_dim = encoder_embed_dim del state else: _args.decoder_embed_dim = args.encoder_embed_dim encoder = Wav2VecEncoderWithAdaptor(_args) encoder = cls.maybe_load_pretrained( encoder, getattr(args, "load_pretrained_encoder_from", None) ) if args.remove_weight_norm: # remove the wn for EMA usage logger.warning("Removing weight norm from wav2vec encoder") remove_weight_norm_from_model(encoder) return encoder @classmethod def get_decoder_args_from_checkpoint(cls, ckpt_args): assert "model" in ckpt_args, "Model args not found in checkpoint cfg!" decoder_args = {} for k, v in ckpt_args["model"].__dict__.items(): if "decoder" in k: decoder_args[k] = v return decoder_args @classmethod def override_decoder_args(cls, cli_args, decoder_args_dict): for k, v in decoder_args_dict.items(): if v != getattr(cli_args, k, None): logger.warning( f"Overriding decoder arg {k}: from {getattr(cli_args, k, None)} to {v}" ) setattr(cli_args, k, v) return cli_args @classmethod def build_decoder(cls, args, task, embed_tokens): _args = copy.deepcopy(args) if args.adaptor_proj or args.encoder_proj: # not V0 arch _args.encoder_embed_dim = _args.decoder_embed_dim _args.dropout = args.decoder_dropout _args.attention_dropout = args.decoder_attention_dropout _args.activation_dropout = args.decoder_activation_dropout _args.layerdrop = _args.decoder_layerdrop decoder = TransformerDecoder(_args, task.target_dictionary, embed_tokens) decoder = cls.maybe_load_pretrained( decoder, getattr(args, "load_pretrained_decoder_from", None) ) for k, p in decoder.named_parameters(): p.requires_grad = need_finetuning(args.finetune_decoder_params, k) return decoder @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if getattr(args, "load_pretrained_decoder_from", None) is not None: ckpt = torch.load(getattr(args, "load_pretrained_decoder_from", None)) decoder_args_dict = cls.get_decoder_args_from_checkpoint(ckpt["cfg"]) args = cls.override_decoder_args(args, decoder_args_dict) decoder_embed_tokens = build_embedding( task.target_dictionary, args.decoder_embed_dim ) encoder = cls.build_encoder(args) decoder = cls.build_decoder(args, task, decoder_embed_tokens) base_model = cls(encoder, decoder) # set up multitask decoders base_model.multitask_decoders = {} for i, (task_name, task_obj) in enumerate(task.multitask_tasks.items()): # dummy auxiliary decoder if task_obj.args.get_loss_weight(0) == 0: continue task_decoder = cls.build_multitask_decoder( args, task_obj.args, task_obj.target_dictionary, args.decoder_embed_dim ) setattr(base_model, f"{task_name}_decoder", task_decoder) decoder_model_cls = ( FairseqEncoderModel if task_obj.args.decoder_type == "ctc" else FairseqLanguageModel ) base_model.multitask_decoders[task_name] = decoder_model_cls( getattr(base_model, f"{task_name}_decoder") ) return base_model @classmethod def build_multitask_decoder( cls, args, mtl_args, tgt_dict, in_dim, is_first_pass_decoder=False, ): decoder_args = mtl_args.decoder_args decoder_args.encoder_embed_dim = in_dim if mtl_args.decoder_type == "transformer": if is_first_pass_decoder: task_decoder = cls.build_text_decoder(args, tgt_dict) else: from fairseq.models.speech_to_speech import ( base_multitask_text_transformer_decoder_arch, ) base_multitask_text_transformer_decoder_arch(decoder_args) # 2L task_decoder = TransformerDecoder( decoder_args, tgt_dict, embed_tokens=TransformerModelBase.build_embedding( decoder_args, tgt_dict, decoder_args.decoder_embed_dim, ), ) elif mtl_args.decoder_type == "ctc": task_decoder = CTCDecoder( dictionary=tgt_dict, in_dim=in_dim, ) else: raise NotImplementedError( "currently only support multitask decoder_type 'transformer', 'ctc'" ) return task_decoder def get_normalized_probs( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): return self.get_normalized_probs_scriptable(net_output, log_probs, sample) def forward( self, src_tokens, src_lengths, prev_output_tokens, return_all_hiddens=False, **kwargs, ): """ The forward method inherited from the base class has a **kwargs argument in its input, which is not supported in torchscript. This method overwrites the forward method definition without **kwargs. """ encoder_out = self.encoder( src_tokens=src_tokens, src_lengths=src_lengths, **kwargs ) decoder_out = self.decoder( prev_output_tokens=prev_output_tokens, encoder_out=encoder_out ) if return_all_hiddens: decoder_out[-1]["encoder_states"] = encoder_out["encoder_out"] # NOTE: from the top layer decoder_out[-1]["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ] return decoder_out def upgrade_state_dict(self, state_dict): for k, _ in state_dict.items(): if "adaptor.layers" in state_dict: new = k.replace("adaptor.layers", "adaptor_layers") state_dict[new] = state_dict[k] del state_dict[k] def set_default_w2v_encoder_args(args): args.no_pretrained_weights = getattr(args, "no_pretrained_weights", False) args.dropout_input = getattr(args, "dropout_input", 0) args.final_dropout = getattr(args, "final_dropout", 0) args.apply_mask = getattr(args, "apply_mask", False) args.dropout = getattr(args, "dropout", 0) args.attention_dropout = getattr(args, "attention_dropout", 0) args.activation_dropout = getattr(args, "activation_dropout", 0) args.encoder_proj = getattr(args, "encoder_proj", False) args.remove_weight_norm = getattr(args, "remove_weight_norm", False) args.mask_length = getattr(args, "mask_length", 10) args.mask_prob = getattr(args, "mask_prob", 0.5) args.mask_selection = getattr(args, "mask_selection", "static") args.mask_other = getattr(args, "mask_other", 0) args.no_mask_overlap = getattr(args, "no_mask_overlap", False) args.mask_channel_length = getattr(args, "mask_channel_length", 10) args.mask_channel_prob = getattr(args, "mask_channel_prob", 0.5) args.mask_channel_before = getattr(args, "mask_channel_before", False) args.mask_channel_selection = getattr(args, "mask_channel_selection", "static") args.mask_channel_other = getattr(args, "mask_channel_other", 0) args.no_mask_channel_overlap = getattr(args, "no_mask_channel_overlap", False) args.freeze_finetune_updates = getattr(args, "freeze_finetune_updates", 0) args.feature_grad_mult = 0.1 args.layerdrop = getattr(args, "layerdrop", 0.0) args.normalize = getattr(args, "normalize", False) args.finetune_w2v_params = getattr(args, "finetune_w2v_params", "all") args.w2v_freezing_updates = getattr(args, "w2v_freezing_updates", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) def set_default_adaptor_args(args): args.adaptor_n_layers = getattr(args, "adaptor_n_layers", 3) args.adaptor_kernel_size = getattr(args, "adaptor_kernel_size", 3) args.adaptor_stride = getattr(args, "adaptor_stride", 2) args.adaptor_layerdrop = getattr(args, "adaptor_layerdrop", 0.0) args.adaptor_layernorm = getattr(args, "adaptor_layernorm", False) args.adaptor_proj = getattr(args, "adaptor_proj", False) def set_default_transformer_decoder_args(args): args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4 * 1024) args.decoder_layers = getattr(args, "decoder_layers", 12) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0) args.adaptive_input = getattr(args, "adaptive_input", False) args.decoder_attention_dropout = getattr(args, "decoder_attention_dropout", 0.0) args.decoder_activation_dropout = getattr(args, "decoder_activation_dropout", 0.0) args.decoder_dropout = getattr(args, "decoder_dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) args.activation_fn = getattr(args, "activation_fn", "gelu") args.pooler_activation_fn = getattr(args, "pooler_activation_fn", "tanh") args.pooler_dropout = getattr(args, "pooler_dropout", 0.0) args.finetune_decoder_params = getattr(args, "finetune_decoder_params", "all") def set_default_general_args(args): args.checkpoint_activations = getattr(args, "checkpoint_activations", False) args.offload_activations = getattr(args, "offload_activations", False) args.min_params_to_wrap = getattr(args, "min_params_to_wrap", int(1e8)) args.max_positions = getattr(args, "max_positions", 3000) @register_model_architecture(model_name="xm_transformer", arch_name="xm_transformer") def base_architecture(args): set_default_general_args(args) set_default_w2v_encoder_args(args) set_default_adaptor_args(args) set_default_transformer_decoder_args(args) ================================================ FILE: fairseq/models/speech_to_text/xm_transformer_unity.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import logging from fairseq.models import ( FairseqEncoderModel, FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.speech_to_speech.modules.ctc_decoder import CTCDecoder from fairseq.models.speech_to_speech.modules.transformer_encoder import ( TransformerEncoderNoEmb, ) from fairseq.models.speech_to_text.xm_transformer import XMTransformerModel from fairseq.models.speech_to_text.xm_transformer import ( base_architecture as xm_t_base_architecture, ) from fairseq.models.speech_to_text.xm_transformer import ( build_embedding, need_finetuning, set_default_adaptor_args, set_default_general_args, set_default_transformer_decoder_args, set_default_w2v_encoder_args, ) from fairseq.models.transformer import Linear, TransformerDecoder, TransformerModelBase from fairseq.models.transformer.transformer_decoder_aug import AugTransformerDecoder logger = logging.getLogger(__name__) def unit_transformer_decoder_arch_base( args, decoder_layers=6, decoder_embed_dim=768, decoder_attention_heads=12 ): args.encoder_layers = decoder_layers args.decoder_layers = decoder_layers args.decoder_embed_dim = decoder_embed_dim args.decoder_ffn_embed_dim = decoder_embed_dim * 4 args.decoder_attention_heads = decoder_attention_heads args.encoder_embed_dim = args.decoder_embed_dim args.decoder_output_dim = decoder_embed_dim args.decoder_input_dim = decoder_embed_dim def unit_transformer_decoder_arch_large( args, decoder_layers=12, decoder_embed_dim=1024, decoder_attention_heads=16 ): args.encoder_layers = decoder_layers args.decoder_layers = decoder_layers args.decoder_embed_dim = decoder_embed_dim args.decoder_ffn_embed_dim = decoder_embed_dim * 4 args.decoder_attention_heads = decoder_attention_heads args.encoder_embed_dim = args.decoder_embed_dim args.decoder_output_dim = decoder_embed_dim args.decoder_input_dim = decoder_embed_dim @register_model("unity_xm_transformer") class XMTransformerModelUnitY(XMTransformerModel): @classmethod def hub_models(cls): base_url = "http://dl.fbaipublicfiles.com/fairseq/s2t" model_ids = [] return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @classmethod def add_args(cls, parser): """Add model-specific arguments to the parser.""" XMTransformerModel.add_args(parser) parser.add_argument( "--translation-decoder-layers", type=int, default=4, metavar="N", help="num decoder layers in the first-pass translation module", ) parser.add_argument( "--synthesizer-encoder-layers", type=int, default=0, metavar="N", help="num encoder layers in the second-pass synthesizer module", ) parser.add_argument( "--synthesizer-augmented-cross-attention", action="store_true", default=False, help="augmented cross-attention over speech encoder output", ) parser.add_argument( "--load-pretrained-aux-decoder-from", type=str, metavar="STR", help="model to take decoder weights from (for initialization)", ) @classmethod def build_text_decoder(cls, args, tgt_dict): _args = copy.deepcopy(args) if args.adaptor_proj or args.encoder_proj: # not V0 arch _args.encoder_embed_dim = _args.decoder_embed_dim _args.dropout = args.decoder_dropout _args.attention_dropout = args.decoder_attention_dropout _args.activation_dropout = args.decoder_activation_dropout _args.layerdrop = _args.decoder_layerdrop _args.decoder_layers = _args.translation_decoder_layers embed_tokens = build_embedding(tgt_dict, _args.decoder_embed_dim) decoder = TransformerDecoder(_args, tgt_dict, embed_tokens) if getattr(args, "load_pretrained_aux_decoder_from", None) is not None: decoder = cls.maybe_load_pretrained( decoder, getattr(args, "load_pretrained_aux_decoder_from", None) ) for k, p in decoder.named_parameters(): p.requires_grad = need_finetuning(args.finetune_decoder_params, k) return decoder @classmethod def build_decoder(cls, args, task, aug_attn=False): _args = copy.deepcopy(args) _args.layerdrop = 0.0 # turn off layerdrop for shallow layers _args.encoder_embed_dim = args.decoder_embed_dim proj = None if args.decoder_embed_dim != _args.decoder_embed_dim: proj = Linear(args.decoder_embed_dim, _args.decoder_embed_dim) embed_tokens = build_embedding(task.target_dictionary, _args.decoder_embed_dim) decoder_cls = AugTransformerDecoder if aug_attn else TransformerDecoder decoder = decoder_cls(_args, task.target_dictionary, embed_tokens) if getattr(args, "load_pretrained_decoder_from", None) is not None: # load all layers first and then discard the bottom layers embed_tokens = build_embedding( task.target_dictionary, _args.decoder_embed_dim ) decoder_tmp = decoder_cls(_args, task.target_dictionary, embed_tokens) decoder_tmp = cls.maybe_load_pretrained( decoder_tmp, getattr(_args, "load_pretrained_decoder_from", None) ) state_dict = decoder_tmp.state_dict() for k, p in decoder.named_parameters(): p.data = state_dict[k].data p.requires_grad = need_finetuning(_args.finetune_decoder_params, k) decoder.layers = decoder.layers[-_args.decoder_layers :] return decoder, proj, _args @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models xm_t_base_architecture(args) encoder = cls.build_encoder(args) decoder, proj, unit_args = cls.build_decoder( args, task, aug_attn=getattr(args, "synthesizer_augmented_cross_attention", False), ) base_model = cls(encoder, decoder) setattr(base_model, "proj", proj) base_model.t2u_augmented_cross_attn = getattr( args, "synthesizer_augmented_cross_attention", False ) # set up multitask decoders base_model.mt_task_name = None base_model.multitask_decoders = {} has_first_pass_decoder = False for task_name, task_obj in task.multitask_tasks.items(): if task_obj.is_first_pass_decoder: has_first_pass_decoder = True base_model.mt_task_name = task_name task_decoder = cls.build_multitask_decoder( args, task_obj.args, task_obj.target_dictionary, args.decoder_embed_dim, task_obj.is_first_pass_decoder, ) setattr(base_model, f"{task_name}_decoder", task_decoder) decoder_model_cls = ( FairseqEncoderModel if task_obj.args.decoder_type == "ctc" else FairseqLanguageModel ) base_model.multitask_decoders[task_name] = decoder_model_cls( getattr(base_model, f"{task_name}_decoder") ) assert has_first_pass_decoder, "set at least one intermediate non-CTC decoder" # set up encoder on top of the auxiliary MT decoder if getattr(args, "synthesizer_encoder_layers", 0) > 0: base_model.synthesizer_encoder = cls.build_t2u_encoder(unit_args) else: base_model.synthesizer_encoder = None return base_model @classmethod def build_t2u_encoder(cls, args): _args = copy.deepcopy(args) _args.encoder_layers = _args.synthesizer_encoder_layers _args.encoder_embed_dim = args.decoder_embed_dim _args.encoder_ffn_embed_dim = args.decoder_ffn_embed_dim _args.encoder_attention_heads = args.decoder_attention_heads _args.encoder_normalize_before = True return TransformerEncoderNoEmb(_args) def forward( self, src_tokens, src_lengths, prev_output_tokens, prev_output_tokens_mt, return_all_hiddens=False, tgt_speaker=None, **kwargs, ): """ The forward method inherited from the base class has a **kwargs argument in its input, which is not supported in torchscript. This method overwrites the forward method definition without **kwargs. """ encoder_out = self.encoder( src_tokens=src_tokens, src_lengths=src_lengths, **kwargs ) # 1. MT decoder mt_decoder = getattr(self, f"{self.mt_task_name}_decoder") mt_decoder_out = mt_decoder( prev_output_tokens_mt, encoder_out=encoder_out, ) x = mt_decoder_out[1]["inner_states"][-1] if mt_decoder.layer_norm is not None: x = mt_decoder.layer_norm(x) if self.proj is not None: x = self.proj(x) mt_decoder_padding_mask = None if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) # 2. T2U encoder if self.synthesizer_encoder is not None: t2u_encoder_out = self.synthesizer_encoder( x, mt_decoder_padding_mask, ) else: t2u_encoder_out = { "encoder_out": [x], # T x B x C "encoder_padding_mask": [mt_decoder_padding_mask], # B x T } # 3. T2U decoder if self.t2u_augmented_cross_attn: decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, encoder_out_aug=t2u_encoder_out, ) else: decoder_out = self.decoder( prev_output_tokens, encoder_out=t2u_encoder_out, ) if return_all_hiddens: decoder_out[-1]["encoder_states"] = encoder_out["encoder_out"] # NOTE: from the top layer decoder_out[-1]["encoder_padding_mask"] = encoder_out[ "encoder_padding_mask" ] decoder_out[-1]["mt_decoder_out"] = mt_decoder_out return decoder_out @register_model_architecture( model_name="unity_xm_transformer", arch_name="unity_xm_transformer" ) def base_architecture_unity(args): set_default_general_args(args) set_default_w2v_encoder_args(args) set_default_adaptor_args(args) set_default_transformer_decoder_args(args) args.layernorm_embedding = False args.decoder_learned_pos = False # for old models @register_model_architecture( model_name="unity_xm_transformer", arch_name="xm_transformer_t2" ) def base_architecture_unity_legacy(args): base_architecture_unity(args) ================================================ FILE: fairseq/models/text_to_speech/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .tacotron2 import * # noqa from .tts_transformer import * # noqa from .fastspeech2 import * # noqa from .vocoder import * # noqa ================================================ FILE: fairseq/models/text_to_speech/codehifigan.py ================================================ from argparse import Namespace import torch import torch.nn as nn from fairseq.models.text_to_speech.fastspeech2 import VariancePredictor from fairseq.models.text_to_speech.hifigan import Generator class CodeGenerator(Generator): def __init__(self, cfg): super().__init__(cfg) self.dict = nn.Embedding(cfg["num_embeddings"], cfg["embedding_dim"]) self.multispkr = cfg.get("multispkr", None) self.embedder = cfg.get("embedder_params", None) if self.multispkr and not self.embedder: self.spkr = nn.Embedding(cfg.get("num_speakers", 200), cfg["embedding_dim"]) elif self.embedder: self.spkr = nn.Linear(cfg.get("embedder_dim", 256), cfg["embedding_dim"]) self.dur_predictor = None if cfg.get("dur_predictor_params", None): self.dur_predictor = VariancePredictor( Namespace(**cfg["dur_predictor_params"]) ) self.f0 = cfg.get("f0", None) n_f0_bin = cfg.get("f0_quant_num_bin", 0) self.f0_quant_embed = ( None if n_f0_bin <= 0 else nn.Embedding(n_f0_bin, cfg["embedding_dim"]) ) @staticmethod def _upsample(signal, max_frames): if signal.dim() == 3: bsz, channels, cond_length = signal.size() elif signal.dim() == 2: signal = signal.unsqueeze(2) bsz, channels, cond_length = signal.size() else: signal = signal.view(-1, 1, 1) bsz, channels, cond_length = signal.size() signal = signal.unsqueeze(3).repeat(1, 1, 1, max_frames // cond_length) # pad zeros as needed (if signal's shape does not divide completely with max_frames) reminder = (max_frames - signal.shape[2] * signal.shape[3]) // signal.shape[3] if reminder > 0: raise NotImplementedError( "Padding condition signal - misalignment between condition features." ) signal = signal.view(bsz, channels, max_frames) return signal def forward(self, **kwargs): x = self.dict(kwargs["code"]).transpose(1, 2) if self.dur_predictor and kwargs.get("dur_prediction", False): assert x.size(0) == 1, "only support single sample" log_dur_pred = self.dur_predictor(x.transpose(1, 2)) dur_out = torch.clamp( torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1 ) # B x C x T x = torch.repeat_interleave(x, dur_out.view(-1), dim=2) if self.f0: if self.f0_quant_embed: kwargs["f0"] = self.f0_quant_embed(kwargs["f0"].long()).transpose(1, 2) else: kwargs["f0"] = kwargs["f0"].unsqueeze(1) if x.shape[-1] < kwargs["f0"].shape[-1]: x = self._upsample(x, kwargs["f0"].shape[-1]) elif x.shape[-1] > kwargs["f0"].shape[-1]: kwargs["f0"] = self._upsample(kwargs["f0"], x.shape[-1]) x = torch.cat([x, kwargs["f0"]], dim=1) if self.multispkr: assert ( "spkr" in kwargs ), 'require "spkr" input for multispeaker CodeHiFiGAN vocoder' spkr = self.spkr(kwargs["spkr"]).transpose(1, 2) spkr = self._upsample(spkr, x.shape[-1]) x = torch.cat([x, spkr], dim=1) for k, feat in kwargs.items(): if k in ["spkr", "code", "f0", "dur_prediction"]: continue feat = self._upsample(feat, x.shape[-1]) x = torch.cat([x, feat], dim=1) return super().forward(x) ================================================ FILE: fairseq/models/text_to_speech/fastspeech2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch from torch import nn from fairseq import utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( FairseqEncoder, FairseqEncoderModel, register_model, register_model_architecture, ) from fairseq.models.text_to_speech.hub_interface import TTSHubInterface from fairseq.models.text_to_speech.tacotron2 import Postnet from fairseq.modules import ( FairseqDropout, LayerNorm, MultiheadAttention, PositionalEmbedding, ) logger = logging.getLogger(__name__) def model_init(m): if isinstance(m, nn.Conv1d): nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu")) def Embedding(num_embeddings, embedding_dim, padding_idx=None): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) return m class PositionwiseFeedForward(nn.Module): def __init__(self, in_dim, hidden_dim, kernel_size, dropout): super().__init__() self.ffn = nn.Sequential( nn.Conv1d( in_dim, hidden_dim, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, ), nn.ReLU(), nn.Conv1d( hidden_dim, in_dim, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, ), ) self.layer_norm = LayerNorm(in_dim) self.dropout = self.dropout_module = FairseqDropout( p=dropout, module_name=self.__class__.__name__ ) def forward(self, x): # B x T x C residual = x x = self.ffn(x.transpose(1, 2)).transpose(1, 2) x = self.dropout(x) return self.layer_norm(x + residual) class FFTLayer(torch.nn.Module): def __init__( self, embed_dim, n_heads, hidden_dim, kernel_size, dropout, attention_dropout ): super().__init__() self.self_attn = MultiheadAttention( embed_dim, n_heads, dropout=attention_dropout, self_attention=True ) self.layer_norm = LayerNorm(embed_dim) self.ffn = PositionwiseFeedForward( embed_dim, hidden_dim, kernel_size, dropout=dropout ) def forward(self, x, padding_mask=None): # B x T x C residual = x x = x.transpose(0, 1) x, _ = self.self_attn( query=x, key=x, value=x, key_padding_mask=padding_mask, need_weights=False ) x = x.transpose(0, 1) x = self.layer_norm(x + residual) return self.ffn(x) class LengthRegulator(nn.Module): def forward(self, x, durations): # x: B x T x C out_lens = durations.sum(dim=1) max_len = out_lens.max() bsz, seq_len, dim = x.size() out = x.new_zeros((bsz, max_len, dim)) for b in range(bsz): indices = [] for t in range(seq_len): indices.extend([t] * utils.item(durations[b, t])) indices = torch.tensor(indices, dtype=torch.long).to(x.device) out_len = utils.item(out_lens[b]) out[b, :out_len] = x[b].index_select(0, indices) return out, out_lens class VariancePredictor(nn.Module): def __init__(self, args): super().__init__() self.conv1 = nn.Sequential( nn.Conv1d( args.encoder_embed_dim, args.var_pred_hidden_dim, kernel_size=args.var_pred_kernel_size, padding=(args.var_pred_kernel_size - 1) // 2, ), nn.ReLU(), ) self.ln1 = nn.LayerNorm(args.var_pred_hidden_dim) self.dropout_module = FairseqDropout( p=args.var_pred_dropout, module_name=self.__class__.__name__ ) self.conv2 = nn.Sequential( nn.Conv1d( args.var_pred_hidden_dim, args.var_pred_hidden_dim, kernel_size=args.var_pred_kernel_size, padding=1, ), nn.ReLU(), ) self.ln2 = nn.LayerNorm(args.var_pred_hidden_dim) self.proj = nn.Linear(args.var_pred_hidden_dim, 1) def forward(self, x): # Input: B x T x C; Output: B x T x = self.conv1(x.transpose(1, 2)).transpose(1, 2) x = self.dropout_module(self.ln1(x)) x = self.conv2(x.transpose(1, 2)).transpose(1, 2) x = self.dropout_module(self.ln2(x)) return self.proj(x).squeeze(dim=2) class VarianceAdaptor(nn.Module): def __init__(self, args): super().__init__() self.args = args self.length_regulator = LengthRegulator() self.duration_predictor = VariancePredictor(args) self.pitch_predictor = VariancePredictor(args) self.energy_predictor = VariancePredictor(args) n_bins, steps = self.args.var_pred_n_bins, self.args.var_pred_n_bins - 1 self.pitch_bins = torch.linspace(args.pitch_min, args.pitch_max, steps) self.embed_pitch = Embedding(n_bins, args.encoder_embed_dim) self.energy_bins = torch.linspace(args.energy_min, args.energy_max, steps) self.embed_energy = Embedding(n_bins, args.encoder_embed_dim) def get_pitch_emb(self, x, tgt=None, factor=1.0): out = self.pitch_predictor(x) bins = self.pitch_bins.to(x.device) if tgt is None: out = out * factor emb = self.embed_pitch(torch.bucketize(out, bins)) else: emb = self.embed_pitch(torch.bucketize(tgt, bins)) return out, emb def get_energy_emb(self, x, tgt=None, factor=1.0): out = self.energy_predictor(x) bins = self.energy_bins.to(x.device) if tgt is None: out = out * factor emb = self.embed_energy(torch.bucketize(out, bins)) else: emb = self.embed_energy(torch.bucketize(tgt, bins)) return out, emb def forward( self, x, padding_mask, durations=None, pitches=None, energies=None, d_factor=1.0, p_factor=1.0, e_factor=1.0, ): # x: B x T x C log_dur_out = self.duration_predictor(x) dur_out = torch.clamp( torch.round((torch.exp(log_dur_out) - 1) * d_factor).long(), min=0 ) dur_out.masked_fill_(padding_mask, 0) pitch_out, pitch_emb = self.get_pitch_emb(x, pitches, p_factor) x = x + pitch_emb energy_out, energy_emb = self.get_energy_emb(x, energies, e_factor) x = x + energy_emb x, out_lens = self.length_regulator( x, dur_out if durations is None else durations ) return x, out_lens, log_dur_out, pitch_out, energy_out class FastSpeech2Encoder(FairseqEncoder): def __init__(self, args, src_dict, embed_speaker): super().__init__(src_dict) self.args = args self.padding_idx = src_dict.pad() self.n_frames_per_step = args.n_frames_per_step self.out_dim = args.output_frame_dim * args.n_frames_per_step self.embed_speaker = embed_speaker self.spk_emb_proj = None if embed_speaker is not None: self.spk_emb_proj = nn.Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim ) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__ ) self.embed_tokens = Embedding( len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx ) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx ) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.dec_pos_emb_alpha = nn.Parameter(torch.ones(1)) self.encoder_fft_layers = nn.ModuleList( FFTLayer( args.encoder_embed_dim, args.encoder_attention_heads, args.fft_hidden_dim, args.fft_kernel_size, dropout=args.dropout, attention_dropout=args.attention_dropout, ) for _ in range(args.encoder_layers) ) self.var_adaptor = VarianceAdaptor(args) self.decoder_fft_layers = nn.ModuleList( FFTLayer( args.decoder_embed_dim, args.decoder_attention_heads, args.fft_hidden_dim, args.fft_kernel_size, dropout=args.dropout, attention_dropout=args.attention_dropout, ) for _ in range(args.decoder_layers) ) self.out_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) self.postnet = None if args.add_postnet: self.postnet = Postnet( self.out_dim, args.postnet_conv_dim, args.postnet_conv_kernel_size, args.postnet_layers, args.postnet_dropout, ) self.apply(model_init) def forward( self, src_tokens, src_lengths=None, speaker=None, durations=None, pitches=None, energies=None, **kwargs, ): x = self.embed_tokens(src_tokens) enc_padding_mask = src_tokens.eq(self.padding_idx) x += self.pos_emb_alpha * self.embed_positions(enc_padding_mask) x = self.dropout_module(x) for layer in self.encoder_fft_layers: x = layer(x, enc_padding_mask) if self.embed_speaker is not None: bsz, seq_len, _ = x.size() emb = self.embed_speaker(speaker).expand(bsz, seq_len, -1) x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) x, out_lens, log_dur_out, pitch_out, energy_out = self.var_adaptor( x, enc_padding_mask, durations, pitches, energies ) dec_padding_mask = lengths_to_padding_mask(out_lens) x += self.dec_pos_emb_alpha * self.embed_positions(dec_padding_mask) for layer in self.decoder_fft_layers: x = layer(x, dec_padding_mask) x = self.out_proj(x) x_post = None if self.postnet is not None: x_post = x + self.postnet(x) return x, x_post, out_lens, log_dur_out, pitch_out, energy_out @register_model("fastspeech2") class FastSpeech2Model(FairseqEncoderModel): """ Implementation for https://arxiv.org/abs/2006.04558 """ NON_AUTOREGRESSIVE = True @classmethod def hub_models(cls): base_url = "http://dl.fbaipublicfiles.com/fairseq/s2" model_ids = [ "fastspeech2-en-ljspeech", "fastspeech2-en-200_speaker-cv4", ] return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", config_yaml="config.yaml", vocoder: str = "griffin_lim", fp16: bool = False, **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), config_yaml=config_yaml, vocoder=vocoder, fp16=fp16, **kwargs, ) return TTSHubInterface(x["args"], x["task"], x["models"][0]) @staticmethod def add_args(parser): parser.add_argument("--dropout", type=float) parser.add_argument("--output-frame-dim", type=int) parser.add_argument("--speaker-embed-dim", type=int) # FFT blocks parser.add_argument("--fft-hidden-dim", type=int) parser.add_argument("--fft-kernel-size", type=int) parser.add_argument("--attention-dropout", type=float) parser.add_argument("--encoder-layers", type=int) parser.add_argument("--encoder-embed-dim", type=int) parser.add_argument("--encoder-attention-heads", type=int) parser.add_argument("--decoder-layers", type=int) parser.add_argument("--decoder-embed-dim", type=int) parser.add_argument("--decoder-attention-heads", type=int) # variance predictor parser.add_argument("--var-pred-n-bins", type=int) parser.add_argument("--var-pred-hidden-dim", type=int) parser.add_argument("--var-pred-kernel-size", type=int) parser.add_argument("--var-pred-dropout", type=float) # postnet parser.add_argument("--add-postnet", action="store_true") parser.add_argument("--postnet-dropout", type=float) parser.add_argument("--postnet-layers", type=int) parser.add_argument("--postnet-conv-dim", type=int) parser.add_argument("--postnet-conv-kernel-size", type=int) def __init__(self, encoder, args, src_dict): super().__init__(encoder) self._num_updates = 0 out_dim = args.output_frame_dim * args.n_frames_per_step self.ctc_proj = None if getattr(args, "ctc_weight", 0.0) > 0.0: self.ctc_proj = nn.Linear(out_dim, len(src_dict)) @classmethod def build_model(cls, args, task): embed_speaker = task.get_speaker_embeddings(args) encoder = FastSpeech2Encoder(args, task.src_dict, embed_speaker) return cls(encoder, args, task.src_dict) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) self._num_updates = num_updates def get_normalized_probs(self, net_output, log_probs, sample=None): logits = self.ctc_proj(net_output[0]) if log_probs: return utils.log_softmax(logits.float(), dim=-1) else: return utils.softmax(logits.float(), dim=-1) @register_model_architecture("fastspeech2", "fastspeech2") def base_architecture(args): args.dropout = getattr(args, "dropout", 0.2) args.output_frame_dim = getattr(args, "output_frame_dim", 80) args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 64) # FFT blocks args.fft_hidden_dim = getattr(args, "fft_hidden_dim", 1024) args.fft_kernel_size = getattr(args, "fft_kernel_size", 9) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.encoder_layers = getattr(args, "encoder_layers", 4) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2) args.decoder_layers = getattr(args, "decoder_layers", 4) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 256) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2) # variance predictor args.var_pred_n_bins = getattr(args, "var_pred_n_bins", 256) args.var_pred_hidden_dim = getattr(args, "var_pred_hidden_dim", 256) args.var_pred_kernel_size = getattr(args, "var_pred_kernel_size", 3) args.var_pred_dropout = getattr(args, "var_pred_dropout", 0.5) # postnet args.add_postnet = getattr(args, "add_postnet", False) args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) args.postnet_layers = getattr(args, "postnet_layers", 5) args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) ================================================ FILE: fairseq/models/text_to_speech/hifigan.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F from torch.nn import Conv1d, ConvTranspose1d from torch.nn.utils import remove_weight_norm, weight_norm LRELU_SLOPE = 0.1 def init_weights(m, mean=0.0, std=0.01): classname = m.__class__.__name__ if classname.find("Conv") != -1: m.weight.data.normal_(mean, std) def get_padding(kernel_size, dilation=1): return (kernel_size * dilation - dilation) // 2 class ResBlock(torch.nn.Module): def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)): super(ResBlock, self).__init__() self.convs1 = nn.ModuleList( [ weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[0], padding=get_padding(kernel_size, dilation[0]), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[1], padding=get_padding(kernel_size, dilation[1]), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=dilation[2], padding=get_padding(kernel_size, dilation[2]), ) ), ] ) self.convs1.apply(init_weights) self.convs2 = nn.ModuleList( [ weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1), ) ), weight_norm( Conv1d( channels, channels, kernel_size, 1, dilation=1, padding=get_padding(kernel_size, 1), ) ), ] ) self.convs2.apply(init_weights) def forward(self, x): for c1, c2 in zip(self.convs1, self.convs2): xt = F.leaky_relu(x, LRELU_SLOPE) xt = c1(xt) xt = F.leaky_relu(xt, LRELU_SLOPE) xt = c2(xt) x = xt + x return x def remove_weight_norm(self): for layer in self.convs1: remove_weight_norm(layer) for layer in self.convs2: remove_weight_norm(layer) class Generator(torch.nn.Module): def __init__(self, cfg): super(Generator, self).__init__() self.num_kernels = len(cfg["resblock_kernel_sizes"]) self.num_upsamples = len(cfg["upsample_rates"]) self.conv_pre = weight_norm( Conv1d( cfg.get("model_in_dim", 80), cfg["upsample_initial_channel"], 7, 1, padding=3, ) ) self.ups = nn.ModuleList() for i, (u, k) in enumerate( zip(cfg["upsample_rates"], cfg["upsample_kernel_sizes"]) ): self.ups.append( weight_norm( ConvTranspose1d( cfg["upsample_initial_channel"] // (2**i), cfg["upsample_initial_channel"] // (2 ** (i + 1)), k, u, padding=(k - u) // 2, ) ) ) self.resblocks = nn.ModuleList() for i in range(len(self.ups)): ch = cfg["upsample_initial_channel"] // (2 ** (i + 1)) for k, d in zip( cfg["resblock_kernel_sizes"], cfg["resblock_dilation_sizes"] ): self.resblocks.append(ResBlock(ch, k, d)) self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3)) self.ups.apply(init_weights) self.conv_post.apply(init_weights) def forward(self, x): x = self.conv_pre(x) for i in range(self.num_upsamples): x = F.leaky_relu(x, LRELU_SLOPE) x = self.ups[i](x) xs = None for j in range(self.num_kernels): if xs is None: xs = self.resblocks[i * self.num_kernels + j](x) else: xs += self.resblocks[i * self.num_kernels + j](x) x = xs / self.num_kernels x = F.leaky_relu(x) x = self.conv_post(x) x = torch.tanh(x) return x def remove_weight_norm(self): print("Removing weight norm...") for layer in self.ups: remove_weight_norm(layer) for layer in self.resblocks: layer.remove_weight_norm() remove_weight_norm(self.conv_pre) remove_weight_norm(self.conv_post) ================================================ FILE: fairseq/models/text_to_speech/hub_interface.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import random from pathlib import Path from typing import Dict, Optional, Tuple import torch import torch.nn as nn logger = logging.getLogger(__name__) class TTSHubInterface(nn.Module): def __init__(self, cfg, task, model): super().__init__() self.cfg = cfg self.task = task self.model = model self.model.eval() self.update_cfg_with_data_cfg(self.cfg, self.task.data_cfg) self.generator = self.task.build_generator([self.model], self.cfg) @classmethod def phonemize( cls, text: str, lang: Optional[str], phonemizer: Optional[str] = None, preserve_punct: bool = False, to_simplified_zh: bool = False, ): if to_simplified_zh: import hanziconv text = hanziconv.HanziConv.toSimplified(text) if phonemizer == "g2p": import g2p_en g2p = g2p_en.G2p() if preserve_punct: return " ".join("|" if p == " " else p for p in g2p(text)) else: res = [{",": "sp", ";": "sp"}.get(p, p) for p in g2p(text)] return " ".join(p for p in res if p.isalnum()) if phonemizer == "g2pc": import g2pc g2p = g2pc.G2pC() return " ".join([w[3] for w in g2p(text)]) elif phonemizer == "ipa": assert lang is not None import phonemizer from phonemizer.separator import Separator lang_map = {"en": "en-us", "fr": "fr-fr"} return phonemizer.phonemize( text, backend="espeak", language=lang_map.get(lang, lang), separator=Separator(word="| ", phone=" "), ) else: return text @classmethod def tokenize(cls, text: str, tkn_cfg: Dict[str, str]): sentencepiece_model = tkn_cfg.get("sentencepiece_model", None) if sentencepiece_model is not None: assert Path(sentencepiece_model).exists() import sentencepiece as sp spm = sp.SentencePieceProcessor() spm.Load(sentencepiece_model) return " ".join(spm.Encode(text, out_type=str)) else: return text @classmethod def update_cfg_with_data_cfg(cls, cfg, data_cfg): cfg["task"].vocoder = data_cfg.vocoder.get("type", "griffin_lim") @classmethod def get_model_input( cls, task, text: str, speaker: Optional[int] = None, verbose: bool = False ): phonemized = cls.phonemize( text, task.data_cfg.hub.get("lang", None), task.data_cfg.hub.get("phonemizer", None), task.data_cfg.hub.get("preserve_punct", False), task.data_cfg.hub.get("to_simplified_zh", False), ) tkn_cfg = task.data_cfg.bpe_tokenizer tokenized = cls.tokenize(phonemized, tkn_cfg) if verbose: logger.info(f"text: {text}") logger.info(f"phonemized: {phonemized}") logger.info(f"tokenized: {tokenized}") spk = task.data_cfg.hub.get("speaker", speaker) n_speakers = len(task.speaker_to_id or {}) if spk is None and n_speakers > 0: spk = random.randint(0, n_speakers - 1) if spk is not None: spk = max(0, min(spk, n_speakers - 1)) if verbose: logger.info(f"speaker: {spk}") spk = None if spk is None else torch.Tensor([[spk]]).long() src_tokens = task.src_dict.encode_line(tokenized, add_if_not_exist=False).view( 1, -1 ) src_lengths = torch.Tensor([len(tokenized.split())]).long() return { "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, "prev_output_tokens": None, }, "target_lengths": None, "speaker": spk, } @classmethod def get_prediction(cls, task, model, generator, sample) -> Tuple[torch.Tensor, int]: prediction = generator.generate(model, sample) return prediction[0]["waveform"], task.sr def predict( self, text: str, speaker: Optional[int] = None, verbose: bool = False ) -> Tuple[torch.Tensor, int]: sample = self.get_model_input(self.task, text, speaker, verbose=verbose) return self.get_prediction(self.task, self.model, self.generator, sample) class VocoderHubInterface(nn.Module): """Vocoder interface to run vocoder models through hub. Currently we only support unit vocoder""" def __init__(self, cfg, model): super().__init__() self.vocoder = model self.vocoder.eval() self.sr = 16000 self.multispkr = self.vocoder.model.multispkr if self.multispkr: logger.info("multi-speaker vocoder") self.num_speakers = cfg.get( "num_speakers", 200, ) # following the default in codehifigan to set to 200 def get_model_input( self, text: str, speaker: Optional[int] = -1, ): units = list(map(int, text.strip().split())) x = { "code": torch.LongTensor(units).view(1, -1), } if not speaker: speaker = -1 if self.multispkr: assert ( speaker < self.num_speakers ), f"invalid --speaker-id ({speaker}) with total #speakers = {self.num_speakers}" spk = random.randint(0, self.num_speakers - 1) if speaker == -1 else speaker x["spkr"] = torch.LongTensor([spk]).view(1, 1) return x def get_prediction(self, sample, dur_prediction: Optional[bool] = True): wav = self.vocoder(sample, dur_prediction) return wav, self.sr def predict( self, text: str, speaker: Optional[int] = None, dur_prediction: Optional[bool] = True, ): sample = self.get_model_input(text, speaker) return self.get_prediction(sample, dur_prediction) ================================================ FILE: fairseq/models/text_to_speech/tacotron2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch from torch import nn from torch.nn import functional as F from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, register_model_architecture, ) from fairseq.modules import LSTMCellWithZoneOut, LocationAttention logger = logging.getLogger(__name__) def encoder_init(m): if isinstance(m, nn.Conv1d): nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu")) class Tacotron2Encoder(FairseqEncoder): def __init__(self, args, src_dict, embed_speaker): super().__init__(src_dict) self.padding_idx = src_dict.pad() self.embed_speaker = embed_speaker self.spk_emb_proj = None if embed_speaker is not None: self.spk_emb_proj = nn.Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim ) self.embed_tokens = nn.Embedding( len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx ) assert args.encoder_conv_kernel_size % 2 == 1 self.convolutions = nn.ModuleList( nn.Sequential( nn.Conv1d( args.encoder_embed_dim, args.encoder_embed_dim, kernel_size=args.encoder_conv_kernel_size, padding=((args.encoder_conv_kernel_size - 1) // 2), ), nn.BatchNorm1d(args.encoder_embed_dim), nn.ReLU(), nn.Dropout(args.encoder_dropout), ) for _ in range(args.encoder_conv_layers) ) self.lstm = nn.LSTM( args.encoder_embed_dim, args.encoder_embed_dim // 2, num_layers=args.encoder_lstm_layers, batch_first=True, bidirectional=True, ) self.apply(encoder_init) def forward(self, src_tokens, src_lengths=None, speaker=None, **kwargs): x = self.embed_tokens(src_tokens) x = x.transpose(1, 2).contiguous() # B x T x C -> B x C x T for conv in self.convolutions: x = conv(x) x = x.transpose(1, 2).contiguous() # B x C x T -> B x T x C src_lengths = src_lengths.cpu().long() x = nn.utils.rnn.pack_padded_sequence(x, src_lengths, batch_first=True) x = self.lstm(x)[0] x = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)[0] encoder_padding_mask = src_tokens.eq(self.padding_idx) if self.embed_speaker is not None: seq_len, bsz, _ = x.size() emb = self.embed_speaker(speaker).expand(seq_len, bsz, -1) x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) return { "encoder_out": [x], # B x T x C "encoder_padding_mask": encoder_padding_mask, # B x T } class Prenet(nn.Module): def __init__(self, in_dim, n_layers, n_units, dropout): super().__init__() self.layers = nn.ModuleList( nn.Sequential(nn.Linear(in_dim if i == 0 else n_units, n_units), nn.ReLU()) for i in range(n_layers) ) self.dropout = dropout def forward(self, x): for layer in self.layers: x = F.dropout(layer(x), p=self.dropout) # always applies dropout return x class Postnet(nn.Module): def __init__(self, in_dim, n_channels, kernel_size, n_layers, dropout): super(Postnet, self).__init__() self.convolutions = nn.ModuleList() assert kernel_size % 2 == 1 for i in range(n_layers): cur_layers = ( [ nn.Conv1d( in_dim if i == 0 else n_channels, n_channels if i < n_layers - 1 else in_dim, kernel_size=kernel_size, padding=((kernel_size - 1) // 2), ), nn.BatchNorm1d(n_channels if i < n_layers - 1 else in_dim), ] + ([nn.Tanh()] if i < n_layers - 1 else []) + [nn.Dropout(dropout)] ) nn.init.xavier_uniform_( cur_layers[0].weight, torch.nn.init.calculate_gain("tanh" if i < n_layers - 1 else "linear"), ) self.convolutions.append(nn.Sequential(*cur_layers)) def forward(self, x): x = x.transpose(1, 2) # B x T x C -> B x C x T for conv in self.convolutions: x = conv(x) return x.transpose(1, 2) def decoder_init(m): if isinstance(m, torch.nn.Conv1d): nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh")) class Tacotron2Decoder(FairseqIncrementalDecoder): def __init__(self, args, src_dict): super().__init__(None) self.args = args self.n_frames_per_step = args.n_frames_per_step self.out_dim = args.output_frame_dim * args.n_frames_per_step self.prenet = Prenet( self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout ) # take prev_context, prev_frame, (speaker embedding) as input self.attention_lstm = LSTMCellWithZoneOut( args.zoneout, args.prenet_dim + args.encoder_embed_dim, args.decoder_lstm_dim, ) # take attention_lstm output, attention_state, encoder_out as input self.attention = LocationAttention( args.attention_dim, args.encoder_embed_dim, args.decoder_lstm_dim, (1 + int(args.attention_use_cumprob)), args.attention_conv_dim, args.attention_conv_kernel_size, ) # take attention_lstm output, context, (gated_latent) as input self.lstm = nn.ModuleList( LSTMCellWithZoneOut( args.zoneout, args.encoder_embed_dim + args.decoder_lstm_dim, args.decoder_lstm_dim, ) for i in range(args.decoder_lstm_layers) ) proj_in_dim = args.encoder_embed_dim + args.decoder_lstm_dim self.feat_proj = nn.Linear(proj_in_dim, self.out_dim) self.eos_proj = nn.Linear(proj_in_dim, 1) self.postnet = Postnet( self.out_dim, args.postnet_conv_dim, args.postnet_conv_kernel_size, args.postnet_layers, args.postnet_dropout, ) self.ctc_proj = None if getattr(args, "ctc_weight", 0.0) > 0.0: self.ctc_proj = nn.Linear(self.out_dim, len(src_dict)) self.apply(decoder_init) def _get_states(self, incremental_state, enc_out): bsz, in_len, _ = enc_out.size() alstm_h = self.get_incremental_state(incremental_state, "alstm_h") if alstm_h is None: alstm_h = enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) alstm_c = self.get_incremental_state(incremental_state, "alstm_c") if alstm_c is None: alstm_c = enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) lstm_h = self.get_incremental_state(incremental_state, "lstm_h") if lstm_h is None: lstm_h = [ enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) for _ in range(self.args.decoder_lstm_layers) ] lstm_c = self.get_incremental_state(incremental_state, "lstm_c") if lstm_c is None: lstm_c = [ enc_out.new_zeros(bsz, self.args.decoder_lstm_dim) for _ in range(self.args.decoder_lstm_layers) ] attn_w = self.get_incremental_state(incremental_state, "attn_w") if attn_w is None: attn_w = enc_out.new_zeros(bsz, in_len) attn_w_cum = self.get_incremental_state(incremental_state, "attn_w_cum") if attn_w_cum is None: attn_w_cum = enc_out.new_zeros(bsz, in_len) return alstm_h, alstm_c, lstm_h, lstm_c, attn_w, attn_w_cum def _get_init_attn_c(self, enc_out, enc_mask): bsz = enc_out.size(0) if self.args.init_attn_c == "zero": return enc_out.new_zeros(bsz, self.args.encoder_embed_dim) elif self.args.init_attn_c == "avg": enc_w = (~enc_mask).type(enc_out.type()) enc_w = enc_w / enc_w.sum(dim=1, keepdim=True) return torch.sum(enc_out * enc_w.unsqueeze(2), dim=1) else: raise ValueError(f"{self.args.init_attn_c} not supported") def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, target_lengths=None, **kwargs, ): enc_mask = encoder_out["encoder_padding_mask"] enc_out = encoder_out["encoder_out"][0] in_len = enc_out.size(1) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:, :] bsz, out_len, _ = prev_output_tokens.size() prenet_out = self.prenet(prev_output_tokens) (alstm_h, alstm_c, lstm_h, lstm_c, attn_w, attn_w_cum) = self._get_states( incremental_state, enc_out ) attn_ctx = self._get_init_attn_c(enc_out, enc_mask) attn_out = enc_out.new_zeros(bsz, in_len, out_len) feat_out = enc_out.new_zeros(bsz, out_len, self.out_dim) eos_out = enc_out.new_zeros(bsz, out_len) for t in range(out_len): alstm_in = torch.cat((attn_ctx, prenet_out[:, t, :]), dim=1) alstm_h, alstm_c = self.attention_lstm(alstm_in, (alstm_h, alstm_c)) attn_state = attn_w.unsqueeze(1) if self.args.attention_use_cumprob: attn_state = torch.stack((attn_w, attn_w_cum), dim=1) attn_ctx, attn_w = self.attention(enc_out, enc_mask, alstm_h, attn_state) attn_w_cum = attn_w_cum + attn_w attn_out[:, :, t] = attn_w for i, cur_lstm in enumerate(self.lstm): if i == 0: lstm_in = torch.cat((attn_ctx, alstm_h), dim=1) else: lstm_in = torch.cat((attn_ctx, lstm_h[i - 1]), dim=1) lstm_h[i], lstm_c[i] = cur_lstm(lstm_in, (lstm_h[i], lstm_c[i])) proj_in = torch.cat((attn_ctx, lstm_h[-1]), dim=1) feat_out[:, t, :] = self.feat_proj(proj_in) eos_out[:, t] = self.eos_proj(proj_in).squeeze(1) self.attention.clear_cache() self.set_incremental_state(incremental_state, "alstm_h", alstm_h) self.set_incremental_state(incremental_state, "alstm_c", alstm_c) self.set_incremental_state(incremental_state, "lstm_h", lstm_h) self.set_incremental_state(incremental_state, "lstm_c", lstm_c) self.set_incremental_state(incremental_state, "attn_w", attn_w) self.set_incremental_state(incremental_state, "attn_w_cum", attn_w_cum) post_feat_out = feat_out + self.postnet(feat_out) eos_out = eos_out.view(bsz, out_len, 1) return post_feat_out, eos_out, {"attn": attn_out, "feature_out": feat_out} @register_model("tacotron_2") class Tacotron2Model(FairseqEncoderDecoderModel): """ Implementation for https://arxiv.org/pdf/1712.05884.pdf """ @staticmethod def add_args(parser): # encoder parser.add_argument("--encoder-dropout", type=float) parser.add_argument("--encoder-embed-dim", type=int) parser.add_argument("--encoder-conv-layers", type=int) parser.add_argument("--encoder-conv-kernel-size", type=int) parser.add_argument("--encoder-lstm-layers", type=int) # decoder parser.add_argument("--attention-dim", type=int) parser.add_argument("--attention-conv-dim", type=int) parser.add_argument("--attention-conv-kernel-size", type=int) parser.add_argument("--prenet-dropout", type=float) parser.add_argument("--prenet-layers", type=int) parser.add_argument("--prenet-dim", type=int) parser.add_argument("--postnet-dropout", type=float) parser.add_argument("--postnet-layers", type=int) parser.add_argument("--postnet-conv-dim", type=int) parser.add_argument("--postnet-conv-kernel-size", type=int) parser.add_argument("--init-attn-c", type=str) parser.add_argument("--attention-use-cumprob", action="store_true") parser.add_argument("--zoneout", type=float) parser.add_argument("--decoder-lstm-layers", type=int) parser.add_argument("--decoder-lstm-dim", type=int) parser.add_argument("--output-frame-dim", type=int) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._num_updates = 0 @classmethod def build_model(cls, args, task): embed_speaker = task.get_speaker_embeddings(args) encoder = Tacotron2Encoder(args, task.src_dict, embed_speaker) decoder = Tacotron2Decoder(args, task.src_dict) return cls(encoder, decoder) def forward_encoder(self, src_tokens, src_lengths, **kwargs): return self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) self._num_updates = num_updates @register_model_architecture("tacotron_2", "tacotron_2") def base_architecture(args): # encoder args.encoder_dropout = getattr(args, "encoder_dropout", 0.5) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_conv_layers = getattr(args, "encoder_conv_layers", 3) args.encoder_conv_kernel_size = getattr(args, "encoder_conv_kernel_size", 5) args.encoder_lstm_layers = getattr(args, "encoder_lstm_layers", 1) # decoder args.attention_dim = getattr(args, "attention_dim", 128) args.attention_conv_dim = getattr(args, "attention_conv_dim", 32) args.attention_conv_kernel_size = getattr(args, "attention_conv_kernel_size", 15) args.prenet_dropout = getattr(args, "prenet_dropout", 0.5) args.prenet_layers = getattr(args, "prenet_layers", 2) args.prenet_dim = getattr(args, "prenet_dim", 256) args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) args.postnet_layers = getattr(args, "postnet_layers", 5) args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) args.init_attn_c = getattr(args, "init_attn_c", "zero") args.attention_use_cumprob = getattr(args, "attention_use_cumprob", True) args.zoneout = getattr(args, "zoneout", 0.1) args.decoder_lstm_layers = getattr(args, "decoder_lstm_layers", 2) args.decoder_lstm_dim = getattr(args, "decoder_lstm_dim", 1024) args.output_frame_dim = getattr(args, "output_frame_dim", 80) ================================================ FILE: fairseq/models/text_to_speech/tts_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import List, Optional import torch from torch import nn from fairseq import utils from fairseq.data.data_utils import lengths_to_padding_mask from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, register_model_architecture, ) from fairseq.models.text_to_speech.hub_interface import TTSHubInterface from fairseq.models.text_to_speech.tacotron2 import Postnet, Prenet from fairseq.modules import ( FairseqDropout, LayerNorm, PositionalEmbedding, TransformerDecoderLayer, TransformerEncoderLayer, ) logger = logging.getLogger(__name__) def encoder_init(m): if isinstance(m, nn.Conv1d): nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("relu")) def Embedding(num_embeddings, embedding_dim): m = nn.Embedding(num_embeddings, embedding_dim) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) return m class TTSTransformerEncoder(FairseqEncoder): def __init__(self, args, src_dict, embed_speaker): super().__init__(src_dict) self.padding_idx = src_dict.pad() self.embed_speaker = embed_speaker self.spk_emb_proj = None if embed_speaker is not None: self.spk_emb_proj = nn.Linear( args.encoder_embed_dim + args.speaker_embed_dim, args.encoder_embed_dim ) self.dropout_module = FairseqDropout( p=args.dropout, module_name=self.__class__.__name__ ) self.embed_tokens = nn.Embedding( len(src_dict), args.encoder_embed_dim, padding_idx=self.padding_idx ) assert args.encoder_conv_kernel_size % 2 == 1 self.prenet = nn.ModuleList( nn.Sequential( nn.Conv1d( args.encoder_embed_dim, args.encoder_embed_dim, kernel_size=args.encoder_conv_kernel_size, padding=((args.encoder_conv_kernel_size - 1) // 2), ), nn.BatchNorm1d(args.encoder_embed_dim), nn.ReLU(), nn.Dropout(args.encoder_dropout), ) for _ in range(args.encoder_conv_layers) ) self.prenet_proj = nn.Linear(args.encoder_embed_dim, args.encoder_embed_dim) self.embed_positions = PositionalEmbedding( args.max_source_positions, args.encoder_embed_dim, self.padding_idx ) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.transformer_layers = nn.ModuleList( TransformerEncoderLayer(args) for _ in range(args.encoder_transformer_layers) ) if args.encoder_normalize_before: self.layer_norm = LayerNorm(args.encoder_embed_dim) else: self.layer_norm = None self.apply(encoder_init) def forward(self, src_tokens, src_lengths=None, speaker=None, **kwargs): x = self.embed_tokens(src_tokens) x = x.transpose(1, 2).contiguous() # B x T x C -> B x C x T for conv in self.prenet: x = conv(x) x = x.transpose(1, 2).contiguous() # B x C x T -> B x T x C x = self.prenet_proj(x) padding_mask = src_tokens.eq(self.padding_idx) positions = self.embed_positions(padding_mask) x += self.pos_emb_alpha * positions x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) for layer in self.transformer_layers: x = layer(x, padding_mask) if self.layer_norm is not None: x = self.layer_norm(x) if self.embed_speaker is not None: seq_len, bsz, _ = x.size() emb = self.embed_speaker(speaker).transpose(0, 1) emb = emb.expand(seq_len, bsz, -1) x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [padding_mask] if padding_mask.any() else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": [], # List[T x B x C] "src_tokens": [], "src_lengths": [], } def decoder_init(m): if isinstance(m, torch.nn.Conv1d): nn.init.xavier_uniform_(m.weight, torch.nn.init.calculate_gain("tanh")) class TTSTransformerDecoder(FairseqIncrementalDecoder): def __init__(self, args, src_dict, padding_idx=1): super().__init__(None) self._future_mask = torch.empty(0) self.args = args self.padding_idx = src_dict.pad() if src_dict else padding_idx self.n_frames_per_step = args.n_frames_per_step self.out_dim = args.output_frame_dim * args.n_frames_per_step self.dropout_module = FairseqDropout( args.dropout, module_name=self.__class__.__name__ ) self.embed_positions = PositionalEmbedding( args.max_target_positions, args.decoder_embed_dim, self.padding_idx ) self.pos_emb_alpha = nn.Parameter(torch.ones(1)) self.prenet = nn.Sequential( Prenet( self.out_dim, args.prenet_layers, args.prenet_dim, args.prenet_dropout ), nn.Linear(args.prenet_dim, args.decoder_embed_dim), ) self.n_transformer_layers = args.decoder_transformer_layers self.transformer_layers = nn.ModuleList( TransformerDecoderLayer(args) for _ in range(self.n_transformer_layers) ) if args.decoder_normalize_before: self.layer_norm = LayerNorm(args.decoder_embed_dim) else: self.layer_norm = None self.feat_proj = nn.Linear(args.decoder_embed_dim, self.out_dim) self.eos_proj = nn.Linear(args.decoder_embed_dim, 1) self.postnet = Postnet( self.out_dim, args.postnet_conv_dim, args.postnet_conv_kernel_size, args.postnet_layers, args.postnet_dropout, ) self.ctc_proj = None if getattr(args, "ctc_weight", 0.0) > 0.0: self.ctc_proj = nn.Linear(self.out_dim, len(src_dict)) self.apply(decoder_init) def extract_features( self, prev_outputs, encoder_out=None, incremental_state=None, target_lengths=None, speaker=None, **kwargs, ): alignment_layer = self.n_transformer_layers - 1 self_attn_padding_mask = lengths_to_padding_mask(target_lengths) positions = self.embed_positions( self_attn_padding_mask, incremental_state=incremental_state ) if incremental_state is not None: prev_outputs = prev_outputs[:, -1:, :] self_attn_padding_mask = self_attn_padding_mask[:, -1:] if positions is not None: positions = positions[:, -1:] x = self.prenet(prev_outputs) x += self.pos_emb_alpha * positions x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) if not self_attn_padding_mask.any(): self_attn_padding_mask = None attn: Optional[torch.Tensor] = None inner_states: List[Optional[torch.Tensor]] = [x] for idx, transformer_layer in enumerate(self.transformer_layers): if incremental_state is None: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None x, layer_attn, _ = transformer_layer( x, encoder_out["encoder_out"][0] if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) else None, encoder_out["encoder_padding_mask"][0] if ( encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0 ) else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float().to(x) if attn is not None: # average probabilities over heads, transpose to # (B, src_len, tgt_len) attn = attn.mean(dim=0).transpose(2, 1) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) return x, {"attn": attn, "inner_states": inner_states} def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, target_lengths=None, speaker=None, **kwargs, ): x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, target_lengths=target_lengths, speaker=speaker, **kwargs, ) attn = extra["attn"] feat_out = self.feat_proj(x) bsz, seq_len, _ = x.size() eos_out = self.eos_proj(x) post_feat_out = feat_out + self.postnet(feat_out) return ( post_feat_out, eos_out, { "attn": attn, "feature_out": feat_out, "inner_states": extra["inner_states"], }, ) def get_normalized_probs(self, net_output, log_probs, sample): logits = self.ctc_proj(net_output[2]["feature_out"]) if log_probs: return utils.log_softmax(logits.float(), dim=-1) else: return utils.softmax(logits.float(), dim=-1) def buffered_future_mask(self, tensor): dim = tensor.size(0) # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. if ( self._future_mask.size(0) == 0 or (not self._future_mask.device == tensor.device) or self._future_mask.size(0) < dim ): self._future_mask = torch.triu( utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 ) self._future_mask = self._future_mask.to(tensor) return self._future_mask[:dim, :dim] @register_model("tts_transformer") class TTSTransformerModel(FairseqEncoderDecoderModel): """ Implementation for https://arxiv.org/pdf/1809.08895.pdf """ @classmethod def hub_models(cls): base_url = "http://dl.fbaipublicfiles.com/fairseq/s2" model_ids = [ "tts_transformer-en-ljspeech", "tts_transformer-en-200_speaker-cv4", "tts_transformer-es-css10", "tts_transformer-fr-cv7_css10", "tts_transformer-ru-cv7_css10", "tts_transformer-zh-cv7_css10", "tts_transformer-ar-cv7_css10", "tts_transformer-tr-cv7_css10", "tts_transformer-vi-cv7", ] return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", config_yaml="config.yaml", vocoder: str = "griffin_lim", fp16: bool = False, **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), config_yaml=config_yaml, vocoder=vocoder, fp16=fp16, **kwargs, ) return TTSHubInterface(x["args"], x["task"], x["models"][0]) @staticmethod def add_args(parser): parser.add_argument("--dropout", type=float) parser.add_argument("--output-frame-dim", type=int) parser.add_argument("--speaker-embed-dim", type=int) # encoder prenet parser.add_argument("--encoder-dropout", type=float) parser.add_argument("--encoder-conv-layers", type=int) parser.add_argument("--encoder-conv-kernel-size", type=int) # encoder transformer layers parser.add_argument("--encoder-transformer-layers", type=int) parser.add_argument("--encoder-embed-dim", type=int) parser.add_argument("--encoder-ffn-embed-dim", type=int) parser.add_argument("--encoder-normalize-before", action="store_true") parser.add_argument("--encoder-attention-heads", type=int) parser.add_argument("--attention-dropout", type=float) parser.add_argument("--activation-dropout", "--relu-dropout", type=float) parser.add_argument("--activation-fn", type=str, default="relu") # decoder prenet parser.add_argument("--prenet-dropout", type=float) parser.add_argument("--prenet-layers", type=int) parser.add_argument("--prenet-dim", type=int) # decoder postnet parser.add_argument("--postnet-dropout", type=float) parser.add_argument("--postnet-layers", type=int) parser.add_argument("--postnet-conv-dim", type=int) parser.add_argument("--postnet-conv-kernel-size", type=int) # decoder transformer layers parser.add_argument("--decoder-transformer-layers", type=int) parser.add_argument("--decoder-embed-dim", type=int) parser.add_argument("--decoder-ffn-embed-dim", type=int) parser.add_argument("--decoder-normalize-before", action="store_true") parser.add_argument("--decoder-attention-heads", type=int) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._num_updates = 0 @classmethod def build_model(cls, args, task): embed_speaker = task.get_speaker_embeddings(args) encoder = TTSTransformerEncoder(args, task.src_dict, embed_speaker) decoder = TTSTransformerDecoder(args, task.src_dict) return cls(encoder, decoder) def forward_encoder(self, src_tokens, src_lengths, speaker=None, **kwargs): return self.encoder( src_tokens, src_lengths=src_lengths, speaker=speaker, **kwargs ) def set_num_updates(self, num_updates): super().set_num_updates(num_updates) self._num_updates = num_updates @register_model_architecture("tts_transformer", "tts_transformer") def base_architecture(args): args.dropout = getattr(args, "dropout", 0.1) args.output_frame_dim = getattr(args, "output_frame_dim", 80) args.speaker_embed_dim = getattr(args, "speaker_embed_dim", 64) # encoder prenet args.encoder_dropout = getattr(args, "encoder_dropout", 0.5) args.encoder_conv_layers = getattr(args, "encoder_conv_layers", 3) args.encoder_conv_kernel_size = getattr(args, "encoder_conv_kernel_size", 5) # encoder transformer layers args.encoder_transformer_layers = getattr(args, "encoder_transformer_layers", 6) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr( args, "encoder_ffn_embed_dim", 4 * args.encoder_embed_dim ) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") # decoder prenet args.prenet_dropout = getattr(args, "prenet_dropout", 0.5) args.prenet_layers = getattr(args, "prenet_layers", 2) args.prenet_dim = getattr(args, "prenet_dim", 256) # decoder postnet args.postnet_dropout = getattr(args, "postnet_dropout", 0.5) args.postnet_layers = getattr(args, "postnet_layers", 5) args.postnet_conv_dim = getattr(args, "postnet_conv_dim", 512) args.postnet_conv_kernel_size = getattr(args, "postnet_conv_kernel_size", 5) # decoder transformer layers args.decoder_transformer_layers = getattr(args, "decoder_transformer_layers", 6) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", 4 * args.decoder_embed_dim ) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) ================================================ FILE: fairseq/models/text_to_speech/vocoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import logging from typing import Dict import numpy as np import torch import torch.nn.functional as F from torch import nn from fairseq.data.audio.audio_utils import ( TTSSpectrogram, get_fourier_basis, get_mel_filters, get_window, ) from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig from fairseq.models import BaseFairseqModel, register_model from fairseq.models.text_to_speech.codehifigan import CodeGenerator as CodeHiFiGANModel from fairseq.models.text_to_speech.hifigan import Generator as HiFiGANModel from fairseq.models.text_to_speech.hub_interface import VocoderHubInterface logger = logging.getLogger(__name__) class PseudoInverseMelScale(torch.nn.Module): def __init__(self, n_stft, n_mels, sample_rate, f_min, f_max) -> None: super(PseudoInverseMelScale, self).__init__() self.n_mels = n_mels basis = get_mel_filters(sample_rate, (n_stft - 1) * 2, n_mels, f_min, f_max) basis = torch.pinverse(basis) # F x F_mel self.register_buffer("basis", basis) def forward(self, melspec: torch.Tensor) -> torch.Tensor: # pack batch shape = melspec.shape # B_1 x ... x B_K x F_mel x T n_mels, time = shape[-2], shape[-1] melspec = melspec.view(-1, n_mels, time) freq, _ = self.basis.size() # F x F_mel assert self.n_mels == n_mels, (self.n_mels, n_mels) specgram = self.basis.matmul(melspec).clamp(min=0) # unpack batch specgram = specgram.view(shape[:-2] + (freq, time)) return specgram class GriffinLim(torch.nn.Module): def __init__( self, n_fft: int, win_length: int, hop_length: int, n_iter: int, window_fn=torch.hann_window, ): super(GriffinLim, self).__init__() self.transform = TTSSpectrogram( n_fft, win_length, hop_length, return_phase=True ) basis = get_fourier_basis(n_fft) basis = torch.pinverse(n_fft / hop_length * basis).T[:, None, :] basis *= get_window(window_fn, n_fft, win_length) self.register_buffer("basis", basis) self.n_fft = n_fft self.win_length = win_length self.hop_length = hop_length self.n_iter = n_iter self.tiny = 1.1754944e-38 @classmethod def get_window_sum_square( cls, n_frames, hop_length, win_length, n_fft, window_fn=torch.hann_window ) -> torch.Tensor: w_sq = get_window(window_fn, n_fft, win_length) ** 2 n = n_fft + hop_length * (n_frames - 1) x = torch.zeros(n, dtype=torch.float32) for i in range(n_frames): ofst = i * hop_length x[ofst : min(n, ofst + n_fft)] += w_sq[: max(0, min(n_fft, n - ofst))] return x def inverse(self, magnitude: torch.Tensor, phase) -> torch.Tensor: x = torch.cat( [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 ) x = F.conv_transpose1d(x, self.basis, stride=self.hop_length) win_sum_sq = self.get_window_sum_square( magnitude.shape[-1], hop_length=self.hop_length, win_length=self.win_length, n_fft=self.n_fft, ).to(magnitude.device) # remove modulation effects approx_nonzero_indices = win_sum_sq > self.tiny x[:, :, approx_nonzero_indices] /= win_sum_sq[approx_nonzero_indices] x *= self.n_fft / self.hop_length x = x[:, :, self.n_fft // 2 :] x = x[:, :, : -self.n_fft // 2 :] return x def forward(self, specgram: torch.Tensor) -> torch.Tensor: angles = np.angle(np.exp(2j * np.pi * np.random.rand(*specgram.shape))) angles = torch.from_numpy(angles).to(specgram) _specgram = specgram.view(-1, specgram.shape[-2], specgram.shape[-1]) waveform = self.inverse(_specgram, angles).squeeze(1) for _ in range(self.n_iter): _, angles = self.transform(waveform) waveform = self.inverse(_specgram, angles).squeeze(1) return waveform.squeeze(0) class GriffinLimVocoder(nn.Module): def __init__( self, sample_rate, win_size, hop_size, n_fft, n_mels, f_min, f_max, window_fn, spec_bwd_max_iter=32, fp16=False, ): super().__init__() self.inv_mel_transform = PseudoInverseMelScale( n_stft=n_fft // 2 + 1, n_mels=n_mels, sample_rate=sample_rate, f_min=f_min, f_max=f_max, ) self.gl_transform = GriffinLim( n_fft=n_fft, win_length=win_size, hop_length=hop_size, window_fn=window_fn, n_iter=spec_bwd_max_iter, ) if fp16: self.half() self.inv_mel_transform.half() self.gl_transform.half() else: self.float() self.inv_mel_transform.float() self.gl_transform.float() def forward(self, x): # x: (B x) T x D -> (B x) 1 x T # NOTE: batched forward produces noisier waveform. recommend running # one utterance at a time self.eval() x = x.exp().transpose(-1, -2) x = self.inv_mel_transform(x) x = self.gl_transform(x) return x @classmethod def from_data_cfg(cls, args, data_cfg: S2TDataConfig): feat_cfg = data_cfg.config["features"] window_fn = getattr(torch, feat_cfg["window_fn"] + "_window") return cls( sample_rate=feat_cfg["sample_rate"], win_size=int(feat_cfg["win_len_t"] * feat_cfg["sample_rate"]), hop_size=int(feat_cfg["hop_len_t"] * feat_cfg["sample_rate"]), n_fft=feat_cfg["n_fft"], n_mels=feat_cfg["n_mels"], f_min=feat_cfg["f_min"], f_max=feat_cfg["f_max"], window_fn=window_fn, spec_bwd_max_iter=args.spec_bwd_max_iter, fp16=args.fp16, ) class HiFiGANVocoder(nn.Module): def __init__( self, checkpoint_path: str, model_cfg: Dict[str, str], fp16: bool = False ) -> None: super().__init__() self.model = HiFiGANModel(model_cfg) state_dict = torch.load(checkpoint_path) self.model.load_state_dict(state_dict["generator"]) if fp16: self.model.half() logger.info(f"loaded HiFiGAN checkpoint from {checkpoint_path}") def forward(self, x: torch.Tensor) -> torch.Tensor: # (B x) T x D -> (B x) 1 x T model = self.model.eval() if len(x.shape) == 2: return model(x.unsqueeze(0).transpose(1, 2)).detach().squeeze(0) else: return model(x.transpose(-1, -2)).detach() @classmethod def from_data_cfg(cls, args, data_cfg: S2TDataConfig): vocoder_cfg = data_cfg.vocoder assert vocoder_cfg.get("type", "griffin_lim") == "hifigan" with open(vocoder_cfg["config"]) as f: model_cfg = json.load(f) return cls(vocoder_cfg["checkpoint"], model_cfg, fp16=args.fp16) @register_model("CodeHiFiGANVocoder") class CodeHiFiGANVocoder(BaseFairseqModel): def __init__( self, checkpoint_path: str, model_cfg: Dict[str, str], fp16: bool = False ) -> None: super().__init__() self.model = CodeHiFiGANModel(model_cfg) if torch.cuda.is_available(): state_dict = torch.load(checkpoint_path) else: state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu")) self.model.load_state_dict(state_dict["generator"]) self.model.eval() if fp16: self.model.half() self.model.remove_weight_norm() logger.info(f"loaded CodeHiFiGAN checkpoint from {checkpoint_path}") def forward(self, x: Dict[str, torch.Tensor], dur_prediction=False) -> torch.Tensor: assert "code" in x x["dur_prediction"] = dur_prediction # remove invalid code mask = x["code"] >= 0 x["code"] = x["code"][mask].unsqueeze(dim=0) if "f0" in x: f0_up_ratio = x["f0"].size(1) // x["code"].size(1) mask = mask.unsqueeze(2).repeat(1, 1, f0_up_ratio).view(-1, x["f0"].size(1)) x["f0"] = x["f0"][mask].unsqueeze(dim=0) return self.model(**x).detach().squeeze() @classmethod def from_data_cfg(cls, args, data_cfg): vocoder_cfg = data_cfg.vocoder assert vocoder_cfg is not None, "vocoder not specified in the data config" with open(vocoder_cfg["config"]) as f: model_cfg = json.load(f) return cls(vocoder_cfg["checkpoint"], model_cfg, fp16=args.fp16) @classmethod def hub_models(cls): base_url = "http://dl.fbaipublicfiles.com/fairseq/vocoder" model_ids = [ "unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj_dur", "unit_hifigan_mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10_dur", "unit_hifigan_HK_layer12.km2500_frame_TAT-TTS", ] return {i: f"{base_url}/{i}.tar.gz" for i in model_ids} @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", config="config.json", fp16: bool = False, **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), config_yaml=config, fp16=fp16, is_vocoder=True, **kwargs, ) with open(f"{x['args']['data']}/{config}") as f: vocoder_cfg = json.load(f) assert len(x["args"]["model_path"]) == 1, "Too many vocoder models in the input" vocoder = CodeHiFiGANVocoder(x["args"]["model_path"][0], vocoder_cfg) return VocoderHubInterface(vocoder_cfg, vocoder) def get_vocoder(args, data_cfg: S2TDataConfig): if args.vocoder == "griffin_lim": return GriffinLimVocoder.from_data_cfg(args, data_cfg) elif args.vocoder == "hifigan": return HiFiGANVocoder.from_data_cfg(args, data_cfg) elif args.vocoder == "code_hifigan": return CodeHiFiGANVocoder.from_data_cfg(args, data_cfg) else: raise ValueError("Unknown vocoder") ================================================ FILE: fairseq/models/transformer/__init__.py ================================================ # Copyright (c) Facebook Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" from .transformer_config import ( TransformerConfig, DEFAULT_MAX_SOURCE_POSITIONS, DEFAULT_MAX_TARGET_POSITIONS, DEFAULT_MIN_PARAMS_TO_WRAP, ) from .transformer_decoder import TransformerDecoder, TransformerDecoderBase, Linear from .transformer_encoder import TransformerEncoder, TransformerEncoderBase from .transformer_legacy import ( TransformerModel, base_architecture, tiny_architecture, transformer_iwslt_de_en, transformer_wmt_en_de, transformer_vaswani_wmt_en_de_big, transformer_vaswani_wmt_en_fr_big, transformer_wmt_en_de_big, transformer_wmt_en_de_big_t2t, ) from .transformer_base import TransformerModelBase, Embedding __all__ = [ "TransformerModelBase", "TransformerConfig", "TransformerDecoder", "TransformerDecoderBase", "TransformerEncoder", "TransformerEncoderBase", "TransformerModel", "Embedding", "Linear", "base_architecture", "tiny_architecture", "transformer_iwslt_de_en", "transformer_wmt_en_de", "transformer_vaswani_wmt_en_de_big", "transformer_vaswani_wmt_en_fr_big", "transformer_wmt_en_de_big", "transformer_wmt_en_de_big_t2t", "DEFAULT_MAX_SOURCE_POSITIONS", "DEFAULT_MAX_TARGET_POSITIONS", "DEFAULT_MIN_PARAMS_TO_WRAP", ] ================================================ FILE: fairseq/models/transformer/transformer_base.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Optional, Tuple import torch import torch.nn as nn from torch import Tensor import logging from fairseq import utils from fairseq.dataclass.utils import gen_parser_from_dataclass from fairseq.distributed import fsdp_wrap from fairseq.models import FairseqEncoderDecoderModel from fairseq.models.transformer import ( TransformerConfig, TransformerDecoderBase, TransformerEncoderBase, ) logger = logging.getLogger(__name__) class TransformerModelBase(FairseqEncoderDecoderModel): """ Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017) <https://arxiv.org/abs/1706.03762>`_. Args: encoder (TransformerEncoder): the encoder decoder (TransformerDecoder): the decoder The Transformer model provides the following named architectures and command-line arguments: .. argparse:: :ref: fairseq.models.transformer_parser :prog: """ def __init__(self, cfg, encoder, decoder): super().__init__(encoder, decoder) self.cfg = cfg self.supports_align_args = True @classmethod def add_args(cls, parser): """Add model-specific arguments to the parser.""" # we want to build the args recursively in this case. gen_parser_from_dataclass( parser, TransformerConfig(), delete_default=False, with_prefix="" ) @classmethod def build_model(cls, cfg, task): """Build a new model instance.""" # -- TODO T96535332 # bug caused by interaction between OmegaConf II and argparsing cfg.decoder.input_dim = int(cfg.decoder.input_dim) cfg.decoder.output_dim = int(cfg.decoder.output_dim) # -- if cfg.encoder.layers_to_keep: cfg.encoder.layers = len(cfg.encoder.layers_to_keep.split(",")) if cfg.decoder.layers_to_keep: cfg.decoder.layers = len(cfg.decoder.layers_to_keep.split(",")) src_dict, tgt_dict = task.source_dictionary, task.target_dictionary if cfg.share_all_embeddings: if src_dict != tgt_dict: raise ValueError("--share-all-embeddings requires a joined dictionary") if cfg.encoder.embed_dim != cfg.decoder.embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if cfg.decoder.embed_path and ( cfg.decoder.embed_path != cfg.encoder.embed_path ): raise ValueError( "--share-all-embeddings not compatible with --decoder-embed-path" ) encoder_embed_tokens = cls.build_embedding( cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path ) decoder_embed_tokens = encoder_embed_tokens cfg.share_decoder_input_output_embed = True elif cfg.merge_src_tgt_embed: logger.info(f"source dict size: {len(src_dict)}") logger.info(f"target dict size: {len(tgt_dict)}") src_dict.update(tgt_dict) task.src_dict = src_dict task.tgt_dict = src_dict logger.info(f"merged dict size: {len(src_dict)}") encoder_embed_tokens = cls.build_embedding( cfg, src_dict, cfg.encoder.embed_dim ) decoder_embed_tokens = encoder_embed_tokens cfg.share_decoder_input_output_embed = True else: encoder_embed_tokens = cls.build_embedding( cfg, src_dict, cfg.encoder.embed_dim, cfg.encoder.embed_path ) decoder_embed_tokens = cls.build_embedding( cfg, tgt_dict, cfg.decoder.embed_dim, cfg.decoder.embed_path ) if cfg.offload_activations: cfg.checkpoint_activations = True # offloading implies checkpointing encoder = cls.build_encoder(cfg, src_dict, encoder_embed_tokens) decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) return cls(cfg, encoder, decoder) @classmethod def build_embedding(cls, cfg, dictionary, embed_dim, path=None): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) # if provided, load from preloaded dictionaries if path: embed_dict = utils.parse_embedding(path) utils.load_embedding(embed_dict, dictionary, emb) return emb @classmethod def build_encoder(cls, cfg, src_dict, embed_tokens): return TransformerEncoderBase(cfg, src_dict, embed_tokens) @classmethod def build_decoder(cls, cfg, tgt_dict, embed_tokens): return TransformerDecoderBase( cfg, tgt_dict, embed_tokens, no_encoder_attn=cfg.no_cross_attention, ) # TorchScript doesn't support optional arguments with variable length (**kwargs). # Current workaround is to add union of all arguments in child classes. def forward( self, src_tokens, src_lengths, prev_output_tokens, return_all_hiddens: bool = True, features_only: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): """ Run the forward pass for an encoder-decoder model. Copied from the base class, but without ``**kwargs``, which are not supported by TorchScript. """ encoder_out = self.encoder( src_tokens, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens ) decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, features_only=features_only, alignment_layer=alignment_layer, alignment_heads=alignment_heads, src_lengths=src_lengths, return_all_hiddens=return_all_hiddens, ) return decoder_out # Since get_normalized_probs is in the Fairseq Model which is not scriptable, # I rewrite the get_normalized_probs from Base Class to call the # helper function in the Base Class. @torch.jit.export def get_normalized_probs( self, net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], log_probs: bool, sample: Optional[Dict[str, Tensor]] = None, ): """Get normalized probabilities (or log probs) from a net's output.""" return self.get_normalized_probs_scriptable(net_output, log_probs, sample) def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m ================================================ FILE: fairseq/models/transformer/transformer_config.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import re from dataclasses import dataclass, field, fields from typing import List, Optional from omegaconf import II from fairseq import utils from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.utils import safe_getattr, safe_hasattr DEFAULT_MAX_SOURCE_POSITIONS = 1024 DEFAULT_MAX_TARGET_POSITIONS = 1024 DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) _NAME_PARSER = r"(decoder|encoder|quant_noise)_(.*)" @dataclass class EncDecBaseConfig(FairseqDataclass): embed_path: Optional[str] = field( default=None, metadata={"help": "path to pre-trained embedding"} ) embed_dim: Optional[int] = field( default=512, metadata={"help": "embedding dimension"} ) ffn_embed_dim: int = field( default=2048, metadata={"help": "embedding dimension for FFN"} ) layers: int = field(default=6, metadata={"help": "number of layers"}) attention_heads: int = field( default=8, metadata={"help": "number of attention heads"} ) normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each block"} ) learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings"} ) # args for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) layerdrop: float = field(default=0, metadata={"help": "LayerDrop probability"}) layers_to_keep: Optional[List[int]] = field( default=None, metadata={"help": "which layers to *keep* when pruning"} ) xformers_att_config: Optional[str] = field( default=None, metadata={ "help": "config for xFormers attention, defined in xformers.components.attention.AttentionConfig" }, ) @dataclass class DecoderConfig(EncDecBaseConfig): input_dim: int = II("model.decoder.embed_dim") output_dim: int = field( default=II("model.decoder.embed_dim"), metadata={ "help": "decoder output dimension (extra linear layer if different from decoder embed dim)" }, ) def __post_init__(self): # II doesn't work if we are just creating the object outside of hydra so fix that if self.input_dim == II("model.decoder.embed_dim"): self.input_dim = self.embed_dim if self.output_dim == II("model.decoder.embed_dim"): self.output_dim = self.embed_dim @dataclass class QuantNoiseConfig(FairseqDataclass): pq: float = field( default=0.0, metadata={"help": "iterative PQ quantization noise at training time"}, ) pq_block_size: int = field( default=8, metadata={"help": "block size of quantization noise at training time"}, ) scalar: float = field( default=0.0, metadata={ "help": "scalar quantization noise and scalar quantization at training time" }, ) @dataclass class TransformerConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"}, ) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"} ) activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN.", "alias": "--relu-dropout", }, ) adaptive_input: bool = False encoder: EncDecBaseConfig = EncDecBaseConfig() # TODO should really be in the encoder config max_source_positions: int = field( default=DEFAULT_MAX_SOURCE_POSITIONS, metadata={"help": "Maximum input length supported by the encoder"}, ) decoder: DecoderConfig = DecoderConfig() # TODO should really be in the decoder config max_target_positions: int = field( default=DEFAULT_MAX_TARGET_POSITIONS, metadata={"help": "Maximum output length supported by the decoder"}, ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"} ) share_all_embeddings: bool = field( default=False, metadata={ "help": "share encoder, decoder and output embeddings (requires shared dictionary and embed dim)" }, ) merge_src_tgt_embed: bool = field( default=False, metadata={ "help": "if true then the source and target embedding table is " "merged into one table. This is going to make the model smaller but " "it might hurt performance." }, ) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if True, disables positional embeddings (outside self attention)" }, ) adaptive_softmax_cutoff: Optional[List[int]] = field( default=None, metadata={ "help": "list of adaptive softmax cutoff points. Must be used with adaptive_loss criterion" }, ) adaptive_softmax_dropout: float = field( default=0.0, metadata={"help": "sets adaptive softmax dropout for the tail projections"}, ) adaptive_softmax_factor: float = field( default=4, metadata={"help": "adaptive input factor"} ) layernorm_embedding: bool = field( default=False, metadata={"help": "add layernorm to embedding"} ) tie_adaptive_weights: bool = field( default=False, metadata={ "help": "if set, ties the weights of adaptive softmax and adaptive input" }, ) tie_adaptive_proj: bool = field( default=False, metadata={ "help": "if set, ties the projection weights of adaptive softmax and adaptive input" }, ) no_scale_embedding: bool = field( default=False, metadata={"help": "if True, dont scale embeddings"} ) checkpoint_activations: bool = field( default=False, metadata={ "help": "checkpoint activations at each layer, which saves GPU memory usage at the cost of some additional compute" }, ) offload_activations: bool = field( default=False, metadata={ "help": "checkpoint activations at each layer, then save to gpu. Sets --checkpoint-activations." }, ) # args for "Cross+Self-Attention for Transformer Models" (Peitz et al., 2019) no_cross_attention: bool = field( default=False, metadata={"help": "do not perform cross-attention"} ) cross_self_attention: bool = field( default=False, metadata={"help": "perform cross+self-attention"} ) # args for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) quant_noise: QuantNoiseConfig = field(default=QuantNoiseConfig()) min_params_to_wrap: int = field( default=DEFAULT_MIN_PARAMS_TO_WRAP, metadata={ "help": "minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed." }, ) # DEPRECATED field, but some old checkpoints might have it char_inputs: bool = field( default=False, metadata={"help": "if set, model takes character ids as input"} ) relu_dropout: float = 0.0 # config for "BASE Layers: Simplifying Training of Large, Sparse Models" base_layers: Optional[int] = field( default=0, metadata={"help": "number of BASE layers in total"} ) base_sublayers: Optional[int] = field( default=1, metadata={"help": "number of sublayers in each BASE layer"} ) base_shuffle: Optional[int] = field( default=1, metadata={"help": "shuffle tokens between workers before computing assignment"}, ) export: bool = field( default=False, metadata={"help": "make the layernorm exportable with torchscript."}, ) # copied from transformer_lm but expected in transformer_decoder: no_decoder_final_norm: bool = field( default=False, metadata={"help": "don't add an extra layernorm after the last decoder block"}, ) # We need to make this hierarchical dataclass like the flat namespace # __getattr__ and __setattr__ here allow backward compatibility # for subclasses of Transformer(Legacy) that depend on read/write on # the flat namespace. def __getattr__(self, name): match = re.match(_NAME_PARSER, name) if match: sub = safe_getattr(self, match[1]) return safe_getattr(sub, match[2]) raise AttributeError(f"invalid argument {name}.") def __setattr__(self, name, value): match = re.match(_NAME_PARSER, name) if match: sub = safe_getattr(self, match[1]) setattr(sub, match[2], value) else: super().__setattr__(name, value) @staticmethod def _copy_keys(args, cls, prefix, seen): """ copy the prefixed keys (decoder_embed_dim) to the DC fields: decoder.embed_dim """ cfg = cls() for fld in fields(cls): # for all the fields in the DC, find the fields (e.g. embed_dim) # in the namespace with the prefix (e.g. decoder) # and set it on the dc. args_key = f"{prefix}_{fld.name}" if safe_hasattr(args, args_key): seen.add(args_key) setattr(cfg, fld.name, safe_getattr(args, args_key)) if safe_hasattr(args, fld.name): seen.add(fld.name) setattr(cfg, fld.name, safe_getattr(args, fld.name)) return cfg @classmethod def from_namespace(cls, args): if args is None: return None if not isinstance(args, cls): seen = set() config = cls() # currently, we can go generically from DC fields to args hierarchically # but we can't easily deconstruct a flat namespace to a hierarchical # DC. Mostly because we could have a sub-dc called `decoder-foo` that should not # go to the sub struct called `decoder`. There are ways to go around this, but let's keep it simple # for now. for fld in fields(cls): # concretelly, the transformer_config know what sub-dc it has, so we go through all the dc fields # and if it's one that has a sub-dc, we build that sub-dc with `copy_keys()` if fld.name == "decoder": if safe_hasattr(args, "decoder"): # in some cases, the args we receive is already structured (as DictConfigs), so let's just build the correct DC seen.add("decoder") config.decoder = DecoderConfig(**args.decoder) else: config.decoder = cls._copy_keys( args, DecoderConfig, "decoder", seen ) elif fld.name == "encoder": # same but for encoder if safe_hasattr(args, "encoder"): seen.add("encoder") config.encoder = EncDecBaseConfig(**args.encoder) else: config.encoder = cls._copy_keys( args, EncDecBaseConfig, "encoder", seen ) elif fld.name == "quant_noise": # same but for quant_noise if safe_hasattr(args, "quant_noise"): seen.add("quant_noise") config.quant_noise = QuantNoiseConfig(**args.quant_noise) else: config.quant_noise = cls._copy_keys( args, QuantNoiseConfig, "quant_noise", seen ) elif safe_hasattr(args, fld.name): # if it's not a structure field, it's just a normal field, copy it over seen.add(fld.name) setattr(config, fld.name, safe_getattr(args, fld.name)) # we got all the fields defined in the dataclass, but # the argparse namespace might have extra args for two reasons: # - we are in a legacy class so all the args are not declared in the dataclass. Ideally once everyone has defined a dataclass for their model, we won't need this # - some places expect args to be there but never define them args_dict = ( args._asdict() if safe_hasattr(args, "_asdict") else vars(args) if safe_hasattr(args, "__dict__") else {} ) # namedtupled doesn't have __dict__ :-/ for key, value in args_dict.items(): if key not in seen: setattr(config, key, value) return config else: return args ================================================ FILE: fairseq/models/transformer/transformer_decoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Any, Dict, List, Optional import torch import torch.nn as nn from torch import Tensor from fairseq import utils from fairseq.distributed import fsdp_wrap from fairseq.models import FairseqIncrementalDecoder from fairseq.models.transformer import TransformerConfig from fairseq.modules import ( AdaptiveSoftmax, BaseLayer, FairseqDropout, LayerDropModuleList, LayerNorm, PositionalEmbedding, SinusoidalPositionalEmbedding, transformer_layer, ) from fairseq.modules.checkpoint_activations import checkpoint_wrapper from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ # rewrite name for backward compatibility in `make_generation_fast_` def module_name_fordropout(module_name: str) -> str: if module_name == "TransformerDecoderBase": return "TransformerDecoder" else: return module_name class TransformerDecoderBase(FairseqIncrementalDecoder): """ Transformer decoder consisting of *cfg.decoder.layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: cfg (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, cfg, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): self.cfg = cfg super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self._future_mask = torch.empty(0) self.dropout_module = FairseqDropout( cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) ) self.decoder_layerdrop = cfg.decoder.layerdrop self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder.embed_dim self.embed_dim = embed_dim self.output_embed_dim = cfg.decoder.output_dim self.padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) if not cfg.adaptive_input and cfg.quant_noise.pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), cfg.quant_noise.pq, cfg.quant_noise.pq_block_size, ) else: self.quant_noise = None self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( self.max_target_positions, embed_dim, self.padding_idx, learned=cfg.decoder.learned_pos, ) if not cfg.no_token_positional_embeddings else None ) if cfg.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) else: self.layernorm_embedding = None self.cross_self_attention = cfg.cross_self_attention if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [ self.build_decoder_layer(cfg, no_encoder_attn) for _ in range(cfg.decoder.layers) ] ) self.num_layers = len(self.layers) if cfg.decoder.normalize_before and not cfg.no_decoder_final_norm: self.layer_norm = LayerNorm(embed_dim, export=cfg.export) else: self.layer_norm = None self.project_out_dim = ( Linear(embed_dim, self.output_embed_dim, bias=False) if embed_dim != self.output_embed_dim and not cfg.tie_adaptive_weights else None ) self.adaptive_softmax = None self.output_projection = output_projection if self.output_projection is None: self.build_output_projection(cfg, dictionary, embed_tokens) def build_output_projection(self, cfg, dictionary, embed_tokens): if cfg.adaptive_softmax_cutoff is not None: self.adaptive_softmax = AdaptiveSoftmax( len(dictionary), self.output_embed_dim, utils.eval_str_list(cfg.adaptive_softmax_cutoff, type=int), dropout=cfg.adaptive_softmax_dropout, adaptive_inputs=embed_tokens if cfg.tie_adaptive_weights else None, factor=cfg.adaptive_softmax_factor, tie_proj=cfg.tie_adaptive_proj, ) elif self.share_input_output_embed: self.output_projection = nn.Linear( self.embed_tokens.weight.shape[1], self.embed_tokens.weight.shape[0], bias=False, ) self.output_projection.weight = self.embed_tokens.weight else: self.output_projection = nn.Linear( self.output_embed_dim, len(dictionary), bias=False ) nn.init.normal_( self.output_projection.weight, mean=0, std=self.output_embed_dim**-0.5 ) num_base_layers = cfg.base_layers for i in range(num_base_layers): self.layers.insert( ((i + 1) * cfg.decoder.layers) // (num_base_layers + 1), BaseLayer(cfg), ) def build_decoder_layer(self, cfg, no_encoder_attn=False): layer = transformer_layer.TransformerDecoderLayerBase(cfg, no_encoder_attn) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer def forward( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention, should be of size T x B x C incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, full_context_alignment=full_context_alignment, alignment_layer=alignment_layer, alignment_heads=alignment_heads, ) if not features_only: x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): return self.extract_features_scriptable( prev_output_tokens, encoder_out, incremental_state, full_context_alignment, alignment_layer, alignment_heads, ) """ A scriptable subclass of this class has an extract_features method and calls super().extract_features, but super() is not supported in torchscript. A copy of this function is made to be used in the subclass instead. """ def extract_features_scriptable( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): """ Similar to *forward* but only return features. Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ bs, slen = prev_output_tokens.size() if alignment_layer is None: alignment_layer = self.num_layers - 1 enc: Optional[Tensor] = None padding_mask: Optional[Tensor] = None if encoder_out is not None and len(encoder_out["encoder_out"]) > 0: enc = encoder_out["encoder_out"][0] if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0: padding_mask = encoder_out["encoder_padding_mask"][0] # embed positions positions = None if self.embed_positions is not None: positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # Prevent torchscript exporting issue for dynamic quant embedding prev_output_tokens = prev_output_tokens.contiguous() # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.quant_noise is not None: x = self.quant_noise(x) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask: Optional[Tensor] = None if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn: Optional[Tensor] = None inner_states: List[Optional[Tensor]] = [x] for idx, layer in enumerate(self.layers): if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None x, layer_attn, _ = layer( x, enc, padding_mask, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float().to(x) if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {"attn": [attn], "inner_states": inner_states} def output_layer(self, features): """Project features to the vocabulary size.""" if self.adaptive_softmax is None: # project back to size of vocabulary return self.output_projection(features) else: return features def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions) def buffered_future_mask(self, tensor): dim = tensor.size(0) # self._future_mask.device != tensor.device is not working in TorchScript. This is a workaround. if ( self._future_mask.size(0) == 0 or (not self._future_mask.device == tensor.device) or self._future_mask.size(0) < dim ): self._future_mask = torch.triu( utils.fill_with_neg_inf(torch.zeros([dim, dim])), 1 ) self._future_mask = self._future_mask.to(tensor) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if f"{name}.output_projection.weight" not in state_dict: if self.share_input_output_embed: embed_out_key = f"{name}.embed_tokens.weight" else: embed_out_key = f"{name}.embed_out" if embed_out_key in state_dict: state_dict[f"{name}.output_projection.weight"] = state_dict[ embed_out_key ] if not self.share_input_output_embed: del state_dict[embed_out_key] for i in range(self.num_layers): # update layer norms layer_norm_map = { "0": "self_attn_layer_norm", "1": "encoder_attn_layer_norm", "2": "final_layer_norm", } for old, new in layer_norm_map.items(): for m in ("weight", "bias"): k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) if k in state_dict: state_dict[ "{}.layers.{}.{}.{}".format(name, i, new, m) ] = state_dict[k] del state_dict[k] version_key = "{}.version".format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m class TransformerDecoder(TransformerDecoderBase): def __init__( self, args, dictionary, embed_tokens, no_encoder_attn=False, output_projection=None, ): self.args = args super().__init__( TransformerConfig.from_namespace(args), dictionary, embed_tokens, no_encoder_attn=no_encoder_attn, output_projection=output_projection, ) def build_output_projection(self, args, dictionary, embed_tokens): super().build_output_projection( TransformerConfig.from_namespace(args), dictionary, embed_tokens ) def build_decoder_layer(self, args, no_encoder_attn=False): return super().build_decoder_layer( TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn ) ================================================ FILE: fairseq/models/transformer/transformer_decoder_aug.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Any, Dict, List, Optional import torch import torch.nn as nn from torch import Tensor from fairseq import utils from fairseq.distributed import fsdp_wrap from fairseq.models.transformer import TransformerConfig from fairseq.models.transformer.transformer_decoder import TransformerDecoderBase from fairseq.modules import ( LayerDropModuleList, SinusoidalPositionalEmbedding, transformer_layer_aug, ) from fairseq.modules.checkpoint_activations import checkpoint_wrapper class AugTransformerDecoderBase(TransformerDecoderBase): """ Transformer decoder augmented with an additional cross-attention. Each layer is a :class:`AugTransformerDecoderLayerBase`. Args: cfg (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding encoder_attn_merge_type (str, optional): the way to combine outputs from two cross-attention modules. If "sequential" is set, two cross-attention modules are stacked sequentially. If "parallel" is set, they are processed in parallel and combined before feeding it to FFN (default: sequential). dropnet_ratio (float, optional): a probability to drop each cross-attention module during training (default: 0.0). """ def __init__( self, cfg, dictionary, embed_tokens, output_projection=None, encoder_attn_merge_type="sequential", dropnet_ratio=0.0, ): super().__init__( cfg, dictionary, embed_tokens, no_encoder_attn=False, output_projection=output_projection, ) # assert cfg.cross_self_attention self.cross_self_attention = cfg.cross_self_attention if self.decoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.decoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [ self.build_decoder_layer(cfg, encoder_attn_merge_type, dropnet_ratio) for _ in range(cfg.decoder.layers) ] ) def build_decoder_layer( self, cfg, encoder_attn_merge_type="sequential", dropnet_ratio=0, ): layer = transformer_layer_aug.AugTransformerDecoderLayerBase( cfg, no_encoder_attn=False, encoder_attn_merge_type=encoder_attn_merge_type, dropnet_ratio=dropnet_ratio, ) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer def forward( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]] = None, encoder_out_aug: Optional[Dict[str, List[Tensor]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, features_only: bool = False, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, src_lengths: Optional[Any] = None, return_all_hiddens: bool = False, ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (optional): output from the encoder, used for encoder-side attention, should be of size T x B x C incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` features_only (bool, optional): only return features without applying output layer (default: False). full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ x, extra = self.extract_features( prev_output_tokens, encoder_out=encoder_out, encoder_out_aug=encoder_out_aug, incremental_state=incremental_state, full_context_alignment=full_context_alignment, alignment_layer=alignment_layer, alignment_heads=alignment_heads, ) if not features_only: x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], encoder_out_aug: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): return self.extract_features_scriptable( prev_output_tokens, encoder_out, encoder_out_aug, incremental_state, full_context_alignment, alignment_layer, alignment_heads, ) """ A scriptable subclass of this class has an extract_features method and calls super().extract_features, but super() is not supported in torchscript. A copy of this function is made to be used in the subclass instead. """ def extract_features_scriptable( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], encoder_out_aug: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): """ Similar to *forward* but only return features. Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: full_context_alignment (bool, optional): don't apply auto-regressive mask to self-attention (default: False). alignment_layer (int, optional): return mean alignment over heads at this layer (default: last layer). alignment_heads (int, optional): only average alignment over this many heads (default: all heads). Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ bs, slen = prev_output_tokens.size() if alignment_layer is None: alignment_layer = self.num_layers - 1 enc: Optional[Tensor] = None padding_mask: Optional[Tensor] = None if encoder_out is not None and len(encoder_out["encoder_out"]) > 0: enc = encoder_out["encoder_out"][0] if encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0: padding_mask = encoder_out["encoder_padding_mask"][0] enc_aug: Optional[Tensor] = None padding_mask_aug: Optional[Tensor] = None if encoder_out_aug is not None and len(encoder_out_aug["encoder_out"]) > 0: enc_aug = encoder_out_aug["encoder_out"][0] if ( encoder_out_aug is not None and len(encoder_out_aug["encoder_padding_mask"]) > 0 ): padding_mask_aug = encoder_out_aug["encoder_padding_mask"][0] # embed positions positions = None if self.embed_positions is not None: positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # Prevent torchscript exporting issue for dynamic quant embedding prev_output_tokens = prev_output_tokens.contiguous() # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.quant_noise is not None: x = self.quant_noise(x) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask: Optional[Tensor] = None if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn: Optional[Tensor] = None attn_aug: Optional[Tensor] = None inner_states: List[Optional[Tensor]] = [x] for idx, layer in enumerate(self.layers): if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None x, layer_attn, layer_attn_aug, _ = layer( x, enc, padding_mask, enc_aug, padding_mask_aug, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float().to(x) if layer_attn_aug is not None and idx == alignment_layer: attn_aug = layer_attn_aug.float().to(x) if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if attn_aug is not None: if alignment_heads is not None: attn_aug = attn_aug[:alignment_heads] # average probabilities over heads attn_aug = attn_aug.mean(dim=0) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) return x, {"attn": [attn], "attn_aug": [attn_aug], "inner_states": inner_states} def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" if f"{name}.output_projection.weight" not in state_dict: if self.share_input_output_embed: embed_out_key = f"{name}.embed_tokens.weight" else: embed_out_key = f"{name}.embed_out" if embed_out_key in state_dict: state_dict[f"{name}.output_projection.weight"] = state_dict[ embed_out_key ] if not self.share_input_output_embed: del state_dict[embed_out_key] for i in range(self.num_layers): # update layer norms layer_norm_map = { "0": "self_attn_layer_norm", "1": "encoder_attn_layer_norm", "2": "encoder_attn_layer_norm2", "3": "final_layer_norm", } for old, new in layer_norm_map.items(): for m in ("weight", "bias"): k = "{}.layers.{}.layer_norms.{}.{}".format(name, i, old, m) if k in state_dict: state_dict[ "{}.layers.{}.{}.{}".format(name, i, new, m) ] = state_dict[k] del state_dict[k] version_key = "{}.version".format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) <= 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict class AugTransformerDecoder(AugTransformerDecoderBase): def __init__( self, args, dictionary, embed_tokens, output_projection=None, ): self.args = args super().__init__( TransformerConfig.from_namespace(args), dictionary, embed_tokens, no_encoder_attn=False, output_projection=output_projection, encoder_attn_merge_type=getattr( args, "synthesizer_augmented_cross_attention_merge_type", "sequential" ), dropnet_ratio=getattr(args, "dropnet_ratio", 0), ) def build_output_projection(self, args, dictionary, embed_tokens): super().build_output_projection( TransformerConfig.from_namespace(args), dictionary, embed_tokens ) def build_decoder_layer( self, args, encoder_attn_merge_type="sequential", dropnet_ratio=0, ): return super().build_decoder_layer( TransformerConfig.from_namespace(args), no_encoder_attn=False, encoder_attn_merge_type=encoder_attn_merge_type, dropnet_ratio=dropnet_ratio, ) ================================================ FILE: fairseq/models/transformer/transformer_encoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Dict, List, Optional import torch import torch.nn as nn from torch import Tensor from fairseq import utils from fairseq.distributed import fsdp_wrap from fairseq.models import FairseqEncoder from fairseq.models.transformer import TransformerConfig from fairseq.modules import ( FairseqDropout, LayerDropModuleList, LayerNorm, PositionalEmbedding, SinusoidalPositionalEmbedding, transformer_layer, ) from fairseq.modules.checkpoint_activations import checkpoint_wrapper from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ # rewrite name for backward compatibility in `make_generation_fast_` def module_name_fordropout(module_name: str) -> str: if module_name == "TransformerEncoderBase": return "TransformerEncoder" else: return module_name class TransformerEncoderBase(FairseqEncoder): """ Transformer encoder consisting of *cfg.encoder.layers* layers. Each layer is a :class:`TransformerEncoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): encoding dictionary embed_tokens (torch.nn.Embedding): input embedding """ def __init__(self, cfg, dictionary, embed_tokens, return_fc=False): self.cfg = cfg super().__init__(dictionary) self.register_buffer("version", torch.Tensor([3])) self.dropout_module = FairseqDropout( cfg.dropout, module_name=module_name_fordropout(self.__class__.__name__) ) self.encoder_layerdrop = cfg.encoder.layerdrop self.return_fc = return_fc embed_dim = embed_tokens.embedding_dim self.padding_idx = embed_tokens.padding_idx self.max_source_positions = cfg.max_source_positions self.embed_tokens = embed_tokens self.embed_scale = 1.0 if cfg.no_scale_embedding else math.sqrt(embed_dim) self.embed_positions = ( PositionalEmbedding( cfg.max_source_positions, embed_dim, self.padding_idx, learned=cfg.encoder.learned_pos, ) if not cfg.no_token_positional_embeddings else None ) if cfg.layernorm_embedding: self.layernorm_embedding = LayerNorm(embed_dim, export=cfg.export) else: self.layernorm_embedding = None if not cfg.adaptive_input and cfg.quant_noise.pq > 0: self.quant_noise = apply_quant_noise_( nn.Linear(embed_dim, embed_dim, bias=False), cfg.quant_noise.pq, cfg.quant_noise.pq_block_size, ) else: self.quant_noise = None if self.encoder_layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.encoder_layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [self.build_encoder_layer(cfg) for i in range(cfg.encoder.layers)] ) self.num_layers = len(self.layers) if cfg.encoder.normalize_before: self.layer_norm = LayerNorm(embed_dim, export=cfg.export) else: self.layer_norm = None def build_encoder_layer(self, cfg): layer = transformer_layer.TransformerEncoderLayerBase( cfg, return_fc=self.return_fc ) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer def forward_embedding( self, src_tokens, token_embedding: Optional[torch.Tensor] = None ): # embed tokens and positions if token_embedding is None: token_embedding = self.embed_tokens(src_tokens) x = embed = self.embed_scale * token_embedding if self.embed_positions is not None: x = embed + self.embed_positions(src_tokens) if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) if self.quant_noise is not None: x = self.quant_noise(x) return x, embed def forward( self, src_tokens, src_lengths: Optional[torch.Tensor] = None, return_all_hiddens: bool = False, token_embeddings: Optional[torch.Tensor] = None, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). token_embeddings (torch.Tensor, optional): precomputed embeddings default `None` will recompute embeddings Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ return self.forward_scriptable( src_tokens, src_lengths, return_all_hiddens, token_embeddings ) # TorchScript doesn't support super() method so that the scriptable Subclass # can't access the base class model in Torchscript. # Current workaround is to add a helper function with different name and # call the helper function from scriptable Subclass. def forward_scriptable( self, src_tokens, src_lengths: Optional[torch.Tensor] = None, return_all_hiddens: bool = False, token_embeddings: Optional[torch.Tensor] = None, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). token_embeddings (torch.Tensor, optional): precomputed embeddings default `None` will recompute embeddings Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) has_pads = ( torch.tensor(src_tokens.device.type == "xla") or encoder_padding_mask.any() ) # Torchscript doesn't handle bool Tensor correctly, so we need to work around. if torch.jit.is_scripting(): has_pads = torch.tensor(1) if has_pads else torch.tensor(0) x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) # account for padding while computing the representation x = x * ( 1 - encoder_padding_mask.unsqueeze(-1).type_as(x) * has_pads.type_as(x) ) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states = [] fc_results = [] if return_all_hiddens: encoder_states.append(x) # encoder layers for layer in self.layers: lr = layer( x, encoder_padding_mask=encoder_padding_mask if has_pads else None ) if isinstance(lr, tuple) and len(lr) == 2: x, fc_result = lr else: x = lr fc_result = None if return_all_hiddens and not torch.jit.is_scripting(): assert encoder_states is not None encoder_states.append(x) fc_results.append(fc_result) if self.layer_norm is not None: x = self.layer_norm(x) # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `forward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. src_lengths = ( src_tokens.ne(self.padding_idx) .sum(dim=1, dtype=torch.int32) .reshape(-1, 1) .contiguous() ) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask], # B x T "encoder_embedding": [encoder_embedding], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "fc_results": fc_results, # List[T x B x C] "src_tokens": [], "src_lengths": [src_lengths], } @torch.jit.export def reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ if len(encoder_out["encoder_out"]) == 0: new_encoder_out = [] else: new_encoder_out = [encoder_out["encoder_out"][0].index_select(1, new_order)] if len(encoder_out["encoder_padding_mask"]) == 0: new_encoder_padding_mask = [] else: new_encoder_padding_mask = [ encoder_out["encoder_padding_mask"][0].index_select(0, new_order) ] if len(encoder_out["encoder_embedding"]) == 0: new_encoder_embedding = [] else: new_encoder_embedding = [ encoder_out["encoder_embedding"][0].index_select(0, new_order) ] if len(encoder_out["src_tokens"]) == 0: src_tokens = [] else: src_tokens = [(encoder_out["src_tokens"][0]).index_select(0, new_order)] if len(encoder_out["src_lengths"]) == 0: src_lengths = [] else: src_lengths = [(encoder_out["src_lengths"][0]).index_select(0, new_order)] encoder_states = encoder_out["encoder_states"] if len(encoder_states) > 0: for idx, state in enumerate(encoder_states): encoder_states[idx] = state.index_select(1, new_order) return { "encoder_out": new_encoder_out, # T x B x C "encoder_padding_mask": new_encoder_padding_mask, # B x T "encoder_embedding": new_encoder_embedding, # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": src_tokens, # B x T "src_lengths": src_lengths, # B x 1 } @torch.jit.export def _reorder_encoder_out(self, encoder_out: Dict[str, List[Tensor]], new_order): """Dummy re-order function for beamable enc-dec attention""" return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" if self.embed_positions is None: return self.max_source_positions return min(self.max_source_positions, self.embed_positions.max_positions) def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" for i in range(self.num_layers): # update layer norms self.layers[i].upgrade_state_dict_named( state_dict, "{}.layers.{}".format(name, i) ) version_key = "{}.version".format(name) if utils.item(state_dict.get(version_key, torch.Tensor([1]))[0]) < 2: # earlier checkpoints did not normalize after the stack of layers self.layer_norm = None self.normalize = False state_dict[version_key] = torch.Tensor([1]) return state_dict class TransformerEncoder(TransformerEncoderBase): def __init__(self, args, dictionary, embed_tokens, return_fc=False): self.args = args super().__init__( TransformerConfig.from_namespace(args), dictionary, embed_tokens, return_fc=return_fc, ) def build_encoder_layer(self, args): return super().build_encoder_layer( TransformerConfig.from_namespace(args), ) ================================================ FILE: fairseq/models/transformer/transformer_legacy.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.dataclass.utils import gen_parser_from_dataclass from fairseq.models import ( register_model, register_model_architecture, ) from fairseq.models.transformer.transformer_config import ( TransformerConfig, DEFAULT_MAX_SOURCE_POSITIONS, DEFAULT_MAX_TARGET_POSITIONS, DEFAULT_MIN_PARAMS_TO_WRAP, ) from fairseq.models.transformer.transformer_base import ( TransformerModelBase, ) @register_model("transformer") class TransformerModel(TransformerModelBase): """ This is the legacy implementation of the transformer model that uses argparse for configuration. """ @classmethod def hub_models(cls): # fmt: off def moses_subword(path): return { 'path': path, 'tokenizer': 'moses', 'bpe': 'subword_nmt', } def moses_fastbpe(path): return { 'path': path, 'tokenizer': 'moses', 'bpe': 'fastbpe', } def spm(path): return { 'path': path, 'bpe': 'sentencepiece', 'tokenizer': 'space', } return { 'transformer.wmt14.en-fr': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2'), 'transformer.wmt16.en-de': 'https://dl.fbaipublicfiles.com/fairseq/models/wmt16.en-de.joined-dict.transformer.tar.bz2', 'transformer.wmt18.en-de': moses_subword('https://dl.fbaipublicfiles.com/fairseq/models/wmt18.en-de.ensemble.tar.gz'), 'transformer.wmt19.en-de': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz'), 'transformer.wmt19.en-ru': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz'), 'transformer.wmt19.de-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz'), 'transformer.wmt19.ru-en': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz'), 'transformer.wmt19.en-de.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.single_model.tar.gz'), 'transformer.wmt19.en-ru.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.single_model.tar.gz'), 'transformer.wmt19.de-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.single_model.tar.gz'), 'transformer.wmt19.ru-en.single_model': moses_fastbpe('https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.single_model.tar.gz'), 'transformer.wmt20.en-ta': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-ta.single.tar.gz'), 'transformer.wmt20.en-iu.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.news.single.tar.gz'), 'transformer.wmt20.en-iu.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.en-iu.nh.single.tar.gz'), 'transformer.wmt20.ta-en': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.ta-en.single.tar.gz'), 'transformer.wmt20.iu-en.news': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.news.single.tar.gz'), 'transformer.wmt20.iu-en.nh': spm('https://dl.fbaipublicfiles.com/fairseq/models/wmt20.iu-en.nh.single.tar.gz'), 'transformer.flores101.mm100.615M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_615M.tar.gz'), 'transformer.flores101.mm100.175M': spm('https://dl.fbaipublicfiles.com/flores101/pretrained_models/flores101_mm100_175M.tar.gz'), } # fmt: on def __init__(self, args, encoder, decoder): cfg = TransformerConfig.from_namespace(args) super().__init__(cfg, encoder, decoder) self.args = args @classmethod def add_args(cls, parser): """Add model-specific arguments to the parser.""" # we want to build the args recursively in this case. # do not set defaults so that settings defaults from various architectures still works gen_parser_from_dataclass( parser, TransformerConfig(), delete_default=True, with_prefix="" ) @classmethod def build_model(cls, args, task): """Build a new model instance.""" # make sure all arguments are present in older models base_architecture(args) if args.encoder_layers_to_keep: args.encoder_layers = len(args.encoder_layers_to_keep.split(",")) if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if getattr(args, "max_source_positions", None) is None: args.max_source_positions = DEFAULT_MAX_SOURCE_POSITIONS if getattr(args, "max_target_positions", None) is None: args.max_target_positions = DEFAULT_MAX_TARGET_POSITIONS src_dict, tgt_dict = task.source_dictionary, task.target_dictionary if args.share_all_embeddings: if src_dict != tgt_dict: raise ValueError("--share-all-embeddings requires a joined dictionary") if args.encoder_embed_dim != args.decoder_embed_dim: raise ValueError( "--share-all-embeddings requires --encoder-embed-dim to match --decoder-embed-dim" ) if args.decoder_embed_path and ( args.decoder_embed_path != args.encoder_embed_path ): raise ValueError( "--share-all-embeddings not compatible with --decoder-embed-path" ) args.share_decoder_input_output_embed = True if getattr(args, "offload_activations", False): args.checkpoint_activations = True # offloading implies checkpointing if not args.share_all_embeddings: args.min_params_to_wrap = getattr( args, "min_params_to_wrap", DEFAULT_MIN_PARAMS_TO_WRAP ) cfg = TransformerConfig.from_namespace(args) return super().build_model(cfg, task) @classmethod def build_embedding(cls, args, dictionary, embed_dim, path=None): return super().build_embedding( TransformerConfig.from_namespace(args), dictionary, embed_dim, path ) @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return super().build_encoder( TransformerConfig.from_namespace(args), src_dict, embed_tokens ) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): return super().build_decoder( TransformerConfig.from_namespace(args), tgt_dict, embed_tokens ) # architectures @register_model_architecture("transformer", "transformer_tiny") def tiny_architecture(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 64) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 64) args.encoder_layers = getattr(args, "encoder_layers", 2) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2) args.decoder_layers = getattr(args, "decoder_layers", 2) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2) return base_architecture(args) @register_model_architecture("transformer", "transformer") def base_architecture(args): args.encoder_embed_path = getattr(args, "encoder_embed_path", None) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048) args.encoder_layers = getattr(args, "encoder_layers", 6) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False) args.decoder_embed_path = getattr(args, "decoder_embed_path", None) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim) args.decoder_ffn_embed_dim = getattr( args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim ) args.decoder_layers = getattr(args, "decoder_layers", 6) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False) args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False) args.attention_dropout = getattr(args, "attention_dropout", 0.0) args.activation_dropout = getattr(args, "activation_dropout", 0.0) args.activation_fn = getattr(args, "activation_fn", "relu") args.dropout = getattr(args, "dropout", 0.1) args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0) args.share_decoder_input_output_embed = getattr( args, "share_decoder_input_output_embed", False ) args.share_all_embeddings = getattr(args, "share_all_embeddings", False) args.merge_src_tgt_embed = getattr(args, "merge_src_tgt_embed", False) args.no_token_positional_embeddings = getattr( args, "no_token_positional_embeddings", False ) args.adaptive_input = getattr(args, "adaptive_input", False) args.no_cross_attention = getattr(args, "no_cross_attention", False) args.cross_self_attention = getattr(args, "cross_self_attention", False) args.decoder_output_dim = getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim) args.no_scale_embedding = getattr(args, "no_scale_embedding", False) args.layernorm_embedding = getattr(args, "layernorm_embedding", False) args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False) args.checkpoint_activations = getattr(args, "checkpoint_activations", False) args.offload_activations = getattr(args, "offload_activations", False) if args.offload_activations: args.checkpoint_activations = True args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None) args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None) args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0) args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0) args.quant_noise_pq = getattr(args, "quant_noise_pq", 0) args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8) args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0) @register_model_architecture("transformer", "transformer_iwslt_de_en") def transformer_iwslt_de_en(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4) args.encoder_layers = getattr(args, "encoder_layers", 6) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4) args.decoder_layers = getattr(args, "decoder_layers", 6) base_architecture(args) @register_model_architecture("transformer", "transformer_wmt_en_de") def transformer_wmt_en_de(args): base_architecture(args) # parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017) @register_model_architecture("transformer", "transformer_vaswani_wmt_en_de_big") def transformer_vaswani_wmt_en_de_big(args): args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False) args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16) args.dropout = getattr(args, "dropout", 0.3) base_architecture(args) @register_model_architecture("transformer", "transformer_vaswani_wmt_en_fr_big") def transformer_vaswani_wmt_en_fr_big(args): args.dropout = getattr(args, "dropout", 0.1) transformer_vaswani_wmt_en_de_big(args) @register_model_architecture("transformer", "transformer_wmt_en_de_big") def transformer_wmt_en_de_big(args): args.attention_dropout = getattr(args, "attention_dropout", 0.1) transformer_vaswani_wmt_en_de_big(args) # default parameters used in tensor2tensor implementation @register_model_architecture("transformer", "transformer_wmt_en_de_big_t2t") def transformer_wmt_en_de_big_t2t(args): args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_dropout = getattr(args, "activation_dropout", 0.1) transformer_vaswani_wmt_en_de_big(args) ================================================ FILE: fairseq/models/transformer_align.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer import ( TransformerModel, base_architecture, transformer_wmt_en_de_big, ) @register_model("transformer_align") class TransformerAlignModel(TransformerModel): """ See "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). """ def __init__(self, encoder, decoder, args): super().__init__(args, encoder, decoder) self.alignment_heads = args.alignment_heads self.alignment_layer = args.alignment_layer self.full_context_alignment = args.full_context_alignment @staticmethod def add_args(parser): # fmt: off super(TransformerAlignModel, TransformerAlignModel).add_args(parser) parser.add_argument('--alignment-heads', type=int, metavar='D', help='Number of cross attention heads per layer to supervised with alignments') parser.add_argument('--alignment-layer', type=int, metavar='D', help='Layer number which has to be supervised. 0 corresponding to the bottommost layer.') parser.add_argument('--full-context-alignment', action='store_true', help='Whether or not alignment is supervised conditioned on the full target context.') # fmt: on @classmethod def build_model(cls, args, task): # set any default arguments transformer_align(args) transformer_model = TransformerModel.build_model(args, task) return TransformerAlignModel( transformer_model.encoder, transformer_model.decoder, args ) def forward(self, src_tokens, src_lengths, prev_output_tokens): encoder_out = self.encoder(src_tokens, src_lengths) return self.forward_decoder(prev_output_tokens, encoder_out) def forward_decoder( self, prev_output_tokens, encoder_out=None, incremental_state=None, features_only=False, **extra_args, ): attn_args = { "alignment_layer": self.alignment_layer, "alignment_heads": self.alignment_heads, } decoder_out = self.decoder(prev_output_tokens, encoder_out, **attn_args) if self.full_context_alignment: attn_args["full_context_alignment"] = self.full_context_alignment _, alignment_out = self.decoder( prev_output_tokens, encoder_out, features_only=True, **attn_args, **extra_args, ) decoder_out[1]["attn"] = alignment_out["attn"] return decoder_out @register_model_architecture("transformer_align", "transformer_align") def transformer_align(args): args.alignment_heads = getattr(args, "alignment_heads", 1) args.alignment_layer = getattr(args, "alignment_layer", 4) args.full_context_alignment = getattr(args, "full_context_alignment", False) base_architecture(args) @register_model_architecture("transformer_align", "transformer_wmt_en_de_big_align") def transformer_wmt_en_de_big_align(args): args.alignment_heads = getattr(args, "alignment_heads", 1) args.alignment_layer = getattr(args, "alignment_layer", 4) transformer_wmt_en_de_big(args) ================================================ FILE: fairseq/models/transformer_from_pretrained_xlm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os from typing import Any, Dict from fairseq import checkpoint_utils from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary from fairseq.models import register_model, register_model_architecture from fairseq.models.transformer import ( TransformerDecoder, TransformerEncoder, TransformerModel, base_architecture as transformer_base_architecture, ) @register_model("transformer_from_pretrained_xlm") class TransformerFromPretrainedXLMModel(TransformerModel): @staticmethod def add_args(parser): """Add model-specific arguments to the parser.""" TransformerModel.add_args(parser) parser.add_argument( "--pretrained-xlm-checkpoint", type=str, metavar="STR", help="XLM model to use for initializing transformer encoder and/or decoder", ) parser.add_argument( "--init-encoder-only", action="store_true", help="if set, don't load the XLM weights and embeddings into decoder", ) parser.add_argument( "--init-decoder-only", action="store_true", help="if set, don't load the XLM weights and embeddings into encoder", ) @classmethod def build_model(self, args, task, cls_dictionary=MaskedLMDictionary): assert hasattr(args, "pretrained_xlm_checkpoint"), ( "You must specify a path for --pretrained-xlm-checkpoint to use " "--arch transformer_from_pretrained_xlm" ) assert isinstance(task.source_dictionary, cls_dictionary) and isinstance( task.target_dictionary, cls_dictionary ), ( "You should use a MaskedLMDictionary when using --arch " "transformer_from_pretrained_xlm because the pretrained XLM model " "was trained using data binarized with MaskedLMDictionary. " "For translation, you may want to use --task " "translation_from_pretrained_xlm" ) assert not ( getattr(args, "init_encoder_only", False) and getattr(args, "init_decoder_only", False) ), "Only one of --init-encoder-only and --init-decoder-only can be set." return super().build_model(args, task) @classmethod def build_encoder(cls, args, src_dict, embed_tokens): return TransformerEncoderFromPretrainedXLM(args, src_dict, embed_tokens) @classmethod def build_decoder(cls, args, tgt_dict, embed_tokens): return TransformerDecoderFromPretrainedXLM(args, tgt_dict, embed_tokens) def upgrade_state_dict_with_xlm_weights( state_dict: Dict[str, Any], pretrained_xlm_checkpoint: str ) -> Dict[str, Any]: """ Load XLM weights into a Transformer encoder or decoder model. Args: state_dict: state dict for either TransformerEncoder or TransformerDecoder pretrained_xlm_checkpoint: checkpoint to load XLM weights from Raises: AssertionError: If architecture (num layers, attention heads, etc.) does not match between the current Transformer encoder or decoder and the pretrained_xlm_checkpoint """ if not os.path.exists(pretrained_xlm_checkpoint): raise IOError("Model file not found: {}".format(pretrained_xlm_checkpoint)) state = checkpoint_utils.load_checkpoint_to_cpu(pretrained_xlm_checkpoint) xlm_state_dict = state["model"] for key in xlm_state_dict.keys(): for search_key in ["embed_tokens", "embed_positions", "layers"]: if search_key in key: subkey = key[key.find(search_key) :] assert subkey in state_dict, ( "{} Transformer encoder / decoder " "state_dict does not contain {}. Cannot " "load {} from pretrained XLM checkpoint " "{} into Transformer.".format( str(state_dict.keys()), subkey, key, pretrained_xlm_checkpoint ) ) state_dict[subkey] = xlm_state_dict[key] return state_dict class TransformerEncoderFromPretrainedXLM(TransformerEncoder): def __init__(self, args, dictionary, embed_tokens): super().__init__(args, dictionary, embed_tokens) if getattr(args, "init_decoder_only", False): # Don't load XLM weights for encoder if --init-decoder-only return assert hasattr(args, "pretrained_xlm_checkpoint"), ( "--pretrained-xlm-checkpoint must be specified to load Transformer " "encoder from pretrained XLM" ) xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights( state_dict=self.state_dict(), pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint, ) self.load_state_dict(xlm_loaded_state_dict, strict=True) class TransformerDecoderFromPretrainedXLM(TransformerDecoder): def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False): super().__init__(args, dictionary, embed_tokens, no_encoder_attn) if getattr(args, "init_encoder_only", False): # Don't load XLM weights for decoder if --init-encoder-only return assert hasattr(args, "pretrained_xlm_checkpoint"), ( "--pretrained-xlm-checkpoint must be specified to load Transformer " "decoder from pretrained XLM" ) xlm_loaded_state_dict = upgrade_state_dict_with_xlm_weights( state_dict=self.state_dict(), pretrained_xlm_checkpoint=args.pretrained_xlm_checkpoint, ) self.load_state_dict(xlm_loaded_state_dict, strict=True) @register_model_architecture( "transformer_from_pretrained_xlm", "transformer_from_pretrained_xlm" ) def base_architecture(args): transformer_base_architecture(args) ================================================ FILE: fairseq/models/transformer_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from typing import Optional from omegaconf import II from fairseq import options, utils from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import ( FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.models.transformer import ( DEFAULT_MIN_PARAMS_TO_WRAP, Embedding, TransformerDecoder, ) from fairseq.modules import AdaptiveInput, CharacterTokenEmbedder from fairseq.utils import safe_getattr, safe_hasattr DEFAULT_MAX_TARGET_POSITIONS = 1024 @dataclass class TransformerLanguageModelConfig(FairseqDataclass): activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="relu", metadata={"help": "activation function to use"} ) dropout: float = field(default=0.1, metadata={"help": "dropout probability"}) attention_dropout: float = field( default=0.0, metadata={"help": "dropout probability for attention weights"} ) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."} ) relu_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN."} ) decoder_embed_dim: int = field( default=512, metadata={"help": "decoder embedding dimension"} ) decoder_output_dim: int = field( default=512, metadata={"help": "decoder output dimension"} ) decoder_input_dim: int = field( default=512, metadata={"help": "decoder input dimension"} ) decoder_ffn_embed_dim: int = field( default=2048, metadata={"help": "decoder embedding dimension for FFN"} ) decoder_layers: int = field(default=6, metadata={"help": "num decoder layers"}) decoder_attention_heads: int = field( default=8, metadata={"help": "num decoder attention heads"} ) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"} ) no_decoder_final_norm: bool = field( default=False, metadata={"help": "don't add an extra layernorm after the last decoder block"}, ) adaptive_softmax_cutoff: Optional[str] = field( default=None, metadata={ "help": "comma separated list of adaptive softmax cutoff points. " "Must be used with adaptive_loss criterion" }, ) adaptive_softmax_dropout: float = field( default=0, metadata={"help": "sets adaptive softmax dropout for the tail projections"}, ) adaptive_softmax_factor: float = field( default=4, metadata={"help": "adaptive input factor"} ) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings (outside self attention)" }, ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"} ) character_embeddings: bool = field( default=False, metadata={ "help": "if set, uses character embedding convolutions to produce token embeddings" }, ) character_filters: str = field( default="[(1, 64), (2, 128), (3, 192), (4, 256), (5, 256), (6, 256), (7, 256)]", metadata={"help": "size of character embeddings"}, ) character_embedding_dim: int = field( default=4, metadata={"help": "size of character embeddings"} ) char_embedder_highway_layers: int = field( default=2, metadata={"help": "number of highway layers for character token embeddder"}, ) adaptive_input: bool = field( default=False, metadata={"help": "if set, uses adaptive input"} ) adaptive_input_factor: float = field( default=4, metadata={"help": "adaptive input factor"} ) adaptive_input_cutoff: Optional[str] = field( default=None, metadata={"help": "comma separated list of adaptive input cutoff points."}, ) tie_adaptive_weights: bool = field( default=False, metadata={ "help": "if set, ties the weights of adaptive softmax and adaptive input" }, ) tie_adaptive_proj: bool = field( default=False, metadata={ "help": "if set, ties the projection weights of adaptive softmax and adaptive input" }, ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) layernorm_embedding: bool = field( default=False, metadata={"help": "add layernorm to embedding"} ) no_scale_embedding: bool = field( default=False, metadata={"help": "if True, dont scale embeddings"} ) checkpoint_activations: bool = field( default=False, metadata={"help": "checkpoint activations at each layer"} ) offload_activations: bool = field( default=False, metadata={"help": "move checkpointed activations to CPU after they are used."}, ) # config for "Reducing Transformer Depth on Demand with Structured Dropout" (Fan et al., 2019) decoder_layerdrop: float = field( default=0.0, metadata={"help": "LayerDrop probability for decoder"} ) decoder_layers_to_keep: Optional[str] = field( default=None, metadata={ "help": "which layers to *keep* when pruning as a comma-separated list" }, ) # config for Training with Quantization Noise for Extreme Model Compression ({Fan*, Stock*} et al., 2020) quant_noise_pq: float = field( default=0.0, metadata={"help": "iterative PQ quantization noise at training time"}, ) quant_noise_pq_block_size: int = field( default=8, metadata={"help": "block size of quantization noise at training time"}, ) quant_noise_scalar: float = field( default=0.0, metadata={ "help": "scalar quantization noise and scalar quantization at training time" }, ) # config for Fully Sharded Data Parallel (FSDP) training min_params_to_wrap: int = field( default=DEFAULT_MIN_PARAMS_TO_WRAP, metadata={ "help": ( "minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed." ) }, ) # config for "BASE Layers: Simplifying Training of Large, Sparse Models" base_layers: Optional[int] = field( default=0, metadata={"help": "number of BASE layers in total"} ) base_sublayers: Optional[int] = field( default=1, metadata={"help": "number of sublayers in each BASE layer"} ) base_shuffle: Optional[int] = field( default=1, metadata={"help": "shuffle tokens between workers before computing assignment"}, ) # NormFormer scale_fc: Optional[bool] = field( default=False, metadata={"help": "Insert LayerNorm between fully connected layers"}, ) scale_attn: Optional[bool] = field( default=False, metadata={"help": "Insert LayerNorm after attention"} ) scale_heads: Optional[bool] = field( default=False, metadata={"help": "Learn a scale coefficient for each attention head"}, ) scale_resids: Optional[bool] = field( default=False, metadata={"help": "Learn a scale coefficient for each residual connection"}, ) # xFormers arguments decoder_xformers_att_config: Optional[str] = field( default=None, metadata={ "help": "config for xFormers library attention, defined in xformers.components.attention.AttentionConfig", }, ) # options from other parts of the config add_bos_token: bool = II("task.add_bos_token") tokens_per_sample: int = II("task.tokens_per_sample") max_target_positions: Optional[int] = II("task.max_target_positions") tpu: bool = II("common.tpu") @register_model("transformer_lm", dataclass=TransformerLanguageModelConfig) class TransformerLanguageModel(FairseqLanguageModel): @classmethod def hub_models(cls): def moses_fastbpe(path): return {"path": path, "tokenizer": "moses", "bpe": "fastbpe"} def spm(path): return {"path": path, "tokenizer": "space", "bpe": "sentencepiece"} return { "transformer_lm.gbw.adaptive_huge": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_gbw_huge.tar.bz2", "transformer_lm.wiki103.adaptive": "https://dl.fbaipublicfiles.com/fairseq/models/lm/adaptive_lm_wiki103.v2.tar.bz2", "transformer_lm.wmt19.en": moses_fastbpe( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.en.tar.bz2" ), "transformer_lm.wmt19.de": moses_fastbpe( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.de.tar.bz2" ), "transformer_lm.wmt19.ru": moses_fastbpe( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt19.ru.tar.bz2" ), "transformer_lm.wmt20.en": spm( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.en.tar.gz" ), "transformer_lm.wmt20.ta": spm( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.ta.tar.gz" ), "transformer_lm.wmt20.iu.news": spm( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.news.tar.gz" ), "transformer_lm.wmt20.iu.nh": spm( "https://dl.fbaipublicfiles.com/fairseq/models/lm/wmt20.iu.nh.tar.gz" ), } def __init__(self, decoder): super().__init__(decoder) @classmethod def build_model(cls, args, task): """Build a new model instance.""" if args.decoder_layers_to_keep: args.decoder_layers = len(args.decoder_layers_to_keep.split(",")) if safe_getattr(args, "max_target_positions", None) is None: args.max_target_positions = safe_getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) if args.character_embeddings: embed_tokens = CharacterTokenEmbedder( task.source_dictionary, eval(args.character_filters), args.character_embedding_dim, args.decoder_embed_dim, args.char_embedder_highway_layers, ) elif args.adaptive_input: embed_tokens = AdaptiveInput( len(task.source_dictionary), task.source_dictionary.pad(), args.decoder_input_dim, args.adaptive_input_factor, args.decoder_embed_dim, options.eval_str_list(args.adaptive_input_cutoff, type=int), args.quant_noise_pq, args.quant_noise_pq_block_size, ) else: embed_tokens = cls.build_embedding( args, task.source_dictionary, args.decoder_input_dim ) if args.tie_adaptive_weights: assert args.adaptive_input assert args.adaptive_input_factor == args.adaptive_softmax_factor assert ( args.adaptive_softmax_cutoff == args.adaptive_input_cutoff ), "{} != {}".format( args.adaptive_softmax_cutoff, args.adaptive_input_cutoff ) assert args.decoder_input_dim == args.decoder_output_dim decoder = TransformerDecoder( args, task.target_dictionary, embed_tokens, no_encoder_attn=True ) return cls(decoder) @classmethod def build_embedding(cls, args, dictionary, embed_dim, path=None): embed_tokens = Embedding(len(dictionary), embed_dim, dictionary.pad()) return embed_tokens def base_lm_architecture(args): # backward compatibility for older model checkpoints if safe_hasattr(args, "no_tie_adaptive_proj"): # previous models defined --no-tie-adaptive-proj, so use the existence of # that option to determine if this is an "old" model checkpoint args.no_decoder_final_norm = True # old models always set this to True if args.no_tie_adaptive_proj is False: args.tie_adaptive_proj = True if safe_hasattr(args, "decoder_final_norm"): args.no_decoder_final_norm = not args.decoder_final_norm args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 2048) args.decoder_layers = safe_getattr(args, "decoder_layers", 6) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8) args.adaptive_softmax_cutoff = safe_getattr(args, "adaptive_softmax_cutoff", None) args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0) args.adaptive_softmax_factor = safe_getattr(args, "adaptive_softmax_factor", 4) args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", False) args.activation_fn = safe_getattr(args, "activation_fn", "relu") args.decoder_layerdrop = safe_getattr(args, "decoder_layerdrop", 0) args.decoder_layers_to_keep = safe_getattr(args, "decoder_layers_to_keep", None) args.quant_noise_pq = safe_getattr(args, "quant_noise_pq", 0) args.quant_noise_pq_block_size = safe_getattr(args, "quant_noise_pq_block_size", 8) args.quant_noise_scalar = safe_getattr(args, "quant_noise_scalar", 0) args.base_layers = safe_getattr(args, "base_layers", 0) args.base_sublayers = safe_getattr(args, "base_sublayers", 1) args.base_shuffle = safe_getattr(args, "base_shuffle", False) args.add_bos_token = safe_getattr(args, "add_bos_token", False) args.no_token_positional_embeddings = safe_getattr( args, "no_token_positional_embeddings", False ) args.share_decoder_input_output_embed = safe_getattr( args, "share_decoder_input_output_embed", False ) args.character_embeddings = safe_getattr(args, "character_embeddings", False) args.decoder_output_dim = safe_getattr( args, "decoder_output_dim", args.decoder_embed_dim ) args.decoder_input_dim = safe_getattr( args, "decoder_input_dim", args.decoder_embed_dim ) # Model training is not stable without this args.decoder_normalize_before = True args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", False) args.adaptive_input = safe_getattr(args, "adaptive_input", False) args.adaptive_input_factor = safe_getattr(args, "adaptive_input_factor", 4) args.adaptive_input_cutoff = safe_getattr(args, "adaptive_input_cutoff", None) args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", False) args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", False) args.no_scale_embedding = safe_getattr(args, "no_scale_embedding", False) args.layernorm_embedding = safe_getattr(args, "layernorm_embedding", False) args.checkpoint_activations = safe_getattr(args, "checkpoint_activations", False) args.offload_activations = safe_getattr(args, "offload_activations", False) args.scale_fc = safe_getattr(args, "scale_fc", False) args.scale_attn = safe_getattr(args, "scale_attn", False) args.scale_heads = safe_getattr(args, "scale_heads", False) args.scale_resids = safe_getattr(args, "scale_resids", False) if args.offload_activations: args.checkpoint_activations = True @register_model_architecture("transformer_lm", "transformer_lm_big") def transformer_lm_big(args): args.decoder_layers = safe_getattr(args, "decoder_layers", 12) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_wiki103") @register_model_architecture("transformer_lm", "transformer_lm_baevski_wiki103") def transformer_lm_baevski_wiki103(args): args.decoder_layers = safe_getattr(args, "decoder_layers", 16) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 8) args.dropout = safe_getattr(args, "dropout", 0.3) args.adaptive_input = safe_getattr(args, "adaptive_input", True) args.tie_adaptive_weights = safe_getattr(args, "tie_adaptive_weights", True) args.adaptive_input_cutoff = safe_getattr( args, "adaptive_input_cutoff", "20000,60000" ) args.adaptive_softmax_cutoff = safe_getattr( args, "adaptive_softmax_cutoff", "20000,60000" ) args.adaptive_softmax_dropout = safe_getattr(args, "adaptive_softmax_dropout", 0.2) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_dropout = safe_getattr(args, "activation_dropout", 0.1) args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True) args.tie_adaptive_proj = safe_getattr(args, "tie_adaptive_proj", True) transformer_lm_big(args) @register_model_architecture("transformer_lm", "transformer_lm_gbw") @register_model_architecture("transformer_lm", "transformer_lm_baevski_gbw") def transformer_lm_baevski_gbw(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 512) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.no_decoder_final_norm = safe_getattr(args, "no_decoder_final_norm", True) transformer_lm_big(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt") def transformer_lm_gpt(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 3072) args.decoder_layers = safe_getattr(args, "decoder_layers", 12) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_small") def transformer_lm_gpt2_small(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 4096) args.decoder_layers = safe_getattr(args, "decoder_layers", 24) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_tiny") def transformer_lm_gpt2_tiny(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 64) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 64) args.decoder_layers = safe_getattr(args, "decoder_layers", 2) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 1) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_medium") def transformer_lm_gpt2_medium(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1280) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 5120) args.decoder_layers = safe_getattr(args, "decoder_layers", 36) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 20) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_big") def transformer_lm_gpt2_big(args): args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1600) args.decoder_ffn_embed_dim = safe_getattr(args, "decoder_ffn_embed_dim", 6400) args.decoder_layers = safe_getattr(args, "decoder_layers", 48) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 25) args.dropout = safe_getattr(args, "dropout", 0.1) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.1) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_big_wide") def transformer_lm_gpt2_big_wide(args): args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192) args.decoder_layers = getattr(args, "decoder_layers", 24) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_fn = getattr(args, "activation_fn", "gelu") base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt2_bigger") def transformer_lm_gpt2_bigger(args): args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048) args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192) args.decoder_layers = getattr(args, "decoder_layers", 48) args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 32) args.dropout = getattr(args, "dropout", 0.1) args.attention_dropout = getattr(args, "attention_dropout", 0.1) args.activation_fn = getattr(args, "activation_fn", "gelu") base_lm_architecture(args) def base_gpt3_architecture(args): args.decoder_input_dim = args.decoder_embed_dim args.decoder_output_dim = args.decoder_embed_dim args.decoder_ffn_embed_dim = safe_getattr( args, "decoder_ffn_embed_dim", args.decoder_embed_dim * 4 ) # GPT-3 used learned positional embeddings, rather than sinusoidal args.decoder_learned_pos = safe_getattr(args, "decoder_learned_pos", True) args.dropout = safe_getattr(args, "dropout", 0.0) args.attention_dropout = safe_getattr(args, "attention_dropout", 0.0) args.activation_fn = safe_getattr(args, "activation_fn", "gelu") args.share_decoder_input_output_embed = True base_lm_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_small") def transformer_lm_gpt3_small(args): # 125M params args.decoder_layers = safe_getattr(args, "decoder_layers", 12) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 768) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 12) base_gpt3_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_medium") def transformer_lm_gpt3_medium(args): # 350M params args.decoder_layers = safe_getattr(args, "decoder_layers", 24) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1024) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) base_gpt3_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_large") def transformer_lm_gpt3_large(args): # 760M params args.decoder_layers = safe_getattr(args, "decoder_layers", 24) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 1536) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 16) base_gpt3_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_xl") def transformer_lm_gpt3_xl(args): # 1.3B params args.decoder_layers = safe_getattr(args, "decoder_layers", 24) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2048) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) base_gpt3_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_2_7") def transformer_lm_gpt3_2_7(args): # 2.7B params args.decoder_layers = safe_getattr(args, "decoder_layers", 32) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 2560) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) base_gpt3_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_6_7") def transformer_lm_gpt3_6_7(args): # 6.7B params args.decoder_layers = safe_getattr(args, "decoder_layers", 32) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 4096) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 32) base_gpt3_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_13") def transformer_lm_gpt3_13(args): # 13B params args.decoder_layers = safe_getattr(args, "decoder_layers", 40) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 5120) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 40) base_gpt3_architecture(args) @register_model_architecture("transformer_lm", "transformer_lm_gpt3_175") def transformer_lm_gpt3_175(args): # 175B params args.decoder_layers = safe_getattr(args, "decoder_layers", 96) args.decoder_embed_dim = safe_getattr(args, "decoder_embed_dim", 12288) args.decoder_attention_heads = safe_getattr(args, "decoder_attention_heads", 96) base_gpt3_architecture(args) ================================================ FILE: fairseq/models/transformer_ulm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq.models.fairseq_decoder import FairseqDecoder import numpy as np from typing import Optional, Dict, Any, List import torch from torch import nn from fairseq.data.data_utils import compute_mask_indices from fairseq.dataclass import ChoiceEnum from fairseq.models import ( FairseqLanguageModel, register_model, register_model_architecture, ) from fairseq.tasks.speech_ulm_task import SpeechUnitLanguageModelingTask from fairseq.models.transformer import Embedding, TransformerDecoder, Linear from fairseq.models.transformer_lm import TransformerLanguageModelConfig from torch import Tensor DEFAULT_MAX_TARGET_POSITIONS = 1024 MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"]) @dataclass class SpeechUnitLanguageModelConfig(TransformerLanguageModelConfig): mask_unit_seg_prob: float = field( default=0.0, metadata={"help": "probability to mask a segment of unit sequence"} ) mask_unit_seg_leng: int = field( default=5, metadata={"help": "length of unit segment mask"} ) mask_unit_seg_type: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose unit mask length"} ) mask_dur_prob: float = field( default=0.0, metadata={"help": "probability to mask entire duration sequence"} ) mask_dur_seg_prob: float = field( default=0.0, metadata={"help": "probability to mask a segment of duration sequence"}, ) mask_dur_seg_leng: int = field( default=5, metadata={"help": "length of duration segment mask"} ) mask_dur_seg_type: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose duration mask length"} ) mask_f0_prob: float = field( default=0.0, metadata={"help": "probability to mask entire duration sequence"} ) mask_f0_seg_prob: float = field( default=0.0, metadata={"help": "probability to mask a segment of f0 sequence"} ) mask_f0_seg_leng: int = field( default=5, metadata={"help": "length of f0 segment mask"} ) mask_f0_seg_type: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose f0 mask length"} ) @register_model("transformer_ulm", dataclass=SpeechUnitLanguageModelConfig) class TransformerUnitLanguageModel(FairseqLanguageModel): def __init__( self, cfg: SpeechUnitLanguageModelConfig, task: SpeechUnitLanguageModelingTask, decoder: FairseqDecoder, ): super().__init__(decoder) self.cfg = cfg self.channel_names = task.channel_names self.channel_sizes = task.channel_sizes self.unit_mask_val = task.source_dictionary.unk() self.dur_mask_val = ( task.source_duration_dictionary.unk() if task.cfg.discrete_duration else 0 ) self.f0_mask_val = ( task.source_f0_dictionary.unk() if task.cfg.discrete_f0 else 0 ) self.ignore_duration_input = task.cfg.ignore_duration_input self.ignore_f0_input = task.cfg.ignore_f0_input @classmethod def build_model(cls, args, task): base_ulm_architecture(args) if getattr(args, "max_target_positions", None) is None: args.max_target_positions = getattr( args, "tokens_per_sample", DEFAULT_MAX_TARGET_POSITIONS ) embed_tokens = Embedding( len(task.source_dictionary), args.decoder_input_dim, padding_idx=task.source_dictionary.pad(), ) embed_duration = None if task.cfg.discrete_duration: embed_duration = Embedding( len(task.source_duration_dictionary), args.decoder_input_dim, padding_idx=0, # duration uses 0 for padding ) embed_f0 = None if task.cfg.discrete_f0: embed_f0 = Embedding( len(task.source_f0_dictionary), args.decoder_input_dim, padding_idx=task.source_f0_dictionary.pad(), ) decoder = MultiStreamTransformerDecoder( args, task.target_dictionary, embed_tokens, [embed_duration, embed_f0], no_encoder_attn=True, channel_sizes=task.channel_sizes, ) return cls(args, task, decoder) def apply_seg_dropout(self, inp, mask_prob, mask_leng, mask_type, mask_val): B, T = inp.size() if mask_prob > 0: mask_indices = compute_mask_indices( (B, T), None, mask_prob, mask_leng, mask_type # may mask padding ) mask_indices = torch.from_numpy(mask_indices).to(inp.device) inp[mask_indices] = mask_val else: mask_indices = torch.zeros_like(inp).bool() return inp, mask_indices def apply_seq_dropout(self, inp, mask_prob, mask_val): B, T = inp.size() if mask_prob > 0: mask_indices = np.random.uniform(0, 1, (B,)) < mask_prob mask_indices = ( torch.from_numpy(mask_indices).to(inp.device).unsqueeze(1).expand(-1, T) ) inp[mask_indices] = mask_val else: mask_indices = torch.zeros_like(inp).bool() return inp, mask_indices def apply_dropout(self, src_tokens, dur_src, f0_src): src_tokens, unit_mask = self.apply_seg_dropout( src_tokens, self.cfg.mask_unit_seg_prob, self.cfg.mask_unit_seg_leng, self.cfg.mask_unit_seg_type, self.unit_mask_val, ) dur_src, dur_mask = self.apply_seq_dropout( dur_src, self.cfg.mask_dur_prob, self.dur_mask_val ) dur_src, _dur_mask = self.apply_seg_dropout( dur_src, self.cfg.mask_dur_seg_prob, self.cfg.mask_dur_seg_leng, self.cfg.mask_dur_seg_type, self.dur_mask_val, ) dur_mask = dur_mask.logical_or(_dur_mask) f0_src, f0_mask = self.apply_seq_dropout( f0_src, self.cfg.mask_f0_prob, self.f0_mask_val ) f0_src, _f0_mask = self.apply_seg_dropout( f0_src, self.cfg.mask_f0_seg_prob, self.cfg.mask_f0_seg_leng, self.cfg.mask_f0_seg_type, self.f0_mask_val, ) f0_mask = f0_mask.logical_or(_f0_mask) return src_tokens, unit_mask, dur_src, dur_mask, f0_src, f0_mask def forward( self, src_tokens: torch.Tensor, dur_src: torch.Tensor, f0_src: torch.Tensor, src_lengths: Optional[Any] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): if self.ignore_duration_input: dur_src = torch.zeros_like(dur_src) if self.ignore_f0_input: f0_src = torch.zeros_like(f0_src) if self.training: ( src_tokens, unit_mask, dur_src, dur_mask, f0_src, f0_mask, ) = self.apply_dropout(src_tokens, dur_src, f0_src) else: unit_masks = dur_mask = f0_mask = None prediction, _ = self.decoder( prev_output_tokens=(src_tokens, dur_src, f0_src), incremental_state=incremental_state, src_lengths=src_lengths, features_only=True, ) result = dict(zip(self.channel_names, prediction)) return result def base_ulm_architecture(args): from .transformer_lm import base_lm_architecture base_lm_architecture(args) @register_model_architecture("transformer_ulm", "transformer_ulm_big") def transformer_ulm_big(args): from .transformer_lm import transformer_lm_big transformer_lm_big(args) base_ulm_architecture(args) @register_model_architecture("transformer_ulm", "transformer_ulm_tiny") def transformer_ulm_tiny(args): from .transformer_lm import transformer_lm_gpt2_tiny transformer_lm_gpt2_tiny(args) base_ulm_architecture(args) class MultiStreamTransformerDecoder(TransformerDecoder): def __init__( self, args, dictionary, embed_tokens, embed_other_list, no_encoder_attn, channel_sizes, ): super().__init__( args, dictionary, embed_tokens, no_encoder_attn=no_encoder_attn ) # embed each channel and project if dimensions do not match self.embed_other_list = torch.nn.ModuleList(embed_other_list) self.proj_other_list = torch.nn.ModuleList() dim = embed_tokens.embedding_dim for embed_other in embed_other_list: other_dim = 1 if embed_other is None else embed_other.embedding_dim self.proj_other_list.append( nn.Linear(other_dim, dim) if other_dim != dim else None ) # tranformer output to prediction self.channel_sizes = channel_sizes self.project_out_dim = Linear( embed_tokens.embedding_dim, sum(channel_sizes), bias=False ) def extract_features_scriptable( self, prev_output_tokens, encoder_out: Optional[Dict[str, List[Tensor]]], incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, full_context_alignment: bool = False, alignment_layer: Optional[int] = None, alignment_heads: Optional[int] = None, ): if alignment_layer is None: alignment_layer = self.num_layers - 1 # XXX: first multi-channel change start prev_output_tokens, *other_channels = prev_output_tokens # XXX: first multi-channel change end # embed positions positions = None if self.embed_positions is not None: positions = self.embed_positions( prev_output_tokens, incremental_state=incremental_state ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] other_channels = [o[:, -1:] for o in other_channels] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) # XXX: second multi-channel change start other_channels = [ o.unsqueeze(-1).to(dtype=x.dtype) if emb is None else emb(o) for o, emb in zip(other_channels, self.embed_other_list) ] other_channels = [ o if proj_other is None else proj_other(o) for o, proj_other in zip(other_channels, self.proj_other_list) ] for o in other_channels: x = x + o # XXX: second multi-channel change end if self.quant_noise is not None: x = self.quant_noise(x) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions if self.layernorm_embedding is not None: x = self.layernorm_embedding(x) x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) self_attn_padding_mask: Optional[Tensor] = None if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) # decoder layers attn: Optional[Tensor] = None inner_states: List[Optional[Tensor]] = [x] for idx, layer in enumerate(self.layers): if incremental_state is None and not full_context_alignment: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None x, layer_attn, _ = layer( x, encoder_out["encoder_out"][0] if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) else None, encoder_out["encoder_padding_mask"][0] if ( encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0 ) else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float().to(x) if attn is not None: if alignment_heads is not None: attn = attn[:alignment_heads] # average probabilities over heads attn = attn.mean(dim=0) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) if self.project_out_dim is not None: x = self.project_out_dim(x) else: assert False # XXX: the last change start result = [] start = 0 for channel_size in self.channel_sizes: end = start + channel_size result.append(x[:, :, start:end]) start = end assert end == x.size(-1) # XXX: the last change end return result, {"attn": [attn], "inner_states": inner_states} ================================================ FILE: fairseq/models/wav2vec/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .wav2vec import * # noqa from .wav2vec2 import * # noqa from .wav2vec2_asr import * # noqa from .wav2vec2_laser import * # noqa from .wav2vec2_classification import * # noqa ================================================ FILE: fairseq/models/wav2vec/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch.nn.functional as F def pad_to_multiple(x, multiple, dim=-1, value=0): # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41 if x is None: return None, 0 tsz = x.size(dim) m = tsz / multiple remainder = math.ceil(m) * multiple - tsz if m.is_integer(): return x, 0 pad_offset = (0,) * (-1 - dim) * 2 return F.pad(x, (*pad_offset, 0, remainder), value=value), remainder ================================================ FILE: fairseq/models/wav2vec/wav2vec.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import logging import math from typing import Optional, Tuple from omegaconf import II import sys import torch import torch.nn as nn import torch.nn.functional as F from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.models import BaseFairseqModel, register_model from fairseq.modules import ( Fp32GroupNorm, Fp32LayerNorm, GumbelVectorQuantizer, KmeansVectorQuantizer, TransposeLast, ) from fairseq.tasks import FairseqTask from fairseq.utils import buffered_arange logger = logging.getLogger(__name__) AGGREGATOR_CHOICES = ChoiceEnum(["cnn", "gru"]) PROJECT_FEATURES_CHOICES = ChoiceEnum(["none", "same", "new"]) ACTIVATION_CHOICES = ChoiceEnum(["relu", "gelu"]) VQ_TYPE_CHOICES = ChoiceEnum(["none", "gumbel", "kmeans"]) @dataclass class Wav2VecConfig(FairseqDataclass): prediction_steps: int = field( default=12, metadata={"help": "number of steps ahead to predict"} ) sample_distance: Optional[int] = field( default=None, metadata={ "help": "sample distance from target. does not work properly with cross-sampling" }, ) cross_sample_negatives: int = field( default=0, metadata={"help": "num of cross sampled negatives"} ) num_negatives: int = field( default=10, metadata={"help": "num of sampled negatives"} ) conv_feature_layers: str = field( default="[(512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1), (512, 1, 1)]", metadata={ "help": "convolutional feature extraction layers [(dim, kernel_size, stride), ...]" }, ) conv_aggregator_layers: str = field( default="[(512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)]", metadata={ "help": "convolutional aggregator layers [(dim, kernel_size, stride), ...]" }, ) dropout: float = field( default=0.0, metadata={"help": "dropout to apply within the model"} ) dropout_features: float = field( default=0.0, metadata={"help": "dropout to apply to the features"} ) dropout_agg: float = field( default=0.0, metadata={"help": "dropout to apply after aggregation step"} ) aggregator: AGGREGATOR_CHOICES = field( default="cnn", metadata={"help": "type of aggregator to use"} ) gru_dim: int = field(default=512, metadata={"help": "GRU dimensionality"}) no_conv_bias: bool = field( default=False, metadata={"help": "if set, does not learn bias for conv layers"} ) agg_zero_pad: bool = field( default=False, metadata={"help": "if set, zero pads in aggregator instead of repl pad"}, ) skip_connections_feat: bool = field( default=False, metadata={"help": "if set, adds skip connections to the feature extractor"}, ) skip_connections_agg: bool = field( default=True, metadata={"help": "if set, adds skip connections to the aggregator"}, ) residual_scale: float = field( default=0.5, metadata={"help": "scales residual by sqrt(value)"} ) log_compression: bool = field( default=True, metadata={"help": "if set, adds a log compression to feature extractor"}, ) balanced_classes: bool = field( default=False, metadata={"help": "if set, loss is scaled to balance for number of negatives"}, ) project_features: PROJECT_FEATURES_CHOICES = field( default="none", metadata={ "help": "if not none, features are projected using the (same or new) aggregator" }, ) non_affine_group_norm: bool = field( default=False, metadata={"help": "if set, group norm is not affine"} ) offset: str = field( default="auto", metadata={ "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value" }, ) activation: ACTIVATION_CHOICES = field( default="relu", metadata={ "help": "if set to 'auto', it is computed automatically from the receptive field, else set to int value" }, ) vq_type: VQ_TYPE_CHOICES = field( default="none", metadata={"help": "which type of quantizer to use"} ) vq_vars: int = field( default=320, metadata={"help": "project to this many vector quantized variables per group"}, ) vq_groups: int = field( default=2, metadata={"help": "number of groups of latent variables"} ) vq_dim: int = field( default=0, metadata={ "help": "uses this dimensionality for quantized vectors. 0 to use model dim // groups" }, ) vq_depth: int = field( default=1, metadata={"help": "number of layers for vq weight projection"} ) combine_groups: bool = field( default=False, metadata={"help": "if set, variables are shared among groups"} ) vq_temp: Tuple[float, float, float] = field( default=(2.0, 0.5, 0.999995), metadata={ "help": "temperature for latent variable sampling with gumbel softmax. should be a tuple of 3 values (start, end, decay)" }, ) vq_gamma: float = field( default=0.25, metadata={"help": "gamma parameter for kmeans style vector quantization"}, ) infonce: bool = II("criterion.infonce") @register_model("wav2vec", dataclass=Wav2VecConfig) class Wav2VecModel(BaseFairseqModel): @classmethod def build_model(cls, cfg: Wav2VecConfig, task: FairseqTask): """Build a new model instance.""" model = Wav2VecModel(cfg) logger.info(model) return model def __init__(self, cfg: Wav2VecConfig): super().__init__() self.prediction_steps = cfg.prediction_steps offset = cfg.offset if cfg.activation == "relu": activation = nn.ReLU() elif cfg.activation == "gelu": activation = nn.GELU() else: raise Exception("unknown activation " + cfg.activation) feature_enc_layers = eval(cfg.conv_feature_layers) self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, log_compression=cfg.log_compression, skip_connections=cfg.skip_connections_feat, residual_scale=cfg.residual_scale, non_affine_group_norm=cfg.non_affine_group_norm, activation=activation, ) embed = feature_enc_layers[-1][0] self.vector_quantizer = None if cfg.vq_type == "gumbel": self.vector_quantizer = GumbelVectorQuantizer( dim=embed, num_vars=cfg.vq_vars, temp=cfg.vq_temp, groups=cfg.vq_groups, combine_groups=cfg.combine_groups, vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed, time_first=False, activation=activation, weight_proj_depth=cfg.vq_depth, weight_proj_factor=2, ) elif cfg.vq_type == "kmeans": self.vector_quantizer = KmeansVectorQuantizer( dim=embed, num_vars=cfg.vq_vars, groups=cfg.vq_groups, combine_groups=cfg.combine_groups, vq_dim=cfg.vq_dim if cfg.vq_dim > 0 else embed, time_first=False, gamma=cfg.vq_gamma, ) else: assert ( cfg.vq_type == "none" or cfg.vq_type is None ), "Unknown quantizer type" if cfg.offset == "auto": jin = 0 rin = 0 for _, k, stride in feature_enc_layers: if rin == 0: rin = k rin = rin + (k - 1) * jin if jin == 0: jin = stride else: jin *= stride offset = math.ceil(rin / jin) offset = int(offset) def make_aggregator(): if cfg.aggregator == "cnn": agg_layers = eval(cfg.conv_aggregator_layers) agg_dim = agg_layers[-1][0] feature_aggregator = ConvAggegator( conv_layers=agg_layers, embed=embed, dropout=cfg.dropout, skip_connections=cfg.skip_connections_agg, residual_scale=cfg.residual_scale, non_affine_group_norm=cfg.non_affine_group_norm, conv_bias=not cfg.no_conv_bias, zero_pad=cfg.agg_zero_pad, activation=activation, ) elif cfg.aggregator == "gru": agg_dim = cfg.gru_dim feature_aggregator = nn.Sequential( TransposeLast(), nn.GRU( input_size=embed, hidden_size=agg_dim, num_layers=1, dropout=cfg.dropout, ), TransposeLast(deconstruct_idx=0), ) else: raise Exception("unknown aggregator type " + cfg.aggregator) return feature_aggregator, agg_dim self.feature_aggregator, agg_dim = make_aggregator() self.wav2vec_predictions = Wav2VecPredictionsModel( in_dim=agg_dim, out_dim=embed, prediction_steps=cfg.prediction_steps, n_negatives=cfg.num_negatives, cross_sample_negatives=cfg.cross_sample_negatives, sample_distance=cfg.sample_distance, dropout=cfg.dropout, offset=offset, balanced_classes=cfg.balanced_classes, infonce=cfg.infonce, ) self.dropout_feats = nn.Dropout(p=cfg.dropout_features) self.dropout_agg = nn.Dropout(p=cfg.dropout_agg) if cfg.project_features == "none": self.project_features = None elif cfg.project_features == "same": self.project_features = self.feature_aggregator elif cfg.project_features == "new": self.project_features, _ = make_aggregator() def forward(self, source): result = {} features = self.feature_extractor(source) if self.vector_quantizer: q_res = self.vector_quantizer(features) features = q_res["x"] for k in q_res.keys(): if k != "x": result[k] = q_res[k] x = self.dropout_feats(features) x = self.feature_aggregator(x) x = self.dropout_agg(x) if self.project_features is not None: features = self.project_features(features) x, targets = self.wav2vec_predictions(x, features) result["cpc_logits"] = x result["cpc_targets"] = targets return result def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) def max_positions(self): """Maximum length supported by the model.""" return sys.maxsize def get_logits(self, net_output): logits = net_output["cpc_logits"] return logits def get_targets(self, sample, net_output): t = net_output["cpc_targets"] if isinstance(t, tuple): t = t[0] return t.contiguous() def get_target_weights(self, targets, net_output): targets = net_output["cpc_targets"] if isinstance(targets, tuple) and targets[-1] is not None: return targets[-1] return None def get_extra_losses(self, net_output): loss = None if "prob_perplexity" in net_output: loss = net_output["num_vars"] - net_output["prob_perplexity"] elif "kmeans_loss" in net_output: loss = net_output["kmeans_loss"] return loss def norm_block(is_layer_norm, dim, affine=True): if is_layer_norm: mod = nn.Sequential( TransposeLast(), Fp32LayerNorm(dim, elementwise_affine=affine), TransposeLast(), ) else: mod = Fp32GroupNorm(1, dim, affine=affine) return mod class ConvFeatureExtractionModel(nn.Module): def __init__( self, conv_layers, dropout, log_compression, skip_connections, residual_scale, non_affine_group_norm, activation, ): super().__init__() def block(n_in, n_out, k, stride): return nn.Sequential( nn.Conv1d(n_in, n_out, k, stride=stride, bias=False), nn.Dropout(p=dropout), norm_block( is_layer_norm=False, dim=n_out, affine=not non_affine_group_norm ), activation, ) in_d = 1 self.conv_layers = nn.ModuleList() for dim, k, stride in conv_layers: self.conv_layers.append(block(in_d, dim, k, stride)) in_d = dim self.log_compression = log_compression self.skip_connections = skip_connections self.residual_scale = math.sqrt(residual_scale) def forward(self, x): # BxT -> BxCxT x = x.unsqueeze(1) for conv in self.conv_layers: residual = x x = conv(x) if self.skip_connections and x.size(1) == residual.size(1): tsz = x.size(2) r_tsz = residual.size(2) residual = residual[..., :: r_tsz // tsz][..., :tsz] x = (x + residual) * self.residual_scale if self.log_compression: x = x.abs() x = x + 1 x = x.log() return x class ZeroPad1d(nn.Module): def __init__(self, pad_left, pad_right): super().__init__() self.pad_left = pad_left self.pad_right = pad_right def forward(self, x): return F.pad(x, (self.pad_left, self.pad_right)) class ConvAggegator(nn.Module): def __init__( self, conv_layers, embed, dropout, skip_connections, residual_scale, non_affine_group_norm, conv_bias, zero_pad, activation, ): super().__init__() def block(n_in, n_out, k, stride): # padding dims only really make sense for stride = 1 ka = k // 2 kb = ka - 1 if k % 2 == 0 else ka pad = ( ZeroPad1d(ka + kb, 0) if zero_pad else nn.ReplicationPad1d((ka + kb, 0)) ) return nn.Sequential( pad, nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias), nn.Dropout(p=dropout), norm_block(False, n_out, affine=not non_affine_group_norm), activation, ) in_d = embed self.conv_layers = nn.ModuleList() self.residual_proj = nn.ModuleList() for dim, k, stride in conv_layers: if in_d != dim and skip_connections: self.residual_proj.append(nn.Conv1d(in_d, dim, 1, bias=False)) else: self.residual_proj.append(None) self.conv_layers.append(block(in_d, dim, k, stride)) in_d = dim self.conv_layers = nn.Sequential(*self.conv_layers) self.skip_connections = skip_connections self.residual_scale = math.sqrt(residual_scale) def forward(self, x): for rproj, conv in zip(self.residual_proj, self.conv_layers): residual = x x = conv(x) if self.skip_connections: if rproj is not None: residual = rproj(residual) x = (x + residual) * self.residual_scale return x class Wav2VecPredictionsModel(nn.Module): def __init__( self, in_dim, out_dim, prediction_steps, n_negatives, cross_sample_negatives, sample_distance, dropout, offset, balanced_classes, infonce, ): super().__init__() self.n_negatives = n_negatives self.cross_sample_negatives = cross_sample_negatives self.sample_distance = sample_distance self.project_to_steps = nn.ConvTranspose2d( in_dim, out_dim, (1, prediction_steps) ) self.dropout = nn.Dropout(p=dropout) self.offset = offset self.balanced_classes = balanced_classes self.infonce = infonce def sample_negatives(self, y): bsz, fsz, tsz = y.shape y = y.transpose(0, 1) # BCT -> CBT y = y.contiguous().view(fsz, -1) # CBT => C(BxT) cross_high = tsz * bsz high = tsz if self.sample_distance is None else min(tsz, self.sample_distance) assert high > 1 neg_idxs = torch.randint(low=0, high=high, size=(bsz, self.n_negatives * tsz)) with torch.no_grad(): if self.n_negatives > 0: tszs = ( buffered_arange(tsz) .unsqueeze(-1) .expand(-1, self.n_negatives) .flatten() ) neg_idxs = torch.randint( low=0, high=high - 1, size=(bsz, self.n_negatives * tsz) ) neg_idxs[neg_idxs >= tszs] += 1 if self.cross_sample_negatives > 0: tszs = ( buffered_arange(tsz) .unsqueeze(-1) .expand(-1, self.cross_sample_negatives) .flatten() ) cross_neg_idxs = torch.randint( low=0, high=cross_high - 1, size=(bsz, self.cross_sample_negatives * tsz), ) cross_neg_idxs[cross_neg_idxs >= tszs] += 1 if self.n_negatives > 0: for i in range(1, bsz): neg_idxs[i] += i * high else: neg_idxs = cross_neg_idxs if self.cross_sample_negatives > 0 and self.n_negatives > 0: neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1) negs = y[..., neg_idxs.view(-1)] negs = negs.view( fsz, bsz, self.n_negatives + self.cross_sample_negatives, tsz ).permute( 2, 1, 0, 3 ) # to NxBxCxT return negs def forward(self, x, y): x = x.unsqueeze(-1) x = self.project_to_steps(x) # BxCxTxS x = self.dropout(x) negatives = self.sample_negatives(y) y = y.unsqueeze(0) targets = torch.cat([y, negatives], dim=0) # Copies x B x C x T copies = targets.size(0) bsz, dim, tsz, steps = x.shape steps = min(steps, tsz - self.offset) predictions = x.new( bsz * copies * (tsz - self.offset + 1) * steps - ((steps + 1) * steps // 2) * copies * bsz ) if self.infonce: labels = predictions.new_full( (predictions.shape[0] // copies,), 0, dtype=torch.long ) else: labels = torch.zeros_like(predictions) weights = ( torch.full_like(labels, 1 / self.n_negatives) if self.balanced_classes and not self.infonce else None ) start = end = 0 for i in range(steps): offset = i + self.offset end = start + (tsz - offset) * bsz * copies if self.infonce: predictions[start:end] = torch.einsum( "bct,nbct->tbn", x[..., :-offset, i], targets[..., offset:] ).flatten() else: pos_num = (end - start) // copies predictions[start:end] = torch.einsum( "bct,nbct->nbt", x[..., :-offset, i], targets[..., offset:] ).flatten() labels[start : start + pos_num] = 1.0 if weights is not None: weights[start : start + pos_num] = 1.0 start = end assert end == predictions.numel(), "{} != {}".format(end, predictions.numel()) if self.infonce: predictions = predictions.view(-1, copies) else: if weights is not None: labels = (labels, weights) return predictions, labels ================================================ FILE: fairseq/models/wav2vec/wav2vec2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field from typing import List, Tuple import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.data.data_utils import compute_mask_indices from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.distributed import fsdp_wrap from fairseq.models import BaseFairseqModel, register_model from fairseq.distributed.fully_sharded_data_parallel import FullyShardedDataParallel from fairseq.modules import ( Fp32GroupNorm, Fp32LayerNorm, GradMultiply, GumbelVectorQuantizer, LayerNorm, MultiheadAttention, RelPositionalEncoding, SamePad, TransposeLast, ) from fairseq.modules.checkpoint_activations import checkpoint_wrapper from fairseq.modules.conformer_layer import ConformerWav2Vec2EncoderLayer from fairseq.modules.transformer_sentence_encoder import init_bert_params from fairseq.utils import buffered_arange, index_put, is_xla_tensor from .utils import pad_to_multiple EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"]) MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(["static", "uniform", "normal", "poisson"]) LAYER_TYPE_CHOICES = ChoiceEnum(["transformer", "conformer", "trf_adp"]) @dataclass class Wav2Vec2Config(FairseqDataclass): extractor_mode: EXTRACTOR_MODE_CHOICES = field( default="default", metadata={ "help": "mode for feature extractor. default has a single group norm with d " "groups in the first conv block, whereas layer_norm has layer norms in " "every block (meant to use with normalize=True)" }, ) encoder_layers: int = field( default=12, metadata={"help": "num encoder layers in the transformer"} ) encoder_embed_dim: int = field( default=768, metadata={"help": "encoder embedding dimension"} ) encoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "encoder embedding dimension for FFN"} ) encoder_attention_heads: int = field( default=12, metadata={"help": "num encoder attention heads"} ) activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="gelu", metadata={"help": "activation function to use"} ) layer_type: LAYER_TYPE_CHOICES = field( default="transformer", metadata={"help": "layer type in encoder"} ) # dropouts dropout: float = field( default=0.1, metadata={"help": "dropout probability for the transformer"} ) attention_dropout: float = field( default=0.1, metadata={"help": "dropout probability for attention weights"} ) activation_dropout: float = field( default=0.0, metadata={"help": "dropout probability after activation in FFN"} ) encoder_layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a tarnsformer layer"} ) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) dropout_features: float = field( default=0.0, metadata={"help": "dropout to apply to the features (after feat extr)"}, ) final_dim: int = field( default=0, metadata={ "help": "project final representations and targets to this many dimensions." "set to encoder_embed_dim is <= 0" }, ) layer_norm_first: bool = field( default=False, metadata={"help": "apply layernorm first in the transformer"} ) conv_feature_layers: str = field( default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]", metadata={ "help": "string describing convolutional feature extraction layers in form of a python list that contains " "[(dim, kernel_size, stride), ...]" }, ) conv_bias: bool = field( default=False, metadata={"help": "include bias in conv encoder"} ) logit_temp: float = field( default=0.1, metadata={"help": "temperature to divide logits by"} ) quantize_targets: bool = field( default=False, metadata={"help": "use quantized targets"} ) quantize_input: bool = field( default=False, metadata={"help": "use quantized inputs"} ) same_quantizer: bool = field( default=False, metadata={"help": "use same quantizer for inputs and targets"} ) target_glu: bool = field( default=False, metadata={"help": "adds projection + glu to targets"} ) feature_grad_mult: float = field( default=1.0, metadata={"help": "multiply feature extractor var grads by this"} ) quantizer_depth: int = field( default=1, metadata={"help": "number of quantizer layers"}, ) quantizer_factor: int = field( default=3, metadata={ "help": "dimensionality increase for inner quantizer layers (if depth > 1)" }, ) latent_vars: int = field( default=320, metadata={"help": "number of latent variables V in each group of the codebook"}, ) latent_groups: int = field( default=2, metadata={"help": "number of groups G of latent variables in the codebook"}, ) latent_dim: int = field( default=0, metadata={ "help": "if > 0, uses this dimensionality for latent variables. " "otherwise uses final_dim / latent_groups" }, ) # masking mask_length: int = field(default=10, metadata={"help": "mask length"}) mask_prob: float = field( default=0.65, metadata={"help": "probability of replacing a token with mask"} ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length"} ) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"} ) mask_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) require_same_masks: bool = field( default=True, metadata={ "help": "whether to number of masked timesteps must be the same across all " "examples in a batch" }, ) mask_dropout: float = field( default=0.0, metadata={"help": "percent of masks to unmask for each sample"}, ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"} ) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"} ) mask_channel_before: bool = False mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"} ) mask_channel_min_space: int = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) # negative selection num_negatives: int = field( default=100, metadata={"help": "number of negative examples from the same sample"}, ) negatives_from_everywhere: bool = field( default=False, metadata={"help": "sample negatives from everywhere, not just masked states"}, ) cross_sample_negatives: int = field( default=0, metadata={"help": "number of negative examples from the any sample"} ) codebook_negatives: int = field( default=0, metadata={"help": "number of negative examples codebook"} ) # positional embeddings conv_pos: int = field( default=128, metadata={"help": "number of filters for convolutional positional embeddings"}, ) conv_pos_groups: int = field( default=16, metadata={"help": "number of groups for convolutional positional embedding"}, ) pos_conv_depth: int = field( default=1, metadata={"help": "depth of positional encoder network"}, ) latent_temp: Tuple[float, float, float] = field( default=(2, 0.5, 0.999995), metadata={ "help": "temperature for latent variable sampling. " "can be tuple of 3 values (start, end, decay)" }, ) max_positions: int = field(default=100000, metadata={"help": "Max positions"}) checkpoint_activations: bool = field( default=False, metadata={"help": "recompute activations and save memory for extra compute"}, ) # FP16 optimization required_seq_len_multiple: int = field( default=2, metadata={ "help": "pad the input to encoder such that the sequence length is divisible by multiple" }, ) crop_seq_to_multiple: int = field( default=1, metadata={ "help": "crop convolutional feature extractor output such that the sequence length is divisible by multiple" }, ) # Conformer depthwise_conv_kernel_size: int = field( default=31, metadata={ "help": "depthwise-conv-kernel-size for convolution in conformer layer" }, ) attn_type: str = field( default="", metadata={"help": "if espnet use ESPNET MHA"}, ) pos_enc_type: str = field( default="abs", metadata={"help": "Positional encoding type to use in conformer"}, ) fp16: bool = field(default=False, metadata={"help": "If fp16 is being used"}) # Adapter num adp_num: int = field( default=-1 ) adp_dim: int = field( default=64 ) adp_act_fn: str = field( default="relu" ) adp_trf_idx: str = field( default="all", ) @register_model("wav2vec2", dataclass=Wav2Vec2Config) class Wav2Vec2Model(BaseFairseqModel): def __init__(self, cfg: Wav2Vec2Config): super().__init__() self.cfg = cfg feature_enc_layers = eval(cfg.conv_feature_layers) self.embed = feature_enc_layers[-1][0] self.feature_extractor = ConvFeatureExtractionModel( conv_layers=feature_enc_layers, dropout=0.0, mode=cfg.extractor_mode, conv_bias=cfg.conv_bias, ) self.post_extract_proj = ( nn.Linear(self.embed, cfg.encoder_embed_dim) if self.embed != cfg.encoder_embed_dim and not cfg.quantize_input else None ) self.crop_seq_to_multiple = cfg.crop_seq_to_multiple self.mask_prob = cfg.mask_prob self.mask_selection = cfg.mask_selection self.mask_other = cfg.mask_other self.mask_length = cfg.mask_length self.no_mask_overlap = cfg.no_mask_overlap self.mask_min_space = cfg.mask_min_space self.mask_channel_prob = cfg.mask_channel_prob self.mask_channel_before = cfg.mask_channel_before self.mask_channel_selection = cfg.mask_channel_selection self.mask_channel_other = cfg.mask_channel_other self.mask_channel_length = cfg.mask_channel_length self.no_mask_channel_overlap = cfg.no_mask_channel_overlap self.mask_channel_min_space = cfg.mask_channel_min_space self.dropout_input = nn.Dropout(cfg.dropout_input) self.dropout_features = nn.Dropout(cfg.dropout_features) self.feature_grad_mult = cfg.feature_grad_mult self.quantizer = None self.input_quantizer = None self.n_negatives = cfg.num_negatives self.cross_sample_negatives = cfg.cross_sample_negatives self.codebook_negatives = cfg.codebook_negatives self.negatives_from_everywhere = cfg.negatives_from_everywhere self.logit_temp = cfg.logit_temp final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim if cfg.quantize_targets: vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else final_dim self.quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=cfg.latent_vars, temp=cfg.latent_temp, groups=cfg.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, weight_proj_depth=cfg.quantizer_depth, weight_proj_factor=cfg.quantizer_factor, ) self.project_q = nn.Linear(vq_dim, final_dim) else: self.project_q = nn.Linear(self.embed, final_dim) if cfg.quantize_input: if cfg.same_quantizer and self.quantizer is not None: vq_dim = final_dim self.input_quantizer = self.quantizer else: vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else cfg.encoder_embed_dim self.input_quantizer = GumbelVectorQuantizer( dim=self.embed, num_vars=cfg.latent_vars, temp=cfg.latent_temp, groups=cfg.latent_groups, combine_groups=False, vq_dim=vq_dim, time_first=True, weight_proj_depth=cfg.quantizer_depth, weight_proj_factor=cfg.quantizer_factor, ) self.project_inp = nn.Linear(vq_dim, cfg.encoder_embed_dim) self.mask_emb = nn.Parameter( torch.FloatTensor(cfg.encoder_embed_dim).uniform_() ) encoder_cls = TransformerEncoder if cfg.layer_type == "conformer" and cfg.pos_enc_type in ["rel_pos", "rope"]: encoder_cls = ConformerEncoder self.encoder = encoder_cls(cfg) self.layer_norm = LayerNorm(self.embed) self.target_glu = None if cfg.target_glu: self.target_glu = nn.Sequential( nn.Linear(final_dim, final_dim * 2), nn.GLU() ) self.final_proj = nn.Linear(cfg.encoder_embed_dim, final_dim) def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) """Upgrade a (possibly old) state dict for new versions of fairseq.""" return state_dict @classmethod def build_model(cls, cfg: Wav2Vec2Config, task=None): """Build a new model instance.""" return cls(cfg) def apply_mask( self, x, padding_mask, mask_indices=None, mask_channel_indices=None, ): B, T, C = x.shape if self.mask_channel_prob > 0 and self.mask_channel_before: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x[mask_channel_indices] = 0 if self.mask_prob > 0: if mask_indices is None: mask_indices = compute_mask_indices( (B, T), padding_mask, self.mask_prob, self.mask_length, self.mask_selection, self.mask_other, min_masks=2, no_overlap=self.no_mask_overlap, min_space=self.mask_min_space, require_same_masks=self.cfg.require_same_masks, mask_dropout=self.cfg.mask_dropout, ) mask_indices = torch.from_numpy(mask_indices).to(x.device) x = index_put(x, mask_indices, self.mask_emb) else: mask_indices = None if self.mask_channel_prob > 0 and not self.mask_channel_before: if mask_channel_indices is None: mask_channel_indices = compute_mask_indices( (B, C), None, self.mask_channel_prob, self.mask_channel_length, self.mask_channel_selection, self.mask_channel_other, no_overlap=self.no_mask_channel_overlap, min_space=self.mask_channel_min_space, ) mask_channel_indices = ( torch.from_numpy(mask_channel_indices) .to(x.device) .unsqueeze(1) .expand(-1, T, -1) ) x = index_put(x, mask_channel_indices, 0) return x, mask_indices def sample_negatives(self, y, num, padding_count=None): if self.n_negatives == 0 and self.cross_sample_negatives == 0: return y.new(0) bsz, tsz, fsz = y.shape y = y.view(-1, fsz) # BTC => (BxT)C # FIXME: what happens if padding_count is specified? cross_high = tsz * bsz high = tsz - (padding_count or 0) with torch.no_grad(): assert high > 1, f"{bsz,tsz,fsz}" if self.n_negatives > 0: tszs = ( buffered_arange(num) .unsqueeze(-1) .expand(-1, self.n_negatives) .flatten() ) neg_idxs = torch.randint( low=0, high=high - 1, size=(bsz, self.n_negatives * num) ) neg_idxs[neg_idxs >= tszs] += 1 if self.cross_sample_negatives > 0: tszs = ( buffered_arange(num) .unsqueeze(-1) .expand(-1, self.cross_sample_negatives) .flatten() ) cross_neg_idxs = torch.randint( low=0, high=cross_high - 1, size=(bsz, self.cross_sample_negatives * num), ) cross_neg_idxs[cross_neg_idxs >= tszs] += 1 if self.n_negatives > 0: neg_idxs = neg_idxs + (torch.arange(bsz).unsqueeze(1) * high) else: neg_idxs = cross_neg_idxs if self.cross_sample_negatives > 0 and self.n_negatives > 0: neg_idxs = torch.cat([neg_idxs, cross_neg_idxs], dim=1) negs = y[neg_idxs.view(-1)] negs = negs.view( bsz, num, self.n_negatives + self.cross_sample_negatives, fsz ).permute( 2, 0, 1, 3 ) # to NxBxTxC return negs, neg_idxs def compute_preds(self, x, y, negatives): neg_is_pos = (y == negatives).all(-1) y = y.unsqueeze(0) targets = torch.cat([y, negatives], dim=0) logits = torch.cosine_similarity(x.float(), targets.float(), dim=-1) logits = logits / self.logit_temp logits = logits.type_as(x) if is_xla_tensor(logits) or neg_is_pos.any(): if not hasattr(self, "_inftensor"): fillval = -float(2**30) self._inftensor = ( torch.tensor(fillval).to(x.device) if is_xla_tensor(logits) else float("-inf") ) logits[1:] = index_put(logits[1:], neg_is_pos, self._inftensor) return logits def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor): """ Computes the output length of the convolutional layers """ def _conv_out_length(input_length, kernel_size, stride): return torch.floor((input_length - kernel_size) / stride + 1) conv_cfg_list = eval(self.cfg.conv_feature_layers) for i in range(len(conv_cfg_list)): input_lengths = _conv_out_length( input_lengths, conv_cfg_list[i][1], conv_cfg_list[i][2] ) return input_lengths.to(torch.long) def forward( self, source, padding_mask=None, mask=True, features_only=False, layer=None, mask_indices=None, mask_channel_indices=None, padding_count=None, corpus_key=None, ): if self.feature_grad_mult > 0: features = self.feature_extractor(source) if self.feature_grad_mult != 1.0: features = GradMultiply.apply(features, self.feature_grad_mult) else: with torch.no_grad(): features = self.feature_extractor(source) features_pen = features.float().pow(2).mean() features = features.transpose(1, 2) features = self.layer_norm(features) unmasked_features = features.clone() if padding_mask is not None and padding_mask.any(): input_lengths = (1 - padding_mask.long()).sum(-1) # apply conv formula to get real output_lengths output_lengths = self._get_feat_extract_output_lengths(input_lengths) padding_mask = torch.zeros( features.shape[:2], dtype=features.dtype, device=features.device ) # these two operations makes sure that all values # before the output lengths indices are attended to padding_mask[ ( torch.arange(padding_mask.shape[0], device=padding_mask.device), output_lengths - 1, ) ] = 1 padding_mask = (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])).bool() else: padding_mask = None time_steps_to_drop = features.size(1) % self.crop_seq_to_multiple if time_steps_to_drop != 0: features = features[:, :-time_steps_to_drop] unmasked_features = unmasked_features[:, :-time_steps_to_drop] if padding_mask is not None: padding_mask = padding_mask[:, :-time_steps_to_drop] if self.post_extract_proj is not None: features = self.post_extract_proj(features) features = self.dropout_input(features) unmasked_features = self.dropout_features(unmasked_features) num_vars = None code_ppl = None prob_ppl = None curr_temp = None if self.input_quantizer: q = self.input_quantizer(features, produce_targets=False) features = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] features = self.project_inp(features) if mask: x, mask_indices = self.apply_mask( features, padding_mask, mask_indices=mask_indices, mask_channel_indices=mask_channel_indices, ) if not is_xla_tensor(x) and mask_indices is not None: # tpu-comment: reducing the size in a dynamic way causes # too many recompilations on xla. y = unmasked_features[mask_indices].view( unmasked_features.size(0), -1, unmasked_features.size(-1) ) else: y = unmasked_features else: x = features y = unmasked_features mask_indices = None x, layer_results = self.encoder( x, padding_mask=padding_mask, layer=layer, corpus_key=corpus_key ) if features_only: return { "x": x, "padding_mask": padding_mask, "features": unmasked_features, "layer_results": layer_results, } if self.quantizer: if self.negatives_from_everywhere: q = self.quantizer(unmasked_features, produce_targets=False) y = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] y = self.project_q(y) negs, _ = self.sample_negatives( y, mask_indices[0].sum(), padding_count=padding_count, ) y = y[mask_indices].view(y.size(0), -1, y.size(-1)) else: q = self.quantizer(y, produce_targets=False) y = q["x"] num_vars = q["num_vars"] code_ppl = q["code_perplexity"] prob_ppl = q["prob_perplexity"] curr_temp = q["temp"] y = self.project_q(y) negs, _ = self.sample_negatives( y, y.size(1), padding_count=padding_count, ) if self.codebook_negatives > 0: cb_negs = self.quantizer.sample_from_codebook( y.size(0) * y.size(1), self.codebook_negatives ) cb_negs = cb_negs.view( self.codebook_negatives, y.size(0), y.size(1), -1 ) # order doesnt matter cb_negs = self.project_q(cb_negs) negs = torch.cat([negs, cb_negs], dim=0) else: y = self.project_q(y) if self.negatives_from_everywhere: negs, _ = self.sample_negatives( unmasked_features, y.size(1), padding_count=padding_count, ) negs = self.project_q(negs) else: negs, _ = self.sample_negatives( y, y.size(1), padding_count=padding_count, ) if not is_xla_tensor(x): # tpu-comment: reducing the size in a dynamic way causes # too many recompilations on xla. x = x[mask_indices].view(x.size(0), -1, x.size(-1)) if self.target_glu: y = self.target_glu(y) negs = self.target_glu(negs) x = self.final_proj(x) x = self.compute_preds(x, y, negs) result = { "x": x, "padding_mask": padding_mask, "features_pen": features_pen, } if prob_ppl is not None: result["prob_perplexity"] = prob_ppl result["code_perplexity"] = code_ppl result["num_vars"] = num_vars result["temp"] = curr_temp return result def quantize(self, x): assert self.quantizer is not None x = self.feature_extractor(x) x = x.transpose(1, 2) x = self.layer_norm(x) return self.quantizer.forward_idx(x) def extract_features( self, source, padding_mask, mask=False, layer=None, corpus_key=None ): res = self.forward( source, padding_mask, mask=mask, features_only=True, layer=layer, corpus_key=corpus_key, ) return res def get_logits(self, net_output): logits = net_output["x"] logits = logits.transpose(0, 2) logits = logits.reshape(-1, logits.size(-1)) return logits def get_targets(self, sample, net_output, expand_steps=True): x = net_output["x"] return x.new_zeros(x.size(1) * x.size(2), dtype=torch.long) def get_extra_losses(self, net_output): pen = [] if "prob_perplexity" in net_output: pen.append( (net_output["num_vars"] - net_output["prob_perplexity"]) / net_output["num_vars"] ) if "features_pen" in net_output: pen.append(net_output["features_pen"]) return pen def remove_pretraining_modules(self, last_layer=None): self.quantizer = None self.project_q = None self.target_glu = None self.final_proj = None if last_layer is not None: self.encoder.layers = nn.ModuleList( l for i, l in enumerate(self.encoder.layers) if i <= last_layer ) class ConvFeatureExtractionModel(nn.Module): def __init__( self, conv_layers: List[Tuple[int, int, int]], dropout: float = 0.0, mode: str = "default", conv_bias: bool = False, ): super().__init__() assert mode in {"default", "layer_norm"} def block( n_in, n_out, k, stride, is_layer_norm=False, is_group_norm=False, conv_bias=False, ): def make_conv(): conv = nn.Conv1d(n_in, n_out, k, stride=stride, bias=conv_bias) nn.init.kaiming_normal_(conv.weight) return conv assert ( is_layer_norm and is_group_norm ) == False, "layer norm and group norm are exclusive" if is_layer_norm: return nn.Sequential( make_conv(), nn.Dropout(p=dropout), nn.Sequential( TransposeLast(), Fp32LayerNorm(dim, elementwise_affine=True), TransposeLast(), ), nn.GELU(), ) elif is_group_norm: return nn.Sequential( make_conv(), nn.Dropout(p=dropout), Fp32GroupNorm(dim, dim, affine=True), nn.GELU(), ) else: return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU()) in_d = 1 self.conv_layers = nn.ModuleList() for i, cl in enumerate(conv_layers): assert len(cl) == 3, "invalid conv definition: " + str(cl) (dim, k, stride) = cl self.conv_layers.append( block( in_d, dim, k, stride, is_layer_norm=mode == "layer_norm", is_group_norm=mode == "default" and i == 0, conv_bias=conv_bias, ) ) in_d = dim def forward(self, x): # BxT -> BxCxT x = x.unsqueeze(1) for conv in self.conv_layers: x = conv(x) return x def make_conv_pos(e, k, g, is_batch_norm=False): pos_conv = nn.Conv1d( e, e, kernel_size=k, padding=k // 2, groups=g, ) dropout = 0 std = math.sqrt((4 * (1.0 - dropout)) / (k * e)) nn.init.normal_(pos_conv.weight, mean=0, std=std) nn.init.constant_(pos_conv.bias, 0) if not is_batch_norm: pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2) pos_conv = nn.Sequential(pos_conv, SamePad(k), nn.GELU()) else: batch_norm = nn.BatchNorm1d(e) pos_conv = nn.Sequential(batch_norm, pos_conv, SamePad(k), nn.GELU()) return pos_conv class TransformerEncoder(nn.Module): def build_encoder_layer(self, args: Wav2Vec2Config, **kwargs): if args.layer_type == "transformer": layer = TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, ) elif args.layer_type == "conformer": layer = ConformerWav2Vec2EncoderLayer( embed_dim=self.embedding_dim, ffn_embed_dim=args.encoder_ffn_embed_dim, attention_heads=args.encoder_attention_heads, dropout=args.dropout, depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, activation_fn="swish", attn_type=args.attn_type, use_fp16=args.fp16, pos_enc_type="abs", ) elif args.layer_type == "trf_adp": use_adp = False if args.adp_trf_idx == "all": use_adp = True else: adp_trf_idx = list(range(*[int(g) for g in args.adp_trf_idx.split(":")])) if kwargs.get("layer_idx", None) in adp_trf_idx: use_adp = True if use_adp: layer = TransformerSentenceEncoderWithAdapterLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, adapter_num=args.adp_num, adapter_dim=args.adp_dim, adapter_act_fn=args.adp_act_fn, ) else: layer = TransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=args.encoder_ffn_embed_dim, num_attention_heads=args.encoder_attention_heads, dropout=self.dropout, attention_dropout=args.attention_dropout, activation_dropout=args.activation_dropout, activation_fn=args.activation_fn, layer_norm_first=args.layer_norm_first, ) layer = fsdp_wrap(layer) if args.checkpoint_activations: layer = checkpoint_wrapper(layer) return layer def __init__(self, args: Wav2Vec2Config, skip_pos_conv: bool = False, override_encoder_layer: int = None): super().__init__() self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.required_seq_len_multiple = args.required_seq_len_multiple pos_conv_depth = getattr(args, "pos_conv_depth", 1) if pos_conv_depth > 1: num_layers = args.pos_conv_depth k = max(3, args.conv_pos // num_layers) def make_conv_block(e, k, g, l): return nn.Sequential( *[ nn.Sequential( nn.Conv1d( e, e, kernel_size=k, padding=k // 2, groups=g, ), SamePad(k), TransposeLast(), LayerNorm(e, elementwise_affine=False), TransposeLast(), nn.GELU(), ) for _ in range(l) ] ) self.pos_conv = make_conv_block( self.embedding_dim, k, args.conv_pos_groups, num_layers ) elif skip_pos_conv: self.pos_conv = None else: self.pos_conv = make_conv_pos( self.embedding_dim, args.conv_pos, args.conv_pos_groups, is_batch_norm=args.conv_pos_batch_norm if hasattr(args, "conv_pos_batch_norm") else False, ) if override_encoder_layer is None: encoder_layers = args.encoder_layers else: encoder_layers = override_encoder_layer self.layers = nn.ModuleList( [self.build_encoder_layer(args, layer_idx=ii) for ii in range(encoder_layers)] ) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params) def forward(self, x, padding_mask=None, layer=None, corpus_key=None): x, layer_results = self.extract_features( x, padding_mask, layer, corpus_key=corpus_key ) if self.layer_norm_first and layer is None: x = self.layer_norm(x) return x, layer_results def extract_features( self, x, padding_mask=None, tgt_layer=None, min_layer=0, corpus_key=None, ): if padding_mask is not None: x = index_put(x, padding_mask, 0) if self.pos_conv is not None: x_conv = self.pos_conv(x.transpose(1, 2)) x_conv = x_conv.transpose(1, 2) x = x + x_conv if not self.layer_norm_first: x = self.layer_norm(x) # pad to the sequence length dimension x, pad_length = pad_to_multiple( x, self.required_seq_len_multiple, dim=-2, value=0 ) if pad_length > 0 and padding_mask is None: padding_mask = x.new_zeros((x.size(0), x.size(1)), dtype=torch.bool) padding_mask[:, -pad_length:] = True else: padding_mask, _ = pad_to_multiple( padding_mask, self.required_seq_len_multiple, dim=-1, value=True ) x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) layer_results = [] r = None for i, layer in enumerate(self.layers): dropout_probability = np.random.random() if self.layerdrop > 0 else 1 if not self.training or (dropout_probability > self.layerdrop): layer_check = layer if isinstance(layer, FullyShardedDataParallel): layer_check = layer.unwrapped_module if (corpus_key is None) or ( not isinstance(layer_check, ( TransformerSentenceEncoderWithAdapterLayer, ) ) ): x, (z, lr) = layer( x, self_attn_padding_mask=padding_mask, need_weights=False ) else: x, (z, lr) = layer( x, self_attn_padding_mask=padding_mask, need_weights=False, corpus_key=corpus_key, ) if i >= min_layer: layer_results.append((x, z, lr)) if i == tgt_layer: r = x break if r is not None: x = r # T x B x C -> B x T x C x = x.transpose(0, 1) # undo paddding if pad_length > 0: x = x[:, :-pad_length] def undo_pad(a, b, c): return ( a[:-pad_length], b[:-pad_length] if b is not None else b, c[:-pad_length], ) layer_results = [undo_pad(*u) for u in layer_results] return x, layer_results def max_positions(self): """Maximum output length supported by the encoder.""" return self.args.max_positions def upgrade_state_dict_named(self, state_dict, name): """Upgrade a (possibly old) state dict for new versions of fairseq.""" return state_dict class ConformerEncoder(TransformerEncoder): def build_encoder_layer(self, args): layer = ConformerWav2Vec2EncoderLayer( embed_dim=self.embedding_dim, ffn_embed_dim=args.encoder_ffn_embed_dim, attention_heads=args.encoder_attention_heads, dropout=args.dropout, depthwise_conv_kernel_size=args.depthwise_conv_kernel_size, activation_fn="swish", attn_type=args.attn_type, pos_enc_type=args.pos_enc_type, use_fp16=args.fp16, # only used for rope ) layer = fsdp_wrap(layer) if args.checkpoint_activations: layer = checkpoint_wrapper(layer) return layer def __init__(self, args): super().__init__(args) self.args = args self.dropout = args.dropout self.embedding_dim = args.encoder_embed_dim self.pos_enc_type = args.pos_enc_type max_source_positions = self.max_positions() if self.pos_enc_type == "rel_pos": self.embed_positions = RelPositionalEncoding( max_source_positions, self.embedding_dim ) elif self.pos_enc_type == "rope": self.embed_positions = None else: raise Exception("Unsupported positional encoding type") self.layers = nn.ModuleList( [self.build_encoder_layer(args) for _ in range(args.encoder_layers)] ) self.layer_norm_first = args.layer_norm_first self.layer_norm = LayerNorm(self.embedding_dim) self.layerdrop = args.encoder_layerdrop self.apply(init_bert_params) def extract_features(self, x, padding_mask=None, tgt_layer=None): if padding_mask is not None: x = index_put(x, padding_mask, 0) # B x T x C -> T x B x C x = x.transpose(0, 1) # B X T X C here position_emb = None if self.pos_enc_type == "rel_pos": position_emb = self.embed_positions(x) if not self.layer_norm_first: x = self.layer_norm(x) x = F.dropout(x, p=self.dropout, training=self.training) layer_results = [] r = None for i, layer in enumerate(self.layers): dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): x, z = layer( x, self_attn_padding_mask=padding_mask, need_weights=False, position_emb=position_emb, ) if tgt_layer is not None: layer_results.append((x, z)) if i == tgt_layer: r = x break if r is not None: x = r # T x B x C -> B x T x C x = x.transpose(0, 1) return x, layer_results class TransformerSentenceEncoderLayer(nn.Module): """ Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained models. """ def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = "relu", layer_norm_first: bool = False, ) -> None: super().__init__() # Initialize parameters self.embedding_dim = embedding_dim self.dropout = dropout self.activation_dropout = activation_dropout # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = MultiheadAttention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True, ) self.dropout1 = nn.Dropout(dropout) self.dropout2 = nn.Dropout(self.activation_dropout) self.dropout3 = nn.Dropout(dropout) self.layer_norm_first = layer_norm_first # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim) self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim) def forward( self, x: torch.Tensor, self_attn_mask: torch.Tensor = None, self_attn_padding_mask: torch.Tensor = None, need_weights: bool = False, att_args=None, ): """ LayerNorm is applied either before or after the self-attention/ffn modules similar to the original Transformer imlementation. """ residual = x if self.layer_norm_first: x = self.self_attn_layer_norm(x) x, attn = self.self_attn( query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, attn_mask=self_attn_mask, need_weights=False, ) x = self.dropout1(x) x = residual + x residual = x x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.dropout2(x) x = self.fc2(x) layer_result = x x = self.dropout3(x) x = residual + x else: x, attn = self.self_attn( query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, need_weights=False, ) x = self.dropout1(x) x = residual + x x = self.self_attn_layer_norm(x) residual = x x = self.activation_fn(self.fc1(x)) x = self.dropout2(x) x = self.fc2(x) layer_result = x x = self.dropout3(x) x = residual + x x = self.final_layer_norm(x) return x, (attn, layer_result) class AdapterFast(nn.Module): def __init__(self, adapter_num, input_dim, hidden_dim, act_fn): """ Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList orto speed up training throughput. """ super().__init__() self.adapter_num = adapter_num self.input_dim = input_dim self.hidden_dim = hidden_dim self.W_a = nn.Parameter(torch.empty(adapter_num, hidden_dim, input_dim)) self.W_b = nn.Parameter(torch.empty(adapter_num, input_dim, hidden_dim)) self.b_a = nn.Parameter(torch.empty(adapter_num, hidden_dim)) self.b_b = nn.Parameter(torch.empty(adapter_num, input_dim)) self.ln_W = nn.Parameter(torch.empty(adapter_num, input_dim)) self.ln_b = nn.Parameter(torch.empty(adapter_num, input_dim)) self.act_fn = nn.Identity() if act_fn == "relu": self.act_fn = nn.ReLU() elif act_fn == "gelu": self.act_fn = nn.GELU() elif act_fn == "selu": self.act_fn = nn.SELU() else: raise ValueError(f"unsupported {act_fn}") self.input_dim = input_dim self.reset_parameters() def reset_parameters(self): for ii in range(self.adapter_num): nn.init.kaiming_uniform_(self.W_a[ii], a=math.sqrt(5)) nn.init.kaiming_uniform_(self.W_b[ii], a=math.sqrt(5)) fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_a[ii]) bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 nn.init.uniform_(self.b_a[ii], -bound, bound) fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.W_b[ii]) bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 nn.init.uniform_(self.b_b[ii], -bound, bound) nn.init.ones_(self.ln_W) nn.init.zeros_(self.ln_b) def forward(self, x, adapter_id): ii = adapter_id h = x h = F.layer_norm(h, (self.input_dim, ), self.ln_W[ii], self.ln_b[ii]) h = F.linear(h, self.W_a[ii], self.b_a[ii]) h = self.act_fn(h) h = F.linear(h, self.W_b[ii], self.b_b[ii]) outputs = h return outputs def extra_repr(self): return ('adapter={}, input_dim={}, hidden_dim={}'.format(self.adapter_num, self.input_dim, self.hidden_dim)) class TransformerSentenceEncoderWithAdapterLayer(TransformerSentenceEncoderLayer): """ Implements a Transformer Encoder Layer with adapters used in BERT/XLM style pre-trained models. An adapter module is added along with vanilla Transformer module. """ def __init__( self, embedding_dim: float = 768, ffn_embedding_dim: float = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = "relu", layer_norm_first: bool = False, adapter_num=201, adapter_dim=64, adapter_act_fn="relu", ) -> None: super().__init__( embedding_dim=embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, layer_norm_first=layer_norm_first, ) self.adapter_num = adapter_num self.adapter_dim = adapter_dim self.adapter_layer = AdapterFast(adapter_num, self.embedding_dim, self.adapter_dim, adapter_act_fn) def forward( self, x: torch.Tensor, self_attn_mask: torch.Tensor = None, self_attn_padding_mask: torch.Tensor = None, need_weights: bool = False, att_args=None, corpus_key=None, ): x, (attn, layer_result) = super().forward( x=x, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_weights=need_weights, att_args=att_args, ) assert corpus_key is not None assert len(set(corpus_key)) == 1, f"corpus_key items are not same {corpus_key}" y = self.adapter_layer(x, corpus_key[0]) x = x + y return x, (attn, layer_result) ================================================ FILE: fairseq/models/wav2vec/wav2vec2_asr.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import copy import logging import math import re from argparse import Namespace from dataclasses import dataclass, field from typing import Any, Optional import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from omegaconf import II, MISSING, open_dict from fairseq import checkpoint_utils, tasks, utils from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import ( BaseFairseqModel, FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, register_model, ) from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES, LAYER_TYPE_CHOICES, AdapterFast from fairseq.modules import LayerNorm, PositionalEmbedding, TransformerDecoderLayer from fairseq.tasks import FairseqTask logger = logging.getLogger(__name__) @dataclass class Wav2Vec2AsrConfig(FairseqDataclass): w2v_path: str = field( default=MISSING, metadata={"help": "path to wav2vec 2.0 model"} ) no_pretrained_weights: bool = field( default=False, metadata={"help": "if true, does not load pretrained weights"} ) dropout_input: float = field( default=0.0, metadata={"help": "dropout to apply to the input (after feat extr)"}, ) final_dropout: float = field( default=0.0, metadata={"help": "dropout after transformer and before final projection"}, ) dropout: float = field( default=0.0, metadata={"help": "dropout probability inside wav2vec 2.0 model"} ) attention_dropout: float = field( default=0.0, metadata={ "help": "dropout probability for attention weights inside wav2vec 2.0 model" }, ) activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN inside wav2vec 2.0 model" }, ) # masking apply_mask: bool = field( default=False, metadata={"help": "apply masking during fine-tuning"} ) mask_length: int = field( default=10, metadata={"help": "repeat the mask indices multiple times"} ) mask_prob: float = field( default=0.5, metadata={ "help": "probability of replacing a token with mask (normalized by length)" }, ) mask_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose masks"} ) mask_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indices" }, ) no_mask_overlap: bool = field( default=False, metadata={"help": "whether to allow masks to overlap"} ) mask_min_space: Optional[int] = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) require_same_masks: bool = field( default=True, metadata={ "help": "whether to number of masked timesteps must be the same across all " "examples in a batch" }, ) mask_dropout: float = field( default=0.0, metadata={"help": "percent of masks to unmask for each sample"}, ) # channel masking mask_channel_length: int = field( default=10, metadata={"help": "length of the mask for features (channels)"} ) mask_channel_prob: float = field( default=0.0, metadata={"help": "probability of replacing a feature with 0"} ) mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field( default="static", metadata={"help": "how to choose mask length for channel masking"}, ) mask_channel_other: float = field( default=0, metadata={ "help": "secondary mask argument (used for more complex distributions), " "see help in compute_mask_indicesh" }, ) no_mask_channel_overlap: bool = field( default=False, metadata={"help": "whether to allow channel masks to overlap"} ) freeze_finetune_updates: int = field( default=0, metadata={"help": "dont finetune wav2vec for this many updates"} ) feature_grad_mult: float = field( default=0.0, metadata={"help": "reset feature grad mult in wav2vec 2.0 to this"} ) layerdrop: float = field( default=0.0, metadata={"help": "probability of dropping a layer in wav2vec 2.0"} ) drop_path: float = 0 mask_channel_min_space: Optional[int] = field( default=1, metadata={"help": "min space between spans (if no overlap is enabled)"}, ) mask_channel_before: bool = False normalize: bool = II("task.normalize") update_alibi: bool = True data: str = II("task.data") # this holds the loaded wav2vec args w2v_args: Any = None offload_activations: bool = field( default=False, metadata={"help": "offload_activations"} ) min_params_to_wrap: int = field( default=int(1e8), metadata={ "help": "minimum number of params for a layer to be wrapped with FSDP() when " "training with --ddp-backend=fully_sharded. Smaller values will " "improve memory efficiency, but may make torch.distributed " "communication less efficient due to smaller input sizes. This option " "is set to 0 (i.e., always wrap) when --checkpoint-activations or " "--offload-activations are passed." }, ) checkpoint_activations: bool = field( default=False, metadata={"help": "recompute activations and save memory for extra compute"}, ) ddp_backend: str = II("distributed_training.ddp_backend") zero_mask: bool = False load_ema: bool = False layer_decay: float = 1 layer_type: LAYER_TYPE_CHOICES = field( default="transformer", metadata={"help": "layer type in encoder"} ) # Adapter num adp_num: int = field( default=-1 ) adp_dim: int = field( default=64 ) adp_act_fn: str = field( default="relu" ) adp_trf_idx: str = field( default="all", ) freeze_regex: Optional[str] = field( default=None, ) @dataclass class Wav2Vec2CtcConfig(Wav2Vec2AsrConfig): blank_weight: float = 0 blank_mode: str = "add" @register_model("wav2vec_ctc", dataclass=Wav2Vec2CtcConfig) class Wav2VecCtc(BaseFairseqModel): def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel): super().__init__() self.cfg = cfg self.w2v_encoder = w2v_encoder self.blank_weight = cfg.blank_weight self.blank_mode = cfg.blank_mode def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask): """Build a new model instance.""" w2v_encoder = Wav2VecEncoder(cfg, len(task.target_dictionary)) return cls(cfg, w2v_encoder) def get_logits(self, net_output, normalize=False): logits = net_output["encoder_out"] if self.blank_weight != 0: if self.blank_mode == "add": logits[..., 0] += self.blank_weight elif self.blank_mode == "set": logits[..., 0] = self.blank_weight else: raise Exception(f"invalid blank mode {self.blank_mode}") if net_output["padding_mask"] is not None and net_output["padding_mask"].any(): number_of_classes = logits.size(-1) masking_tensor = torch.ones( number_of_classes, device=logits.device ) * float("-inf") masking_tensor[0] = 0 if logits.size(0) > net_output["padding_mask"].size(1): net_output["padding_mask"] = F.pad( net_output["padding_mask"], (1, 0), value=False ) logits[net_output["padding_mask"].T] = masking_tensor.type_as(logits) if normalize: logits = utils.log_softmax(logits.float(), dim=-1) return logits def get_normalized_probs(self, net_output, log_probs): """Get normalized probabilities (or log probs) from a net's output.""" logits = self.get_logits(net_output) if log_probs: return utils.log_softmax(logits.float(), dim=-1) else: return utils.softmax(logits.float(), dim=-1) def forward(self, **kwargs): x = self.w2v_encoder(**kwargs) return x @dataclass class Wav2Vec2Seq2SeqConfig(Wav2Vec2AsrConfig): decoder_embed_dim: int = field( default=768, metadata={"help": "decoder embedding dimension"} ) decoder_ffn_embed_dim: int = field( default=3072, metadata={"help": "decoder embedding dimension for FFN"} ) decoder_layers: int = field(default=6, metadata={"help": "num of decoder layers"}) decoder_layerdrop: float = field( default=0.0, metadata={"help": "decoder layerdrop chance"} ) decoder_attention_heads: int = field( default=4, metadata={"help": "num decoder attention heads"} ) decoder_learned_pos: bool = field( default=False, metadata={"help": "use learned positional embeddings in the decoder"}, ) decoder_normalize_before: bool = field( default=False, metadata={"help": "apply layernorm before each decoder block"} ) no_token_positional_embeddings: bool = field( default=False, metadata={ "help": "if set, disables positional embeddings (outside self attention)" }, ) decoder_dropout: float = field( default=0.0, metadata={"help": "dropout probability in the decoder"} ) decoder_attention_dropout: float = field( default=0.0, metadata={ "help": "dropout probability for attention weights inside the decoder" }, ) decoder_activation_dropout: float = field( default=0.0, metadata={ "help": "dropout probability after activation in FFN inside the decoder" }, ) max_target_positions: int = field( default=2048, metadata={"help": "max target positions"} ) share_decoder_input_output_embed: bool = field( default=False, metadata={"help": "share decoder input and output embeddings"} ) autoregressive: bool = II("task.autoregressive") @register_model("wav2vec_seq2seq", dataclass=Wav2Vec2Seq2SeqConfig) class Wav2Vec2Seq2SeqModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @classmethod def build_model(cls, cfg: Wav2Vec2Seq2SeqConfig, task: FairseqTask): """Build a new model instance.""" assert ( cfg.autoregressive ), "Please set task.autoregressive=true for seq2seq asr models" src_dict, tgt_dict = task.source_dictionary, task.target_dictionary def build_embedding(dictionary, embed_dim): num_embeddings = len(dictionary) padding_idx = dictionary.pad() emb = Embedding(num_embeddings, embed_dim, padding_idx) return emb decoder_embed_tokens = build_embedding(tgt_dict, cfg.decoder_embed_dim) encoder = cls.build_encoder(cfg) decoder = cls.build_decoder(cfg, tgt_dict, decoder_embed_tokens) return Wav2Vec2Seq2SeqModel(encoder, decoder) @classmethod def build_encoder(cls, cfg: Wav2Vec2AsrConfig): return Wav2VecEncoder(cfg) @classmethod def build_decoder(cls, cfg: Wav2Vec2Seq2SeqConfig, tgt_dict, embed_tokens): return TransformerDecoder(cfg, tgt_dict, embed_tokens) def forward(self, **kwargs): encoder_out = self.encoder(**kwargs) decoder_out = self.decoder(encoder_out=encoder_out, **kwargs) return decoder_out def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict class Wav2VecEncoder(FairseqEncoder): def __init__(self, cfg: Wav2Vec2AsrConfig, output_size=None): self.apply_mask = cfg.apply_mask arg_overrides = { "dropout": cfg.dropout, "activation_dropout": cfg.activation_dropout, "dropout_input": cfg.dropout_input, "attention_dropout": cfg.attention_dropout, "mask_length": cfg.mask_length, "mask_prob": cfg.mask_prob, "require_same_masks": getattr(cfg, "require_same_masks", True), "pct_holes": getattr(cfg, "mask_dropout", 0), "mask_selection": cfg.mask_selection, "mask_other": cfg.mask_other, "no_mask_overlap": cfg.no_mask_overlap, "mask_channel_length": cfg.mask_channel_length, "mask_channel_prob": cfg.mask_channel_prob, "mask_channel_before": cfg.mask_channel_before, "mask_channel_selection": cfg.mask_channel_selection, "mask_channel_other": cfg.mask_channel_other, "no_mask_channel_overlap": cfg.no_mask_channel_overlap, "encoder_layerdrop": cfg.layerdrop, "feature_grad_mult": cfg.feature_grad_mult, "checkpoint_activations": cfg.checkpoint_activations, "offload_activations": cfg.offload_activations, "min_params_to_wrap": cfg.min_params_to_wrap, # d2v multi args "encoder_dropout": cfg.dropout, "drop_path": getattr(cfg, "drop_path", 0), "mask_dropout": getattr(cfg, "mask_dropout", 0), "zero_mask": getattr(cfg, "zero_mask", False), "local_grad_mult": cfg.feature_grad_mult, "layerdrop": cfg.layerdrop, "prenet_layerdrop": cfg.layerdrop, "prenet_dropout": cfg.dropout, "post_mlp_drop": cfg.dropout, "encoder_zero_mask": getattr(cfg, "zero_mask", False), "inverse_mask": False, "learned_alibi_scale": getattr(cfg, "update_alibi", True), } if cfg.w2v_args is None: state = checkpoint_utils.load_checkpoint_to_cpu(cfg.w2v_path, arg_overrides) w2v_args = state.get("cfg", None) if w2v_args is None: w2v_args = convert_namespace_to_omegaconf(state["args"]) w2v_args.criterion = None w2v_args.lr_scheduler = None cfg.w2v_args = w2v_args logger.info(w2v_args) else: state = None w2v_args = cfg.w2v_args if isinstance(w2v_args, Namespace): cfg.w2v_args = w2v_args = convert_namespace_to_omegaconf(w2v_args) self.is_d2v_multi = "data2vec_multi" in w2v_args.model.get("_name", None) if not self.is_d2v_multi: model_normalized = w2v_args.task.get( "normalize", w2v_args.model.get("normalize", False) ) assert cfg.normalize == model_normalized, ( "Fine-tuning works best when data normalization is the same. " "Please check that --normalize is set or unset for both pre-training and here" ) with open_dict(w2v_args): args_replacement = ["checkpoint_activations", "layer_type", "adp_num", "adp_dim", "adp_act_fn", "adp_trf_idx"] for _args in args_replacement: if hasattr(cfg, _args) and getattr(cfg, _args, None) is not None: w2v_args.model[_args] = getattr(cfg, _args, None) if hasattr(cfg, "checkpoint_activations") and cfg.checkpoint_activations: with open_dict(w2v_args): w2v_args.model.checkpoint_activations = cfg.checkpoint_activations w2v_args.task.data = cfg.data task = tasks.setup_task(w2v_args.task, from_checkpoint=True) model = task.build_model(w2v_args.model, from_checkpoint=True) model.remove_pretraining_modules() d = w2v_args.model.encoder_embed_dim else: assert cfg.normalize if hasattr(w2v_args.task, "audio"): w2v_args.task.audio.data = cfg.data else: w2v_args.task.data = cfg.data task = tasks.setup_task(w2v_args.task, from_checkpoint=True) model = task.build_model(w2v_args.model, from_checkpoint=True) model.remove_pretraining_modules(modality="audio") d = w2v_args.model.embed_dim if state is not None and not cfg.no_pretrained_weights: if cfg.load_ema: assert "_ema" in state["model"] for k in state["model"]["_ema"]: mk = "encoder." + k assert mk in state["model"], mk state["model"][mk] = state["model"]["_ema"][k] self.load_model_weights(state, model, cfg) super().__init__(task.source_dictionary) self.w2v_model = model self.final_dropout = nn.Dropout(cfg.final_dropout) self.freeze_finetune_updates = cfg.freeze_finetune_updates self.num_updates = 0 targ_d = None self.proj = None if output_size is not None: targ_d = output_size elif getattr(cfg, "decoder_embed_dim", d) != d: targ_d = cfg.decoder_embed_dim if targ_d is not None: self.proj = Linear(d, targ_d) if cfg.freeze_regex is not None: self.freeze_regex(cfg.freeze_regex) layer_decay = getattr(cfg, "layer_decay", 1) if layer_decay < 1: mod_encs = list(model.modality_encoders.values()) assert len(mod_encs) == 1, len(mod_encs) blocks = list(mod_encs[0].context_encoder.blocks) + list(model.blocks) num_layers = len(blocks) + 1 layer_scales = list( layer_decay ** (num_layers - i) for i in range(num_layers + 1) ) for i, b in enumerate(blocks): lid = i + 1 if layer_scales[lid] == 1.0: continue for n, p in b.named_parameters(): optim_override = getattr(p, "optim_overrides", {}) if "optimizer" not in optim_override: optim_override["optimizer"] = {} optim_override["optimizer"]["lr_scale"] = layer_scales[lid] p.optim_overrides = optim_override def freeze_regex(self, pattern): unfrozen_names = [] for name, param in self.named_parameters(): if re.fullmatch(pattern, name) is not None: param.requires_grad_(False) else: unfrozen_names.append(name) def load_model_weights(self, state, model, cfg): if cfg.ddp_backend == "fully_sharded": from fairseq.distributed import FullyShardedDataParallel for name, module in model.named_modules(): if "encoder.layers" in name and len(name.split(".")) == 3: # Only for layers, we do a special handling and load the weights one by one # We dont load all weights together as that wont be memory efficient and may # cause oom new_dict = { k.replace(name + ".", ""): v for (k, v) in state["model"].items() if name + "." in k } assert isinstance(module, FullyShardedDataParallel) with module.summon_full_params(): module.load_state_dict(new_dict, strict=True) module._reset_lazy_init() # Once layers are loaded, filter them out and load everything else. r = re.compile("encoder.layers.\d.") filtered_list = list(filter(r.match, state["model"].keys())) new_big_dict = { k: v for (k, v) in state["model"].items() if k not in filtered_list } model.load_state_dict(new_big_dict, strict=False) else: to_delete = {"_ema", "target_proj", "decoder"} for k in to_delete: if k in state["model"]: del state["model"][k] if hasattr(model, "modality_encoders"): if "modality_encoders.AUDIO.encoder_mask" not in state["model"]: model.modality_encoders["AUDIO"].encoder_mask = None elif not cfg.zero_mask: model.modality_encoders["AUDIO"].encoder_mask = None del state["model"]["modality_encoders.AUDIO.encoder_mask"] for k in list(state["model"].keys()): if k.startswith("modality_encoders.") and not k.startswith( "modality_encoders.AUDIO" ): del state["model"][k] print(model) model.load_state_dict(state["model"], strict=True) def set_num_updates(self, num_updates): """Set the number of parameters updates.""" super().set_num_updates(num_updates) self.num_updates = num_updates def forward(self, source, padding_mask, **kwargs): w2v_args = { "source": source, "padding_mask": padding_mask, "mask": self.apply_mask and self.training, } if "corpus_key" in kwargs: w2v_args["corpus_key"] = kwargs["corpus_key"] if self.is_d2v_multi: w2v_args["mode"] = "AUDIO" ft = self.freeze_finetune_updates <= self.num_updates with torch.no_grad() if not ft else contextlib.ExitStack(): res = self.w2v_model.extract_features(**w2v_args) x = res["x"] padding_mask = res["padding_mask"] # B x T x C -> T x B x C x = x.transpose(0, 1) x = self.final_dropout(x) if self.proj: x = self.proj(x) return { "encoder_out": x, # T x B x C "padding_mask": padding_mask, # B x T, "layer_results": res["layer_results"], } def forward_torchscript(self, net_input): if torch.jit.is_scripting(): return self.forward(net_input["source"], net_input["padding_mask"]) else: return self.forward_non_torchscript(net_input) def reorder_encoder_out(self, encoder_out, new_order): if encoder_out["encoder_out"] is not None: encoder_out["encoder_out"] = encoder_out["encoder_out"].index_select( 1, new_order ) if encoder_out["padding_mask"] is not None: encoder_out["padding_mask"] = encoder_out["padding_mask"].index_select( 0, new_order ) return encoder_out def max_positions(self): """Maximum input length supported by the encoder.""" return None def upgrade_state_dict_named(self, state_dict, name): return state_dict class TransformerDecoder(FairseqIncrementalDecoder): """ Transformer decoder consisting of *args.decoder_layers* layers. Each layer is a :class:`TransformerDecoderLayer`. Args: args (argparse.Namespace): parsed command-line arguments dictionary (~fairseq.data.Dictionary): decoding dictionary embed_tokens (torch.nn.Embedding): output embedding no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, cfg: Wav2Vec2Seq2SeqConfig, dictionary, embed_tokens, no_encoder_attn=False, ): super().__init__(dictionary) self.dropout = cfg.decoder_dropout self.share_input_output_embed = cfg.share_decoder_input_output_embed input_embed_dim = embed_tokens.embedding_dim embed_dim = cfg.decoder_embed_dim self.output_embed_dim = cfg.decoder_embed_dim self.layerdrop = cfg.decoder_layerdrop self.padding_idx = embed_tokens.padding_idx self.max_target_positions = cfg.max_target_positions self.embed_tokens = embed_tokens self.embed_scale = math.sqrt(embed_dim) # todo: try with input_embed_dim self.project_in_dim = ( Linear(input_embed_dim, embed_dim, bias=False) if embed_dim != input_embed_dim else None ) self.embed_positions = ( PositionalEmbedding( cfg.max_target_positions, embed_dim, self.padding_idx, learned=cfg.decoder_learned_pos, ) if not cfg.no_token_positional_embeddings else None ) # TODO: update this when transformer gets converted to dataclass configs transformer_cfg = copy.deepcopy(cfg) with open_dict(transformer_cfg): transformer_cfg.dropout = transformer_cfg.decoder_dropout transformer_cfg.attention_dropout = ( transformer_cfg.decoder_attention_dropout ) transformer_cfg.activation_dropout = ( transformer_cfg.decoder_activation_dropout ) self.layers = nn.ModuleList([]) self.layers.extend( [ TransformerDecoderLayer(transformer_cfg, no_encoder_attn) for _ in range(transformer_cfg.decoder_layers) ] ) if not self.share_input_output_embed: self.embed_out = nn.Parameter( torch.Tensor(len(dictionary), self.output_embed_dim) ) nn.init.normal_(self.embed_out, mean=0, std=self.output_embed_dim**-0.5) if transformer_cfg.decoder_normalize_before: self.layer_norm = LayerNorm(embed_dim) else: self.layer_norm = None def forward( self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused ): """ Args: prev_output_tokens (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_out (Tensor, optional): output from the encoder, used for encoder-side attention incremental_state (dict): dictionary used for storing state during :ref:`Incremental decoding` Returns: tuple: - the decoder's output of shape `(batch, tgt_len, vocab)` - a dictionary with any model-specific outputs """ if type(prev_output_tokens) == list: max_len = max((len(x) for x in prev_output_tokens)) tmp = torch.zeros( [len(prev_output_tokens), max_len], device=prev_output_tokens[0].device ) for (i, p) in enumerate(prev_output_tokens): tmp[i, : len(p)] = p prev_output_tokens = tmp prev_output_tokens = prev_output_tokens.long() x, extra = self.extract_features( prev_output_tokens, encoder_out, incremental_state ) x = self.output_layer(x) return x, extra def extract_features( self, prev_output_tokens, encoder_out=None, incremental_state=None, **unused ): """ Similar to *forward* but only return features. Returns: tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - a dictionary with any model-specific outputs """ # embed positions positions = ( self.embed_positions( prev_output_tokens, incremental_state=incremental_state ) if self.embed_positions is not None else None ) if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] if positions is not None: positions = positions[:, -1:] # embed tokens and positions x = self.embed_scale * self.embed_tokens(prev_output_tokens) if self.project_in_dim is not None: x = self.project_in_dim(x) if positions is not None: x += positions x = F.dropout(x, p=self.dropout, training=self.training) # B x T x C -> T x B x C x = x.transpose(0, 1) attn = None inner_states = [x] # decoder layers self_attn_padding_mask = None if prev_output_tokens.eq(self.padding_idx).any(): self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx) for layer in self.layers: dropout_probability = np.random.random() if not self.training or (dropout_probability > self.layerdrop): x, attn, _ = layer( x, encoder_out["encoder_out"] if encoder_out is not None else None, encoder_out["padding_mask"] if encoder_out is not None else None, incremental_state, self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None, self_attn_padding_mask=self_attn_padding_mask, ) inner_states.append(x) if self.layer_norm: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) return x, {"attn": attn, "inner_states": inner_states} def output_layer(self, features, **kwargs): """Project features to the vocabulary size.""" # project back to size of vocabulary if self.share_input_output_embed: return F.linear(features, self.embed_tokens.weight) else: return F.linear(features, self.embed_out) def max_positions(self): """Maximum output length supported by the decoder.""" if self.embed_positions is None: return self.max_target_positions return min(self.max_target_positions, self.embed_positions.max_positions) def buffered_future_mask(self, tensor): dim = tensor.size(0) if ( not hasattr(self, "_future_mask") or self._future_mask is None or self._future_mask.device != tensor.device or self._future_mask.size(0) < dim ): self._future_mask = torch.triu( utils.fill_with_neg_inf(tensor.new(dim, dim)), 1 ) return self._future_mask[:dim, :dim] def upgrade_state_dict_named(self, state_dict, name): return state_dict def Embedding(num_embeddings, embedding_dim, padding_idx): m = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) nn.init.constant_(m.weight[padding_idx], 0) return m def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m ================================================ FILE: fairseq/models/wav2vec/wav2vec2_classification.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import logging from argparse import Namespace from dataclasses import dataclass, field from typing import Any, Optional import torch import torch.nn as nn import torch.nn.functional as F from omegaconf import II, MISSING, open_dict from fairseq import checkpoint_utils, tasks, utils from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import BaseFairseqModel, FairseqEncoder, register_model from fairseq.models.wav2vec.wav2vec2 import MASKING_DISTRIBUTION_CHOICES, Wav2Vec2Config from fairseq.models.wav2vec.wav2vec2_asr import Embedding, Linear, Wav2VecEncoder, Wav2Vec2AsrConfig from fairseq.tasks import FairseqTask logging.basicConfig(level=logging.DEBUG) @dataclass class Wav2Vec2ClassificationConfig(Wav2Vec2AsrConfig): latent_embed_dim: Optional[int] = field( default=None, metadata={"help": "latent dim (encoder w2v -> latent -> class"} ) pooling: str = field( default="first_token", metadata={"help": "pooling layer choices"}, ) activation_fn: ChoiceEnum(utils.get_available_activation_fns()) = field( default="gelu", metadata={"help": "activation function to use"} ) @register_model("wav2vec_classification", dataclass=Wav2Vec2ClassificationConfig) class Wav2VecClassification(BaseFairseqModel): # TODO: Can be shared/merged with ASR model class as w2v_encoder params are common. def __init__( self, cfg: Wav2Vec2ClassificationConfig, w2v_encoder: BaseFairseqModel, pooling_layer, ): super().__init__() self.cfg = cfg self.w2v_encoder = w2v_encoder self.pooling_layer = pooling_layer def upgrade_state_dict_named(self, state_dict, name): super().upgrade_state_dict_named(state_dict, name) return state_dict @classmethod def build_model(cls, cfg: Wav2Vec2ClassificationConfig, task: FairseqTask): """Build a new model instance.""" w2v_encoder = Wav2VecEncoder(cfg, None) pooling_layer = get_pooling_layer( cfg, w2v_encoder.w2v_model.encoder.layers[-1].embedding_dim, len(task.target_dictionary), len(w2v_encoder.w2v_model.encoder.layers), ) return cls(cfg, w2v_encoder, pooling_layer) def get_normalized_probs(self, net_output, log_probs): """Get normalized probabilities (or log probs) from a net's output.""" logits = net_output if log_probs: return utils.log_softmax(logits.float(), dim=-1) else: return utils.softmax(logits.float(), dim=-1) def get_logits(self, net_output): return net_output def forward(self, **kwargs): encoder_out_dict = self.w2v_encoder(**kwargs) w2v_encoder_out = encoder_out_dict["encoder_out"] # TxBxC w2v_encoder_padding_mask = encoder_out_dict["padding_mask"] # BxT # w2v_encoder_layer_results = encoder_out_dict["layer_results"] return self.pooling_layer( last_layer_feats=w2v_encoder_out, padding_mask=w2v_encoder_padding_mask, # all_layer_feats=w2v_encoder_layer_results, ) # def forward_latent(self, **kwargs): # encoder_out_dict = self.w2v_encoder(**kwargs) # w2v_encoder_out = encoder_out_dict["encoder_out"] # w2v_encoder_padding_mask = encoder_out_dict["encoder_padding_mask"] # w2v_encoder_layer_results = encoder_out_dict["layer_results"] # return self.pooling_layer.forward_latent( # last_layer_feats=w2v_encoder_out, # padding_mask=w2v_encoder_padding_mask, # all_layer_feats=w2v_encoder_layer_results, # ) def get_pooling_layer( cfg: Wav2Vec2ClassificationConfig, encoder_embed_dim: int, num_targets: int, encoder_layers: int, ): assert cfg.pooling == 'mean' if cfg.pooling == "first_token": return FirstToken(cfg, encoder_embed_dim, num_targets) # elif cfg.pooling == "mean": # return MeanPooling(cfg, encoder_embed_dim, num_targets) elif cfg.pooling == "mean": return MeanPoolingFast(cfg, encoder_embed_dim, num_targets) elif cfg.pooling == "mean_amsoftmax": return MeanPoolingFastAMSoftmax(cfg, encoder_embed_dim, num_targets) elif cfg.pooling == "max": return MaxPoolingFast(cfg, encoder_embed_dim, num_targets) elif cfg.pooling == "elmo": return LayerWeightedMeanPooling( cfg, encoder_embed_dim, num_targets, encoder_layers ) else: raise NotImplementedError(f"{cfg.pooling} has not been implemented yet.") class Pooling(nn.Module): def __init__( self, cfg: Wav2Vec2ClassificationConfig, encoder_embed_dim: int, num_targets: int, ): super().__init__() self.projection = Linear(encoder_embed_dim, num_targets) def forward(self, last_layer_feats, **kwargs): raise NotImplementedError() class FirstToken(Pooling): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def forward(self, last_layer_feats, **kwargs): return self.projection(last_layer_feats[:, 0]) # class MeanPooling(Pooling): # def __init__( # self, # cfg: Wav2VecClassificationConfig, # encoder_embed_dim: int, # num_targets: int, # **kwargs, # ): # super().__init__(cfg, encoder_embed_dim, num_targets) # self.activation_fn = utils.get_activation_fn(cfg.activation_fn) # self.linear = Linear(encoder_embed_dim, encoder_embed_dim) # def forward(self, last_layer_feats, padding_mask, **kwargs): # # last_layer_feats: [BxTxD] # # padding_mask: [BxT] # last_layer_feats = self.linear(self.activation_fn(last_layer_feats)) # input_lengths = (1 - padding_mask.long()).sum(-1) # pooled_feature_list = [] # for i in range(len(last_layer_feats)): # length = input_lengths[i] # pooled_feature = torch.mean(last_layer_feats[i][:length], dim=0) # pooled_feature_list.append(pooled_feature) # return self.projection(torch.stack(pooled_feature_list)) def fn_mean(x, mask): """ Args: x: TxBxD mask: BxT Return: y: BxD """ if mask is not None: mask = mask.t()[:, :, None] return (x * mask).sum(0) / mask.sum(0) else: return x.sum(0) / x.shape[0] class MeanPoolingFast(nn.Module): def __init__( self, cfg: Wav2Vec2ClassificationConfig, encoder_embed_dim: int, num_targets: int, **kwargs, ): super().__init__() self.activation_fn = utils.get_activation_fn(cfg.activation_fn) self.latent_embed_dim = ( cfg.latent_embed_dim if cfg.latent_embed_dim is not None else encoder_embed_dim ) logging.debug(f"| {self.latent_embed_dim=}") self.linear = Linear(encoder_embed_dim, self.latent_embed_dim) self.projection = Linear(self.latent_embed_dim, num_targets) def forward(self, last_layer_feats, padding_mask, **kwargs): """ Arguments features - [TxBxD] Acoustic feature with shape padding_mask - [BxT] Padding Mask """ if padding_mask is not None: feat_mask = (~padding_mask).to(last_layer_feats.dtype) else: feat_mask = None feat = self.linear(last_layer_feats) feat = fn_mean(feat, feat_mask) feat = self.activation_fn(feat) return self.projection(feat) def forward_latent(self, last_layer_feats, padding_mask, **kwargs): """ Arguments features - [TxBxD] Acoustic feature with shape padding_mask - [BxT] Padding Mask """ if padding_mask is not None: feat_mask = (~padding_mask).to(last_layer_feats.dtype) else: feat_mask = None feat = self.linear(last_layer_feats) feat = fn_mean(feat, feat_mask) return feat class MeanPoolingFastAMSoftmax(MeanPoolingFast): def __init__( self, cfg: Wav2Vec2ClassificationConfig, encoder_embed_dim: int, num_targets: int, **kwargs, ): super().__init__(cfg, encoder_embed_dim, num_targets, **kwargs) self.projection = Linear(self.latent_embed_dim, num_targets, bias=False) nn.init.xavier_normal_(self.projection.weight, gain=1) def forward(self, last_layer_feats, padding_mask, **kwargs): """ Arguments features - [BxTxD] Acoustic feature with shape padding_mask - [BxT] Padding Mask """ feat_mask = (~padding_mask).to(last_layer_feats.dtype) # T,B -> B,T feat = self.linear(last_layer_feats) # B,T,D feat = fn_mean(feat, feat_mask) # B,D feat = self.activation_fn(feat) # normalize feat feat_norm = F.normalize(feat, p=2, dim=-1) # B,D weight_norm = F.normalize(self.projection.weight.t(), p=2, dim=-1) # D,K cos_fw = feat_norm @ weight_norm return cos_fw def fn_max(x, mask): """ Args: x: TxBxD mask: BxT Return: y: BxD """ mask = mask.t()[:, :, None].to(torch.bool) return x.masked_fill(~mask, -1e-8).max(0)[0] class MaxPoolingFast(Pooling): def __init__( self, cfg: Wav2Vec2ClassificationConfig, encoder_embed_dim: int, num_targets: int, **kwargs, ): super().__init__(cfg, encoder_embed_dim, num_targets) self.activation_fn = utils.get_activation_fn(cfg.activation_fn) self.linear = Linear(encoder_embed_dim, encoder_embed_dim) def forward(self, last_layer_feats, padding_mask, **kwargs): """ Arguments features - [TxBxD] Acoustic feature with shape padding_mask - [BxT] Padding Mask """ feat_mask = (~padding_mask).to(last_layer_feats.dtype) feat = self.linear(last_layer_feats) feat = fn_max(feat, feat_mask) feat = self.activation_fn(feat) return self.projection(feat) class LayerWeightedMeanPooling(MeanPoolingFast): """Elmo-style weighted average representation.""" def __init__( self, cfg: Wav2Vec2ClassificationConfig, encoder_embed_dim: int, num_targets: int, encoder_layers: int, ): super().__init__(cfg, encoder_embed_dim, num_targets) self.num_layers = encoder_layers self.weights = nn.Parameter(torch.ones(encoder_layers)) def forward(self, last_layer_feats, padding_mask, all_layer_feats): # last_layer_feats: [BxTxD] # padding_mask: [BxT] if not self.training: msg = ( f"Number of layers in input features = {len(all_layer_feats)}." f" Expected {self.num_layers} layers." ) assert len(all_layer_feats) == self.num_layers, msg # Stack up all layers and reshape to (num_layers, features) all_layer_feats_stacked = torch.stack(all_layer_feats, dim=0) num_layers, *original_feat_shape = all_layer_feats_stacked.shape all_layer_feats_stacked_flat = all_layer_feats_stacked.view(num_layers, -1) # Weighted average normalized_weights = F.softmax(self.weights, dim=-1) weighted_avg_features = ( normalized_weights.unsqueeze(-1) * all_layer_feats_stacked_flat ).sum(dim=0) weighted_avg_features = weighted_avg_features.view(*original_feat_shape) # Mean Pooling on weighted average features. return super().forward(weighted_avg_features, padding_mask) ================================================ FILE: fairseq/models/wav2vec/wav2vec2_laser.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.models import BaseFairseqModel, register_model from fairseq.models.wav2vec.wav2vec2_asr import ( Wav2Vec2CtcConfig, Wav2VecCtc, Wav2VecEncoder, ) from fairseq.tasks import FairseqTask @register_model("wav2vec2_laser", dataclass=Wav2Vec2CtcConfig) class Wav2VecLaser(Wav2VecCtc): def __init__(self, cfg: Wav2Vec2CtcConfig, w2v_encoder: BaseFairseqModel): super().__init__(cfg, w2v_encoder) self.num_updates = 0 self.freeze_finetune_updates = cfg.freeze_finetune_updates @classmethod def build_model(cls, cfg: Wav2Vec2CtcConfig, task: FairseqTask): """Build a new model instance.""" w2v_encoder = Wav2VecEncoder(cfg, 1024) return cls(cfg, w2v_encoder) def forward(self, **kwargs): output = super().forward(**kwargs) x_out = output["encoder_out"] * 0.01 out_pad_mask = output["padding_mask"] # Set padded outputs to -inf so they are not selected by max-pooling if out_pad_mask is not None and out_pad_mask.any(): x_out = ( x_out.float() .masked_fill_(out_pad_mask.T.unsqueeze(-1), float("-inf")) .type_as(x_out) ) return x_out.max(dim=0)[0] ================================================ FILE: fairseq/models/xmod/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .model import * # noqa from .transformer_layer_xmod import * # noqa ================================================ FILE: fairseq/models/xmod/hub_interface.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.models.roberta.hub_interface import RobertaHubInterface import torch import torch.nn.functional as F class XMODHubInterface(RobertaHubInterface): def extract_features( self, tokens: torch.LongTensor, return_all_hiddens: bool = False, lang_id=None, ) -> torch.Tensor: if tokens.dim() == 1: tokens = tokens.unsqueeze(0) if tokens.size(-1) > self.model.max_positions(): raise ValueError( "tokens exceeds maximum length: {} > {}".format( tokens.size(-1), self.model.max_positions() ) ) features, extra = self.model( tokens.to(device=self.device), features_only=True, return_all_hiddens=return_all_hiddens, lang_id=lang_id, ) if return_all_hiddens: # convert from T x B x C -> B x T x C inner_states = extra["inner_states"] return [inner_state.transpose(0, 1) for inner_state in inner_states] else: return features # just the last layer's features def predict( self, head: str, tokens: torch.LongTensor, return_logits: bool = False, lang_id=None, ): features = self.extract_features(tokens.to(device=self.device), lang_id=lang_id) logits = self.model.classification_heads[head](features) if return_logits: return logits return F.log_softmax(logits, dim=-1) ================================================ FILE: fairseq/models/xmod/model.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from ..roberta.model_xlmr import XLMRModel from fairseq.models.xmod.transformer_layer_xmod import XMODTransformerEncoderLayerBase from ..roberta.model import base_architecture, RobertaEncoder from fairseq.models.transformer import TransformerEncoder from fairseq.modules.transformer_sentence_encoder import init_bert_params from typing import Optional from fairseq.models.xmod.hub_interface import XMODHubInterface import torch from fairseq.distributed import fsdp_wrap from fairseq.models import ( register_model, register_model_architecture, ) from fairseq.modules.checkpoint_activations import checkpoint_wrapper DEFAULT_MIN_PARAMS_TO_WRAP = int(1e8) @register_model("xmod") class XMODModel(XLMRModel): @classmethod def hub_models(cls): return { "xmod.base": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.81.1M.tar.gz", "xmod.large.prenorm": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.large.prenorm.81.500k.tar.gz", "xmod.base.13.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.13.125k.tar.gz", "xmod.base.30.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.125k.tar.gz", "xmod.base.30.195k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.30.195k.tar.gz", "xmod.base.60.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.125k.tar.gz", "xmod.base.60.265k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.60.265k.tar.gz", "xmod.base.75.125k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.125k.tar.gz", "xmod.base.75.269k": "https://dl.fbaipublicfiles.com/fairseq/models/xmod/xmod.base.75.269k.tar.gz", } @classmethod def from_pretrained( cls, model_name_or_path, checkpoint_file="model.pt", data_name_or_path=".", bpe="sentencepiece", **kwargs, ): from fairseq import hub_utils x = hub_utils.from_pretrained( model_name_or_path, checkpoint_file, data_name_or_path, archive_map=cls.hub_models(), bpe=bpe, load_checkpoint_heads=True, **kwargs, ) return XMODHubInterface(x["args"], x["task"], x["models"][0]) @classmethod def build_model(cls, args, task): """Build a new model instance.""" from omegaconf import OmegaConf if OmegaConf.is_config(args): OmegaConf.set_struct(args, False) # make sure all arguments are present base_architecture(args) if not hasattr(args, "max_positions"): if not hasattr(args, "tokens_per_sample"): args.tokens_per_sample = task.max_positions() args.max_positions = args.tokens_per_sample encoder = XMODEncoder(args, task.source_dictionary) if OmegaConf.is_config(args): OmegaConf.set_struct(args, True) return cls(args, encoder) def forward( self, src_tokens, features_only=False, return_all_hiddens=False, classification_head_name=None, lang_id=None, **kwargs, ): if classification_head_name is not None: features_only = True x, extra = self.encoder( src_tokens, features_only, return_all_hiddens, lang_id=lang_id, **kwargs ) if classification_head_name is not None: x = self.classification_heads[classification_head_name](x) return x, extra class XMODEncoder(RobertaEncoder): """XMOD encoder.""" def build_encoder(self, args, dictionary, embed_tokens): encoder = XMODTransformerEncoder(args, dictionary, embed_tokens) encoder.apply(init_bert_params) return encoder def forward( self, src_tokens, features_only=False, return_all_hiddens=False, masked_tokens=None, lang_id=None, **unused, ): """ Args: src_tokens (LongTensor): input tokens of shape `(batch, src_len)` features_only (bool, optional): skip LM head and just return features. If True, the output will be of shape `(batch, src_len, embed_dim)`. return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). Returns: tuple: - the LM output of shape `(batch, src_len, vocab)` - a dictionary of additional data, where 'inner_states' is a list of hidden states. Note that the hidden states have shape `(src_len, batch, vocab)`. """ x, extra = self.extract_features( src_tokens, return_all_hiddens=return_all_hiddens, lang_id=lang_id ) if not features_only: x = self.output_layer(x, masked_tokens=masked_tokens) return x, extra def extract_features( self, src_tokens, return_all_hiddens=False, lang_id=None, **kwargs ): encoder_out = self.sentence_encoder( src_tokens, return_all_hiddens=return_all_hiddens, lang_id=lang_id, token_embeddings=kwargs.get("token_embeddings", None), ) # T x B x C -> B x T x C features = encoder_out["encoder_out"][0].transpose(0, 1) inner_states = encoder_out["encoder_states"] if return_all_hiddens else None return features, {"inner_states": inner_states} class XMODTransformerEncoder(TransformerEncoder): def build_encoder_layer(self, cfg): layer = XMODTransformerEncoderLayerBase(cfg) checkpoint = cfg.checkpoint_activations if checkpoint: offload_to_cpu = cfg.offload_activations layer = checkpoint_wrapper(layer, offload_to_cpu=offload_to_cpu) # if we are checkpointing, enforce that FSDP always wraps the # checkpointed layer, regardless of layer size min_params_to_wrap = cfg.min_params_to_wrap if not checkpoint else 0 layer = fsdp_wrap(layer, min_num_params=min_params_to_wrap) return layer def forward( self, src_tokens, src_lengths: Optional[torch.Tensor] = None, return_all_hiddens: bool = False, token_embeddings: Optional[torch.Tensor] = None, lang_id=None, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). token_embeddings (torch.Tensor, optional): precomputed embeddings default `None` will recompute embeddings Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ return self.forward_scriptable( src_tokens, src_lengths, return_all_hiddens, token_embeddings, lang_id=lang_id, ) # TorchScript doesn't support super() method so that the scriptable Subclass # can't access the base class model in Torchscript. # Current workaround is to add a helper function with different name and # call the helper function from scriptable Subclass. def forward_scriptable( self, src_tokens, src_lengths: Optional[torch.Tensor] = None, return_all_hiddens: bool = False, token_embeddings: Optional[torch.Tensor] = None, lang_id=None, ): """ Args: src_tokens (LongTensor): tokens in the source language of shape `(batch, src_len)` src_lengths (torch.LongTensor): lengths of each source sentence of shape `(batch)` return_all_hiddens (bool, optional): also return all of the intermediate hidden states (default: False). token_embeddings (torch.Tensor, optional): precomputed embeddings default `None` will recompute embeddings Returns: dict: - **encoder_out** (Tensor): the last encoder layer's output of shape `(src_len, batch, embed_dim)` - **encoder_padding_mask** (ByteTensor): the positions of padding elements of shape `(batch, src_len)` - **encoder_embedding** (Tensor): the (scaled) embedding lookup of shape `(batch, src_len, embed_dim)` - **encoder_states** (List[Tensor]): all intermediate hidden states of shape `(src_len, batch, embed_dim)`. Only populated if *return_all_hiddens* is True. """ # compute padding mask encoder_padding_mask = src_tokens.eq(self.padding_idx) has_pads = src_tokens.device.type == "xla" or encoder_padding_mask.any() x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) # account for padding while computing the representation if has_pads: x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x)) # B x T x C -> T x B x C x = x.transpose(0, 1) encoder_states = [] if return_all_hiddens: encoder_states.append(x) # encoder layers for layer in self.layers: x = layer( x, encoder_padding_mask=encoder_padding_mask if has_pads else None, lang_id=lang_id, ) if return_all_hiddens: assert encoder_states is not None encoder_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in # `forward` so we use a dictionary instead. # TorchScript does not support mixed values so the values are all lists. # The empty list is equivalent to None. src_lengths = ( src_tokens.ne(self.padding_idx) .sum(dim=1, dtype=torch.int32) .reshape(-1, 1) .contiguous() ) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask], # B x T "encoder_embedding": [encoder_embedding], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [src_lengths], } @register_model_architecture("xmod", "xmod_base_13") def roberta_base_architecture(args): args.ffn_modules = getattr(args, "ffn_modules", False) args.adapter_modules = getattr(args, "adapter_modules", True) args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) args.ln_before_adapter = getattr(args, "ln_before_adapter", True) args.languages = getattr( args, "languages", [ "ar_AR", "en_XX", "fi_FI", "fr_XX", "hi_IN", "id_ID", "ka_GE", "ko_KR", "ru_RU", "sw_KE", "ta_IN", "th_TH", "vi_VN", ], ) base_architecture(args) @register_model_architecture("xmod", "xmod_base_30") def roberta_base_architecture(args): args.ffn_modules = getattr(args, "ffn_modules", False) args.adapter_modules = getattr(args, "adapter_modules", True) args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) args.ln_before_adapter = getattr(args, "ln_before_adapter", True) args.languages = getattr( args, "languages", [ "ar_AR", "cs_CZ", "en_XX", "eu_ES", "fi_FI", "fr_XX", "hi_IN", "hr_HR", "hu_HU", "hy_AM", "id_ID", "it_IT", "ka_GE", "ko_KR", "lt_LT", "ml_IN", "mn_MN", "ms_MY", "pl_PL", "ro_RO", "ru_RU", "si_LK", "sk_SK", "sq_AL", "sv_SE", "sw_KE", "ta_IN", "th_TH", "tl_XX", "vi_VN", ], ) base_architecture(args) @register_model_architecture("xmod", "xmod_base_60") def roberta_base_architecture(args): args.ffn_modules = getattr(args, "ffn_modules", False) args.adapter_modules = getattr(args, "adapter_modules", True) args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) args.ln_before_adapter = getattr(args, "ln_before_adapter", True) args.languages = getattr( args, "languages", [ "af_ZA", "am_ET", "ar_AR", "be_BY", "bn_IN", "ca_ES", "cs_CZ", "cy_GB", "da_DK", "en_XX", "eo_EO", "et_EE", "eu_ES", "fa_IR", "fi_FI", "fr_XX", "ga_IE", "gl_ES", "gu_IN", "ha_NG", "hi_IN", "hr_HR", "hu_HU", "hy_AM", "id_ID", "is_IS", "it_IT", "ka_GE", "ko_KR", "ku_TR", "la_VA", "lt_LT", "lv_LV", "mk_MK", "ml_IN", "mn_MN", "ms_MY", "ne_NP", "nl_XX", "no_XX", "pl_PL", "ps_AF", "pt_XX", "ro_RO", "ru_RU", "sa_IN", "sd_PK", "si_LK", "sk_SK", "sl_SI", "so_SO", "sq_AL", "sr_RS", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "vi_VN", ], ) base_architecture(args) @register_model_architecture("xmod", "xmod_base_75") def roberta_base_architecture(args): args.ffn_modules = getattr(args, "ffn_modules", False) args.adapter_modules = getattr(args, "adapter_modules", True) args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) args.ln_before_adapter = getattr(args, "ln_before_adapter", True) args.languages = getattr( args, "languages", [ "af_ZA", "am_ET", "ar_AR", "as_IN", "be_BY", "bn_IN", "br_FR", "bs_BA", "ca_ES", "cs_CZ", "cy_GB", "da_DK", "en_XX", "eo_EO", "et_EE", "eu_ES", "fa_IR", "fi_FI", "fr_XX", "fy_NL", "ga_IE", "gd_GB", "gl_ES", "gu_IN", "ha_NG", "hi_IN", "hr_HR", "hu_HU", "hy_AM", "id_ID", "is_IS", "it_IT", "jv_ID", "ka_GE", "kn_IN", "ko_KR", "ku_TR", "la_VA", "lt_LT", "lv_LV", "mg_MG", "mk_MK", "ml_IN", "mn_MN", "mr_IN", "ms_MY", "ne_NP", "nl_XX", "no_XX", "om_KE", "or_IN", "pa_IN", "pl_PL", "ps_AF", "pt_XX", "ro_RO", "ru_RU", "sa_IN", "sd_PK", "si_LK", "sk_SK", "sl_SI", "so_SO", "sq_AL", "sr_RS", "su_ID", "sv_SE", "sw_KE", "ta_IN", "te_IN", "th_TH", "tl_XX", "vi_VN", "xh_ZA", "yi_DE", ], ) base_architecture(args) @register_model_architecture("xmod", "xmod_base") def roberta_base_architecture(args): args.ffn_modules = getattr(args, "ffn_modules", False) args.adapter_modules = getattr(args, "adapter_modules", True) args.adapter_layer_norm = getattr(args, "adapter_layer_norm", False) args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", True) args.ln_before_adapter = getattr(args, "ln_before_adapter", True) args.languages = getattr( args, "languages", [ "en_XX", "id_ID", "vi_VN", "ru_RU", "fa_IR", "sv_SE", "ja_XX", "fr_XX", "de_DE", "ro_RO", "ko_KR", "hu_HU", "es_XX", "fi_FI", "uk_UA", "da_DK", "pt_XX", "no_XX", "th_TH", "pl_PL", "bg_BG", "nl_XX", "zh_CN", "he_IL", "el_GR", "it_IT", "sk_SK", "hr_HR", "tr_TR", "ar_AR", "cs_CZ", "lt_LT", "hi_IN", "zh_TW", "ca_ES", "ms_MY", "sl_SI", "lv_LV", "ta_IN", "bn_IN", "et_EE", "az_AZ", "sq_AL", "sr_RS", "kk_KZ", "ka_GE", "tl_XX", "ur_PK", "is_IS", "hy_AM", "ml_IN", "mk_MK", "be_BY", "la_VA", "te_IN", "eu_ES", "gl_ES", "mn_MN", "kn_IN", "ne_NP", "sw_KE", "si_LK", "mr_IN", "af_ZA", "gu_IN", "cy_GB", "eo_EO", "km_KH", "ky_KG", "uz_UZ", "ps_AF", "pa_IN", "ga_IE", "ha_NG", "am_ET", "lo_LA", "ku_TR", "so_SO", "my_MM", "or_IN", "sa_IN", ], ) base_architecture(args) @register_model_architecture("xmod", "xmod_large_prenorm") def roberta_base_architecture(args): args.ffn_modules = getattr(args, "ffn_modules", False) args.adapter_modules = getattr(args, "adapter_modules", True) args.adapter_layer_norm = getattr(args, "adapter_layer_norm", True) args.adapter_reuse_layer_norm = getattr(args, "adapter_reuse_layer_norm", False) args.ln_before_adapter = getattr(args, "ln_before_adapter", False) # args.bottleneck = getattr(args, "bottleneck", 8) args.bottleneck = getattr(args, "bottleneck", 4) args.languages = getattr( args, "languages", [ "en_XX", "id_ID", "vi_VN", "ru_RU", "fa_IR", "sv_SE", "ja_XX", "fr_XX", "de_DE", "ro_RO", "ko_KR", "hu_HU", "es_XX", "fi_FI", "uk_UA", "da_DK", "pt_XX", "no_XX", "th_TH", "pl_PL", "bg_BG", "nl_XX", "zh_CN", "he_IL", "el_GR", "it_IT", "sk_SK", "hr_HR", "tr_TR", "ar_AR", "cs_CZ", "lt_LT", "hi_IN", "zh_TW", "ca_ES", "ms_MY", "sl_SI", "lv_LV", "ta_IN", "bn_IN", "et_EE", "az_AZ", "sq_AL", "sr_RS", "kk_KZ", "ka_GE", "tl_XX", "ur_PK", "is_IS", "hy_AM", "ml_IN", "mk_MK", "be_BY", "la_VA", "te_IN", "eu_ES", "gl_ES", "mn_MN", "kn_IN", "ne_NP", "sw_KE", "si_LK", "mr_IN", "af_ZA", "gu_IN", "cy_GB", "eo_EO", "km_KH", "ky_KG", "uz_UZ", "ps_AF", "pa_IN", "ga_IE", "ha_NG", "am_ET", "lo_LA", "ku_TR", "so_SO", "my_MM", "or_IN", "sa_IN", ], ) args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True) args.encoder_layers = getattr(args, "encoder_layers", 24) args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024) args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096) args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16) base_architecture(args) ================================================ FILE: fairseq/models/xmod/transformer_layer_xmod.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.modules.transformer_layer import TransformerEncoderLayer from typing import Optional import torch import torch.nn as nn from fairseq import utils from fairseq.modules import LayerNorm from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise from torch import Tensor class Adapter(nn.Module): def __init__(self, cfg, red_fac=2): super(Adapter, self).__init__() self.cfg = cfg self.embed_dim = cfg.encoder_embed_dim self.quant_noise = getattr(cfg, "quant_noise_pq", 0) self.quant_noise_block_size = getattr(cfg, "quant_noise_pq_block_size", 8) or 8 self.activation_fn = utils.get_activation_fn( activation=getattr(cfg, "activation_fn", "relu") or "relu" ) self.fc1 = quant_noise( nn.Linear(self.embed_dim, self.embed_dim // red_fac), p=self.quant_noise, block_size=self.quant_noise_block_size, ) self.fc2 = quant_noise( nn.Linear(self.embed_dim // red_fac, self.embed_dim), p=self.quant_noise, block_size=self.quant_noise_block_size, ) activation_dropout_p = getattr(cfg, "activation_dropout", 0) or 0 if activation_dropout_p == 0: # for backwards compatibility with models that use cfg.relu_dropout activation_dropout_p = getattr(cfg, "relu_dropout", 0) or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) def forward(self, x): x = self.activation_fn(self.fc1(x)) if not hasattr(self.cfg, "adapter_dropout") or self.cfg.adapter_dropout: x = self.activation_dropout_module(x) x = self.fc2(x) return x class XMODTransformerEncoderLayerBase(TransformerEncoderLayer): """Encoder layer block. In the original paper each operation (multi-head attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *cfg.encoder.normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments """ def __init__(self, cfg): super().__init__(cfg) if hasattr(cfg, "adapter_modules") and cfg.adapter_modules: export = getattr(cfg, "export", False) if cfg.adapter_layer_norm: self.adapter_layer_norm = LayerNorm(self.embed_dim, export=export) self.adapter_modules = nn.ModuleDict(dict()) if hasattr(self.cfg, "bottleneck"): bottleneck = self.cfg.bottleneck else: bottleneck = 2 for language in cfg.languages: self.adapter_modules[str(language)] = Adapter(cfg, red_fac=bottleneck) def lang_adapter(self, lang_id, x): # If language adapters exist pass throught them if hasattr(self.cfg, "adapter_modules") and self.cfg.adapter_modules: if lang_id is None: lang_id = ["en_XX"] * x.shape[1] d_langs = [lang_id[0]] lang_lengths = [1] for lang in lang_id[1:]: if lang == d_langs[-1]: lang_lengths[-1] += 1 else: d_langs.append(lang) lang_lengths.append(1) if ( not hasattr(self.cfg, "ln_before_adapter") or not self.cfg.ln_before_adapter ): residual = x if self.cfg.adapter_layer_norm: x = self.adapter_layer_norm(x) elif self.cfg.adapter_reuse_layer_norm: x = self.final_layer_norm(x) if hasattr(self.cfg, "ln_before_adapter") and self.cfg.ln_before_adapter: residual = x split_x = torch.split(x, lang_lengths, 1) x_ = [] for i, (lang, s_x) in enumerate(zip(d_langs, split_x)): lang = lang.replace("_rom", "").replace("_zaw", "") x_.append(self.adapter_modules[str(lang)](s_x)) x = torch.cat(x_, 1) x = self.dropout_module(x) x = self.residual_connection(x, residual) return x def forward( self, x, encoder_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor] = None, lang_id: Optional[list] = None, ): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, seq_len)` where padding elements are indicated by ``1``. attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`, where `tgt_len` is the length of output and `src_len` is the length of input, though here both are equal to `seq_len`. `attn_mask[tgt_i, src_j] = 1` means that when calculating the embedding for `tgt_i`, we exclude (mask out) `src_j`. This is useful for strided self-attention. Returns: encoded output of shape `(seq_len, batch, embed_dim)` """ # anything in original attn_mask = 1, becomes -1e8 # anything in original attn_mask = 0, becomes 0 # Note that we cannot use -inf here, because at some edge cases, # the attention weight (before softmax) for some padded element in query # will become -inf, which results in NaN in model parameters if attn_mask is not None: attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8) residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) x, _ = self.self_attn( query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=False, attn_mask=attn_mask, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.self_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = self.residual_connection(x, residual) x = self.lang_adapter(lang_id, x) if not self.normalize_before: x = self.final_layer_norm(x) return x ================================================ FILE: fairseq/modules/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" from .adaptive_input import AdaptiveInput from .adaptive_softmax import AdaptiveSoftmax from .base_layer import BaseLayer from .beamable_mm import BeamableMM from .character_token_embedder import CharacterTokenEmbedder from .conv_tbc import ConvTBC from .cross_entropy import cross_entropy from .downsampled_multihead_attention import DownsampledMultiHeadAttention from .dynamic_convolution import DynamicConv, DynamicConv1dTBC, DynamicConv_scripatable from .dynamic_crf_layer import DynamicCRF from .ema_module import EMAModuleConfig, EMAModule from .fairseq_dropout import FairseqDropout from .fp32_batch_norm import Fp32BatchNorm from .fp32_group_norm import Fp32GroupNorm from .fp32_instance_norm import Fp32InstanceNorm from .gelu import gelu, gelu_accurate from .grad_multiply import GradMultiply from .gumbel_vector_quantizer import GumbelVectorQuantizer from .kmeans_vector_quantizer import KmeansVectorQuantizer from .layer_drop import LayerDropModuleList from .layer_norm import Fp32LayerNorm, LayerNorm from .learned_positional_embedding import LearnedPositionalEmbedding from .lightweight_convolution import LightweightConv, LightweightConv1dTBC from .linearized_convolution import LinearizedConvolution from .location_attention import LocationAttention from .lstm_cell_with_zoneout import LSTMCellWithZoneOut from .multihead_attention import MultiheadAttention from .positional_embedding import PositionalEmbedding from .same_pad import SamePad, SamePad2d from .scalar_bias import ScalarBias from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding from .transformer_sentence_encoder_layer import TransformerSentenceEncoderLayer from .transformer_sentence_encoder import TransformerSentenceEncoder from .transpose_last import TransposeLast from .unfold import unfold1d from .transformer_layer import TransformerDecoderLayer, TransformerEncoderLayer from .vggblock import VGGBlock from .espnet_multihead_attention import ( ESPNETMultiHeadedAttention, RelPositionMultiHeadedAttention, RotaryPositionMultiHeadedAttention, ) from .rotary_positional_embedding import RotaryPositionalEmbedding from .positional_encoding import ( RelPositionalEncoding, ) __all__ = [ "AdaptiveInput", "AdaptiveSoftmax", "BaseLayer", "BeamableMM", "CharacterTokenEmbedder", "ConvTBC", "cross_entropy", "DownsampledMultiHeadAttention", "DynamicConv1dTBC", "DynamicConv", "DynamicConv_scripatable", "DynamicCRF", "EMAModule", "EMAModuleConfig", "FairseqDropout", "Fp32BatchNorm", "Fp32GroupNorm", "Fp32LayerNorm", "Fp32InstanceNorm", "gelu", "gelu_accurate", "GradMultiply", "GumbelVectorQuantizer", "KmeansVectorQuantizer", "LayerDropModuleList", "LayerNorm", "LearnedPositionalEmbedding", "LightweightConv1dTBC", "LightweightConv", "LinearizedConvolution", "LocationAttention", "LSTMCellWithZoneOut", "MultiheadAttention", "PositionalEmbedding", "SamePad", "SamePad2d", "ScalarBias", "SinusoidalPositionalEmbedding", "TransformerSentenceEncoderLayer", "TransformerSentenceEncoder", "TransformerDecoderLayer", "TransformerEncoderLayer", "TransposeLast", "VGGBlock", "unfold1d", "ESPNETMultiheadedAttention", "PositionalEmbedding", "RelPositionMultiHeadedAttention", "RelPositionalEncoding", "RotaryPositionalEmbedding", "RotaryPositionMultiHeadedAttention", ] ================================================ FILE: fairseq/modules/adaptive_input.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import List import torch from torch import nn from fairseq.modules.quant_noise import quant_noise class AdaptiveInput(nn.Module): def __init__( self, vocab_size: int, padding_idx: int, initial_dim: int, factor: float, output_dim: int, cutoff: List[int], q_noise: float = 0, qn_block_size: int = 8, ): super().__init__() if vocab_size > cutoff[-1]: cutoff = cutoff + [vocab_size] else: assert ( vocab_size == cutoff[-1] ), "cannot specify cutoff larger than vocab size" self.cutoff = cutoff self.embedding_dim = output_dim self.padding_idx = padding_idx self.embeddings = nn.ModuleList() for i in range(len(self.cutoff)): prev = self.cutoff[i - 1] if i > 0 else 0 size = self.cutoff[i] - prev dim = int(initial_dim // (factor**i)) seq = nn.Sequential( nn.Embedding(size, dim, self.padding_idx), quant_noise( nn.Linear(dim, output_dim, bias=False), q_noise, qn_block_size ), ) self.embeddings.append(seq) self.padding_idx = None self.padding_idx = padding_idx def init_weights(m): if isinstance(m, nn.Embedding): nn.init.normal_(m.weight, mean=0, std=m.weight.shape[1] ** -0.5) nn.init.constant_(m.weight[padding_idx], 0) elif hasattr(m, "weight"): nn.init.xavier_uniform_(m.weight) self.apply(init_weights) self.register_buffer("_float_tensor", torch.FloatTensor(1)) def weights_for_band(self, band: int): return self.embeddings[band][0].weight, self.embeddings[band][1].weight def forward(self, input: torch.Tensor): result = self._float_tensor.new(input.shape + (self.embedding_dim,)) for i in range(len(self.cutoff)): mask = input.lt(self.cutoff[i]) if i > 0: mask.mul_(input.ge(self.cutoff[i - 1])) chunk_input = input[mask] - self.cutoff[i - 1] else: chunk_input = input[mask] if mask.any(): result[mask] = self.embeddings[i](chunk_input) return result ================================================ FILE: fairseq/modules/adaptive_softmax.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import functools import operator import torch import torch.nn.functional as F from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise from torch import nn class TiedLinear(nn.Module): def __init__(self, weight, transpose): super().__init__() self.weight = weight self.transpose = transpose def forward(self, input): return F.linear(input, self.weight.t() if self.transpose else self.weight) class TiedHeadModule(nn.Module): def __init__(self, weights, input_dim, num_classes, q_noise, qn_block_size): super().__init__() tied_emb, _ = weights self.num_words, emb_dim = tied_emb.size() self.word_proj = quant_noise( TiedLinear(tied_emb, transpose=False), q_noise, qn_block_size ) if input_dim != emb_dim: self.word_proj = nn.Sequential( quant_noise( nn.Linear(input_dim, emb_dim, bias=False), q_noise, qn_block_size ), self.word_proj, ) self.class_proj = quant_noise( nn.Linear(input_dim, num_classes, bias=False), q_noise, qn_block_size ) self.out_dim = self.num_words + num_classes self.register_buffer("_float_tensor", torch.FloatTensor(1)) def forward(self, input): inp_sz = functools.reduce(operator.mul, input.shape[:-1], 1) out = self._float_tensor.new(inp_sz, self.out_dim) out[:, : self.num_words] = self.word_proj(input.view(inp_sz, -1)) out[:, self.num_words :] = self.class_proj(input.view(inp_sz, -1)) return out class AdaptiveSoftmax(nn.Module): """ This is an implementation of the efficient softmax approximation for graphical processing units (GPU), described in the paper "Efficient softmax approximation for GPUs" (http://arxiv.org/abs/1609.04309). """ def __init__( self, vocab_size, input_dim, cutoff, dropout, factor=4.0, adaptive_inputs=None, tie_proj=False, q_noise=0, qn_block_size=8, ): super().__init__() if vocab_size > cutoff[-1]: cutoff = cutoff + [vocab_size] else: assert ( vocab_size == cutoff[-1] ), "cannot specify cutoff larger than vocab size" output_dim = cutoff[0] + len(cutoff) - 1 self.vocab_size = vocab_size self.cutoff = cutoff self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.input_dim = input_dim self.factor = factor self.q_noise = q_noise self.qn_block_size = qn_block_size self.lsm = nn.LogSoftmax(dim=1) if adaptive_inputs is not None: self.head = TiedHeadModule( adaptive_inputs.weights_for_band(0), input_dim, len(cutoff) - 1, self.q_noise, self.qn_block_size, ) else: self.head = quant_noise( nn.Linear(input_dim, output_dim, bias=False), self.q_noise, self.qn_block_size, ) self._make_tail(adaptive_inputs, tie_proj) def init_weights(m): if ( hasattr(m, "weight") and not isinstance(m, TiedLinear) and not isinstance(m, TiedHeadModule) ): nn.init.xavier_uniform_(m.weight) self.apply(init_weights) self.register_buffer("version", torch.LongTensor([1])) def _make_tail(self, adaptive_inputs=None, tie_proj=False): self.tail = nn.ModuleList() for i in range(len(self.cutoff) - 1): dim = int(self.input_dim // self.factor ** (i + 1)) tied_emb, tied_proj = ( adaptive_inputs.weights_for_band(i + 1) if adaptive_inputs is not None else (None, None) ) if tied_proj is not None: if tie_proj: proj = quant_noise( TiedLinear(tied_proj, transpose=True), self.q_noise, self.qn_block_size, ) else: proj = quant_noise( nn.Linear(tied_proj.size(0), tied_proj.size(1), bias=False), self.q_noise, self.qn_block_size, ) else: proj = quant_noise( nn.Linear(self.input_dim, dim, bias=False), self.q_noise, self.qn_block_size, ) if tied_emb is None: out_proj = nn.Linear( dim, self.cutoff[i + 1] - self.cutoff[i], bias=False ) else: out_proj = TiedLinear(tied_emb, transpose=False) m = nn.Sequential( proj, nn.Dropout(self.dropout_module.p), quant_noise(out_proj, self.q_noise, self.qn_block_size), ) self.tail.append(m) def upgrade_state_dict_named(self, state_dict, name): version_name = name + ".version" if version_name not in state_dict: raise Exception("This version of the model is no longer supported") def adapt_target(self, target): """ In order to be efficient, the AdaptiveSoftMax does not compute the scores for all the word of the vocabulary for all the examples. It is thus necessary to call the method adapt_target of the AdaptiveSoftMax layer inside each forward pass. """ target = target.view(-1) new_target = [target.clone()] target_idxs = [] for i in range(len(self.cutoff) - 1): mask = target.ge(self.cutoff[i]).mul(target.lt(self.cutoff[i + 1])) new_target[0][mask] = self.cutoff[0] + i if mask.any(): target_idxs.append(mask.nonzero(as_tuple=False).squeeze(1)) new_target.append(target[mask].add(-self.cutoff[i])) else: target_idxs.append(None) new_target.append(None) return new_target, target_idxs def forward(self, input, target): """ Args: input: (b x t x d) target: (b x t) Returns: 2 lists: output for each cutoff section and new targets by cut off """ input = input.contiguous().view(-1, input.size(-1)) input = self.dropout_module(input) new_target, target_idxs = self.adapt_target(target) output = [self.head(input)] for i in range(len(target_idxs)): if target_idxs[i] is not None: output.append(self.tail[i](input.index_select(0, target_idxs[i]))) else: output.append(None) return output, new_target def get_log_prob(self, input, target): """ Computes the log probabilities for all the words of the vocabulary, given a 2D tensor of hidden vectors. """ bsz, length, dim = input.size() input = input.contiguous().view(-1, dim) if target is not None: _, target_idxs = self.adapt_target(target) else: target_idxs = None head_y = self.head(input) log_probs = head_y.new_zeros(input.size(0), self.vocab_size) head_sz = self.cutoff[0] + len(self.tail) log_probs[:, :head_sz] = self.lsm(head_y) tail_priors = log_probs[:, self.cutoff[0] : head_sz].clone() for i in range(len(self.tail)): start = self.cutoff[i] end = self.cutoff[i + 1] if target_idxs is None: tail_out = log_probs[:, start:end] tail_out.copy_(self.tail[i](input)) log_probs[:, start:end] = self.lsm(tail_out).add_( tail_priors[:, i, None] ) elif target_idxs[i] is not None: idxs = target_idxs[i] tail_out = log_probs[idxs, start:end] tail_out.copy_(self.tail[i](input[idxs])) log_probs[idxs, start:end] = self.lsm(tail_out).add_( tail_priors[idxs, i, None] ) log_probs = log_probs.view(bsz, length, -1) return log_probs ================================================ FILE: fairseq/modules/base_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn import torch import sys from fairseq import utils from fairseq.distributed import utils as distributed_utils from fairseq.modules.layer_norm import LayerNorm class BaseLayer(nn.Module): def __init__(self, args): super().__init__() self.num_workers = distributed_utils.get_data_parallel_world_size() expert_centroids = torch.empty(self.num_workers, args.decoder_embed_dim) torch.nn.init.orthogonal_(expert_centroids, gain=0.1) self.register_parameter( "expert_centroids", torch.nn.Parameter(expert_centroids) ) self.expert_network = nn.Sequential( *([BaseSublayer(args) for _ in range(args.base_sublayers)]) ) self.expert_id = distributed_utils.get_data_parallel_rank() self.shuffle = args.base_shuffle self.cpp = self.load_assignment() # Add a special attribute to the expert parameters, so we know not to sync their gradients for param in self.expert_network.parameters(): param.expert = True def forward(self, input_features, *args, **kwargs): features = input_features.reshape(-1, input_features.size(-1)) is_training = input_features.requires_grad if self.shuffle and is_training: # Send each token to a random worker, to break correlations within the batch shuffle_sort = torch.randperm(features.size(0), device=features.device) features = All2All.apply(features[shuffle_sort]) with torch.no_grad(): # Compute similarity of each token to each expert, for routing token_expert_affinities = features.matmul( self.expert_centroids.transpose(0, 1) ) # Compute which token goes to which expert sort_by_expert, input_splits, output_splits = ( self.balanced_assignment(token_expert_affinities) if is_training else self.greedy_assignment(token_expert_affinities) ) # Swap these tokens for the right ones for our expert routed_features = All2All.apply( features[sort_by_expert], output_splits, input_splits ) if routed_features.size(0) > 0: # Mix in the expert network based on how appropriate it is for these tokens alpha = torch.sigmoid( routed_features.mv(self.expert_centroids[self.expert_id]) ).unsqueeze(1) routed_features = ( alpha * self.expert_network(routed_features) + (1 - alpha) * routed_features ) # Return to original worker and ordering result = All2All.apply(routed_features, input_splits, output_splits)[ self.inverse_sort(sort_by_expert) ] if self.shuffle and is_training: # Undo shuffling result = All2All.apply(result)[self.inverse_sort(shuffle_sort)] # Return additional Nones for compatibility with TransformerDecoderLayer return result.view(input_features.size()), None, None def inverse_sort(self, order): # Creates an index that undoes a sort: xs==xs[order][inverse_sort(order)] return torch.empty_like(order).scatter_( 0, order, torch.arange(0, order.size(0), device=order.device) ) def balanced_assignment(self, scores): ok = scores.isfinite() if not ok.all(): # NaNs here can break the assignment algorithm scores[~ok] = scores[ok].min() return self.cpp.balanced_assignment(scores), None, None # Assigns each token to the top k experts def greedy_assignment(self, scores, k=1): token_to_workers = torch.topk(scores, dim=1, k=k, largest=True).indices.view(-1) token_to_workers, sort_ordering = torch.sort(token_to_workers) worker2token = sort_ordering // k # Find how many tokens we're sending to each other worker (being careful for sending 0 tokens to some workers) output_splits = torch.zeros( (self.num_workers,), dtype=torch.long, device=scores.device ) workers, counts = torch.unique_consecutive(token_to_workers, return_counts=True) output_splits[workers] = counts # Tell other workers how many tokens to expect from us input_splits = All2All.apply(output_splits) return worker2token, input_splits.tolist(), output_splits.tolist() def load_assignment(self): try: from fairseq import libbase return libbase except ImportError as e: sys.stderr.write( "ERROR: missing libbase. run `python setup.py build_ext --inplace`\n" ) raise e class BaseSublayer(nn.Module): def __init__(self, args): super().__init__() self.activation_fn = utils.get_activation_fn( activation=getattr(args, "activation_fn", "relu") or "relu" ) self.norm = LayerNorm(args.decoder_embed_dim, export=False) self.ff1 = torch.nn.Linear(args.decoder_embed_dim, args.decoder_ffn_embed_dim) self.ff2 = torch.nn.Linear(args.decoder_ffn_embed_dim, args.decoder_embed_dim) self.ff2.weight.data.zero_() def forward(self, xs): return xs + self.ff2(self.activation_fn(self.ff1(self.norm(xs)))) # Wraps torch.distributed.all_to_all_single as a function that supports autograd class All2All(torch.autograd.Function): @staticmethod def forward(ctx, xs, input_splits=None, output_splits=None): ctx.input_splits = input_splits ctx.output_splits = output_splits ys = ( torch.empty_like(xs) if output_splits is None else xs.new_empty(size=[sum(output_splits)] + list(xs.size()[1:])) ) torch.distributed.all_to_all_single( ys, xs, output_split_sizes=output_splits, input_split_sizes=input_splits ) return ys @staticmethod def backward(ctx, grad_output): result = ( torch.empty_like(grad_output) if ctx.input_splits is None else grad_output.new_empty( size=[sum(ctx.input_splits)] + list(grad_output.size()[1:]) ) ) torch.distributed.all_to_all_single( result, grad_output, output_split_sizes=ctx.input_splits, input_split_sizes=ctx.output_splits, ) return result, None, None ================================================ FILE: fairseq/modules/beamable_mm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn class BeamableMM(nn.Module): """This module provides an optimized MM for beam decoding with attention. It leverage the fact that the source-side of the input is replicated beam times and the target-side of the input is of width one. This layer speeds up inference by replacing the inputs {(bsz x 1 x nhu), (bsz x sz2 x nhu)} with smaller inputs {(bsz/beam x beam x nhu), (bsz/beam x sz2 x nhu)}. """ def __init__(self, beam_size=None): super(BeamableMM, self).__init__() self.beam_size = beam_size def forward(self, input1, input2): if ( not self.training and self.beam_size is not None # test mode and input1.dim() == 3 # beam size is set and input1.size(1) # only support batched input == 1 # single time step update ): bsz, beam = input1.size(0), self.beam_size # bsz x 1 x nhu --> bsz/beam x beam x nhu input1 = input1[:, 0, :].unfold(0, beam, beam).transpose(2, 1) # bsz x sz2 x nhu --> bsz/beam x sz2 x nhu input2 = input2.unfold(0, beam, beam)[:, :, :, 0] # use non batched operation if bsz = beam if input1.size(0) == 1: output = torch.mm(input1[0, :, :], input2[0, :, :]) else: output = input1.bmm(input2) return output.view(bsz, 1, -1) else: return input1.bmm(input2) def set_beam_size(self, beam_size): self.beam_size = beam_size ================================================ FILE: fairseq/modules/character_token_embedder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import List, Tuple import torch import torch.nn.functional as F from fairseq.data import Dictionary from torch import nn CHAR_PAD_IDX = 0 CHAR_EOS_IDX = 257 logger = logging.getLogger(__name__) class CharacterTokenEmbedder(torch.nn.Module): def __init__( self, vocab: Dictionary, filters: List[Tuple[int, int]], char_embed_dim: int, word_embed_dim: int, highway_layers: int, max_char_len: int = 50, char_inputs: bool = False, ): super(CharacterTokenEmbedder, self).__init__() self.onnx_trace = False self.embedding_dim = word_embed_dim self.max_char_len = max_char_len self.char_embeddings = nn.Embedding(257, char_embed_dim, padding_idx=0) self.symbol_embeddings = nn.Parameter(torch.FloatTensor(2, word_embed_dim)) self.eos_idx, self.unk_idx = 0, 1 self.char_inputs = char_inputs self.convolutions = nn.ModuleList() for width, out_c in filters: self.convolutions.append( nn.Conv1d(char_embed_dim, out_c, kernel_size=width) ) last_dim = sum(f[1] for f in filters) self.highway = Highway(last_dim, highway_layers) if highway_layers > 0 else None self.projection = nn.Linear(last_dim, word_embed_dim) assert ( vocab is not None or char_inputs ), "vocab must be set if not using char inputs" self.vocab = None if vocab is not None: self.set_vocab(vocab, max_char_len) self.reset_parameters() def prepare_for_onnx_export_(self): self.onnx_trace = True def set_vocab(self, vocab, max_char_len): word_to_char = torch.LongTensor(len(vocab), max_char_len) truncated = 0 for i in range(len(vocab)): if i < vocab.nspecial: char_idxs = [0] * max_char_len else: chars = vocab[i].encode() # +1 for padding char_idxs = [c + 1 for c in chars] + [0] * (max_char_len - len(chars)) if len(char_idxs) > max_char_len: truncated += 1 char_idxs = char_idxs[:max_char_len] word_to_char[i] = torch.LongTensor(char_idxs) if truncated > 0: logger.info( "truncated {} words longer than {} characters".format( truncated, max_char_len ) ) self.vocab = vocab self.word_to_char = word_to_char @property def padding_idx(self): return Dictionary().pad() if self.vocab is None else self.vocab.pad() def reset_parameters(self): nn.init.xavier_normal_(self.char_embeddings.weight) nn.init.xavier_normal_(self.symbol_embeddings) nn.init.xavier_uniform_(self.projection.weight) nn.init.constant_( self.char_embeddings.weight[self.char_embeddings.padding_idx], 0.0 ) nn.init.constant_(self.projection.bias, 0.0) def forward( self, input: torch.Tensor, ): if self.char_inputs: chars = input.view(-1, self.max_char_len) pads = chars[:, 0].eq(CHAR_PAD_IDX) eos = chars[:, 0].eq(CHAR_EOS_IDX) if eos.any(): if self.onnx_trace: chars = torch.where(eos.unsqueeze(1), chars.new_zeros(1), chars) else: chars[eos] = 0 unk = None else: flat_words = input.view(-1) chars = self.word_to_char[flat_words.type_as(self.word_to_char)].type_as( input ) pads = flat_words.eq(self.vocab.pad()) eos = flat_words.eq(self.vocab.eos()) unk = flat_words.eq(self.vocab.unk()) word_embs = self._convolve(chars) if self.onnx_trace: if pads.any(): word_embs = torch.where( pads.unsqueeze(1), word_embs.new_zeros(1), word_embs ) if eos.any(): word_embs = torch.where( eos.unsqueeze(1), self.symbol_embeddings[self.eos_idx], word_embs ) if unk is not None and unk.any(): word_embs = torch.where( unk.unsqueeze(1), self.symbol_embeddings[self.unk_idx], word_embs ) else: if pads.any(): word_embs[pads] = 0 if eos.any(): word_embs[eos] = self.symbol_embeddings[self.eos_idx] if unk is not None and unk.any(): word_embs[unk] = self.symbol_embeddings[self.unk_idx] return word_embs.view(input.size()[:2] + (-1,)) def _convolve( self, char_idxs: torch.Tensor, ): char_embs = self.char_embeddings(char_idxs) char_embs = char_embs.transpose(1, 2) # BTC -> BCT conv_result = [] for conv in self.convolutions: x = conv(char_embs) x, _ = torch.max(x, -1) x = F.relu(x) conv_result.append(x) x = torch.cat(conv_result, dim=-1) if self.highway is not None: x = self.highway(x) x = self.projection(x) return x class Highway(torch.nn.Module): """ A `Highway layer <https://arxiv.org/abs/1505.00387>`_. Adopted from the AllenNLP implementation. """ def __init__(self, input_dim: int, num_layers: int = 1): super(Highway, self).__init__() self.input_dim = input_dim self.layers = nn.ModuleList( [nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)] ) self.activation = nn.ReLU() self.reset_parameters() def reset_parameters(self): for layer in self.layers: # As per comment in AllenNLP: # We should bias the highway layer to just carry its input forward. We do that by # setting the bias on `B(x)` to be positive, because that means `g` will be biased to # be high, so we will carry the input forward. The bias on `B(x)` is the second half # of the bias vector in each Linear layer. nn.init.constant_(layer.bias[self.input_dim :], 1) nn.init.constant_(layer.bias[: self.input_dim], 0) nn.init.xavier_normal_(layer.weight) def forward(self, x: torch.Tensor): for layer in self.layers: projection = layer(x) proj_x, gate = projection.chunk(2, dim=-1) proj_x = self.activation(proj_x) gate = torch.sigmoid(gate) x = gate * x + (gate.new_tensor([1]) - gate) * proj_x return x ================================================ FILE: fairseq/modules/checkpoint_activations.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import functools from typing import Any, Dict, List, Tuple, Union import torch import torch.utils.checkpoint as checkpoint from fairseq import utils def checkpoint_wrapper(m, offload_to_cpu=False): """ A friendlier wrapper for performing activation checkpointing. Compared to the PyTorch version, this version: - wraps an nn.Module, so that all subsequent calls will use checkpointing - handles keyword arguments in the forward - handles non-Tensor outputs from the forward Usage:: checkpointed_module = checkpoint_wrapper(my_module, offload_to_cpu=True) a, b = checkpointed_module(x, y=3, z=torch.Tensor([1])) """ # should I check whether original_forward has already been set? assert not hasattr( m, "precheckpoint_forward" ), "checkpoint function has already been applied?" m.precheckpoint_forward = m.forward m.forward = functools.partial( _checkpointed_forward, m.precheckpoint_forward, # original_forward offload_to_cpu, ) return m def unwrap_checkpoint(m: torch.nn.Module): """ unwrap a module and its children from checkpoint_wrapper """ for module in m.modules(): if hasattr(module, "precheckpoint_forward"): module.forward = module.precheckpoint_forward del module.precheckpoint_forward if hasattr(module, "old_deepcopy_method"): module.__deepcopy__ = module.old_deepcopy_method del module.old_deepcopy_method return m def _checkpointed_forward(original_forward, offload_to_cpu, *args, **kwargs): # Autograd Functions in PyTorch work best with positional args, since # the backward must return gradients (or None) for every input argument. # We can flatten keyword arguments to make this easier. kwarg_keys, flat_args = pack_kwargs(*args, **kwargs) parent_ctx_dict = {"offload": offload_to_cpu} output = CheckpointFunction.apply( original_forward, parent_ctx_dict, kwarg_keys, *flat_args ) if isinstance(output, torch.Tensor): return output else: packed_non_tensor_outputs = parent_ctx_dict["packed_non_tensor_outputs"] if packed_non_tensor_outputs: output = unpack_non_tensors(output, packed_non_tensor_outputs) return output def pack_kwargs(*args, **kwargs) -> Tuple[List[str], List[Any]]: """ Usage:: kwarg_keys, flat_args = pack_kwargs(1, 2, a=3, b=4) args, kwargs = unpack_kwargs(kwarg_keys, flat_args) assert args == [1, 2] assert kwargs == {"a": 3, "b": 4} """ kwarg_keys = [] flat_args = list(args) for k, v in kwargs.items(): kwarg_keys.append(k) flat_args.append(v) return kwarg_keys, flat_args def unpack_kwargs( kwarg_keys: List[str], flat_args: List[Any] ) -> Tuple[List[Any], Dict[str, Any]]: if len(kwarg_keys) == 0: return flat_args, {} args = flat_args[: -len(kwarg_keys)] kwargs = {k: v for k, v in zip(kwarg_keys, flat_args[-len(kwarg_keys) :])} return args, kwargs def split_non_tensors( mixed: Union[torch.Tensor, Tuple[Any]] ) -> Tuple[Tuple[torch.Tensor], Dict[str, List[Any]]]: """ Usage:: x = torch.Tensor([1]) y = torch.Tensor([2]) tensors, packed_non_tensors = split_non_tensors((x, y, None, 3)) recon = unpack_non_tensors(tensors, packed_non_tensors) assert recon == (x, y, None, 3) """ if isinstance(mixed, torch.Tensor): return (mixed,), None tensors = [] packed_non_tensors = {"is_tensor": [], "objects": []} for o in mixed: if isinstance(o, torch.Tensor): packed_non_tensors["is_tensor"].append(True) tensors.append(o) else: packed_non_tensors["is_tensor"].append(False) packed_non_tensors["objects"].append(o) return tuple(tensors), packed_non_tensors def unpack_non_tensors( tensors: Tuple[torch.Tensor], packed_non_tensors: Dict[str, List[Any]], ) -> Tuple[Any]: if packed_non_tensors is None: return tensors assert isinstance(packed_non_tensors, dict) mixed = [] is_tensor_list = packed_non_tensors["is_tensor"] objects = packed_non_tensors["objects"] assert len(tensors) + len(objects) == len(is_tensor_list) obj_i = tnsr_i = 0 for is_tensor in is_tensor_list: if is_tensor: mixed.append(tensors[tnsr_i]) tnsr_i += 1 else: mixed.append(objects[obj_i]) obj_i += 1 return tuple(mixed) class CheckpointFunction(torch.autograd.Function): """Similar to the torch version, but support non-Tensor outputs. The caller is expected to provide a dict (*parent_ctx_dict*) that will hold the non-Tensor outputs. These should be combined with the Tensor *outputs* by calling ``unpack_non_tensors``. """ @staticmethod def forward(ctx, run_function, parent_ctx_dict, kwarg_keys, *args): if torch.is_grad_enabled(): # grad may be disabled, e.g., during validation checkpoint.check_backward_validity(args) ctx.run_function = run_function ctx.kwarg_keys = kwarg_keys ctx.fwd_rng_state = utils.get_rng_state() tensor_inputs, packed_non_tensor_inputs = split_non_tensors(args) if parent_ctx_dict["offload"]: ctx.fwd_device = tuple(x.device for x in tensor_inputs) ctx.grad_requirements = tuple(x.requires_grad for x in tensor_inputs) tensor_inputs = tuple( x.to(torch.device("cpu"), non_blocking=True) for x in tensor_inputs ) else: ctx.fwd_device, ctx.grad_requirements = None, None ctx.save_for_backward(*tensor_inputs) ctx.packed_non_tensor_inputs = packed_non_tensor_inputs with torch.no_grad(): unpacked_args, unpacked_kwargs = unpack_kwargs(kwarg_keys, args) outputs = run_function(*unpacked_args, **unpacked_kwargs) if isinstance(outputs, torch.Tensor): return outputs else: # Autograd Functions don't like non-Tensor outputs. We can split the # non-Tensor and Tensor outputs, returning the former by reference # through *parent_ctx_dict* and returning the latter directly. outputs, packed_non_tensor_outputs = split_non_tensors(outputs) parent_ctx_dict["packed_non_tensor_outputs"] = packed_non_tensor_outputs return outputs @staticmethod def backward(ctx, *args): if not torch.autograd._is_checkpoint_valid(): raise RuntimeError( "Checkpointing is not compatible with .grad(), please use .backward() if possible" ) tensor_inputs: Tuple = ctx.saved_tensors tensor_inputs = checkpoint.detach_variable(tensor_inputs) if ctx.fwd_device is not None: tensor_inputs = [ t.to(ctx.fwd_device[i], non_blocking=True) for i, t in enumerate(tensor_inputs) ] for i, need_grad in enumerate(ctx.grad_requirements): tensor_inputs[i].requires_grad = need_grad inputs = unpack_non_tensors(tensor_inputs, ctx.packed_non_tensor_inputs) # Store the current states. bwd_rng_state = utils.get_rng_state() # Set the states to what it used to be before the forward pass. utils.set_rng_state(ctx.fwd_rng_state) with torch.enable_grad(): unpacked_args, unpacked_kwargs = unpack_kwargs(ctx.kwarg_keys, inputs) outputs = ctx.run_function(*unpacked_args, **unpacked_kwargs) tensor_outputs, _ = split_non_tensors(outputs) # Set the states back to what it was at the start of this function. utils.set_rng_state(bwd_rng_state) # Run backward() with only Tensors that require grad outputs_with_grad = [] args_with_grad = [] for i in range(len(tensor_outputs)): if tensor_outputs[i].requires_grad: outputs_with_grad.append(tensor_outputs[i]) args_with_grad.append(args[i]) if len(outputs_with_grad) == 0: raise RuntimeError( "None of the outputs have requires_grad=True, " "this checkpoint() is not necessary" ) torch.autograd.backward(outputs_with_grad, args_with_grad) grads = tuple( inp.grad if isinstance(inp, torch.Tensor) else None for inp in inputs ) return (None, None, None) + grads ================================================ FILE: fairseq/modules/conformer_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Optional import torch from fairseq.modules import ( ESPNETMultiHeadedAttention, LayerNorm, MultiheadAttention, RelPositionMultiHeadedAttention, RotaryPositionMultiHeadedAttention, ) from fairseq.utils import get_activation_fn class ConvolutionModule(torch.nn.Module): """Convolution block used in the conformer block""" def __init__( self, embed_dim, channels, depthwise_kernel_size, dropout, activation_fn="swish", bias=False, export=False, ): """ Args: embed_dim: Embedding dimension channels: Number of channels in depthwise conv layers depthwise_kernel_size: Depthwise conv layer kernel size dropout: dropout value activation_fn: Activation function to use after depthwise convolution kernel bias: If bias should be added to conv layers export: If layernorm should be exported to jit """ super(ConvolutionModule, self).__init__() assert ( depthwise_kernel_size - 1 ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding" self.layer_norm = LayerNorm(embed_dim, export=export) self.pointwise_conv1 = torch.nn.Conv1d( embed_dim, 2 * channels, kernel_size=1, stride=1, padding=0, bias=bias, ) self.glu = torch.nn.GLU(dim=1) self.depthwise_conv = torch.nn.Conv1d( channels, channels, depthwise_kernel_size, stride=1, padding=(depthwise_kernel_size - 1) // 2, groups=channels, bias=bias, ) self.batch_norm = torch.nn.BatchNorm1d(channels) self.activation = get_activation_fn(activation_fn)(channels) self.pointwise_conv2 = torch.nn.Conv1d( channels, embed_dim, kernel_size=1, stride=1, padding=0, bias=bias, ) self.dropout = torch.nn.Dropout(dropout) def forward(self, x): """ Args: x: Input of shape B X T X C Returns: Tensor of shape B X T X C """ x = self.layer_norm(x) # exchange the temporal dimension and the feature dimension x = x.transpose(1, 2) # GLU mechanism x = self.pointwise_conv1(x) # (batch, 2*channel, dim) x = self.glu(x) # (batch, channel, dim) # 1D Depthwise Conv x = self.depthwise_conv(x) x = self.batch_norm(x) x = self.activation(x) x = self.pointwise_conv2(x) x = self.dropout(x) return x.transpose(1, 2) class FeedForwardModule(torch.nn.Module): """Positionwise feed forward layer used in conformer""" def __init__( self, input_feat, hidden_units, dropout1, dropout2, activation_fn="swish", bias=True, ): """ Args: input_feat: Input feature dimension hidden_units: Hidden unit dimension dropout1: dropout value for layer1 dropout2: dropout value for layer2 activation_fn: Name of activation function bias: If linear layers should have bias """ super(FeedForwardModule, self).__init__() self.layer_norm = LayerNorm(input_feat) self.w_1 = torch.nn.Linear(input_feat, hidden_units, bias=bias) self.w_2 = torch.nn.Linear(hidden_units, input_feat, bias=bias) self.dropout1 = torch.nn.Dropout(dropout1) self.dropout2 = torch.nn.Dropout(dropout2) self.activation = get_activation_fn(activation_fn)(hidden_units) def forward(self, x): """ Args: x: Input Tensor of shape T X B X C Returns: Tensor of shape T X B X C """ x = self.layer_norm(x) x = self.w_1(x) x = self.activation(x) x = self.dropout1(x) x = self.w_2(x) return self.dropout2(x) class ConformerEncoderLayer(torch.nn.Module): """Conformer block based on https://arxiv.org/abs/2005.08100. We currently don't support relative positional encoding in MHA""" def __init__( self, embed_dim, ffn_embed_dim, attention_heads, dropout, use_fp16, depthwise_conv_kernel_size=31, activation_fn="swish", attn_type=None, pos_enc_type="abs", ): """ Args: embed_dim: Input embedding dimension ffn_embed_dim: FFN layer dimension attention_heads: Number of attention heads in MHA dropout: dropout value depthwise_conv_kernel_size: Size of kernel in depthwise conv layer in convolution module activation_fn: Activation function name to use in convulation block and feed forward block attn_type: MHA implementation from ESPNET vs fairseq pos_enc_type: Positional encoding type - abs, rope, rel_pos """ self.pos_enc_type = pos_enc_type super(ConformerEncoderLayer, self).__init__() self.ffn1 = FeedForwardModule( embed_dim, ffn_embed_dim, dropout, dropout, ) self.self_attn_layer_norm = LayerNorm(embed_dim, export=False) self.self_attn_dropout = torch.nn.Dropout(dropout) if attn_type == "espnet": if self.pos_enc_type == "rel_pos": self.self_attn = RelPositionMultiHeadedAttention( embed_dim, attention_heads, dropout=dropout, ) elif self.pos_enc_type == "rope": self.self_attn = RotaryPositionMultiHeadedAttention( embed_dim, attention_heads, dropout=dropout, precision=use_fp16 ) elif self.pos_enc_type == "abs": self.self_attn = ESPNETMultiHeadedAttention( embed_dim, attention_heads, dropout=dropout, ) else: raise Exception(f"Unsupported attention type {self.pos_enc_type}") else: # Default to fairseq MHA self.self_attn = MultiheadAttention( embed_dim, attention_heads, dropout=dropout, ) self.conv_module = ConvolutionModule( embed_dim=embed_dim, channels=embed_dim, depthwise_kernel_size=depthwise_conv_kernel_size, dropout=dropout, activation_fn=activation_fn, ) self.ffn2 = FeedForwardModule( embed_dim, ffn_embed_dim, dropout, dropout, activation_fn=activation_fn, ) self.final_layer_norm = LayerNorm(embed_dim, export=False) def forward( self, x, encoder_padding_mask: Optional[torch.Tensor], position_emb: Optional[torch.Tensor] = None, ): """ Args: x: Tensor of shape T X B X C encoder_padding_mask: Optional mask tensor positions: Returns: Tensor of shape T X B X C """ residual = x x = self.ffn1(x) x = x * 0.5 + residual residual = x x = self.self_attn_layer_norm(x) if self.pos_enc_type == "rel_pos": x, attn = self.self_attn( query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, pos_emb=position_emb, need_weights=False, ) else: x, attn = self.self_attn( query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=False, ) x = self.self_attn_dropout(x) x = x + residual residual = x # TBC to BTC x = x.transpose(0, 1) x = self.conv_module(x) # BTC to TBC x = x.transpose(0, 1) x = residual + x residual = x x = self.ffn2(x) layer_result = x x = x * 0.5 + residual x = self.final_layer_norm(x) return x, (attn, layer_result) class ConformerWav2Vec2EncoderLayer(ConformerEncoderLayer): """Encoder layer for Wav2vec2 encoder""" def forward( self, x: torch.Tensor, self_attn_mask: torch.Tensor = None, self_attn_padding_mask: torch.Tensor = None, need_weights: bool = False, att_args=None, position_emb=None, ): return super().forward(x, self_attn_padding_mask, position_emb) ================================================ FILE: fairseq/modules/conv_tbc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from torch import nn from torch.nn.modules.utils import _single from torch import Tensor class ConvTBC(torch.nn.Module): """1D convolution over an input of shape (time x batch x channel) The implementation uses gemm to perform the convolution. This implementation is faster than cuDNN for small kernel sizes. """ def __init__(self, in_channels, out_channels, kernel_size, padding=0): super(ConvTBC, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _single(kernel_size) self.padding = _single(padding) self.weight = torch.nn.Parameter( torch.Tensor(self.kernel_size[0], in_channels, out_channels) ) self.bias = torch.nn.Parameter(torch.Tensor(out_channels)) self.reset_parameters() def reset_parameters(self): nn.init.xavier_normal_(self.weight) nn.init.zeros_(self.bias) def conv_tbc(self, input: Tensor): return torch.conv_tbc( input.contiguous(), self.weight, self.bias, self.padding[0] ) def forward(self, input: Tensor): return self.conv_tbc(input) def __repr__(self): s = ( "{name}({in_channels}, {out_channels}, kernel_size={kernel_size}" ", padding={padding}" ) if self.bias is None: s += ", bias=False" s += ")" return s.format(name=self.__class__.__name__, **self.__dict__) ================================================ FILE: fairseq/modules/cross_entropy.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch import torch.nn.functional as F logger = logging.getLogger(__name__) def _cross_entropy_pytorch(logits, target, ignore_index=None, reduction="mean"): lprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32) return F.nll_loss( lprobs, target, ignore_index=ignore_index, reduction=reduction, ) try: import xentropy_cuda from apex.contrib import xentropy def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): if logits.device == torch.device("cpu"): return _cross_entropy_pytorch(logits, target, ignore_index, reduction) else: if not getattr(cross_entropy, "_has_logged_once", False): logger.info("using fused cross entropy") cross_entropy._has_logged_once = True half_to_float = logits.dtype == torch.half losses = xentropy.SoftmaxCrossEntropyLoss.apply( logits, target, 0.0, ignore_index, half_to_float, ) if reduction == "sum": return losses.sum() elif reduction == "mean": if ignore_index >= 0: return losses.sum() / target.ne(ignore_index).sum() else: return losses.mean() elif reduction == "none": return losses else: raise NotImplementedError except ImportError: def cross_entropy(logits, target, ignore_index=-100, reduction="mean"): return _cross_entropy_pytorch(logits, target, ignore_index, reduction) ================================================ FILE: fairseq/modules/cuda_utils.cu ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ template <typename U, typename V> constexpr __host__ __device__ auto divUp(U a, V b) -> decltype(a + b) { return (a + b - 1) / b; } template <int FS, int SB, int padding_l, typename scalar_t> __inline__ __device__ void zeroSharedMem(scalar_t* data) { /* Given an array of length FS + SB, zero out the first padding_l and last (FS - padding_l) values in the array */ int tid = threadIdx.x; if (FS < SB) { // zero all if we have enough threads in a block to do all of them if (tid < padding_l || tid > SB - FS + padding_l - 1) { data[tid] = scalar_t(0.0); } } else { // otherwise zero out one block at a time const int numIterations = divUp<int, int>(FS, SB); for (int i = 0; i < numIterations; i++) { int offset = i * SB; if (tid + offset < padding_l) { data[tid + offset] = scalar_t(0.0); } else if (tid + offset < FS) { data[SB + tid + offset] = scalar_t(0.0); } } } } template <typename scalar_t> __inline__ __device__ scalar_t warpReduce(scalar_t data) { /* Reduce an array within each warp. After processing all values in warp will caontain the sum of all original values in that warp. data - pointer to data to reduce */ data += __shfl_xor_sync(SHFL_MASK, data, 16); data += __shfl_xor_sync(SHFL_MASK, data, 8); data += __shfl_xor_sync(SHFL_MASK, data, 4); data += __shfl_xor_sync(SHFL_MASK, data, 2); data += __shfl_xor_sync(SHFL_MASK, data, 1); return data; } template <typename scalar_t> __inline__ __device__ scalar_t blockReduce(scalar_t data) { /* Reduce an entire array on the block level. After processing, the first value in the array will contain the reduced sum. data - pointer to data to reduce */ static __shared__ scalar_t warpSum[32]; const int tid = threadIdx.x; int wid = tid / 32; int lane = tid % 32; __syncthreads(); // reduce each warp then write to shared memory scalar_t sum = warpReduce(data); if (lane == 0) { warpSum[wid] = sum; } __syncthreads(); scalar_t v; // perform final sum of partial warp sums if (tid < blockDim.x / 32) { v = warpSum[lane]; } else { v = scalar_t(0.0); } if (wid == 0) { v = warpReduce(v); } __syncthreads(); return v; } void checkCudaStatus(cudaError_t status, int lineNumber = -1) { if (status != cudaSuccess) { std::cout << cudaGetErrorString(status) << " at line " << lineNumber << std::endl; std::cout << "Exiting" << std::endl; exit(1); } } template <int FS, int SB, int padding_l, typename scalar_t> __device__ void load_input_to_shared( const scalar_t* input, // global memory int inputOffset, int sequenceLength, int iteration, int numIterations, bool no_prev, scalar_t* output /* shared memory */) { /* Load a block size of input into shared memory with right and left overhang of total size FS. If previously loaded memory, overlap will be shifted over to reduce global memory access input - pointer to start of channel sequence inputOffset - how far in the sequence to start loading sequenceLength - total length of sequence iteration - which block of sequence we are loading numIterations - total number of blocks to load no_prev - whether to load the whole block if the previous block wasn't loaded output - shared memory to write input to */ const int tid = threadIdx.x; // Load the left "overhang" of input if (iteration > 0) { if (padding_l < SB) { // load all at once if (tid < padding_l) { output[tid] = (no_prev) ? input[inputOffset - padding_l + tid] : output[tid + SB]; } } else { // load in chunks of size SB int numIterations = divUp<int, int>(padding_l, SB); for (int i = 0; i < numIterations; i++) { int offset = i * SB; if ((tid + offset) < padding_l) { output[tid + offset] = (no_prev) ? input[inputOffset - padding_l + tid + offset] : output[tid + offset + SB]; } } } } // Load the right "overhang" of input if (iteration < (numIterations - 1)) { const int elementsLeft = sequenceLength - (iteration + 1) * SB; if ((FS - padding_l) < SB) { // load all at once if (tid < (FS - padding_l)) { output[padding_l + SB + tid] = (tid < elementsLeft) ? input[inputOffset + SB + tid] : scalar_t(0.0); } } else { // load in chunks of size SB int numIterations = divUp<int, int>(FS - padding_l, SB); for (int i = 0; i < numIterations; i++) { int offset = i * SB; if ((tid + offset) < (FS - padding_l)) { output[padding_l + SB + tid + offset] = ((tid + offset) < elementsLeft) ? input[inputOffset + SB + tid + offset] : scalar_t(0.0); } } } } // We should also clear out the right "overhang" if (iteration == (numIterations - 1)) { if ((FS - padding_l) < SB) { // clear out all at once if (tid < (FS - padding_l)) { output[padding_l + SB + tid] = scalar_t(0.0); } } else { // clear in chunks of size SB int numIterations = divUp<int, int>(FS - padding_l, SB); for (int i = 0; i < numIterations; i++) { int offset = i * SB; if ((tid + offset) < (FS - padding_l)) { output[padding_l + SB + tid + offset] = scalar_t(0.0); } } } } output[tid + padding_l] = ((inputOffset + tid) < sequenceLength) ? input[inputOffset + tid] : scalar_t(0.0); } ================================================ FILE: fairseq/modules/downsampled_multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import math import torch import torch.nn as nn import torch.nn.functional as F from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.scalar_bias import scalar_bias class SingleHeadAttention(nn.Module): """ Single-head attention that supports Gating and Downsampling """ def __init__( self, out_channels, embed_dim, head_dim, head_index, dropout=0.0, bias=True, project_input=True, gated=False, downsample=False, num_heads=1, ): super().__init__() self.embed_dim = embed_dim self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.head_index = head_index self.head_dim = head_dim self.project_input = project_input self.gated = gated self.downsample = downsample self.num_heads = num_heads self.projection = None k_layers = [] v_layers = [] if self.downsample: k_layers.append(Downsample(self.head_index)) v_layers.append(Downsample(self.head_index)) out_proj_size = self.head_dim else: out_proj_size = self.head_dim * self.num_heads if self.gated: k_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias)) self.in_proj_q = GatedLinear(self.embed_dim, out_proj_size, bias=bias) v_layers.append(GatedLinear(self.embed_dim, out_proj_size, bias=bias)) else: k_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias)) self.in_proj_q = Linear(self.embed_dim, out_proj_size, bias=bias) v_layers.append(Linear(self.embed_dim, out_proj_size, bias=bias)) self.in_proj_k = nn.Sequential(*k_layers) self.in_proj_v = nn.Sequential(*v_layers) if self.downsample: self.out_proj = Linear(out_proj_size, self.head_dim, bias=bias) else: self.out_proj = Linear(out_proj_size, out_channels, bias=bias) self.scaling = self.head_dim**-0.5 def forward( self, query, key, value, mask_future_timesteps=False, key_padding_mask=None, use_scalar_bias=False, ): """Input shape: Time x Batch x Channel Self-attention can be implemented by passing in the same arguments for query, key and value. Future timesteps can be masked with the `mask_future_timesteps` argument. Padding elements can be excluded from the key by passing a binary ByteTensor (`key_padding_mask`) with shape: batch x src_len, where padding elements are indicated by 1s. """ src_len, bsz, out_channels = key.size() tgt_len = query.size(0) assert list(query.size()) == [tgt_len, bsz, out_channels] assert key.size() == value.size() if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == src_len if self.downsample: size = bsz else: size = bsz * self.num_heads k = key v = value q = query if self.project_input: q = self.in_proj_q(q) k = self.in_proj_k(k) v = self.in_proj_v(v) src_len = k.size()[0] q *= self.scaling if not self.downsample: q = q.view(tgt_len, size, self.head_dim) k = k.view(src_len, size, self.head_dim) v = v.view(src_len, size, self.head_dim) q = q.transpose(0, 1) k = k.transpose(0, 1) v = v.transpose(0, 1) attn_weights = torch.bmm(q, k.transpose(1, 2)) if mask_future_timesteps: assert ( query.size() == key.size() ), "mask_future_timesteps only applies to self-attention" attn_weights *= torch.tril( attn_weights.data.new([1]).expand(tgt_len, tgt_len).clone(), diagonal=-1, )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0) attn_weights += torch.triu( attn_weights.data.new([-math.inf]).expand(tgt_len, tgt_len).clone(), diagonal=0, )[:, :: self.head_index + 1 if self.downsample else 1].unsqueeze(0) tgt_size = tgt_len if use_scalar_bias: attn_weights = scalar_bias(attn_weights, 2) v = scalar_bias(v, 1) tgt_size += 1 if key_padding_mask is not None: # don't attend to padding symbols if key_padding_mask.max() > 0: if self.downsample: attn_weights = attn_weights.view(bsz, 1, tgt_len, src_len) else: attn_weights = attn_weights.view( size, self.num_heads, tgt_len, src_len ) attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1).unsqueeze(2), -math.inf, ) attn_weights = attn_weights.view(size, tgt_len, src_len) attn_weights = F.softmax(attn_weights, dim=-1) attn_weights = self.dropout_module(attn_weights) attn = torch.bmm(attn_weights, v) if self.downsample: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.head_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim) attn = self.out_proj(attn) return attn, attn_weights class DownsampledMultiHeadAttention(nn.ModuleList): """ Multi-headed attention with Gating and Downsampling """ def __init__( self, out_channels, embed_dim, num_heads, dropout=0.0, bias=True, project_input=True, gated=False, downsample=False, ): self.embed_dim = embed_dim self.num_heads = num_heads self.head_dim = embed_dim // num_heads self.downsample = downsample self.gated = gated self.project_input = project_input assert self.head_dim * num_heads == embed_dim if self.downsample: attention_heads = [] for index in range(self.num_heads): attention_heads.append( SingleHeadAttention( out_channels, self.embed_dim, self.head_dim, index, dropout, bias, self.project_input, self.gated, self.downsample, self.num_heads, ) ) super().__init__(modules=attention_heads) self.out_proj = Linear(embed_dim, out_channels, bias=bias) else: # either we have a list of attention heads, or just one attention head # if not being downsampled, we can do the heads with one linear layer instead of separate ones super().__init__() self.attention_module = SingleHeadAttention( out_channels, self.embed_dim, self.head_dim, 1, dropout, bias, self.project_input, self.gated, self.downsample, self.num_heads, ) def forward( self, query, key, value, mask_future_timesteps=False, key_padding_mask=None, use_scalar_bias=False, ): src_len, bsz, embed_dim = key.size() tgt_len = query.size(0) assert embed_dim == self.embed_dim assert list(query.size()) == [tgt_len, bsz, embed_dim] assert key.size() == value.size() tgt_size = tgt_len if use_scalar_bias: tgt_size += 1 attn = [] attn_weights = [] if self.downsample: for attention_head_number in range(self.num_heads): # call the forward of each attention head _attn, _attn_weight = self[attention_head_number]( query, key, value, mask_future_timesteps, key_padding_mask, use_scalar_bias, ) attn.append(_attn) attn_weights.append(_attn_weight) full_attn = torch.cat(attn, dim=2) full_attn = self.out_proj(full_attn) return full_attn, attn_weights[0].clone() else: _attn, _attn_weight = self.attention_module( query, key, value, mask_future_timesteps, key_padding_mask, use_scalar_bias, ) attn.append(_attn) attn_weights.append(_attn_weight) full_attn = torch.cat(attn, dim=2) full_attn_weights = torch.cat(attn_weights) full_attn_weights = full_attn_weights.view( bsz, self.num_heads, tgt_size, src_len ) full_attn_weights = full_attn_weights.sum(dim=1) / self.num_heads return full_attn, full_attn_weights class Downsample(nn.Module): """ Selects every nth element, where n is the index """ def __init__(self, index): super().__init__() self.index = index def forward(self, x): return x[:: self.index + 1] def Linear(in_features, out_features, dropout=0.0, bias=True): """Weight-normalized Linear layer (input: B x T x C)""" m = nn.Linear(in_features, out_features, bias=bias) m.weight.data.normal_(mean=0, std=math.sqrt((1 - dropout) / in_features)) m.bias.data.zero_() return nn.utils.weight_norm(m) def GatedLinear(in_features, out_features, dropout=0.0, bias=True): """Weight-normalized Linear layer (input: B x T x C) with interspersed GLU units""" return nn.Sequential( Linear(in_features, out_features * 4, dropout, bias), nn.GLU(), Linear(out_features * 2, out_features * 2, dropout, bias), nn.GLU(), Linear(out_features, out_features, dropout, bias), ) ================================================ FILE: fairseq/modules/dynamic_convolution.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, Optional import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.incremental_decoding_utils import ( FairseqIncrementalState, with_incremental_state, ) from fairseq.modules.fairseq_dropout import FairseqDropout from torch import Tensor from .unfold import unfold1d def DynamicConv( input_size, kernel_size=1, padding_l=None, num_heads=1, weight_dropout=0.0, weight_softmax=False, renorm_padding=False, bias=False, conv_bias=False, query_size=None, in_proj=False, ): if torch.cuda.is_available(): try: from fairseq.modules.dynamicconv_layer import DynamicconvLayer return DynamicconvLayer( input_size, kernel_size=kernel_size, padding_l=padding_l, num_heads=num_heads, weight_dropout=weight_dropout, weight_softmax=weight_softmax, renorm_padding=renorm_padding, bias=bias, conv_bias=conv_bias, query_size=query_size, ) except ImportError as e: print(e) return DynamicConv1dTBC( input_size, kernel_size=kernel_size, padding_l=padding_l, num_heads=num_heads, weight_dropout=weight_dropout, weight_softmax=weight_softmax, renorm_padding=renorm_padding, bias=bias, conv_bias=conv_bias, query_size=query_size, ) def Linear(in_features, out_features, bias=True): m = nn.Linear(in_features, out_features, bias) nn.init.xavier_uniform_(m.weight) if bias: nn.init.constant_(m.bias, 0.0) return m @with_incremental_state class DynamicConv1dTBC(nn.Module): """Dynamic lightweight convolution taking T x B x C inputs Args: input_size: # of channels of the input kernel_size: convolution channels padding_l: padding to the left when using "same" padding num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size) weight_dropout: the drop rate of the DropConnect to drop the weight weight_softmax: normalize the weight with softmax before the convolution renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1) bias: use bias conv_bias: bias of the convolution query_size: specified when feeding a different input as the query in_proj: project the input and generate the filter together Shape: Input: TxBxC, i.e. (timesteps, batch_size, input_size) Output: TxBxC, i.e. (timesteps, batch_size, input_size) Attributes: weight: the learnable weights of the module of shape `(num_heads, 1, kernel_size)` bias: the learnable bias of the module of shape `(input_size)` """ def __init__( self, input_size, kernel_size=1, padding_l=None, num_heads=1, weight_dropout=0.0, weight_softmax=False, renorm_padding=False, bias=False, conv_bias=False, query_size=None, in_proj=False, ): super().__init__() self.input_size = input_size self.query_size = input_size if query_size is None else query_size self.kernel_size = kernel_size self.padding_l = padding_l self.num_heads = num_heads self.weight_dropout_module = FairseqDropout( weight_dropout, module_name=self.__class__.__name__ ) self.weight_softmax = weight_softmax self.renorm_padding = renorm_padding if in_proj: self.weight_linear = Linear( self.input_size, self.input_size + num_heads * kernel_size * 1 ) else: self.weight_linear = Linear( self.query_size, num_heads * kernel_size * 1, bias=bias ) if conv_bias: self.conv_bias = nn.Parameter(torch.Tensor(input_size)) else: self.conv_bias = None self.reset_parameters() @property def in_proj(self): return ( self.weight_linear.out_features == self.input_size + self.num_heads * self.kernel_size ) def reset_parameters(self): self.weight_linear.reset_parameters() if self.conv_bias is not None: nn.init.constant_(self.conv_bias, 0.0) def forward(self, x, incremental_state=None, query=None, unfold=None): """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C args: x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size) incremental_state: A dict to keep the state unfold: unfold the input or not. If not, we use the matrix trick instead query: use the specified query to predict the conv filters """ unfold = ( x.size(0) > 512 if unfold is None else unfold ) # use unfold mode as default for long sequence to save memory unfold = unfold or (incremental_state is not None) assert query is None or not self.in_proj if query is None: query = x if unfold: output = self._forward_unfolded(x, incremental_state, query) else: output = self._forward_expanded(x, incremental_state, query) if self.conv_bias is not None: output = output + self.conv_bias.view(1, 1, -1) return output def _forward_unfolded(self, x, incremental_state, query): """The conventional implementation of convolutions. Unfolding the input by having a window shifting to the right.""" T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H assert R * H == C == self.input_size if self.in_proj: proj = self.weight_linear(x) x = proj.narrow(2, 0, self.input_size).contiguous() weight = ( proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1) ) else: weight = self.weight_linear(query).view(T * B * H, -1) # renorm_padding is only implemented in _forward_expanded assert not self.renorm_padding or incremental_state is not None if incremental_state is not None: input_buffer = self._get_input_buffer(incremental_state) if input_buffer is None: input_buffer = x.new() x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) if self.kernel_size > 1: self._set_input_buffer( incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] ) x_unfold = x_unfold.view(T * B * H, R, -1) else: padding_l = self.padding_l if K > T and padding_l == K - 1: weight = weight.narrow(1, K - T, T) K, padding_l = T, T - 1 # unfold the input: T x B x C --> T' x B x C x K x_unfold = unfold1d(x, K, padding_l, 0) x_unfold = x_unfold.view(T * B * H, R, K) if self.weight_softmax and not self.renorm_padding: weight = F.softmax(weight, dim=1) weight = weight.narrow(1, 0, K) if incremental_state is not None: weight = weight[:, -x_unfold.size(2) :] K = weight.size(1) if self.weight_softmax and self.renorm_padding: weight = F.softmax(weight, dim=1) weight = self.weight_dropout_module(weight, inplace=False) output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1 output = output.view(T, B, C) return output def _forward_expanded(self, x, incremental_stat, query): """Turn the convolution filters into band matrices and do matrix multiplication. This is faster when the sequence is short, but less memory efficient. This is not used in the decoder during inference. """ T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H assert R * H == C == self.input_size if self.in_proj: proj = self.weight_linear(x) x = proj.narrow(2, 0, self.input_size).contiguous() weight = ( proj.narrow(2, self.input_size, H * K).contiguous().view(T * B * H, -1) ) else: weight = self.weight_linear(query).view(T * B * H, -1) if not self.renorm_padding: if self.weight_softmax: weight = F.softmax(weight, dim=1) weight = self.weight_dropout_module(weight, inplace=False) weight = weight.narrow(1, 0, K).contiguous() weight = weight.view(T, B * H, K).transpose(0, 1) x = x.view(T, B * H, R).transpose(0, 1) if self.weight_softmax and self.renorm_padding: # turn the convolution filters into band matrices weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf")) weight_expanded.as_strided( (B * H, T, K), (T * (T + K - 1), T + K, 1) ).copy_(weight) weight_expanded = weight_expanded.narrow(2, self.padding_l, T) # normalize the weight over valid positions like self-attention weight_expanded = F.softmax(weight_expanded, dim=2) weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False) else: P = self.padding_l # For efficiency, we cut the kernel size and reduce the padding when the kernel is larger than the length if K > T and P == K - 1: weight = weight.narrow(2, K - T, T) K, P = T, T - 1 # turn the convolution filters into band matrices weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False) weight_expanded.as_strided( (B * H, T, K), (T * (T + K - 1), T + K, 1) ).copy_(weight) weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T output = torch.bmm(weight_expanded, x) output = output.transpose(0, 1).contiguous().view(T, B, C) return output def reorder_incremental_state(self, incremental_state, new_order): input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: input_buffer = input_buffer.index_select(1, new_order) self._set_input_buffer(incremental_state, input_buffer) def _get_input_buffer(self, incremental_state): return utils.get_incremental_state(self, incremental_state, "input_buffer") def _set_input_buffer(self, incremental_state, new_buffer): return utils.set_incremental_state( self, incremental_state, "input_buffer", new_buffer ) def extra_repr(self): s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}".format( self.input_size, self.kernel_size, self.padding_l, self.num_heads, self.weight_softmax, self.conv_bias is not None, self.renorm_padding, self.in_proj, ) if self.query_size != self.input_size: s += ", query_size={}".format(self.query_size) if self.weight_dropout_module.p > 0.0: s += ", weight_dropout={}".format(self.weight_dropout_module.p) return s class DynamicConv_scripatable(nn.Module, FairseqIncrementalState): """Dynamic lightweight convolution taking T x B x C inputs Args: input_size: # of channels of the input kernel_size: convolution channels padding_l: padding to the left when using "same" padding num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size) weight_dropout: the drop rate of the DropConnect to drop the weight weight_softmax: normalize the weight with softmax before the convolution renorm_padding: re-normalize the filters to ignore the padded part (only the non-padding parts sum up to 1) bias: use bias conv_bias: bias of the convolution query_size: specified when feeding a different input as the query in_proj: project the input and generate the filter together Shape: Input: TxBxC, i.e. (timesteps, batch_size, input_size) Output: TxBxC, i.e. (timesteps, batch_size, input_size) Attributes: weight: the learnable weights of the module of shape `(num_heads, 1, kernel_size)` bias: the learnable bias of the module of shape `(input_size)` """ def __init__( self, input_size, kernel_size=1, padding_l=None, num_heads=1, weight_dropout=0.0, weight_softmax=False, renorm_padding=False, bias=False, conv_bias=False, query_size=None, in_proj=False, ): super().__init__() self.input_size = input_size self.query_size = input_size if query_size is None else query_size self.kernel_size = kernel_size self.padding_l = padding_l self.num_heads = num_heads self.weight_dropout_module = FairseqDropout( weight_dropout, module_name=self.__class__.__name__ ) self.weight_softmax = weight_softmax self.renorm_padding = renorm_padding if in_proj: self.weight_linear = Linear( self.input_size, self.input_size + num_heads * kernel_size * 1 ) else: self.weight_linear = Linear( self.query_size, num_heads * kernel_size * 1, bias=bias ) self.in_proj = ( self.weight_linear.out_features == self.input_size + self.num_heads * self.kernel_size ) self.has_conv_bias = conv_bias self.conv_bias = nn.Parameter(torch.Tensor(input_size).view(1, 1, -1)) self.init_incremental_state() self.reset_parameters() def reset_parameters(self): self.weight_linear.reset_parameters() if self.has_conv_bias: nn.init.constant_(self.conv_bias, 0.0) def forward( self, x, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, query: Optional[Tensor] = None, ): """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C args: x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size) incremental_state: A dict to keep the state unfold: unfold the input or not. If not, we use the matrix trick instead query: use the specified query to predict the conv filters """ assert query is None or not self.in_proj if query is None: query = x output = self._forward_unfolded(x, incremental_state, query) if self.has_conv_bias: output = output + self.conv_bias return output def _forward_unfolded( self, x, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], query, ): """The conventional implementation of convolutions. Unfolding the input by having a window shifting to the right.""" T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H assert R * H == C == self.input_size TxBxH = T * B * H if self.in_proj: proj = self.weight_linear(x) x = proj.narrow(2, 0, self.input_size).contiguous() weight = proj.narrow(2, self.input_size, H * K).contiguous().view(TxBxH, -1) else: weight = self.weight_linear(query).view(TxBxH, -1) # renorm_padding is only implemented in _forward_expanded assert not self.renorm_padding or incremental_state is not None if incremental_state is not None: input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) else: x_unfold = x.unsqueeze(3).clone() if self.kernel_size > 1: self._set_input_buffer( incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] ) x_unfold = x_unfold.view(TxBxH, R, -1) else: padding_l = self.padding_l if K > T and padding_l == K - 1: weight = weight.narrow(1, K - T, T) K, padding_l = T, T - 1 # unfold the input: T x B x C --> T' x B x C x K x_unfold = unfold1d(x, K, padding_l, 0.0) x_unfold = x_unfold.view(TxBxH, R, K) if self.weight_softmax and not self.renorm_padding: weight = F.softmax(weight, dim=1) weight = weight.narrow(1, 0, K) if incremental_state is not None: weight = weight[:, -(x_unfold.size(2)) :] K = weight.size(1) if self.weight_softmax and self.renorm_padding: weight = F.softmax(weight, dim=1) weight = self.weight_dropout_module(weight, inplace=False) output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T x B x H x R x 1 output = output.view(T, B, C) return output def reorder_incremental_state( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], new_order: Tensor, ): input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: input_buffer = input_buffer.index_select(1, new_order) self._set_input_buffer(incremental_state, input_buffer) def _get_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ): result = self.get_incremental_state(incremental_state, "input_buffer") if result is not None and "input_buffer" in result: return result["input_buffer"] else: return None def _set_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], new_buffer: Optional[Tensor], ): result = self.set_incremental_state( incremental_state, "input_buffer", {"input_buffer": new_buffer} ) if result is not None: incremental_state = result return incremental_state def extra_repr(self): s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, conv_bias={}, renorm_padding={}, in_proj={}".format( # noqa self.input_size, self.kernel_size, self.padding_l, self.num_heads, self.weight_softmax, self.conv_bias is not None, self.renorm_padding, self.in_proj, ) if self.query_size != self.input_size: s += ", query_size={}".format(self.query_size) if self.weight_dropout_module.p > 0.0: s += ", weight_dropout={}".format(self.weight_dropout_module.p) return s ================================================ FILE: fairseq/modules/dynamic_crf_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ This file is to re-implemented the low-rank and beam approximation of CRF layer Proposed by: Sun, Zhiqing, et al. Fast Structured Decoding for Sequence Models https://arxiv.org/abs/1910.11555 The CRF implementation is mainly borrowed from https://github.com/kmkurn/pytorch-crf/blob/master/torchcrf/__init__.py """ import numpy as np import torch import torch.nn as nn def logsumexp(x, dim=1): return torch.logsumexp(x.float(), dim=dim).type_as(x) class DynamicCRF(nn.Module): """Dynamic CRF layer is used to approximate the traditional Conditional Random Fields (CRF) $P(y | x) = 1/Z(x) exp(sum_i s(y_i, x) + sum_i t(y_{i-1}, y_i, x))$ where in this function, we assume the emition scores (s) are given, and the transition score is a |V| x |V| matrix $M$ in the following two aspects: (1) it used a low-rank approximation for the transition matrix: $M = E_1 E_2^T$ (2) it used a beam to estimate the normalizing factor Z(x) """ def __init__(self, num_embedding, low_rank=32, beam_size=64): super().__init__() self.E1 = nn.Embedding(num_embedding, low_rank) self.E2 = nn.Embedding(num_embedding, low_rank) self.vocb = num_embedding self.rank = low_rank self.beam = beam_size def extra_repr(self): return "vocab_size={}, low_rank={}, beam_size={}".format( self.vocb, self.rank, self.beam ) def forward(self, emissions, targets, masks, beam=None): """ Compute the conditional log-likelihood of a sequence of target tokens given emission scores Args: emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output ``(batch_size, seq_len, vocab_size)``. We assume batch-first targets (`~torch.LongTensor`): Sequence of target token indices ``(batch_size, seq_len) masks (`~torch.ByteTensor`): Mask tensor with the same size as targets Returns: `~torch.Tensor`: approximated log-likelihood """ numerator = self._compute_score(emissions, targets, masks) denominator = self._compute_normalizer(emissions, targets, masks, beam) return numerator - denominator def forward_decoder(self, emissions, masks=None, beam=None): """ Find the most likely output sequence using Viterbi algorithm. Args: emissions (`~torch.Tensor`): Emission score are usually the unnormalized decoder output ``(batch_size, seq_len, vocab_size)``. We assume batch-first masks (`~torch.ByteTensor`): Mask tensor with the same size as targets Returns: `~torch.LongTensor`: decoded sequence from the CRF model """ return self._viterbi_decode(emissions, masks, beam) def _compute_score(self, emissions, targets, masks=None): batch_size, seq_len = targets.size() emission_scores = emissions.gather(2, targets[:, :, None])[:, :, 0] # B x T transition_scores = (self.E1(targets[:, :-1]) * self.E2(targets[:, 1:])).sum(2) scores = emission_scores scores[:, 1:] += transition_scores if masks is not None: scores = scores * masks.type_as(scores) return scores.sum(-1) def _compute_normalizer(self, emissions, targets=None, masks=None, beam=None): # HACK: we include "target" which is a hueristic for training # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?) beam = beam if beam is not None else self.beam batch_size, seq_len = emissions.size()[:2] if targets is not None: _emissions = emissions.scatter(2, targets[:, :, None], np.float("inf")) beam_targets = _emissions.topk(beam, 2)[1] beam_emission_scores = emissions.gather(2, beam_targets) else: beam_emission_scores, beam_targets = emissions.topk(beam, 2) beam_transition_score1 = self.E1(beam_targets[:, :-1]) # B x (T-1) x K x D beam_transition_score2 = self.E2(beam_targets[:, 1:]) # B x (T-1) x K x D beam_transition_matrix = torch.bmm( beam_transition_score1.view(-1, beam, self.rank), beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2), ) beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam) # compute the normalizer in the log-space score = beam_emission_scores[:, 0] # B x K for i in range(1, seq_len): next_score = score[:, :, None] + beam_transition_matrix[:, i - 1] next_score = logsumexp(next_score, dim=1) + beam_emission_scores[:, i] if masks is not None: score = torch.where(masks[:, i : i + 1], next_score, score) else: score = next_score # Sum (log-sum-exp) over all possible tags return logsumexp(score, dim=1) def _viterbi_decode(self, emissions, masks=None, beam=None): # HACK: we use a beam of tokens to approximate the normalizing factor (which is bad?) beam = beam if beam is not None else self.beam batch_size, seq_len = emissions.size()[:2] beam_emission_scores, beam_targets = emissions.topk(beam, 2) beam_transition_score1 = self.E1(beam_targets[:, :-1]) # B x (T-1) x K x D beam_transition_score2 = self.E2(beam_targets[:, 1:]) # B x (T-1) x K x D beam_transition_matrix = torch.bmm( beam_transition_score1.view(-1, beam, self.rank), beam_transition_score2.view(-1, beam, self.rank).transpose(1, 2), ) beam_transition_matrix = beam_transition_matrix.view(batch_size, -1, beam, beam) traj_tokens, traj_scores = [], [] finalized_tokens, finalized_scores = [], [] # compute the normalizer in the log-space score = beam_emission_scores[:, 0] # B x K dummy = ( torch.arange(beam, device=score.device).expand(*score.size()).contiguous() ) for i in range(1, seq_len): traj_scores.append(score) _score = score[:, :, None] + beam_transition_matrix[:, i - 1] _score, _index = _score.max(dim=1) _score = _score + beam_emission_scores[:, i] if masks is not None: score = torch.where(masks[:, i : i + 1], _score, score) index = torch.where(masks[:, i : i + 1], _index, dummy) else: score, index = _score, _index traj_tokens.append(index) # now running the back-tracing and find the best best_score, best_index = score.max(dim=1) finalized_tokens.append(best_index[:, None]) finalized_scores.append(best_score[:, None]) for idx, scs in zip(reversed(traj_tokens), reversed(traj_scores)): previous_index = finalized_tokens[-1] finalized_tokens.append(idx.gather(1, previous_index)) finalized_scores.append(scs.gather(1, previous_index)) finalized_tokens.reverse() finalized_tokens = torch.cat(finalized_tokens, 1) finalized_tokens = beam_targets.gather(2, finalized_tokens[:, :, None])[:, :, 0] finalized_scores.reverse() finalized_scores = torch.cat(finalized_scores, 1) finalized_scores[:, 1:] = finalized_scores[:, 1:] - finalized_scores[:, :-1] return finalized_scores, finalized_tokens ================================================ FILE: fairseq/modules/dynamicconv_layer/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .dynamicconv_layer import DynamicconvLayer # noqa ================================================ FILE: fairseq/modules/dynamicconv_layer/cuda_function_gen.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. def gen_forward(): kernels = [3, 5, 7, 15, 31, 63, 127, 255] blocks = [32, 64, 128, 256] head = """ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "dynamicconv_cuda.cuh" std::vector<at::Tensor> dynamicconv_cuda_forward(at::Tensor input, at::Tensor weight, int padding_l) { at::DeviceGuard g(input.device()); const auto minibatch = input.size(0); const auto numFeatures = input.size(1); const auto sequenceLength = input.size(2); const auto numHeads = weight.size(1); const auto filterSize = weight.size(2); const auto numFiltersInBlock = numFeatures / numHeads; const dim3 blocks(minibatch, numFeatures); auto output = at::zeros_like(input); auto stream = at::cuda::getCurrentCUDAStream(); """ switch = """ switch(filterSize) { """ case_k = """ case {k}: """ main_block = """ if (padding_l == {pad}) {{ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "dynamicconv_forward", ([&] {{ dynamicconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t> <<<blocks, {b_size}, 0, stream>>>( input.data<scalar_t>(), weight.data<scalar_t>(), minibatch, sequenceLength, numFeatures, numFiltersInBlock, numHeads, output.data<scalar_t>()); }})); }} else """ bad_padding = """ { std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl; } break;\n """ end = """ default: std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl; } return {output}; } """ with open("dynamicconv_cuda_forward.cu", "w") as forward: forward.write(head) forward.write(switch) for k in kernels: b_size = 32 for b in blocks: if b > k: b_size = b break forward.write(case_k.format(k=k)) for pad in [k // 2, k - 1]: forward.write(main_block.format(k=k, b_size=b_size, pad=pad)) forward.write(bad_padding) forward.write(end) def gen_backward(): kernels = [3, 5, 7, 15, 31, 63, 127, 255] thresh = [512, 512, 512, 512, 512, 380, 256, 256] min_block = [64, 64, 64, 64, 64, 64, 128, 256] seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] head = """ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "dynamicconv_cuda.cuh" std::vector<at::Tensor> dynamicconv_cuda_backward(at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor weight) { at::DeviceGuard g(input.device()); const auto minibatch = input.size(0); const auto numFeatures = input.size(1); const auto sequenceLength = input.size(2); const auto numHeads = weight.size(1); const auto filterSize = weight.size(2); const auto numFiltersInBlock = numFeatures / numHeads; auto numChunks = 1; auto gradInput = at::zeros_like(input); auto gradWeight = at::zeros_like(weight); auto stream = at::cuda::getCurrentCUDAStream(); dim3 blocks(minibatch, numHeads, numChunks); """ sequence_if = """ if (sequenceLength < {seq}) {{ switch(filterSize) {{ """ case_k = """ case {k}: """ chunks_reset = """ numChunks = int(ceilf(sequenceLength/float({b_size}))); blocks = dim3(minibatch, numHeads, numChunks); """ main_block = """ if (padding_l == {p}) {{ AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOutput.scalar_type(), "dynamicconv_backward", ([&] {{ dynamicconv_backward_kernel<{k}, {b_size}, {p}, scalar_t> <<<blocks, {b_size}, 0, stream>>>( gradOutput.data<scalar_t>(), input.data<scalar_t>(), weight.data<scalar_t>(), minibatch, sequenceLength, numFeatures, numFiltersInBlock, numHeads, gradWeight.data<scalar_t>(), gradInput.data<scalar_t>()); }})); }} else """ bad_padding = """ { std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl; } break;\n """ bad_filter = """ default: std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl; } """ con_else = """ } else """ final_else = """ { switch(filterSize) { """ last_return = """ } return {gradInput, gradWeight}; } """ with open("dynamicconv_cuda_backward.cu", "w") as backward: backward.write(head) for seq in seqs: backward.write(sequence_if.format(seq=seq)) for k, t, m in zip(kernels, thresh, min_block): backward.write(case_k.format(k=k)) if seq <= t: b_size = seq else: b_size = m backward.write(chunks_reset.format(b_size=b_size)) for p in [k // 2, k - 1]: backward.write(main_block.format(k=k, b_size=b_size, p=p)) backward.write(bad_padding) backward.write(bad_filter) backward.write(con_else) backward.write(final_else) for k, m in zip(kernels, min_block): backward.write(case_k.format(k=k)) backward.write(chunks_reset.format(b_size=m)) for p in [k // 2, k - 1]: backward.write(main_block.format(k=k, b_size=m, p=p)) backward.write(bad_padding) backward.write(bad_filter) backward.write(last_return) if __name__ == "__main__": gen_forward() gen_backward() ================================================ FILE: fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cpp ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include <torch/extension.h> #include <vector> std::vector<at::Tensor> dynamicconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l); std::vector<at::Tensor> dynamicconv_cuda_backward( at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor filters); #define CHECK_CUDA(x) \ AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) \ AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) std::vector<at::Tensor> dynamicconv_forward(at::Tensor input, at::Tensor filters, int padding_l) { CHECK_INPUT(input); CHECK_INPUT(filters); return dynamicconv_cuda_forward(input, filters, padding_l); } std::vector<at::Tensor> dynamicconv_backward( at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor filters) { CHECK_INPUT(gradOutput); CHECK_INPUT(input); CHECK_INPUT(filters); return dynamicconv_cuda_backward(gradOutput, padding_l, input, filters); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &dynamicconv_forward, "dynamicconv forward (CUDA)"); m.def("backward", &dynamicconv_backward, "dynamicconv backward (CUDA)"); } ================================================ FILE: fairseq/modules/dynamicconv_layer/dynamicconv_cuda.cuh ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include <ATen/ATen.h> #include <c10/cuda/CUDAStream.h> #include <cuda.h> #include <cuda_fp16.h> #include <cuda_runtime.h> #include <algorithm> #include <functional> #include <iostream> #include <stdexcept> #include <utility> #include <vector> #include <assert.h> #include <math.h> #include <stdlib.h> #define SHFL_MASK 0xffffffff template <int FS, int SB, int padding_l, typename scalar_t> __global__ void dynamicconv_forward_kernel( const scalar_t* input, const scalar_t* weight, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, int numHeads, scalar_t* output); template <int FS, int SB, int padding_l, typename scalar_t> __global__ void dynamicconv_backward_kernel( const scalar_t* gradOutput, // B * C * T const scalar_t* input, // B * C * T const scalar_t* weight, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, int numHeads, scalar_t* gradWeight, scalar_t* gradInput); // B * H * k * T ================================================ FILE: fairseq/modules/dynamicconv_layer/dynamicconv_cuda_kernel.cu ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "../cuda_utils.cu" #include "dynamicconv_cuda.cuh" #include "dynamicconv_cuda_backward.cu" #include "dynamicconv_cuda_forward.cu" // FS is filter size and kernels are specialized for filter sizes template <int FS, int SB, int padding_l, typename scalar_t> __global__ void dynamicconv_forward_kernel( const scalar_t* input, const scalar_t* weight, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, int numHeads, scalar_t* output) { assert(blockDim.x == SB); const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int featureIdx = blockIdx.y; const int head = featureIdx / numFiltersInBlock; const int IOOffset = batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; scalar_t* outputFeature = &output[IOOffset]; scalar_t filter[FS]; __shared__ scalar_t tempInput[SB + FS]; zeroSharedMem<FS, SB, padding_l>(tempInput); const int numIterations = divUp<int, int>(sequenceLength, SB); for (int i = 0; i < numIterations; ++i) { __syncthreads(); const int inputOffset = i * SB; load_input_to_shared<FS, SB, padding_l>( inputFeature, inputOffset, sequenceLength, i, numIterations, false, tempInput); __syncthreads(); if (inputOffset + tid < sequenceLength) { #pragma unroll for (int k = 0; k < FS; ++k) { const int filterOffset = batchIdx * numHeads * FS * sequenceLength + head * FS * sequenceLength + k * sequenceLength + i * SB + tid; filter[k] = weight[filterOffset]; } scalar_t out = scalar_t(0.0); #pragma unroll for (int k = 0; k < FS; ++k) { out += filter[k] * tempInput[tid + k]; } outputFeature[inputOffset + tid] = out; } } } template <int FS, int SB, int padding_l, typename scalar_t> __global__ void dynamicconv_backward_kernel( const scalar_t* gradOutput, // B * C * T const scalar_t* input, // B * C * T const scalar_t* weight, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, int numHeads, scalar_t* gradWeight, scalar_t* gradInput) { // B * H * k * T assert(blockDim.x == SB); // each block operates on a single batch and filter head const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int headIdx = blockIdx.y; const int chunkIdx = blockIdx.z; const int numChunks = divUp<int, int>(sequenceLength, SB); const int inputOffset = chunkIdx * SB; // initialize shared memory for output gradient and input __shared__ scalar_t tempGradOutput[SB + FS]; __shared__ scalar_t tempInput[SB + FS]; const int padding = FS - padding_l - 1; zeroSharedMem<FS, SB, padding>(tempGradOutput); zeroSharedMem<FS, SB, padding_l>(tempInput); // initialize local filter and weight gradient sum arrays scalar_t tempGradSum[FS]; scalar_t bfilter[FS]; for (int k = 0; k < FS; ++k) { tempGradSum[k] = scalar_t(0.0); int idxOffset = inputOffset + tid + k - padding; if (idxOffset >= 0 && idxOffset < sequenceLength) { int bfilterOffset = batchIdx * numHeads * FS * sequenceLength + headIdx * FS * sequenceLength + (FS - k - 1) * sequenceLength + idxOffset; bfilter[k] = weight[bfilterOffset]; } else { bfilter[k] = scalar_t(0.0); } } // iterate over filter block for (int featureIdx = 0; featureIdx < numFiltersInBlock; ++featureIdx) { __syncthreads(); // load input and output gradient for this channel and chunk const int IOOffset = batchIdx * numFeatures * sequenceLength + (headIdx * numFiltersInBlock + featureIdx) * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; const scalar_t* gradOutputFeature = &gradOutput[IOOffset]; scalar_t* gradInputFeature = &gradInput[IOOffset]; load_input_to_shared<FS, SB, padding>( gradOutputFeature, inputOffset, sequenceLength, chunkIdx, numChunks, true, tempGradOutput); load_input_to_shared<FS, SB, padding_l>( inputFeature, inputOffset, sequenceLength, chunkIdx, numChunks, true, tempInput); __syncthreads(); // sum input and weight gradients scalar_t out = scalar_t(0.0); #pragma unroll for (int k = 0; k < FS; ++k) { tempGradSum[k] += tempInput[tid + k] * tempGradOutput[tid + padding]; out += bfilter[k] * tempGradOutput[tid + k]; } if (inputOffset + tid < sequenceLength) { gradInputFeature[inputOffset + tid] = out; } } const int gradOffset = batchIdx * numHeads * FS * sequenceLength + headIdx * FS * sequenceLength; scalar_t* gradWeightFeature = &gradWeight[gradOffset]; // write weight gradient if (inputOffset + tid < sequenceLength) { for (int k = 0; k < FS; ++k) { const int outputOffset = k * sequenceLength + inputOffset + tid; gradWeightFeature[outputOffset] = tempGradSum[k]; } } } ================================================ FILE: fairseq/modules/dynamicconv_layer/dynamicconv_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import dynamicconv_cuda import torch import torch.nn.functional as F from fairseq import utils from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.unfold import unfold1d from torch import nn from torch.autograd import Function class dynamicconvFunction(Function): @staticmethod def forward(ctx, x, weights, padding_l): ctx.padding_l = padding_l outputs = dynamicconv_cuda.forward(x, weights, padding_l) variables = [x, weights] ctx.save_for_backward(*variables) return outputs[0] @staticmethod def backward(ctx, grad_output): outputs = dynamicconv_cuda.backward( grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors ) grad_input, grad_weights = outputs return grad_input, grad_weights, None @with_incremental_state class DynamicconvLayer(nn.Module): def __init__( self, input_size, kernel_size=1, padding_l=None, weight_softmax=False, num_heads=1, weight_dropout=0.0, bias=False, renorm_padding=False, conv_bias=False, query_size=None, ): super(DynamicconvLayer, self).__init__() self.input_size = input_size self.query_size = input_size if query_size is None else query_size self.kernel_size = kernel_size self.padding_l = padding_l self.num_heads = num_heads self.weight_softmax = weight_softmax self.weight_dropout_module = FairseqDropout( weight_dropout, module_name=self.__class__.__name__ ) self.renorm_padding = renorm_padding self.bias = bias self.weight_linear = nn.Linear(input_size, num_heads * kernel_size, bias) if conv_bias: self.conv_bias = nn.Parameter(torch.Tensor(input_size)) else: self.conv_bias = None self.reset_parameters() def reset_parameters(self): nn.init.xavier_uniform_(self.weight_linear.weight) if self.conv_bias is not None: nn.init.constant_(self.conv_bias, 0.0) nn.init.constant_(self.weight_linaer.bias, 0.0) def forward(self, x, incremental_state=None, query=None, unfold=None): T, B, C = x.size() K, H = self.kernel_size, self.num_heads # R = C // H # during inference time, incremental BMM is faster if incremental_state is not None: unfold = ( x.size(0) > 512 if unfold is None else unfold ) # use unfold mode as default for long sequence to save memory unfold = unfold or (incremental_state is not None) assert query is None if query is None: query = x if unfold: output = self._forward_unfolded(x, incremental_state, query) else: output = self._forward_expanded(x, incremental_state, query) if self.conv_bias is not None: output = output + self.conv_bias.view(1, 1, -1) return output # during training time, use CUDA kernel else: weight = self.weight_linear(x).view(T, B, H, K) if self.weight_softmax: weight = F.softmax(weight, dim=-1) if self.weight_dropout_module.p: weight = self.weight_dropout_module(weight) weight = weight.permute(1, 2, 3, 0).contiguous() self.filters = weight x = x.permute(1, 2, 0).contiguous() output = dynamicconvFunction.apply(x, weight, self.padding_l).permute( 2, 0, 1 ) if self.conv_bias is not None: output = output + self.conv_bias.view(1, 1, -1) return output def reorder_incremental_state(self, incremental_state, new_order): input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: input_buffer = input_buffer.index_select(1, new_order) self._set_input_buffer(incremental_state, input_buffer) def _get_input_buffer(self, incremental_state): return utils.get_incremental_state(self, incremental_state, "input_buffer") def _set_input_buffer(self, incremental_state, new_buffer): return utils.set_incremental_state( self, incremental_state, "input_buffer", new_buffer ) def _forward_unfolded(self, x, incremental_state, query): """The conventional implementation of convolutions. Unfolding the input by having a window shifting to the right.""" T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H assert R * H == C == self.input_size weight = self.weight_linear(query).view(T * B * H, -1) # renorm_padding is only implemented in _forward_expanded assert not self.renorm_padding or incremental_state is not None if incremental_state is not None: input_buffer = self._get_input_buffer(incremental_state) if input_buffer is None: input_buffer = x.new() x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) if self.kernel_size > 1: self._set_input_buffer( incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] ) x_unfold = x_unfold.view(T * B * H, R, -1) else: padding_l = self.padding_l if K > T and padding_l == K - 1: weight = weight.narrow(1, K - T, T) K, padding_l = T, T - 1 # unfold the input: T x B x C --> T' x B x C x K x_unfold = unfold1d(x, K, padding_l, 0) x_unfold = x_unfold.view(T * B * H, R, K) if self.weight_softmax and not self.renorm_padding: weight = F.softmax(weight, dim=1) weight = weight.narrow(1, 0, K) if incremental_state is not None: weight = weight[:, -x_unfold.size(2) :] K = weight.size(1) if self.weight_softmax and self.renorm_padding: weight = F.softmax(weight, dim=1) weight = self.weight_dropout_module(weight, inplace=False) output = torch.bmm(x_unfold, weight.unsqueeze(2)) # T*B*H x R x 1 output = output.view(T, B, C) return output def _forward_expanded(self, x, incremental_stat, query): """Turn the convolution filters into band matrices and do matrix multiplication. This is faster when the sequence is short, but less memory efficient. This is not used in the decoder during inference. """ T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H assert R * H == C == self.input_size weight = self.weight_linear(query).view(T * B * H, -1) if not self.renorm_padding: if self.weight_softmax: weight = F.softmax(weight, dim=1) weight = self.weight_dropout_module(weight, inplace=False) weight = weight.narrow(1, 0, K).contiguous() weight = weight.view(T, B * H, K).transpose(0, 1) x = x.view(T, B * H, R).transpose(0, 1) if self.weight_softmax and self.renorm_padding: # turn the convolution filters into band matrices weight_expanded = weight.new(B * H, T, T + K - 1).fill_(float("-inf")) weight_expanded.as_strided( (B * H, T, K), (T * (T + K - 1), T + K, 1) ).copy_(weight) weight_expanded = weight_expanded.narrow(2, self.padding_l, T) # normalize the weight over valid positions like self-attention weight_expanded = F.softmax(weight_expanded, dim=2) weight_expanded = self.weight_dropout_module(weight_expanded, inplace=False) else: P = self.padding_l # For efficiency, we cut the kernel size and reduce the padding when the kernel is larger than the length if K > T and P == K - 1: weight = weight.narrow(2, K - T, T) K, P = T, T - 1 # turn the convolution filters into band matrices weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False) weight_expanded.as_strided( (B * H, T, K), (T * (T + K - 1), T + K, 1) ).copy_(weight) weight_expanded = weight_expanded.narrow(2, P, T) # B*H x T x T output = torch.bmm(weight_expanded, x) output = output.transpose(0, 1).contiguous().view(T, B, C) return output ================================================ FILE: fairseq/modules/dynamicconv_layer/dynamiconv_cpu.cpp ================================================ #include <torch/torch.h> #include <vector> std::vector<float*> dynamicconv_cpu_forward(float* input, float* filters, int padding_l); std::vector<float*> dynamicconv_cpu_backward( float* gradOutput, int padding_l, float* input, float* filters); std::vector<float*> dynamicconv_forward(float* input, float* filters, int padding_l) { return dynamicconv_cpu_forward(input, filters, padding_l); } std::vector<float*> dynamicconv_backward( float* gradOutput, int padding_l, float* input, float* filters) { return dynamicconv_cpu_backward(gradOutput, padding_l, input, filters); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &dynamicconv_forward, "dynamicconv forward (CPU)"); m.def("backward", &dynamicconv_backward, "dynamicconv backward (CPU)"); } ================================================ FILE: fairseq/modules/dynamicconv_layer/setup.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name="dynamicconv_layer", ext_modules=[ CUDAExtension( name="dynamicconv_cuda", sources=[ "dynamicconv_cuda.cpp", "dynamicconv_cuda_kernel.cu", ], ), ], cmdclass={"build_ext": BuildExtension}, ) ================================================ FILE: fairseq/modules/ema_module.py ================================================ #!/usr/bin/env python3 """ Used for EMA tracking a given pytorch module. The user is responsible for calling step() and setting the appropriate decay """ import copy from dataclasses import dataclass, field import logging import torch from omegaconf import II from fairseq.dataclass import FairseqDataclass try: from amp_C import multi_tensor_l2norm multi_tensor_l2norm_available = True except ImportError: multi_tensor_l2norm_available = False logger = logging.getLogger(__name__) @dataclass class EMAModuleConfig(FairseqDataclass): ema_decay: float = field( default=0.9999, metadata={"help": "decay for exponential moving average model"} ) ema_fp32: bool = field( default=False, metadata={"help": "If true, store EMA model in fp32 even if model is in fp16"}, ) add_missing_params: bool = True log_norms: bool = False class EMAModule: """Exponential Moving Average of Fairseq Models""" def __init__( self, model, config: EMAModuleConfig, copy_model=True, device=None, skip_keys=None, ): """ @param model model to initialize the EMA with @param config EMAConfig object with configuration like ema_decay, ema_update_freq, ema_fp32 @param device If provided, copy EMA to this device (e.g. gpu). Otherwise EMA is in the same device as the model. """ self.config = config if copy_model: self.model = copy.deepcopy(model) self.model.requires_grad_(False) else: self.model = model self.config = config self.decay = config.ema_decay self.skip_keys = skip_keys or set() self.add_missing_params = config.add_missing_params self.fp32_params = {} if device is not None: logging.info(f"Copying EMA model to device {device}") self.model = self.model.to(device=device) if self.config.ema_fp32: self.build_fp32_params() self.log_norms = config.log_norms and multi_tensor_l2norm_available self.logs = {} def build_fp32_params(self, state_dict=None): """ Store a copy of the EMA params in fp32. If state dict is passed, the EMA params is copied from the provided state dict. Otherwise, it is copied from the current EMA model parameters. """ if not self.config.ema_fp32: raise RuntimeError( "build_fp32_params should not be called if ema_fp32=False. " "Use ema_fp32=True if this is really intended." ) if state_dict is None: state_dict = self.model.state_dict() def _to_float(t): return t.float() if torch.is_floating_point(t) else t for param_key in state_dict: if param_key in self.fp32_params: if param_key == "__sq_mom": self.fp32_params[param_key] = state_dict[param_key] else: self.fp32_params[param_key].copy_(state_dict[param_key]) else: self.fp32_params[param_key] = _to_float(state_dict[param_key]) if "__sq_mom" in self.fp32_params: self.fp32_params["__sq_mom"][param_key] = torch.zeros_like( self.fp32_params[param_key] ) def restore(self, state_dict, build_fp32_params=False): """Load data from a model spec into EMA model""" self.model.load_state_dict(state_dict, strict=False) if build_fp32_params: self.build_fp32_params(state_dict) def set_decay(self, decay, weight_decay=None): self.decay = decay if weight_decay is not None: self.weight_decay = weight_decay def get_decay(self): return self.decay def _step_internal(self, new_model): """One update of the EMA model based on new model weights""" decay = self.decay ema_state_dict = {} ema_params = ( self.fp32_params if self.config.ema_fp32 else self.model.state_dict() ) new_p = [] ema_p = [] for key, param in new_model.named_parameters(): if isinstance(param, dict): continue if not self.add_missing_params and key not in ema_params: continue try: ema_param = ema_params[key] except KeyError: ema_param = ( param.float().clone() if param.ndim == 1 else copy.deepcopy(param) ) ema_params[key] = ema_param if param.shape != ema_param.shape: raise ValueError( "incompatible tensor shapes between model param and ema param" + "{} vs. {}".format(param.shape, ema_param.shape) ) if "version" in key: # Do not decay a model.version pytorch param continue lr = 1 - decay if key in self.skip_keys or not param.requires_grad: ema_params[key].copy_(param.to(dtype=ema_param.dtype).data) ema_param = ema_params[key] else: if self.log_norms: new_p.append(param) ema_p.append(ema_param) ema_param.mul_(1 - lr) ema_param.add_(param.data.to(dtype=ema_param.dtype), alpha=lr) ema_state_dict[key] = ema_param for key, param in new_model.named_buffers(): ema_state_dict[key] = param if self.log_norms: if "model_norm" in self.logs: self.prev_model_norm = self.logs["model_norm"] chunk_size = 2048 * 32 has_inf = torch.zeros( (1, 1), dtype=torch.int, device=next(new_model.parameters()).device ) new_norm = multi_tensor_l2norm(chunk_size, has_inf, [new_p], False) old_norm = multi_tensor_l2norm(chunk_size, has_inf, [ema_p], False) self.logs["model_norm"] = new_norm[0] self.logs["ema_norm"] = old_norm[0] self.restore(ema_state_dict, build_fp32_params=False) @torch.no_grad() def step(self, new_model): self._step_internal(new_model) def reverse(self, model): """ Load the model parameters from EMA model. Useful for inference or fine-tuning from the EMA model. """ d = self.model.state_dict() if "_ema" in d: del d["_ema"] model.load_state_dict(d, strict=False) return model ================================================ FILE: fairseq/modules/espnet_multihead_attention.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright 2019 Shigeki Karita # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Multi-Head Attention layer definition.""" import math import torch from torch import nn from fairseq.modules.rotary_positional_embedding import ( RotaryPositionalEmbedding, apply_rotary_pos_emb, ) class ESPNETMultiHeadedAttention(nn.Module): """Multi-Head Attention layer. Args: n_head: The number of heads. n_feat: The number of features. dropout: Dropout rate. """ def __init__(self, n_feat, n_head, dropout): """Construct an MultiHeadedAttention object.""" super(ESPNETMultiHeadedAttention, self).__init__() assert n_feat % n_head == 0 # We assume d_v always equals d_k self.d_k = n_feat // n_head self.h = n_head self.linear_q = nn.Linear(n_feat, n_feat) self.linear_k = nn.Linear(n_feat, n_feat) self.linear_v = nn.Linear(n_feat, n_feat) self.linear_out = nn.Linear(n_feat, n_feat) self.attn = None self.dropout = nn.Dropout(p=dropout) def forward_qkv(self, query, key, value, **kwargs): """Transform query, key and value. Args: query: Query tensor B X T1 X C key: Key tensor B X T2 X C value: Value tensor B X T2 X C Returns: torch.Tensor: Transformed query tensor B X n_head X T1 X d_k torch.Tensor: Transformed key tensor B X n_head X T2 X d_k torch.Tensor: Transformed value tensor B X n_head X T2 X d_k """ n_batch = query.size(0) q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) q = q.transpose(1, 2) # (batch, head, time1, d_k) k = k.transpose(1, 2) # (batch, head, time2, d_k) v = v.transpose(1, 2) # (batch, head, time2, d_k) return q, k, v def forward_attention(self, value, scores, mask): """Compute attention context vector. Args: value: Transformed value B X n_head X T2 X d_k. scores: Attention score B X n_head X T1 X T2 mask: Mask T2 X B Returns: torch.Tensor: Transformed value B X T1 X d_model weighted by the attention score B X T1 X T2 """ n_batch = value.size(0) if mask is not None: scores = scores.masked_fill( mask.unsqueeze(1).unsqueeze(2).to(bool), float("-inf"), # (batch, head, time1, time2) ) self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) else: self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2) p_attn = self.dropout(self.attn) x = torch.matmul(p_attn, value) # (batch, head, time1, d_k) x = ( x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k) ) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model) def forward(self, query, key, value, key_padding_mask=None, **kwargs): """Compute scaled dot product attention. Args: query (torch.Tensor): Query tensor T X B X C key (torch.Tensor): Key tensor T X B X C value (torch.Tensor): Value tensor T X B X C mask (torch.Tensor): Mask tensor T X B Returns: torch.Tensor: Output tensor T X B X D. """ query = query.transpose(0, 1) key = key.transpose(0, 1) value = value.transpose(0, 1) q, k, v = self.forward_qkv(query, key, value) scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) scores = self.forward_attention(v, scores, key_padding_mask) scores = scores.transpose(0, 1) return scores, None class RelPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): """Multi-Head Attention layer with relative position encoding. Paper: https://arxiv.org/abs/1901.02860 Args: n_head: The number of heads. n_feat: The number of features. dropout: Dropout rate. zero_triu: Whether to zero the upper triangular part of attention matrix. """ def __init__(self, n_feat, n_head, dropout, zero_triu=False): """Construct an RelPositionMultiHeadedAttention object.""" super().__init__(n_feat, n_head, dropout) self.zero_triu = zero_triu # linear transformation for positional encoding self.linear_pos = nn.Linear(n_feat, n_feat, bias=False) # these two learnable bias are used in matrix c and matrix d # as described in https://arxiv.org/abs/1901.02860 Section 3.3 self.pos_bias_u = nn.Parameter(torch.zeros(self.h, self.d_k)) self.pos_bias_v = nn.Parameter(torch.zeros(self.h, self.d_k)) torch.nn.init.xavier_uniform_(self.pos_bias_u) torch.nn.init.xavier_uniform_(self.pos_bias_v) def rel_shift(self, x): """Compute relative positional encoding. Args: x: Input tensor B X n_head X T X 2T-1 Returns: torch.Tensor: Output tensor. """ zero_pad = torch.zeros((*x.size()[:3], 1), device=x.device, dtype=x.dtype) x_padded = torch.cat([zero_pad, x], dim=-1) x_padded = x_padded.view(*x.size()[:2], x.size(3) + 1, x.size(2)) x = x_padded[:, :, 1:].view_as(x)[ :, :, :, : x.size(-1) // 2 + 1 ] # only keep the positions from 0 to time2 if self.zero_triu: ones = torch.ones((x.size(2), x.size(3)), device=x.device) x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :] return x def forward(self, query, key, value, pos_emb, key_padding_mask=None, **kwargs): """Compute scaled dot product attention. Args: query: Query tensor T X B X C key: Key tensor T X B X C value: Value tensor T X B X C pos_emb: Positional embedding tensor B X 2T-1 X C key_padding_mask: Mask tensor T X B Returns: torch.Tensor: Output tensor T X B X C. """ query = query.transpose(0, 1) key = key.transpose(0, 1) value = value.transpose(0, 1) pos_emb = pos_emb.transpose(0, 1) q, k, v = self.forward_qkv(query, key, value) q = q.transpose(1, 2) # (batch, time1, head, d_k) n_batch_pos = pos_emb.size(0) p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) p = p.transpose(1, 2) # (batch, head, 2*time1-1, d_k) # (batch, head, time1, d_k) q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2) # (batch, head, time1, d_k) q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2) # compute attention score # first compute matrix a and matrix c # as described in https://arxiv.org/abs/1901.02860 Section 3.3 # (batch, head, time1, time2) matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1)) # compute matrix b and matrix d # (batch, head, time1, 2*time1-1) matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) matrix_bd = self.rel_shift(matrix_bd) scores = (matrix_ac + matrix_bd) / math.sqrt( self.d_k ) # (batch, head, time1, time2) scores = self.forward_attention(v, scores, key_padding_mask) scores = scores.transpose(0, 1) return scores, None class RotaryPositionMultiHeadedAttention(ESPNETMultiHeadedAttention): def __init__( self, n_feat, n_head, dropout, precision, rotary_emd_base=10000, ): """Construct an RotaryPositionMultiHeadedAttention object.""" super().__init__(n_feat, n_head, dropout) precision = torch.float self.rotary_ndims = self.d_k # also try self.d_k//2 if precision == "fp16": precision = torch.half self.rotary_emb = RotaryPositionalEmbedding( self.rotary_ndims, base=rotary_emd_base, precision=precision ) def forward(self, query, key, value, key_padding_mask=None, **kwargs): """Compute rotary position attention. Args: query: Query tensor T X B X C key: Key tensor T X B X C value: Value tensor T X B X C key_padding_mask: Mask tensor T X B Returns: torch.Tensor: Output tensor T X B X D. Notes: Assumes self attn """ T, B, C = value.size() query = query.view(T, B, self.h, self.d_k) key = key.view(T, B, self.h, self.d_k) value = value.view(T, B, self.h, self.d_k) cos, sin = self.rotary_emb(value, seq_len=T) query, key = apply_rotary_pos_emb( query, key, cos, sin, offset=0 ) # offset is based on layer_past query = query.view(T, B, self.h * self.d_k) key = key.view(T, B, self.h * self.d_k) value = value.view(T, B, self.h * self.d_k) # TBD to BTD query = query.transpose(0, 1) key = key.transpose(0, 1) value = value.transpose(0, 1) q, k, v = self.forward_qkv(query, key, value) scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) scores = self.forward_attention(v, scores, key_padding_mask) scores = scores.transpose(0, 1) return scores, None ================================================ FILE: fairseq/modules/fairseq_dropout.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from typing import List, Optional import torch.nn as nn import torch.nn.functional as F logger = logging.getLogger(__name__) class FairseqDropout(nn.Module): def __init__(self, p, module_name=None): super().__init__() self.p = p self.module_name = module_name self.apply_during_inference = False def forward(self, x, inplace: bool = False): if self.p > 0 and (self.training or self.apply_during_inference): return F.dropout(x, p=self.p, training=True, inplace=inplace) else: return x def make_generation_fast_( self, name: str, retain_dropout: bool = False, retain_dropout_modules: Optional[List[str]] = None, **kwargs ): if retain_dropout: if retain_dropout_modules is not None and self.module_name is None: logger.warning( "Cannot enable dropout during inference for module {} " "because module_name was not set".format(name) ) elif ( retain_dropout_modules is None # if None, apply to all modules or self.module_name in retain_dropout_modules ): logger.info( "Enabling dropout during inference for module: {}".format(name) ) self.apply_during_inference = True else: logger.info("Disabling dropout for module: {}".format(name)) ================================================ FILE: fairseq/modules/fp32_batch_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ batch norm done in fp32 (for fp16 training) """ import torch import torch.nn as nn class Fp32BatchNorm(nn.Module): def __init__(self, sync=False, *args, **kwargs): super().__init__() if sync: from fairseq.distributed import utils if utils.get_global_world_size() == 1: sync = False if sync: self.bn = nn.SyncBatchNorm(*args, **kwargs) else: self.bn = nn.BatchNorm1d(*args, **kwargs) self.sync = sync def forward(self, input): if self.bn.running_mean.dtype != torch.float: if self.sync: self.bn.running_mean = self.bn.running_mean.float() self.bn.running_var = self.bn.running_var.float() if self.bn.affine: try: self.bn.weight = self.bn.weight.float() self.bn.bias = self.bn.bias.float() except: self.bn.float() else: self.bn.float() output = self.bn(input.float()) return output.type_as(input) ================================================ FILE: fairseq/modules/fp32_group_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Layer norm done in fp32 (for fp16 training) """ import torch.nn as nn import torch.nn.functional as F class Fp32GroupNorm(nn.GroupNorm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def forward(self, input): output = F.group_norm( input.float(), self.num_groups, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, self.eps, ) return output.type_as(input) ================================================ FILE: fairseq/modules/fp32_instance_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Layer norm done in fp32 (for fp16 training) """ import torch.nn as nn import torch.nn.functional as F class Fp32InstanceNorm(nn.InstanceNorm1d): def __init__(self, *args, **kwargs): self.transpose_last = "transpose_last" in kwargs and kwargs["transpose_last"] if "transpose_last" in kwargs: del kwargs["transpose_last"] super().__init__(*args, **kwargs) def forward(self, input): if self.transpose_last: input = input.transpose(1, 2) output = F.instance_norm( input.float(), running_mean=self.running_mean, running_var=self.running_var, weight=self.weight.float() if self.weight is not None else None, bias=self.bias.float() if self.bias is not None else None, use_input_stats=self.training or not self.track_running_stats, momentum=self.momentum, eps=self.eps, ) if self.transpose_last: output = output.transpose(1, 2) return output.type_as(input) ================================================ FILE: fairseq/modules/gelu.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ See "Gaussian Error Linear Units (GELUs)" by Dan Hendrycks and Kevin Gimpel with the corresponding GitHub repo: https://github.com/hendrycks/GELUs """ import math import torch import torch.nn as nn def gelu_accurate(x): if not hasattr(gelu_accurate, "_a"): gelu_accurate._a = math.sqrt(2 / math.pi) return ( 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))) ) def gelu(x: torch.Tensor) -> torch.Tensor: return torch.nn.functional.gelu(x.float()).type_as(x) ================================================ FILE: fairseq/modules/grad_multiply.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch class GradMultiply(torch.autograd.Function): @staticmethod def forward(ctx, x, scale): ctx.scale = scale res = x.new(x) return res @staticmethod def backward(ctx, grad): return grad * ctx.scale, None ================================================ FILE: fairseq/modules/gumbel_vector_quantizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F class GumbelVectorQuantizer(nn.Module): def __init__( self, dim, num_vars, temp, groups, combine_groups, vq_dim, time_first, activation=nn.GELU(), weight_proj_depth=1, weight_proj_factor=1, hard=True, std=0, ): """Vector quantization using gumbel softmax Args: dim: input dimension (channels) num_vars: number of quantized vectors per group temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor) groups: number of groups for vector quantization combine_groups: whether to use the vectors for all groups vq_dim: dimensionality of the resulting quantized vector time_first: if true, expect input in BxTxC format, otherwise in BxCxT activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1 weight_proj_depth: number of layers (with activation in between) to project input before computing logits weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of projections by this factor """ super().__init__() self.groups = groups self.combine_groups = combine_groups self.input_dim = dim self.num_vars = num_vars self.time_first = time_first self.hard = hard assert ( vq_dim % groups == 0 ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation" var_dim = vq_dim // groups num_groups = groups if not combine_groups else 1 self.vars = nn.Parameter(torch.FloatTensor(1, num_groups * num_vars, var_dim)) if std == 0: nn.init.uniform_(self.vars) else: nn.init.normal_(self.vars, mean=0, std=std) if weight_proj_depth > 1: def block(input_dim, output_dim): return nn.Sequential(nn.Linear(input_dim, output_dim), activation) inner_dim = self.input_dim * weight_proj_factor self.weight_proj = nn.Sequential( *[ block(self.input_dim if i == 0 else inner_dim, inner_dim) for i in range(weight_proj_depth - 1) ], nn.Linear(inner_dim, groups * num_vars), ) else: self.weight_proj = nn.Linear(self.input_dim, groups * num_vars) nn.init.normal_(self.weight_proj.weight, mean=0, std=1) nn.init.zeros_(self.weight_proj.bias) if isinstance(temp, str): import ast temp = ast.literal_eval(temp) assert len(temp) == 3, f"{temp}, {len(temp)}" self.max_temp, self.min_temp, self.temp_decay = temp self.curr_temp = self.max_temp self.codebook_indices = None def set_num_updates(self, num_updates): self.curr_temp = max( self.max_temp * self.temp_decay**num_updates, self.min_temp ) def get_codebook_indices(self): if self.codebook_indices is None: from itertools import product p = [range(self.num_vars)] * self.groups inds = list(product(*p)) self.codebook_indices = torch.tensor( inds, dtype=torch.long, device=self.vars.device ).flatten() if not self.combine_groups: self.codebook_indices = self.codebook_indices.view( self.num_vars**self.groups, -1 ) for b in range(1, self.groups): self.codebook_indices[:, b] += self.num_vars * b self.codebook_indices = self.codebook_indices.flatten() return self.codebook_indices def codebook(self): indices = self.get_codebook_indices() return ( self.vars.squeeze(0) .index_select(0, indices) .view(self.num_vars**self.groups, -1) ) def sample_from_codebook(self, b, n): indices = self.get_codebook_indices() indices = indices.view(-1, self.groups) cb_size = indices.size(0) assert ( n < cb_size ), f"sample size {n} is greater than size of codebook {cb_size}" sample_idx = torch.randint(low=0, high=cb_size, size=(b * n,)) indices = indices[sample_idx] z = self.vars.squeeze(0).index_select(0, indices.flatten()).view(b, n, -1) return z def to_codebook_index(self, indices): res = indices.new_full(indices.shape[:-1], 0) for i in range(self.groups): exponent = self.groups - i - 1 res += indices[..., i] * (self.num_vars**exponent) return res def forward_idx(self, x): res = self.forward(x, produce_targets=True) return res["x"], res["targets"] def forward(self, x, produce_targets=False): result = {"num_vars": self.num_vars * self.groups} if not self.time_first: x = x.transpose(1, 2) bsz, tsz, fsz = x.shape x = x.reshape(-1, fsz) x = self.weight_proj(x) x = x.view(bsz * tsz * self.groups, -1) with torch.no_grad(): _, k = x.max(-1) hard_x = ( x.new_zeros(*x.shape) .scatter_(-1, k.view(-1, 1), 1.0) .view(bsz * tsz, self.groups, -1) ) hard_probs = torch.mean(hard_x.float(), dim=0) result["code_perplexity"] = torch.exp( -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) ).sum() avg_probs = torch.softmax( x.view(bsz * tsz, self.groups, -1).float(), dim=-1 ).mean(dim=0) result["prob_perplexity"] = torch.exp( -torch.sum(avg_probs * torch.log(avg_probs + 1e-7), dim=-1) ).sum() result["temp"] = self.curr_temp if self.training: x = F.gumbel_softmax(x.float(), tau=self.curr_temp, hard=self.hard).type_as( x ) else: x = hard_x x = x.view(bsz * tsz, -1) vars = self.vars if self.combine_groups: vars = vars.repeat(1, self.groups, 1) if produce_targets: result["targets"] = ( x.view(bsz * tsz * self.groups, -1) .argmax(dim=-1) .view(bsz, tsz, self.groups) .detach() ) x = x.unsqueeze(-1) * vars x = x.view(bsz * tsz, self.groups, self.num_vars, -1) x = x.sum(-2) x = x.view(bsz, tsz, -1) if not self.time_first: x = x.transpose(1, 2) # BTC -> BCT result["x"] = x return result ================================================ FILE: fairseq/modules/kmeans_attention.py ================================================ import math from functools import reduce, wraps from inspect import isfunction from operator import mul import torch import torch.nn as nn import torch.nn.functional as F from aml.multimodal_video.utils.einops.lib import rearrange, repeat from aml.multimodal_video.utils.einops.lib.layers.torch import Rearrange from fairseq.modules.local_attention import LocalAttention # constants TOKEN_SELF_ATTN_VALUE = -5e4 KMEAN_INIT_ITERS = 10 # helper functions def exists(val): return val is not None def identity(x, *args, **kwargs): return x def default(x, d): if not exists(x): return d if not isfunction(d) else d() return x def cast_tuple(x): return x if isinstance(x, tuple) else (x,) def cache_fn(f): cache = None @wraps(f) def cached_fn(*args, **kwargs): nonlocal cache if exists(cache): return cache cache = f(*args, **kwargs) return cache return cached_fn def to(t): return {"device": t.device, "dtype": t.dtype} def find_modules(nn_module, type): return [module for module in nn_module.modules() if isinstance(module, type)] def is_empty(t): return t.nelement() == 0 def max_neg_value(tensor): return -torch.finfo(tensor.dtype).max def batched_index_select(values, indices): last_dim = values.shape[-1] return values.gather(2, expand_dim(indices, -1, last_dim)) def merge_dims(ind_from, ind_to, tensor): shape = list(tensor.shape) arr_slice = slice(ind_from, ind_to + 1) shape[arr_slice] = [reduce(mul, shape[arr_slice])] return tensor.reshape(*shape) def expand_dim(t, dim, k): t = t.unsqueeze(dim) expand_shape = [-1] * len(t.shape) expand_shape[dim] = k return t.expand(*expand_shape) def scatter_mean(src, t, index, dim, eps=1e-5): numer = src.scatter_add(dim, index, t) denom = src.scatter_add(dim, index, torch.ones_like(t)) return numer / (denom + eps) def split_at_index(dim, index, t): pre_slices = (slice(None),) * dim l = (*pre_slices, slice(None, index)) r = (*pre_slices, slice(index, None)) return t[l], t[r] def reshape_dim(t, dim, split_dims): shape = list(t.shape) num_dims = len(shape) dim = (dim + num_dims) % num_dims shape[dim : dim + 1] = split_dims return t.reshape(shape) def ema(old, new, decay): if not exists(old): return new return old * decay + new * (1 - decay) def ema_inplace(moving_avg, new, decay): if is_empty(moving_avg): moving_avg.data.copy_(new) return moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay)) # helper classes def map_first_tuple_or_el(x, fn): if isinstance(x, tuple): return (fn(x[0]),) + x[1:] return fn(x) class Chunk(nn.Module): def __init__(self, chunks, fn, along_dim=-1): super().__init__() self.dim = along_dim self.chunks = chunks self.fn = fn def forward(self, x, **kwargs): if self.chunks <= 1: return self.fn(x, **kwargs) chunks = x.chunk(self.chunks, dim=self.dim) return torch.cat([self.fn(c, **kwargs) for c in chunks], dim=self.dim) class PreNorm(nn.ModuleList): def __init__(self, norm_class, dim, fn): super().__init__() self.norm = norm_class(dim) self.fn = fn def forward(self, x, **kwargs): x = self.norm(x) return self.fn(x, **kwargs) class ReZero(nn.Module): def __init__(self, fn): super().__init__() self.residual_weight = nn.Parameter(torch.zeros(1)) self.fn = fn def forward(self, x, **kwargs): x = self.fn(x, **kwargs) return map_first_tuple_or_el(x, lambda t: t * self.residual_weight) class ScaleNorm(nn.Module): def __init__(self, dim, eps=1e-5): super().__init__() self.g = nn.Parameter(torch.ones(1)) self.eps = eps def forward(self, x): def norm(t): n = torch.norm(t, dim=-1, keepdim=True).clamp(min=self.eps) return t / n * self.g return map_first_tuple_or_el(x, norm) class ProjectInOut(nn.Module): def __init__(self, fn, dim_in, dim_out, project_out=True): super().__init__() self.fn = fn self.project_in = nn.Linear(dim_in, dim_out) self.project_out = nn.Linear(dim_out, dim_in) if project_out else identity def forward(self, x, **kwargs): x = self.project_in(x) x, loss = self.fn(x, **kwargs) x = self.project_out(x) return x, loss class MatrixMultiply(nn.Module): def __init__(self, tensor, transpose=False): super().__init__() self.tensor = tensor self.transpose = transpose def forward(self, x): tensor = self.tensor if self.transpose: tensor = tensor.t() return x @ tensor # positional embeddings class DepthWiseConv1d(nn.Module): def __init__(self, dim_in, dim_out, kernel_size, stride=1, bias=True, causal=False): super().__init__() self.padding = ( ((kernel_size - 1), 0) if causal else (kernel_size // 2, kernel_size // 2) ) self.net = nn.Sequential( nn.Conv1d( dim_in, dim_in, kernel_size=kernel_size, groups=dim_in, stride=stride, bias=bias, ), nn.Conv1d(dim_in, dim_out, 1, bias=bias), ) def forward(self, x): x = F.pad(x, self.padding, value=0.0) return self.net(x) class FixedPositionalEmbedding(nn.Module): def __init__(self, dim, max_seq_len): super().__init__() inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim)) position = torch.arange(0, max_seq_len, dtype=torch.float) sinusoid_inp = torch.einsum("i,j->ij", position, inv_freq) emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) self.register_buffer("emb", emb) def forward(self, x): return self.emb[None, : x.shape[1], :].to(x) def rotate_every_two(x): x = rearrange(x, "... (d j) -> ... d j", j=2) x1, x2 = x.unbind(dim=-1) x = torch.stack((-x2, x1), dim=-1) return rearrange(x, "... d j -> ... (d j)") def apply_rotary_pos_emb(q, k, sinu_pos): sinu_pos = rearrange(sinu_pos, "() n (j d) -> n j d", j=2) sin, cos = sinu_pos.unbind(dim=-2) sin, cos = map(lambda t: repeat(t, "b n -> b (n j)", j=2), (sin, cos)) q, k = map(lambda t: (t * cos) + (rotate_every_two(t) * sin), (q, k)) return q, k # kmeans related function and class def update_kmeans_on_backwards(module): module.kmean_modules = find_modules(module, Kmeans) def hook(_, grad_in, grad_out): for m in module.kmean_modules: m.update() return module.register_backward_hook(hook) def similarity(x, means): return torch.einsum("bhld,hcd->bhlc", x, means) def dists_and_buckets(x, means): dists = similarity(x, means) _, buckets = torch.max(dists, dim=-1) return dists, buckets def batched_bincount(index, num_classes, dim=-1): shape = list(index.shape) shape[dim] = num_classes out = index.new_zeros(shape) out.scatter_add_(dim, index, torch.ones_like(index, dtype=index.dtype)) return out def kmeans_iter(x, means, buckets=None): b, h, _, d, dtype, num_clusters = *x.shape, x.dtype, means.shape[1] if not exists(buckets): _, buckets = dists_and_buckets(x, means) bins = batched_bincount(buckets, num_clusters).sum(0, keepdim=True) zero_mask = bins.long() == 0 means_ = buckets.new_zeros(b, h, num_clusters, d, dtype=dtype) means_.scatter_add_(-2, expand_dim(buckets, -1, d), x) means_ = F.normalize(means_.sum(0, keepdim=True), dim=-1).type(dtype) means = torch.where(zero_mask.unsqueeze(-1), means, means_) means = means.squeeze(0) return means def distribution(dists, window_size): _, topk_indices = dists.topk(k=window_size, dim=-2) indices = topk_indices.transpose(-2, -1) return indices.reshape(*indices.size()[:2], -1) class Kmeans(nn.Module): def __init__( self, num_heads, head_dim, num_clusters, ema_decay=0.999, commitment=1e-4 ): super().__init__() self.commitment = commitment self.ema_decay = ema_decay self.register_buffer("means", torch.randn(num_heads, num_clusters, head_dim)) self.register_buffer("initted", torch.tensor(False)) self.num_new_means = 0 self.new_means = None @torch.no_grad() def init(self, x): if self.initted: return _, h, _, d, device, _ = *x.shape, x.device, x.dtype num_clusters = self.means.shape[1] means = x.transpose(0, 1).contiguous().view(h, -1, d) num_samples = means.shape[1] if num_samples >= num_clusters: indices = torch.randperm(num_samples, device=device)[:num_clusters] else: indices = torch.randint(0, num_samples, (num_clusters,), device=device) means = means[:, indices] for _ in range(KMEAN_INIT_ITERS): means = kmeans_iter(x, means) self.num_new_means = 0 self.means.data.copy_(means) self.initted.data.copy_(torch.tensor(True)) @torch.no_grad() def update(self, new_means=None): new_means = default(new_means, self.new_means) assert exists(new_means), "new kmeans has not been supplied" ema_inplace(self.means, new_means, self.ema_decay) del self.new_means self.new_means = None self.num_new_means = 0 def forward(self, x, update_means=False): self.init(x) b, dtype = x.shape[0], x.dtype means = self.means.type(dtype) x = F.normalize(x, 2, dim=-1).type(dtype) with torch.no_grad(): dists, buckets = dists_and_buckets(x, means) routed_means = batched_index_select(expand_dim(means, 0, b), buckets) loss = F.mse_loss(x, routed_means) * self.commitment if update_means: with torch.no_grad(): means = kmeans_iter(x, means, buckets) self.new_means = ema( self.new_means, means, self.num_new_means / (self.num_new_means + 1) ) self.num_new_means += 1 return dists, loss # kmeans attention class class KmeansAttention(nn.Module): def __init__( self, num_clusters, window_size, num_heads, head_dim, causal=False, dropout=0.0, ema_decay=0.999, commitment=1e-4, context_window_size=None, receives_context=False, num_mem_kv=0, shared_qk=False, ): super().__init__() self.num_heads = num_heads self.num_clusters = num_clusters self.head_dim = head_dim self.window_size = window_size self.context_window_size = default(context_window_size, window_size) self.causal = causal self.shared_qk = shared_qk self.receives_context = receives_context self.kmeans = Kmeans(num_heads, head_dim, num_clusters, ema_decay, commitment) self.dropout = nn.Dropout(dropout) self.num_mem_kv = max(num_mem_kv, 1 if causal and not shared_qk else 0) self.mem_key = nn.Parameter( torch.randn(num_heads, num_clusters, self.num_mem_kv, head_dim) ) self.mem_value = nn.Parameter( torch.randn(num_heads, num_clusters, self.num_mem_kv, head_dim) ) def forward(self, q, k, v, query_mask=None, key_mask=None, **kwargs): b, h, t, d, kv_t, wsz, c_wsz, nc, device, dtype = ( *q.shape, k.shape[2], self.window_size, self.context_window_size, self.num_clusters, q.device, q.dtype, ) is_reverse = kwargs.pop("_reverse", False) out = torch.zeros_like(q, dtype=dtype) update_kmeans = self.training and not is_reverse key_mask = ( default(key_mask, query_mask) if not self.receives_context else key_mask ) kv_wsz = wsz if not self.receives_context else c_wsz wsz = min(wsz, t) kv_wsz = min(kv_wsz, kv_t) if not self.shared_qk or self.receives_context: dists, aux_loss = self.kmeans(torch.cat((q, k), dim=2), update_kmeans) q_dists, k_dists = split_at_index(2, t, dists) indices = distribution(q_dists, wsz) kv_indices = distribution(k_dists, kv_wsz) else: dists, aux_loss = self.kmeans(q, update_kmeans) k = F.normalize(k, dim=-1).to(q) indices = distribution(dists, wsz) kv_indices = indices q = batched_index_select(q, indices) k = batched_index_select(k, kv_indices) v = batched_index_select(v, kv_indices) reshape_with_window = lambda x: x.reshape(b, h, nc, -1, d) q, k, v = map(reshape_with_window, (q, k, v)) m_k, m_v = map( lambda x: expand_dim(x, 0, b).to(q), (self.mem_key, self.mem_value) ) k, v = map(lambda x: torch.cat(x, dim=3), ((m_k, k), (m_v, v))) dots = torch.einsum("bhnid,bhnjd->bhnij", q, k) * (d**-0.5) mask_value = max_neg_value(dots) if exists(query_mask) or exists(key_mask): query_mask = default( query_mask, lambda: torch.ones((b, t), device=device).bool() ) key_mask = default( key_mask, lambda: torch.ones((b, kv_t), device=device).bool() ) q_mask = expand_dim(query_mask, 1, h).gather(2, indices) kv_mask = expand_dim(key_mask, 1, h).gather(2, kv_indices) q_mask, kv_mask = map(lambda t: t.reshape(b, h, nc, -1), (q_mask, kv_mask)) mask = q_mask[:, :, :, :, None] * kv_mask[:, :, :, None, :] mask = F.pad(mask, (self.num_mem_kv, 0), value=1) dots.masked_fill_(~mask, mask_value) del mask if self.causal: q_mask, kv_mask = map( lambda t: t.reshape(b, h, nc, -1), (indices, kv_indices) ) mask = q_mask[:, :, :, :, None] >= kv_mask[:, :, :, None, :] mask = F.pad(mask, (self.num_mem_kv, 0), value=1) dots.masked_fill_(~mask, mask_value) del mask if self.shared_qk: q_mask, kv_mask = map( lambda t: t.reshape(b, h, nc, -1), (indices, kv_indices) ) mask = q_mask[:, :, :, :, None] == kv_mask[:, :, :, None, :] mask = F.pad(mask, (self.num_mem_kv, 0), value=0) dots.masked_fill_(mask, TOKEN_SELF_ATTN_VALUE) del mask dots = dots.softmax(dim=-1) dots = self.dropout(dots) bo = torch.einsum("bhcij,bhcjd->bhcid", dots, v) so = torch.reshape(bo, (b, h, -1, bo.shape[-1])).type(dtype) out = scatter_mean(out, so, indices.unsqueeze(-1).expand_as(so), -2) return out, aux_loss # feedforward class GELU_(nn.Module): def forward(self, x): return ( 0.5 * x * ( 1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))) ) ) GELU = nn.GELU if hasattr(nn, "GELU") else GELU_ class FeedForward(nn.Module): def __init__(self, dim, mult=4, dropout=0.0, activation=None, glu=False): super().__init__() activation = default(activation, GELU) self.glu = glu self.w1 = nn.Linear(dim, dim * mult * (2 if glu else 1)) self.act = activation() self.dropout = nn.Dropout(dropout) self.w2 = nn.Linear(dim * mult, dim) def forward(self, x, **kwargs): if not self.glu: x = self.w1(x) x = self.act(x) else: x, v = self.w1(x).chunk(2, dim=-1) x = self.act(x) * v x = self.dropout(x) x = self.w2(x) return x # self attention class SelfAttention(nn.Module): def __init__( self, dim, max_seq_len, heads, local_attn_heads, window_size, dim_head=None, local_attn_window_size=None, local_attn_radius_blocks=1, causal=False, attn_dropout=0.0, dropout=0.0, kmeans_ema_decay=0.999, commitment_factor=1e-4, receives_context=False, context_window_size=None, rel_pos_emb=True, num_mem_kv=0, shared_qk=False, conv_query_kernel=9, ): super().__init__() assert ( dim_head or (dim % heads) == 0 ), "hidden dimension must be divisible by number of heads" assert ( max_seq_len % window_size ) == 0, "maximum sequence length must be divisible by the target window size" assert ( local_attn_heads <= heads ), "number of local attention heads must be less than total heads" assert not ( receives_context and local_attn_heads > 0 ), "local attention cannot be used for self attention with context" assert not ( receives_context and causal ), "contextual attention layer cannot be causal" local_attn_window_size = default(local_attn_window_size, window_size) context_window_size = default(context_window_size, window_size) self.shared_qk = shared_qk self.receives_context = receives_context self.heads = heads self.local_attn_heads = local_attn_heads self.global_attn_heads = heads - local_attn_heads self.causal = causal self.window_size = window_size dim_head = default(dim_head, dim // heads) dim_heads = dim_head * heads self.dim_head = dim_head num_clusters = max_seq_len // window_size # local local_dim_heads = dim_head * self.local_attn_heads if self.local_attn_heads > 0: rel_pos_emb_config = (dim_head, local_attn_heads) if rel_pos_emb else None self.local_attn = LocalAttention( dim=dim_head, window_size=local_attn_window_size, causal=causal, dropout=attn_dropout, rel_pos_emb_config=rel_pos_emb_config, look_backward=local_attn_radius_blocks, look_forward=0 if causal else local_attn_radius_blocks, ) self.local_to_qkv = nn.Linear(dim, 3 * local_dim_heads) # global global_dim_heads = dim_head * self.global_attn_heads if self.global_attn_heads > 0: self.global_attn = KmeansAttention( num_clusters, window_size, self.global_attn_heads, dim_head, causal=causal, dropout=attn_dropout, ema_decay=kmeans_ema_decay, commitment=commitment_factor, receives_context=receives_context, num_mem_kv=num_mem_kv, shared_qk=shared_qk, ) self.to_q = nn.Sequential( Rearrange("b n c -> b c n"), DepthWiseConv1d(dim, global_dim_heads, conv_query_kernel, causal=causal), Rearrange("b c n -> b n c"), ) self.to_v = nn.Linear(dim, global_dim_heads, bias=False) if not self.shared_qk: self.to_k = nn.Linear(dim, global_dim_heads, bias=False) # out self.to_out = nn.Linear(dim_heads, dim, bias=False) self.dropout = nn.Dropout(dropout) def forward( self, query, key, value, context=None, key_padding_mask=None, context_mask=None, pos_emb=None, **kwargs ): assert not ( self.receives_context and not exists(context) ), "context must be passed if self attention is set to receive context" input_mask = key_padding_mask x = query.transpose(0, 1) b, t, _, h, dh = *x.shape, self.heads, self.dim_head has_local, has_global = map( lambda x: x > 0, (self.local_attn_heads, self.global_attn_heads) ) split_heads = ( lambda v: reshape_dim(v, -1, (-1, dh)).transpose(1, 2).contiguous() ) if has_local: local_qkv = self.local_to_qkv(x).chunk(3, dim=-1) lq, lk, lv = map(split_heads, local_qkv) if has_global: kv_input = x if not self.receives_context else context q, v = self.to_q(x), self.to_v(kv_input) if not self.shared_qk: k = self.to_k(kv_input) else: k = self.to_q(kv_input) if self.receives_context else q q, k, v = map(split_heads, (q, k, v)) out = [] total_loss = torch.tensor(0.0, requires_grad=True, **to(x)) if has_local: local_out = self.local_attn(lq, lk, lv, input_mask=input_mask) out.append(local_out) if has_global: if not self.receives_context and exists(pos_emb): q, k = apply_rotary_pos_emb(q, k, pos_emb) global_out, loss = self.global_attn( q, k, v, query_mask=input_mask, key_mask=context_mask ) total_loss = total_loss + loss out.append(global_out) out = torch.cat(out, dim=1) out = out.reshape(b, h, t, -1).transpose(1, 2).reshape(b, t, -1) out = self.dropout(out.transpose(0, 1)) # out = self.to_out(out) return out, total_loss ================================================ FILE: fairseq/modules/kmeans_vector_quantizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn from fairseq.modules import Fp32GroupNorm class KmeansVectorQuantizer(nn.Module): def __init__( self, dim, num_vars, groups, combine_groups, vq_dim, time_first, gamma=0.25 ): """Vector quantization using straight pass-through estimator (i.e. kmeans) Args: dim: input dimension (channels) num_vars: number of quantized vectors per group groups: number of groups for vector quantization combine_groups: whether to use the vectors for all groups vq_dim: dimensionality of the resulting quantized vector time_first: if true, expect input in BxTxC format, otherwise in BxCxT gamma: commitment loss coefficient """ super().__init__() self.groups = groups self.combine_groups = combine_groups self.input_dim = dim self.num_vars = num_vars self.vq_dim = vq_dim self.time_first = time_first assert ( vq_dim % groups == 0 ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation" self.var_dim = vq_dim // groups num_groups = groups if not combine_groups else 1 self.embedding = nn.Parameter( 0.01 * torch.randn(num_vars, num_groups, self.var_dim) ) self.projection = nn.Sequential( nn.Conv1d(dim, dim, kernel_size=1, groups=groups, bias=False), Fp32GroupNorm(groups, dim), ) self.gamma = gamma self.mse_mean = nn.MSELoss(reduction="mean") def _pass_grad(self, x, y): """Manually set gradient for backward pass. for y = f(x), ensure that during the backward pass, dL/dy = dL/dx regardless of f(x). Returns: y, with the gradient forced to be dL/dy = dL/dx. """ return y.detach() + (x - x.detach()) @property def expand_embedding(self): if self.combine_groups: return self.embedding.expand(self.num_vars, self.groups, self.var_dim) return self.embedding def forward_idx(self, x): res = self.forward(x, produce_targets=True) return res["x"], res["targets"] def forward(self, x, produce_targets=False): result = {"num_vars": self.num_vars} if self.time_first: x = x.transpose(1, 2) bsz, fsz, tsz = x.shape ze = self.projection(x) ze_ = ze.view(bsz, self.groups, self.var_dim, tsz).permute(0, 3, 1, 2) d = ( (ze_.unsqueeze(0) - self.expand_embedding.unsqueeze(1).unsqueeze(1)) .view(self.num_vars, bsz, tsz, self.groups, -1) .norm(dim=-1, p=2) ) idx = d.argmin(dim=0) zq = ( torch.stack( [ self.expand_embedding[idx[..., group], group] for group in range(self.groups) ], dim=-2, ) .view(bsz, tsz, self.groups * self.var_dim) .permute(0, 2, 1) ) assert ze.shape == zq.shape, (ze.shape, zq.shape) x = self._pass_grad(ze, zq) with torch.no_grad(): hard_x = ( idx.new_zeros(bsz * tsz * self.groups, self.num_vars) .scatter_(-1, idx.view(-1, 1), 1.0) .view(bsz * tsz, self.groups, -1) ) hard_probs = torch.mean(hard_x.float(), dim=0) result["code_perplexity"] = torch.exp( -torch.sum(hard_probs * torch.log(hard_probs + 1e-7), dim=-1) ).sum() if produce_targets: result["targets"] = idx if self.time_first: x = x.transpose(1, 2) # BCT -> BTC result["x"] = x ze = ze.float() zq = zq.float() latent_loss = self.mse_mean(zq, ze.detach()) commitment_loss = self.mse_mean(ze, zq.detach()) result["kmeans_loss"] = latent_loss + self.gamma * commitment_loss return result ================================================ FILE: fairseq/modules/layer_drop.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ LayerDrop as described in https://arxiv.org/abs/1909.11556. """ import torch import torch.nn as nn class LayerDropModuleList(nn.ModuleList): """ A LayerDrop implementation based on :class:`torch.nn.ModuleList`. We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During evaluation we always iterate over all layers. Usage:: layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3]) for layer in layers: # this might iterate over layers 1 and 3 x = layer(x) for layer in layers: # this might iterate over all layers x = layer(x) for layer in layers: # this might not iterate over any layers x = layer(x) Args: p (float): probability of dropping out each layer modules (iterable, optional): an iterable of modules to add """ def __init__(self, p, modules=None): super().__init__(modules) self.p = p def __iter__(self): dropout_probs = torch.empty(len(self)).uniform_() for i, m in enumerate(super().__iter__()): if not self.training or (dropout_probs[i] > self.p): yield m ================================================ FILE: fairseq/modules/layer_norm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F try: from apex.normalization import FusedLayerNorm as _FusedLayerNorm has_fused_layernorm = True class FusedLayerNorm(_FusedLayerNorm): @torch.jit.unused def forward(self, x): if not x.is_cuda: return super().forward(x) else: with torch.cuda.device(x.device): return super().forward(x) except ImportError: has_fused_layernorm = False def LayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True, export=False): if torch.jit.is_scripting() or torch.jit.is_tracing(): export = True if not export and torch.cuda.is_available() and has_fused_layernorm: return FusedLayerNorm(normalized_shape, eps, elementwise_affine) return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) class Fp32LayerNorm(nn.LayerNorm): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def forward(self, input): output = F.layer_norm( input.float(), self.normalized_shape, self.weight.float() if self.weight is not None else None, self.bias.float() if self.bias is not None else None, self.eps, ) return output.type_as(input) ================================================ FILE: fairseq/modules/learned_positional_embedding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, Optional import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from torch import Tensor class LearnedPositionalEmbedding(nn.Embedding): """ This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to the forward function. """ def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int): super().__init__(num_embeddings, embedding_dim, padding_idx) self.onnx_trace = False if self.padding_idx is not None: self.max_positions = self.num_embeddings - self.padding_idx - 1 else: self.max_positions = self.num_embeddings def forward( self, input: Tensor, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, positions: Optional[Tensor] = None, ): """Input is expected to be of size [bsz x seqlen].""" assert (positions is None) or ( self.padding_idx is None ), "If positions is pre-computed then padding_idx should not be set." if positions is None: if incremental_state is not None: # positions is the same for every token when decoding a single step # Without the int() cast, it doesn't work in some cases when exporting to ONNX positions = torch.zeros( (1, 1), device=input.device, dtype=input.dtype ).fill_(int(self.padding_idx + input.size(1))) else: positions = utils.make_positions( input, self.padding_idx, onnx_trace=self.onnx_trace ) return F.embedding( positions, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse, ) ================================================ FILE: fairseq/modules/lightconv_layer/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .lightconv_layer import LightconvLayer # noqa ================================================ FILE: fairseq/modules/lightconv_layer/cuda_function_gen.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. def gen_forward(): kernels = [3, 5, 7, 15, 31, 63, 127, 255] seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] head = """ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "lightconv_cuda.cuh" std::vector<at::Tensor> lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l) { at::DeviceGuard g(input.device()); const auto minibatch = input.size(0); const auto numFeatures = input.size(1); const auto sequenceLength = input.size(2); const auto numHeads = filters.size(0); const auto filterSize = filters.size(1); const auto numFiltersInBlock = numFeatures / numHeads; const dim3 blocks(minibatch, numFeatures); auto output = at::zeros_like(input); auto stream = at::cuda::getCurrentCUDAStream(); """ sequence_if = """ if (sequenceLength <= {seq}) {{ switch(filterSize) {{ """ case_k = """ case {k}: """ main_block = """ if (padding_l == {pad}) {{ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_forward", ([&] {{ lightconv_forward_kernel<{k}, {b_size}, {pad}, scalar_t> <<<blocks, {b_size}, 0, stream>>>( input.data<scalar_t>(), filters.data<scalar_t>(), minibatch, sequenceLength, numFeatures, numFiltersInBlock, output.data<scalar_t>()); }})); }} else """ bad_padding = """ { std::cout << "WARNING: Unsupported padding size - skipping forward pass" << std::endl; } break; """ bad_filter = """ default: std::cout << "WARNING: Unsupported filter length passed - skipping forward pass" << std::endl; } """ con_else = """ } else """ final_else = """ { switch(filterSize) { """ final_return = """ } return {output}; } """ with open("lightconv_cuda_forward.cu", "w") as forward: forward.write(head) for seq in seqs: forward.write(sequence_if.format(seq=seq)) for k in kernels: forward.write(case_k.format(k=k)) for pad in [k // 2, k - 1]: forward.write(main_block.format(k=k, b_size=seq, pad=pad)) forward.write(bad_padding) forward.write(bad_filter) forward.write(con_else) forward.write(final_else) for k in kernels: forward.write(case_k.format(k=k)) for pad in [k // 2, k - 1]: forward.write(main_block.format(k=k, b_size=seq, pad=pad)) forward.write(bad_padding) forward.write(bad_filter) forward.write(final_return) def gen_backward(): head = """ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "lightconv_cuda.cuh" std::vector<at::Tensor> lightconv_cuda_backward( at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor filters) { // gradWrtInput const int minibatch = input.size(0); const int numFeatures = input.size(1); const int sequenceLength = input.size(2); const int numHeads = filters.size(0); const int filterSize = filters.size(1); const dim3 gradBlocks(minibatch, numFeatures); const dim3 weightGradFirstpassShortBlocks(minibatch, numHeads); const dim3 weightGradSecondpassBlocks(numHeads, filterSize); const int numFiltersInBlock = numFeatures / numHeads; auto gradInput = at::zeros_like(input); auto gradFilters = at::zeros_like(filters); at::DeviceGuard g(input.device()); auto stream = at::cuda::getCurrentCUDAStream(); switch(filterSize) { """ sequence_if = """ if (sequenceLength <= {seq}) {{ """ case_k = """ case {k}: """ main_block = """ if (padding_l == {p}) {{ AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.scalar_type(), "lightconv_backward", ([&] {{ lightconv_grad_wrt_input_kernel<{k}, {b_size}, {p}, scalar_t> <<<gradBlocks, {b_size}, 0, stream>>>( gradOutput.data<scalar_t>(), filters.data<scalar_t>(), minibatch, sequenceLength, numFeatures, numFiltersInBlock, gradInput.data<scalar_t>()); """ weight_grad_short = """ at::Tensor tempSumGradFilters = at::zeros({{minibatch, numHeads, filterSize}}, input.options().dtype(at::kFloat)); lightconv_grad_wrt_weights_firstpass_short_kernel<{k}, {b_size}, {p}, scalar_t> <<<weightGradFirstpassShortBlocks, {b_size}, 0, stream>>>( input.data<scalar_t>(), gradOutput.data<scalar_t>(), minibatch, sequenceLength, numFeatures, numFiltersInBlock, numHeads, tempSumGradFilters.data<float>() ); lightconv_grad_wrt_weights_secondpass_short_kernel<{k}, {b_size}, scalar_t> <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>( tempSumGradFilters.data<float>(), minibatch, numFiltersInBlock, gradFilters.data<scalar_t>() ); }})); }} else """ weight_grad = """ at::Tensor tempSumGradFilters = at::zeros({{minibatch, numFeatures, filterSize}}, input.options().dtype(at::kFloat)); lightconv_grad_wrt_weights_firstpass_kernel<{k}, {b_size}, {p}, scalar_t> <<<gradBlocks, {b_size}, 0, stream>>>( input.data<scalar_t>(), gradOutput.data<scalar_t>(), minibatch, sequenceLength, numFeatures, numFiltersInBlock, tempSumGradFilters.data<float>() ); lightconv_grad_wrt_weights_secondpass_kernel<{k}, {b_size}, scalar_t> <<<weightGradSecondpassBlocks, {b_size}, 0, stream>>>( tempSumGradFilters.data<float>(), minibatch, numFiltersInBlock, gradFilters.data<scalar_t>() ); }})); }} else """ bad_padding = """ { std::cout << "WARNING: Unsupported padding size - skipping backward pass" << std::endl; } """ breakout = """ break; """ bad_filter = """ default: std::cout << "WARNING: Unsupported filter length passed - skipping backward pass" << std::endl; """ con_else = """ } else """ final_else = """ { switch(filterSize) { """ last_return = """ } return {gradInput, gradFilters}; } """ kernels = [3, 5, 7, 15, 31, 63, 127, 255] seqs = [32 * x for x in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]] thresh = [32, 32, 64, 128, 256, -1, -1, -1] max_mem = [-1, -1, -1, -1, -1, 192, 96, 64] with open("lightconv_cuda_backward.cu", "w") as backward: backward.write(head) for (k, t, mem) in zip(kernels, thresh, max_mem): backward.write(case_k.format(k=k)) for seq in seqs: if (t == -1 or seq <= t) and (mem == -1 or seq < mem): backward.write(sequence_if.format(seq=seq)) for p in [k // 2, k - 1]: backward.write(main_block.format(k=k, b_size=seq, p=p)) backward.write(weight_grad_short.format(k=k, b_size=seq, p=p)) backward.write(bad_padding) else: for p in [k // 2, k - 1]: backward.write(main_block.format(k=k, b_size=32, p=p)) backward.write(weight_grad.format(k=k, b_size=32, p=p)) backward.write(bad_padding) backward.write(breakout) break backward.write(con_else) backward.write(bad_filter) backward.write(last_return) if __name__ == "__main__": gen_forward() gen_backward() ================================================ FILE: fairseq/modules/lightconv_layer/lightconv_cuda.cpp ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include <torch/extension.h> #include <vector> std::vector<at::Tensor> lightconv_cuda_forward(at::Tensor input, at::Tensor filters, int padding_l); std::vector<at::Tensor> lightconv_cuda_backward( at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor filters); #define CHECK_CUDA(x) \ AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) \ AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) \ CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) std::vector<at::Tensor> lightconv_forward(at::Tensor input, at::Tensor filters, int padding_l) { CHECK_INPUT(input); CHECK_INPUT(filters); return lightconv_cuda_forward(input, filters, padding_l); } std::vector<at::Tensor> lightconv_backward( at::Tensor gradOutput, int padding_l, at::Tensor input, at::Tensor filters) { CHECK_INPUT(gradOutput); CHECK_INPUT(input); CHECK_INPUT(filters); return lightconv_cuda_backward(gradOutput, padding_l, input, filters); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("forward", &lightconv_forward, "lighconv forward (CUDA)"); m.def("backward", &lightconv_backward, "lighconv backward (CUDA)"); } ================================================ FILE: fairseq/modules/lightconv_layer/lightconv_cuda.cuh ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include <ATen/ATen.h> #include <c10/cuda/CUDAStream.h> #include <cuda.h> #include <cuda_runtime.h> #include <algorithm> #include <functional> #include <iostream> #include <stdexcept> #include <utility> #include <vector> #include <assert.h> #include <stdlib.h> #define SHFL_MASK 0xffffffff template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_forward_kernel( const scalar_t* input, const scalar_t* filters, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, scalar_t* output); template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_grad_wrt_input_kernel( const scalar_t* input, const scalar_t* filters, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, scalar_t* output); template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_grad_wrt_weights_firstpass_short_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, int numHeads, float* output); template <int FS, int SB, typename scalar_t> __global__ void lightconv_grad_wrt_weights_secondpass_short_kernel( const float* input, const int minibatch, const int numFiltersInBlock, scalar_t* output); template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_grad_wrt_weights_firstpass_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, float* output); template <int FS, int SB, typename scalar_t> __global__ void lightconv_grad_wrt_weights_secondpass_kernel( const float* input, const int minibatch, const int numFiltersInBlock, scalar_t* output); ================================================ FILE: fairseq/modules/lightconv_layer/lightconv_cuda_kernel.cu ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ #include "../cuda_utils.cu" #include "lightconv_cuda.cuh" #include "lightconv_cuda_backward.cu" #include "lightconv_cuda_forward.cu" template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_forward_kernel( const scalar_t* input, const scalar_t* filters, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, scalar_t* output) { const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int featureIdx = blockIdx.y; const int filterIdx = featureIdx / numFiltersInBlock; const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; scalar_t* outputFeature = &output[IOOffset]; const scalar_t* inputFilter = &filters[filterIdx * FS]; assert(blockDim.x == SB); scalar_t filter[FS]; #pragma unroll for (int i = 0; i < FS; ++i) { filter[i] = inputFilter[i]; } __shared__ scalar_t temp[SB + FS]; zeroSharedMem<FS, SB, padding_l>(temp); const int numIterations = divUp<int, int>(sequenceLength, SB); for (int i = 0; i < numIterations; ++i) { // Read input into shared memory const int inputOffset = i * SB; load_input_to_shared<FS, SB, padding_l>( inputFeature, inputOffset, sequenceLength, i, numIterations, (numIterations == 1), temp); __syncthreads(); scalar_t out = 0; #pragma unroll for (int j = 0; j < FS; ++j) { out += filter[j] * temp[tid + j]; } // Write output const int outputOffset = inputOffset; if ((outputOffset + tid) < sequenceLength) { outputFeature[outputOffset + tid] = out; } __syncthreads(); } } template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_grad_wrt_input_kernel( const scalar_t* input, const scalar_t* filters, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, scalar_t* output) { // input grad kernel is similar to forward kernel const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int featureIdx = blockIdx.y; const int filterIdx = featureIdx / numFiltersInBlock; const int IOOffset = numFeatures * sequenceLength * batchIdx + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; scalar_t* outputFeature = &output[IOOffset]; const scalar_t* inputFilter = &filters[filterIdx * FS]; assert(blockDim.x == SB); scalar_t filter[FS]; // The only change is loading the filter in reverse #pragma unroll for (int i = 0; i < FS; ++i) { filter[i] = inputFilter[FS - i - 1]; } __shared__ scalar_t temp[SB + FS]; const int padding = FS - padding_l - 1; zeroSharedMem<FS, SB, padding>(temp); __syncthreads(); const int numIterations = divUp<int, int>(sequenceLength, SB); for (int i = 0; i < numIterations; ++i) { // Read input into shared memory const int inputOffset = i * SB; load_input_to_shared<FS, SB, padding>( inputFeature, inputOffset, sequenceLength, i, numIterations, false, temp); __syncthreads(); scalar_t out = 0; #pragma unroll for (int j = 0; j < FS; ++j) { out += filter[j] * temp[tid + j]; } // Write output const int outputOffset = inputOffset; if ((outputOffset + tid) < sequenceLength) { outputFeature[outputOffset + tid] = out; } __syncthreads(); } } // This is by far the most expensive kernel in terms of time taken. // Can be 16x slower than the forward or grad_wrt_input when filter size is 31 template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_grad_wrt_weights_firstpass_short_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, int numHeads, float* output) { const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int filterIdx = blockIdx.y; const int numIterations = divUp<int, int>(sequenceLength, SB); float* tempOutputGradWeight = &output[filterIdx * FS * minibatch]; assert(blockDim.x == SB); __shared__ scalar_t tempInput[SB + FS]; __shared__ scalar_t tempGradInput[SB + FS]; // local weight accumulation float accumWeights[FS]; // Initialize memory for (int i = 0; i < FS; ++i) { accumWeights[i] = float(0.0); } // loop over each sequence within filterblock for (int idxInFilterBlock = 0; idxInFilterBlock < numFiltersInBlock; ++idxInFilterBlock) { const int featureOffset = batchIdx * numFeatures * sequenceLength + (filterIdx * numFiltersInBlock + idxInFilterBlock) * sequenceLength; const scalar_t* inputFeature = &input[featureOffset]; const scalar_t* gradInputFeature = &gradInput[featureOffset]; zeroSharedMem<FS, SB, padding_l>(tempInput); zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput); __syncthreads(); for (int i = 0; i < numIterations; ++i) { const int inputOffset = i * SB; load_input_to_shared<FS, SB, padding_l>( inputFeature, inputOffset, sequenceLength, i, numIterations, false, tempInput); load_input_to_shared<FS, SB, (FS / 2)>( gradInputFeature, inputOffset, sequenceLength, i, numIterations, false, tempGradInput); __syncthreads(); const int gradIndex = (FS / 2) + tid; scalar_t tempGrad = tempGradInput[gradIndex]; #pragma unroll for (int j = 0; j < FS; j++) { const int inputIndex = tid + j; accumWeights[j] += tempInput[inputIndex] * tempGrad; } __syncthreads(); } } // Row-major sum for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { float temp; if (tid < sequenceLength) { temp = accumWeights[filterWeightIdx]; } else { temp = float(0.0); } const int outputOffset = filterWeightIdx * minibatch + batchIdx; temp = blockReduce(temp); if (tid == 0) { tempOutputGradWeight[outputOffset] = temp; } } } template <int FS, int SB, typename scalar_t> __global__ void lightconv_grad_wrt_weights_secondpass_short_kernel( const float* input, const int minibatch, const int numFiltersInBlock, scalar_t* output) { assert(blockDim.x == SB); const int tid = threadIdx.x; const int filterIdx = blockIdx.x; const int filterWeightIdx = blockIdx.y; const int inputOffset = filterIdx * FS * minibatch + filterWeightIdx * minibatch; const float* tempInput = &input[inputOffset]; // read into shared memory for reduction int readIndex = tid; float sum = 0.0; while (readIndex < minibatch) { sum += tempInput[readIndex]; readIndex += SB; } float temp = blockReduce(sum); if (tid == 0) { output[blockIdx.x * FS + blockIdx.y] = temp; } } // This is by far the most expensive kernel in terms of time taken. // Can be 16x slower than the forward or grad_wrt_input when filter size is 31 template <int FS, int SB, int padding_l, typename scalar_t> __global__ void lightconv_grad_wrt_weights_firstpass_kernel( const scalar_t* input, const scalar_t* gradInput, int minibatch, int sequenceLength, int numFeatures, int numFiltersInBlock, float* output) { assert(blockDim.x == SB); const int tid = threadIdx.x; const int batchIdx = blockIdx.x; const int featureIdx = blockIdx.y; const int filterIdx = featureIdx / numFiltersInBlock; const int idxInFilterBlock = featureIdx % numFiltersInBlock; const int numIterations = divUp<int, int>(sequenceLength, SB); float temp; __shared__ scalar_t tempInput[SB + FS]; __shared__ scalar_t tempGradInput[SB + FS]; zeroSharedMem<FS, SB, padding_l>(tempInput); zeroSharedMem<FS, SB, (FS / 2)>(tempGradInput); __syncthreads(); float accumWeights[FS]; for (int i = 0; i < FS; ++i) { accumWeights[i] = float(0.0); } const int IOOffset = batchIdx * numFeatures * sequenceLength + featureIdx * sequenceLength; const scalar_t* inputFeature = &input[IOOffset]; const scalar_t* gradInputFeature = &gradInput[IOOffset]; float* tempOutputGradWeight = &output[filterIdx * FS * minibatch * numFiltersInBlock]; for (int i = 0; i < numIterations; ++i) { const int inputOffset = i * SB; load_input_to_shared<FS, SB, padding_l>( inputFeature, inputOffset, sequenceLength, i, numIterations, false, tempInput); load_input_to_shared<FS, SB, (FS / 2)>( gradInputFeature, inputOffset, sequenceLength, i, numIterations, false, tempGradInput); __syncthreads(); #pragma unroll for (int j = 0; j < FS; ++j) { accumWeights[j] += tempInput[tid + j] * tempGradInput[tid + (FS / 2)]; } __syncthreads(); } // Row-major sum for (int filterWeightIdx = 0; filterWeightIdx < FS; ++filterWeightIdx) { // Write to shared memory before reduction if (tid < sequenceLength) { temp = accumWeights[filterWeightIdx]; } else { temp = float(0.0); } temp = blockReduce(temp); const int outputOffset = filterWeightIdx * minibatch * numFiltersInBlock + batchIdx * numFiltersInBlock + idxInFilterBlock; if (tid == 0) { tempOutputGradWeight[outputOffset] = temp; } } } template <int FS, int SB, typename scalar_t> __global__ void lightconv_grad_wrt_weights_secondpass_kernel( const float* input, const int minibatch, const int numFiltersInBlock, scalar_t* output) { assert(blockDim.x == SB); const int tid = threadIdx.x; // What is the id within a minibatch const int filterIdx = blockIdx.x; const int filterWeightIdx = blockIdx.y; const int inputOffset = filterIdx * FS * minibatch * numFiltersInBlock + filterWeightIdx * minibatch * numFiltersInBlock; const float* tempInput = &input[inputOffset]; int readIndex = tid; float sum = float(0.0); while (readIndex < (minibatch * numFiltersInBlock)) { sum += tempInput[readIndex]; readIndex += SB; } float temp = blockReduce(sum); if (tid == 0) { output[blockIdx.x * FS + blockIdx.y] = temp; } } ================================================ FILE: fairseq/modules/lightconv_layer/lightconv_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import lightconv_cuda import torch import torch.nn.functional as F from fairseq import utils from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules.fairseq_dropout import FairseqDropout from torch import nn from torch.autograd import Function class lightconvFunction(Function): @staticmethod def forward(ctx, x, weights, padding_l): ctx.padding_l = padding_l outputs = lightconv_cuda.forward(x, weights, padding_l) variables = [x, weights] ctx.save_for_backward(*variables) return outputs[0] @staticmethod def backward(ctx, grad_output): outputs = lightconv_cuda.backward( grad_output.contiguous(), ctx.padding_l, *ctx.saved_tensors ) grad_input, grad_weights = outputs return grad_input, grad_weights, None @with_incremental_state class LightconvLayer(nn.Module): def __init__( self, input_size, kernel_size=1, padding_l=None, weight_softmax=False, num_heads=1, weight_dropout=0.0, bias=False, ): super(LightconvLayer, self).__init__() self.input_size = input_size self.kernel_size = kernel_size self.padding_l = padding_l self.num_heads = num_heads self.weight_softmax = weight_softmax self.weight_dropout_module = FairseqDropout( weight_dropout, module_name=self.__class__.__name__ ) self.weight = nn.Parameter(torch.Tensor(num_heads, kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(input_size)) else: self.bias = None self.reset_parameters() def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" for k, v in state_dict.items(): if k.endswith(prefix + "weight"): if v.dim() == 3 and v.size(1) == 1: state_dict[k] = v.squeeze(1) def reset_parameters(self): nn.init.xavier_uniform_(self.weight) if self.bias is not None: nn.init.constant_(self.bias, 0.0) def forward(self, x, incremental_state=None): # during inference time, incremental BMM is faster if incremental_state is not None: T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H input_buffer = self._get_input_buffer(incremental_state) if input_buffer is None: input_buffer = x.new() x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) if self.kernel_size > 1: self._set_input_buffer( incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] ) x_unfold = x_unfold.view(T * B * H, R, -1) weight = self.weight if self.weight_softmax: weight = F.softmax(weight.float(), dim=1).type_as(weight) weight = weight[:, -x_unfold.size(2) :] K = weight.size(1) weight = ( weight.view(1, H, K) .expand(T * B, H, K) .contiguous() .view(T * B * H, K, 1) ) weight = self.weight_dropout_module(weight) output = torch.bmm(x_unfold, weight) # T*B*H x R x 1 output = output.view(T, B, C) return output # during training time, use CUDA kernel else: x = x.permute(1, 2, 0).contiguous() weight = self.weight if self.weight_softmax: weight = F.softmax(self.weight, -1) if self.weight_dropout_module.p: weight = self.weight_dropout_module(weight) return lightconvFunction.apply(x, weight, self.padding_l).permute(2, 0, 1) def reorder_incremental_state(self, incremental_state, new_order): input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: input_buffer = input_buffer.index_select(1, new_order) self._set_input_buffer(incremental_state, input_buffer) def _get_input_buffer(self, incremental_state): return utils.get_incremental_state(self, incremental_state, "input_buffer") def _set_input_buffer(self, incremental_state, new_buffer): return utils.set_incremental_state( self, incremental_state, "input_buffer", new_buffer ) def half(self): return self._apply(lambda t: t.half() if t.is_floating_point() else t) ================================================ FILE: fairseq/modules/lightconv_layer/setup.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from setuptools import setup from torch.utils.cpp_extension import BuildExtension, CUDAExtension setup( name="lightconv_layer", ext_modules=[ CUDAExtension( "lightconv_cuda", [ "lightconv_cuda.cpp", "lightconv_cuda_kernel.cu", ], ), ], cmdclass={"build_ext": BuildExtension}, ) ================================================ FILE: fairseq/modules/lightweight_convolution.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F from fairseq import utils from fairseq.incremental_decoding_utils import with_incremental_state from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.unfold import unfold1d def LightweightConv( input_size, kernel_size=1, padding_l=None, num_heads=1, weight_dropout=0.0, weight_softmax=False, bias=False, ): if torch.cuda.is_available(): try: from fairseq.modules.lightconv_layer import LightconvLayer return LightconvLayer( input_size, kernel_size=kernel_size, padding_l=padding_l, num_heads=num_heads, weight_dropout=weight_dropout, weight_softmax=weight_softmax, bias=bias, ) except ImportError as e: print(e) return LightweightConv1dTBC( input_size, kernel_size=kernel_size, padding_l=padding_l, num_heads=num_heads, weight_dropout=weight_dropout, weight_softmax=weight_softmax, bias=bias, ) class LightweightConv1d(nn.Module): """Lightweight Convolution assuming the input is BxCxT This is just an example that explains LightConv clearer than the TBC version. We don't use this module in the model. Args: input_size: # of channels of the input and output kernel_size: convolution channels padding: padding num_heads: number of heads used. The weight is of shape `(num_heads, 1, kernel_size)` weight_softmax: normalize the weight with softmax before the convolution Shape: Input: BxCxT, i.e. (batch_size, input_size, timesteps) Output: BxCxT, i.e. (batch_size, input_size, timesteps) Attributes: weight: the learnable weights of the module of shape `(num_heads, 1, kernel_size)` bias: the learnable bias of the module of shape `(input_size)` """ def __init__( self, input_size, kernel_size=1, padding=0, num_heads=1, weight_softmax=False, bias=False, weight_dropout=0.0, ): super().__init__() self.input_size = input_size self.kernel_size = kernel_size self.num_heads = num_heads self.padding = padding self.weight_softmax = weight_softmax self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(input_size)) else: self.bias = None self.weight_dropout_module = FairseqDropout( weight_dropout, module_name=self.__class__.__name__ ) self.reset_parameters() def reset_parameters(self): nn.init.xavier_uniform_(self.weight) if self.bias is not None: nn.init.constant_(self.bias, 0.0) def forward(self, input): """ input size: B x C x T output size: B x C x T """ B, C, T = input.size() H = self.num_heads weight = self.weight if self.weight_softmax: weight = F.softmax(weight, dim=-1) weight = self.weight_dropout_module(weight) # Merge every C/H entries into the batch dimension (C = self.input_size) # B x C x T -> (B * C/H) x H x T # One can also expand the weight to C x 1 x K by a factor of C/H # and do not reshape the input instead, which is slow though input = input.view(-1, H, T) output = F.conv1d(input, weight, padding=self.padding, groups=self.num_heads) output = output.view(B, C, T) if self.bias is not None: output = output + self.bias.view(1, -1, 1) return output @with_incremental_state class LightweightConv1dTBC(nn.Module): """Lightweight Convolution assuming the input is TxBxC Args: input_size: # of channels of the input kernel_size: convolution channels padding_l: padding to the left when using "same" padding num_heads: number of heads used. The weight is of shape (num_heads, 1, kernel_size) weight_dropout: the drop rate of the DropConnect to drop the weight weight_softmax: normalize the weight with softmax before the convolution bias: use bias Shape: Input: TxBxC, i.e. (timesteps, batch_size, input_size) Output: TxBxC, i.e. (timesteps, batch_size, input_size) Attributes: weight: the learnable weights of the module of shape `(num_heads, 1, kernel_size)` bias: the learnable bias of the module of shape `(input_size)` """ def __init__( self, input_size, kernel_size=1, padding_l=None, num_heads=1, weight_dropout=0.0, weight_softmax=False, bias=False, ): super().__init__() self.input_size = input_size self.kernel_size = kernel_size self.padding_l = padding_l self.num_heads = num_heads self.weight_dropout_module = FairseqDropout( weight_dropout, module_name=self.__class__.__name__ ) self.weight_softmax = weight_softmax self.weight = nn.Parameter(torch.Tensor(num_heads, 1, kernel_size)) if bias: self.bias = nn.Parameter(torch.Tensor(input_size)) else: self.bias = None self.reset_parameters() self.onnx_trace = False def reset_parameters(self): nn.init.xavier_uniform_(self.weight) if self.bias is not None: nn.init.constant_(self.bias, 0.0) def forward(self, x, incremental_state=None, unfold=False): """Assuming the input, x, of the shape T x B x C and producing an output in the shape T x B x C args: x: Input of shape T x B x C, i.e. (timesteps, batch_size, input_size) incremental_state: A dict to keep the state unfold: unfold the input or not. If not, we use the matrix trick instead """ unfold = unfold or (incremental_state is not None) if unfold: output = self._forward_unfolded(x, incremental_state) else: output = self._forward_expanded(x, incremental_state) if self.bias is not None: output = output + self.bias.view(1, 1, -1) return output def prepare_for_onnx_export_(self): self.onnx_trace = True def _forward_unfolded(self, x, incremental_state): """The conventional implementation of convolutions. Unfolding the input by having a window shifting to the right.""" T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H assert R * H == C == self.input_size weight = self.weight.view(H, K) if incremental_state is not None: input_buffer = self._get_input_buffer(incremental_state) if input_buffer is None: input_buffer = x.new() x_unfold = torch.cat([input_buffer, x.unsqueeze(3)], dim=3) if self.kernel_size > 1: self._set_input_buffer( incremental_state, x_unfold[:, :, :, -self.kernel_size + 1 :] ) x_unfold = x_unfold.view(T * B * H, R, -1) else: # unfold the input: T x B x C --> T' x B x C x K x_unfold = unfold1d(x, self.kernel_size, self.padding_l, 0) x_unfold = x_unfold.view(T * B * H, R, K) if self.weight_softmax: weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as( weight ) if incremental_state is not None: weight = weight[:, -x_unfold.size(2) :] K = weight.size(1) weight = ( weight.view(1, H, K).expand(T * B, H, K).contiguous().view(T * B * H, K, 1) ) weight = self.weight_dropout_module(weight) output = torch.bmm(x_unfold, weight) # T*B*H x R x 1 output = output.view(T, B, C) return output def _forward_expanded(self, x, incremental_state): """Turn the convolution filters into band matrices and do matrix multiplication. This is faster when the sequence is short, but less memory efficient. This is not used in the decoder during inference. """ T, B, C = x.size() K, H = self.kernel_size, self.num_heads R = C // H assert R * H == C == self.input_size weight = self.weight.view(H, K) if self.weight_softmax: weight = utils.softmax(weight, dim=1, onnx_trace=self.onnx_trace).type_as( weight ) weight = weight.view(1, H, K).expand(T * B, H, K).contiguous() weight = weight.view(T, B * H, K).transpose(0, 1) x = x.view(T, B * H, R).transpose(0, 1) P = self.padding_l if K > T and P == K - 1: weight = weight.narrow(2, K - T, T) K, P = T, T - 1 # turn the convolution filters into band matrices weight_expanded = weight.new_zeros(B * H, T, T + K - 1, requires_grad=False) weight_expanded.as_strided((B * H, T, K), (T * (T + K - 1), T + K, 1)).copy_( weight ) weight_expanded = weight_expanded.narrow(2, P, T) weight_expanded = self.weight_dropout_module(weight_expanded) output = torch.bmm(weight_expanded, x) output = output.transpose(0, 1).contiguous().view(T, B, C) return output def reorder_incremental_state(self, incremental_state, new_order): input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: input_buffer = input_buffer.index_select(1, new_order) self._set_input_buffer(incremental_state, input_buffer) def _get_input_buffer(self, incremental_state): return utils.get_incremental_state(self, incremental_state, "input_buffer") def _set_input_buffer(self, incremental_state, new_buffer): return utils.set_incremental_state( self, incremental_state, "input_buffer", new_buffer ) def extra_repr(self): s = "{}, kernel_size={}, padding_l={}, num_heads={}, weight_softmax={}, bias={}".format( self.input_size, self.kernel_size, self.padding_l, self.num_heads, self.weight_softmax, self.bias is not None, ) if self.weight_dropout_module.p > 0.0: s += ", weight_dropout={}".format(self.weight_dropout_module.p) return s ================================================ FILE: fairseq/modules/linearized_convolution.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn.functional as F from fairseq import utils from fairseq.incremental_decoding_utils import with_incremental_state from .conv_tbc import ConvTBC from typing import Dict, Optional from torch import Tensor @with_incremental_state class LinearizedConvolution(ConvTBC): """An optimized version of nn.Conv1d. At training time, this module uses ConvTBC, which is an optimized version of Conv1d. At inference time, it optimizes incremental generation (i.e., one time step at a time) by replacing the convolutions with linear layers. Note that the input order changes from training to inference. """ def __init__(self, in_channels, out_channels, kernel_size, **kwargs): super().__init__(in_channels, out_channels, kernel_size, **kwargs) self._linearized_weight = None self.register_backward_hook(self._clear_linearized_weight) def state_dict(self, destination=None, prefix="", keep_vars=False): state = ConvTBC.state_dict(self, destination, prefix, keep_vars=keep_vars) # don't store redundant _linearized_weight in checkpoints if prefix + "_linearized_weight" in state: del state[prefix + "_linearized_weight"] return state def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" if prefix + "_linearized_weight" in state_dict: del state_dict[prefix + "_linearized_weight"] @torch.jit.export def forward( self, input, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, ): """ Args: incremental_state: Used to buffer signal; if not None, then input is expected to contain a single frame. If the input order changes between time steps, call reorder_incremental_state. Input: Time x Batch x Channel during training Batch x Time x Channel during inference """ if incremental_state is None: output = self.conv_tbc(input) if self.kernel_size[0] > 1 and self.padding[0] > 0: # remove future timesteps added by padding output = output[: -self.padding[0], :, :] return output # reshape weight weight = self._get_linearized_weight() kw = self.kernel_size[0] bsz = input.size(0) # input: bsz x len x dim if kw > 1: input = input.data input_buffer = self._get_input_buffer(incremental_state) if input_buffer is None: input_buffer = input.new(bsz, kw, input.size(2)).zero_() self._set_input_buffer(incremental_state, input_buffer) else: # shift buffer input_buffer[:, :-1, :] = input_buffer[:, 1:, :].clone() # append next input input_buffer[:, -1, :] = input[:, -1, :] input = input_buffer with torch.no_grad(): output = F.linear(input.view(bsz, -1), weight, self.bias) return output.view(bsz, 1, -1) @torch.jit.unused def reorder_incremental_state( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], new_order, ): input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: input_buffer = input_buffer.index_select(0, new_order) self._set_input_buffer(incremental_state, input_buffer) @torch.jit.unused def _get_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ): return utils.get_incremental_state(self, incremental_state, "input_buffer") @torch.jit.unused def _set_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], new_buffer, ): return utils.set_incremental_state( self, incremental_state, "input_buffer", new_buffer ) @torch.jit.unused def _get_linearized_weight(self): if self._linearized_weight is None: kw = self.kernel_size[0] weight = self.weight.transpose(2, 1).transpose(1, 0).contiguous() assert weight.size() == (self.out_channels, kw, self.in_channels) return weight.view(self.out_channels, -1) return self._linearized_weight @torch.jit.unused def _clear_linearized_weight(self, *args): self._linearized_weight = None ================================================ FILE: fairseq/modules/location_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn import torch import torch.nn.functional as F class LocationAttention(nn.Module): """ Attention-Based Models for Speech Recognition https://arxiv.org/pdf/1506.07503.pdf :param int encoder_dim: # projection-units of encoder :param int decoder_dim: # units of decoder :param int attn_dim: attention dimension :param int conv_dim: # channels of attention convolution :param int conv_kernel_size: filter size of attention convolution """ def __init__( self, attn_dim, encoder_dim, decoder_dim, attn_state_kernel_size, conv_dim, conv_kernel_size, scaling=2.0, ): super(LocationAttention, self).__init__() self.attn_dim = attn_dim self.decoder_dim = decoder_dim self.scaling = scaling self.proj_enc = nn.Linear(encoder_dim, attn_dim) self.proj_dec = nn.Linear(decoder_dim, attn_dim, bias=False) self.proj_attn = nn.Linear(conv_dim, attn_dim, bias=False) self.conv = nn.Conv1d( attn_state_kernel_size, conv_dim, 2 * conv_kernel_size + 1, padding=conv_kernel_size, bias=False, ) self.proj_out = nn.Sequential(nn.Tanh(), nn.Linear(attn_dim, 1)) self.proj_enc_out = None # cache def clear_cache(self): self.proj_enc_out = None def forward(self, encoder_out, encoder_padding_mask, decoder_h, attn_state): """ :param torch.Tensor encoder_out: padded encoder hidden state B x T x D :param torch.Tensor encoder_padding_mask: encoder padding mask :param torch.Tensor decoder_h: decoder hidden state B x D :param torch.Tensor attn_prev: previous attention weight B x K x T :return: attention weighted encoder state (B, D) :rtype: torch.Tensor :return: previous attention weights (B x T) :rtype: torch.Tensor """ bsz, seq_len, _ = encoder_out.size() if self.proj_enc_out is None: self.proj_enc_out = self.proj_enc(encoder_out) # B x K x T -> B x C x T attn = self.conv(attn_state) # B x C x T -> B x T x C -> B x T x D attn = self.proj_attn(attn.transpose(1, 2)) if decoder_h is None: decoder_h = encoder_out.new_zeros(bsz, self.decoder_dim) dec_h = self.proj_dec(decoder_h).view(bsz, 1, self.attn_dim) out = self.proj_out(attn + self.proj_enc_out + dec_h).squeeze(2) out.masked_fill_(encoder_padding_mask, -float("inf")) w = F.softmax(self.scaling * out, dim=1) c = torch.sum(encoder_out * w.view(bsz, seq_len, 1), dim=1) return c, w ================================================ FILE: fairseq/modules/lstm_cell_with_zoneout.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn class LSTMCellWithZoneOut(nn.Module): """ Zoneout: Regularizing RNNs by Randomly Preserving Hidden Activations https://arxiv.org/abs/1606.01305 """ def __init__( self, prob: float, input_size: int, hidden_size: int, bias: bool = True ): super(LSTMCellWithZoneOut, self).__init__() self.lstm_cell = nn.LSTMCell(input_size, hidden_size, bias=bias) self.prob = prob if prob > 1.0 or prob < 0.0: raise ValueError( "zoneout probability must be in the range from " "0.0 to 1.0." ) def zoneout(self, h, next_h, prob): if isinstance(h, tuple): return tuple([self.zoneout(h[i], next_h[i], prob) for i in range(len(h))]) if self.training: mask = h.new_zeros(*h.size()).bernoulli_(prob) return mask * h + (1 - mask) * next_h return prob * h + (1 - prob) * next_h def forward(self, x, h): return self.zoneout(h, self.lstm_cell(x, h), self.prob) ================================================ FILE: fairseq/modules/multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Dict, List, Optional, Tuple import torch import torch.nn.functional as F from torch import Tensor, nn from torch.nn import Parameter try: from xformers.components.attention import build_attention from xformers.components.attention.utils import maybe_merge_masks _xformers_available = True except ImportError: _xformers_available = False from fairseq import utils from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise from fairseq.models.fairseq_incremental_decoder import FairseqIncrementalDecoder # TODO: move this into xformers? # TODO: uint8 input type should just output a bool def _mask_for_xformers(mask: Tensor, to_dtype: Optional[torch.dtype] = None): """ call to pytorch multihead accepts three mask types: - ByteTensor where non-zero means to mask - FloatTensor which is an additive mask - BoolTensor where True means to mask xFormers currently accepts boolean and additive maks. For boolean masks the values have opposite meaning. For a BoolTensor True mean to keep the value. """ float_types = [torch.float, torch.float16] # If an input mask is a float it is an additive mask. Otherwise it is either uint8 or bool. additive = mask.dtype in float_types # If to_dype is not specified, keep same dtype as mask. to_dtype = mask.dtype if to_dtype is None else to_dtype to_additive = to_dtype in float_types if additive: if to_additive: return mask.to(to_dtype) mask = mask < 0 if to_additive: # return additive mask new_mask = torch.zeros_like(mask, dtype=to_dtype) new_mask = new_mask.masked_fill_(mask, -float("inf")) return new_mask # In xFormers True is value to keep rather than value to mask mask = ~mask.to(torch.bool) mask = mask.to(to_dtype) return mask class MultiheadAttention(FairseqIncrementalDecoder): """Multi-headed attention. See "Attention Is All You Need" for more details. """ def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, dictionary=None, q_noise=0.0, qn_block_size=8, # TODO: pass in config rather than string. # config defined in xformers.components.attention.AttentionConfig xformers_att_config: Optional[str] = None, xformers_blocksparse_layout: Optional[ torch.Tensor ] = None, # This should be part of the config xformers_blocksparse_blocksize: Optional[ int ] = 16, # This should be part of the config ): super().__init__(dictionary) xformers_att_config = utils.eval_str_dict(xformers_att_config) self.use_xformers = xformers_att_config is not None if self.use_xformers and not _xformers_available: raise ImportError("\n\n Please install xFormers.") self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim self.num_heads = num_heads self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.head_dim = embed_dim // num_heads assert ( self.head_dim * num_heads == self.embed_dim ), "embed_dim must be divisible by num_heads" self.scaling = self.head_dim**-0.5 self.self_attention = self_attention self.encoder_decoder_attention = encoder_decoder_attention assert not self.self_attention or self.qkv_same_dim, ( "Self-attention requires query, key and " "value to be of the same size" ) self.k_proj = quant_noise( nn.Linear(self.kdim, embed_dim, bias=bias), q_noise, qn_block_size ) self.v_proj = quant_noise( nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size ) self.q_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size ) self.out_proj = quant_noise( nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size ) if add_bias_kv: self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) else: self.bias_k = self.bias_v = None self.add_zero_attn = add_zero_attn self.beam_size = 1 self.reset_parameters() if self.use_xformers: xformers_att_config["dropout"] = xformers_att_config.get("dropout", dropout) xformers_att_config["num_heads"] = xformers_att_config.get( "num_heads", num_heads ) if xformers_blocksparse_layout is not None: # Could be part of a single config passed only once xformers_att_config["block_size"] = xformers_blocksparse_blocksize xformers_att_config["layout"] = xformers_blocksparse_layout xformers_att_config["name"] = "blocksparse" self.attention = build_attention(xformers_att_config) self.onnx_trace = False self.skip_embed_dim_check = False self.init_incremental_state() def prepare_for_onnx_export_(self): self.onnx_trace = True def reset_parameters(self): if self.qkv_same_dim: # Empirically observed the convergence to be much better with # the scaled initialization nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) else: nn.init.xavier_uniform_(self.k_proj.weight) nn.init.xavier_uniform_(self.v_proj.weight) nn.init.xavier_uniform_(self.q_proj.weight) nn.init.xavier_uniform_(self.out_proj.weight) if self.out_proj.bias is not None: nn.init.constant_(self.out_proj.bias, 0.0) if self.bias_k is not None: nn.init.xavier_normal_(self.bias_k) if self.bias_v is not None: nn.init.xavier_normal_(self.bias_v) def _get_reserve_head_index(self, num_heads_to_keep: int): k_proj_heads_norm = [] q_proj_heads_norm = [] v_proj_heads_norm = [] for i in range(self.num_heads): start_idx = i * self.head_dim end_idx = (i + 1) * self.head_dim k_proj_heads_norm.append( torch.sum( torch.abs( self.k_proj.weight[ start_idx:end_idx, ] ) ).tolist() + torch.sum(torch.abs(self.k_proj.bias[start_idx:end_idx])).tolist() ) q_proj_heads_norm.append( torch.sum( torch.abs( self.q_proj.weight[ start_idx:end_idx, ] ) ).tolist() + torch.sum(torch.abs(self.q_proj.bias[start_idx:end_idx])).tolist() ) v_proj_heads_norm.append( torch.sum( torch.abs( self.v_proj.weight[ start_idx:end_idx, ] ) ).tolist() + torch.sum(torch.abs(self.v_proj.bias[start_idx:end_idx])).tolist() ) heads_norm = [] for i in range(self.num_heads): heads_norm.append( k_proj_heads_norm[i] + q_proj_heads_norm[i] + v_proj_heads_norm[i] ) sorted_head_index = sorted( range(self.num_heads), key=lambda k: heads_norm[k], reverse=True ) reserve_head_index = [] for i in range(num_heads_to_keep): start = sorted_head_index[i] * self.head_dim end = (sorted_head_index[i] + 1) * self.head_dim reserve_head_index.append((start, end)) return reserve_head_index def _adaptive_prune_heads(self, reserve_head_index: List[Tuple[int, int]]): new_q_weight = [] new_q_bias = [] new_k_weight = [] new_k_bias = [] new_v_weight = [] new_v_bias = [] new_out_proj_weight = [] for ele in reserve_head_index: start_idx, end_idx = ele new_q_weight.append( self.q_proj.weight[ start_idx:end_idx, ] ) new_q_bias.append(self.q_proj.bias[start_idx:end_idx]) new_k_weight.append( self.k_proj.weight[ start_idx:end_idx, ] ) new_k_bias.append(self.k_proj.bias[start_idx:end_idx]) new_v_weight.append( self.v_proj.weight[ start_idx:end_idx, ] ) new_v_bias.append(self.v_proj.bias[start_idx:end_idx]) new_out_proj_weight.append(self.out_proj.weight[:, start_idx:end_idx]) new_q_weight = torch.cat(new_q_weight).detach() new_k_weight = torch.cat(new_k_weight).detach() new_v_weight = torch.cat(new_v_weight).detach() new_out_proj_weight = torch.cat(new_out_proj_weight, dim=-1).detach() new_q_weight.requires_grad = True new_k_weight.requires_grad = True new_v_weight.requires_grad = True new_out_proj_weight.requires_grad = True new_q_bias = torch.cat(new_q_bias).detach() new_q_bias.requires_grad = True new_k_bias = torch.cat(new_k_bias).detach() new_k_bias.requires_grad = True new_v_bias = torch.cat(new_v_bias).detach() new_v_bias.requires_grad = True self.q_proj.weight = torch.nn.Parameter(new_q_weight) self.q_proj.bias = torch.nn.Parameter(new_q_bias) self.k_proj.weight = torch.nn.Parameter(new_k_weight) self.k_proj.bias = torch.nn.Parameter(new_k_bias) self.v_proj.weight = torch.nn.Parameter(new_v_weight) self.v_proj.bias = torch.nn.Parameter(new_v_bias) self.out_proj.weight = torch.nn.Parameter(new_out_proj_weight) self.num_heads = len(reserve_head_index) self.embed_dim = self.head_dim * self.num_heads self.q_proj.out_features = self.embed_dim self.k_proj.out_features = self.embed_dim self.v_proj.out_features = self.embed_dim def _set_skip_embed_dim_check(self): self.skip_embed_dim_check = True def _pad_masks( self, key_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor], ) -> Tuple[Optional[Tensor], Optional[Tensor]]: if attn_mask is not None: shape = attn_mask.size()[:-1] + torch.Size([1]) attn_mask = torch.cat([attn_mask, attn_mask.new_zeros(shape)], dim=-1) if key_padding_mask is not None: shape = key_padding_mask.size()[:-1] + torch.Size([1]) key_padding_mask = torch.cat( [ key_padding_mask, key_padding_mask.new_zeros(shape), ], dim=-1, ) return key_padding_mask, attn_mask def _add_bias( self, k: Tensor, v: Tensor, key_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor], bsz: int, ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: assert self.bias_k is not None assert self.bias_v is not None k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) key_padding_mask, attn_mask = self._pad_masks( key_padding_mask=key_padding_mask, attn_mask=attn_mask ) return k, v, key_padding_mask, attn_mask def _append_zero_attn( self, k: Tensor, v: Tensor, key_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor], ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]: zero_attn_shape = k.size()[:-2] + torch.Size([1]) + k.size()[-1:] k = torch.cat( [k, torch.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], dim=-2 ) v = torch.cat( [v, torch.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], dim=-2 ) key_padding_mask, attn_mask = self._pad_masks( key_padding_mask=key_padding_mask, attn_mask=attn_mask ) return k, v, key_padding_mask, attn_mask def _xformers_attn_forward( self, query, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, need_weights: bool = True, attn_mask: Optional[Tensor] = None, ) -> Tuple[Tensor, Optional[Tensor]]: tgt_len, bsz, embed_dim = query.size() if key_padding_mask is not None: assert key_padding_mask.size(0) == bsz assert key_padding_mask.size(1) == tgt_len if self.self_attention: key = query value = query elif self.encoder_decoder_attention: value = key q = self.q_proj(query) k = self.k_proj(key) v = self.v_proj(value) if self.bias_k is not None: assert self.bias_v is not None k, v, attn_mask, key_padding_mask = self._add_bias( k, v, attn_mask, key_padding_mask, bsz ) def fold_heads(x): return ( x.contiguous() .view(-1, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) def split_heads(x): return ( x.contiguous() .view(-1, bsz, self.num_heads, self.head_dim) .transpose(0, 1) .transpose(1, 2) ) massage = split_heads if self.attention.requires_head_dimension else fold_heads q = massage(q) if k is not None: k = massage(k) if v is not None: v = massage(v) if self.add_zero_attn: k, v, key_padding_mask, attn_mask = self._append_zero_attn( k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask ) kwargs = {} if attn_mask is not None and self.attention.supports_attention_mask: attn_mask = _mask_for_xformers(attn_mask, to_dtype=q.dtype) kwargs["att_mask"] = attn_mask if key_padding_mask is not None: key_padding_mask = _mask_for_xformers(key_padding_mask, to_dtype=torch.bool) if not self.attention.requires_separate_masks: attn_mask = maybe_merge_masks( attn_mask, key_padding_mask, batch_size=bsz, src_len=k.size(-2), tgt_len=q.size(-2), num_heads=self.num_heads, ) key_padding_mask = None kwargs["att_mask"] = attn_mask if self.attention.supports_key_padding_mask: kwargs["key_padding_mask"] = key_padding_mask y = self.attention(q, k, v, **kwargs) y = ( y.view(bsz, self.num_heads, tgt_len, self.head_dim) .transpose(1, 2) .flatten(start_dim=2, end_dim=3) .transpose(0, 1) ) assert list(y.size()) == [tgt_len, bsz, embed_dim] # Dropout not needed because already applied in attention. # It is applied to the attention weights before matmul with v. y = self.out_proj(y) # TODO: support returning attention weights if needed. return y, None def forward( self, query: Tensor, key: Optional[Tensor], value: Optional[Tensor], key_padding_mask: Optional[Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, need_weights: bool = True, static_kv: bool = False, attn_mask: Optional[Tensor] = None, before_softmax: bool = False, need_head_weights: bool = False, ) -> Tuple[Tensor, Optional[Tensor]]: """Input shape: Time x Batch x Channel Args: key_padding_mask (ByteTensor, optional): mask to exclude keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s. need_weights (bool, optional): return the attention weights, averaged over heads (default: False). attn_mask (ByteTensor, optional): typically used to implement causal attention, where the mask prevents the attention from looking forward in time (default: None). before_softmax (bool, optional): return the raw attention weights and values before the attention softmax. need_head_weights (bool, optional): return the attention weights for each head. Implies *need_weights*. Default: return the average attention weights over all heads. """ if need_head_weights: need_weights = True is_tpu = query.device.type == "xla" tgt_len, bsz, embed_dim = query.size() src_len = tgt_len if not self.skip_embed_dim_check: assert ( embed_dim == self.embed_dim ), f"query dim {embed_dim} != {self.embed_dim}" assert list(query.size()) == [tgt_len, bsz, embed_dim] if key is not None: src_len, key_bsz, _ = key.size() if not torch.jit.is_scripting(): assert value is not None assert src_len, key_bsz == value.shape[:2] if ( not self.onnx_trace and not is_tpu # don't use PyTorch version on TPUs and incremental_state is None and not static_kv # A workaround for quantization to work. Otherwise JIT compilation # treats bias in linear module as method. and not torch.jit.is_scripting() # The Multihead attention implemented in pytorch forces strong dimension check # for input embedding dimention and K,Q,V projection dimension. # Since pruning will break the dimension check and it is not easy to modify the pytorch API, # it is preferred to bypass the pytorch MHA when we need to skip embed_dim_check and not self.skip_embed_dim_check ): assert key is not None and value is not None if self.use_xformers: return self._xformers_attn_forward( query, key, value, key_padding_mask, need_weights, attn_mask ) else: return F.multi_head_attention_forward( query, key, value, self.embed_dim, self.num_heads, torch.empty([0]), torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)), self.bias_k, self.bias_v, self.add_zero_attn, self.dropout_module.p, self.out_proj.weight, self.out_proj.bias, self.training or self.dropout_module.apply_during_inference, key_padding_mask.bool() if key_padding_mask is not None else None, need_weights, attn_mask, use_separate_proj_weight=True, q_proj_weight=self.q_proj.weight, k_proj_weight=self.k_proj.weight, v_proj_weight=self.v_proj.weight, ) if incremental_state is not None: saved_state = self._get_input_buffer(incremental_state) if saved_state is not None and "prev_key" in saved_state: # previous time steps are cached - no need to recompute # key and value if they are static if static_kv: assert self.encoder_decoder_attention and not self.self_attention key = value = None else: saved_state = None if self.self_attention: q = self.q_proj(query) k = self.k_proj(query) v = self.v_proj(query) elif self.encoder_decoder_attention: # encoder-decoder attention q = self.q_proj(query) if key is None: assert value is None k = v = None else: if self.beam_size > 1 and bsz == key.size(1): # key is [T, bsz*beam_size, C], reduce to [T, bsz, C] key = key.view(key.size(0), -1, self.beam_size, key.size(2))[ :, :, 0, : ] if key_padding_mask is not None: key_padding_mask = key_padding_mask.view( -1, self.beam_size, key_padding_mask.size(1) )[:, 0, :] k = self.k_proj(key) v = self.v_proj(key) else: assert key is not None and value is not None q = self.q_proj(query) k = self.k_proj(key) v = self.v_proj(value) q *= self.scaling if self.bias_k is not None: assert self.bias_v is not None k, v, attn_mask, key_padding_mask = self._add_bias( k, v, attn_mask, key_padding_mask, bsz ) q = ( q.contiguous() .view(tgt_len, bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) kv_bsz = bsz # need default value for scripting if k is not None: kv_bsz = k.size(1) k = ( k.contiguous() .view(-1, kv_bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if v is not None: v = ( v.contiguous() .view(-1, kv_bsz * self.num_heads, self.head_dim) .transpose(0, 1) ) if saved_state is not None: # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) if "prev_key" in saved_state: _prev_key = saved_state["prev_key"] assert _prev_key is not None kv_bsz = _prev_key.size(0) prev_key = _prev_key.view(kv_bsz * self.num_heads, -1, self.head_dim) if static_kv: k = prev_key else: assert k is not None k = torch.cat([prev_key, k], dim=1) src_len = k.size(1) if "prev_value" in saved_state: _prev_value = saved_state["prev_value"] assert _prev_value is not None assert kv_bsz == _prev_value.size(0) prev_value = _prev_value.view( kv_bsz * self.num_heads, -1, self.head_dim ) if static_kv: v = prev_value else: assert v is not None v = torch.cat([prev_value, v], dim=1) prev_key_padding_mask: Optional[Tensor] = None if "prev_key_padding_mask" in saved_state: prev_key_padding_mask = saved_state["prev_key_padding_mask"] assert k is not None and v is not None key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( key_padding_mask=key_padding_mask, prev_key_padding_mask=prev_key_padding_mask, batch_size=kv_bsz, src_len=k.size(1), static_kv=static_kv, ) saved_state["prev_key"] = k.view(kv_bsz, self.num_heads, -1, self.head_dim) saved_state["prev_value"] = v.view( kv_bsz, self.num_heads, -1, self.head_dim ) saved_state["prev_key_padding_mask"] = key_padding_mask # In this branch incremental_state is never None assert incremental_state is not None incremental_state = self._set_input_buffer(incremental_state, saved_state) assert k is not None assert k.size(1) == src_len # This is part of a workaround to get around fork/join parallelism # not supporting Optional types. if key_padding_mask is not None and key_padding_mask.dim() == 0: key_padding_mask = None if key_padding_mask is not None: assert key_padding_mask.size(0) == kv_bsz assert key_padding_mask.size(1) == src_len if self.add_zero_attn: assert v is not None src_len += 1 k, v, key_padding_mask, attn_mask = self._append_zero_attn( k=k, v=v, key_padding_mask=key_padding_mask, attn_mask=attn_mask ) if self.encoder_decoder_attention and bsz != kv_bsz: attn_weights = torch.einsum( "bxhtd,bhsd->bxhts", q.view((kv_bsz, -1, self.num_heads) + q.size()[1:]), k.view((kv_bsz, self.num_heads) + k.size()[1:]), ) attn_weights = attn_weights.reshape((-1,) + attn_weights.size()[-2:]) else: attn_weights = torch.bmm(q, k.transpose(1, 2)) attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] if attn_mask is not None: attn_mask = attn_mask.unsqueeze(0) if self.onnx_trace: attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1) attn_weights += attn_mask if key_padding_mask is not None: # don't attend to padding symbols attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) if not is_tpu: attn_weights = attn_weights.view( kv_bsz, -1, self.num_heads, tgt_len, src_len ) attn_weights = attn_weights.masked_fill( key_padding_mask.unsqueeze(1) .unsqueeze(2) .unsqueeze(3) .to(torch.bool), float("-inf"), ) else: attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) attn_weights = attn_weights.transpose(0, 2) attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) if before_softmax: return attn_weights, v attn_weights_float = utils.softmax( attn_weights, dim=-1, onnx_trace=self.onnx_trace ) attn_weights = attn_weights_float.type_as(attn_weights) attn_probs = self.dropout_module(attn_weights) assert v is not None attn: Optional[Tensor] = None if self.encoder_decoder_attention and bsz != kv_bsz: attn = torch.einsum( "bxhts,bhsd->bxhtd", attn_probs.view( ( kv_bsz, -1, self.num_heads, ) + attn_probs.size()[1:] ), v.view( ( kv_bsz, self.num_heads, ) + v.size()[1:] ), ) attn = attn.reshape((-1,) + attn.size()[-2:]) else: attn = torch.bmm(attn_probs, v) assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] if self.onnx_trace and attn.size(1) == 1: # when ONNX tracing a single decoder step (sequence length == 1) # the transpose is a no-op copy before view, thus unnecessary attn = attn.contiguous().view(tgt_len, bsz, self.embed_dim) else: attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, self.embed_dim) attn = self.out_proj(attn) attn_weights: Optional[Tensor] = None if need_weights: attn_weights = attn_weights_float.view( bsz, self.num_heads, tgt_len, src_len ).transpose(1, 0) if not need_head_weights: # average attention weights over heads attn_weights = attn_weights.mean(dim=0) return attn, attn_weights @staticmethod def _append_prev_key_padding_mask( key_padding_mask: Optional[Tensor], prev_key_padding_mask: Optional[Tensor], batch_size: int, src_len: int, static_kv: bool, ) -> Optional[Tensor]: # saved key padding masks have shape (bsz, seq_len) if prev_key_padding_mask is not None and static_kv: new_key_padding_mask = prev_key_padding_mask elif prev_key_padding_mask is not None and key_padding_mask is not None: new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1 ) # During incremental decoding, as the padding token enters and # leaves the frame, there will be a time when prev or current # is None elif prev_key_padding_mask is not None: if src_len > prev_key_padding_mask.size(1): filler = torch.zeros( (batch_size, src_len - prev_key_padding_mask.size(1)), device=prev_key_padding_mask.device, ) new_key_padding_mask = torch.cat( [prev_key_padding_mask.float(), filler.float()], dim=1 ) else: new_key_padding_mask = prev_key_padding_mask.float() elif key_padding_mask is not None: if src_len > key_padding_mask.size(1): filler = torch.zeros( (batch_size, src_len - key_padding_mask.size(1)), device=key_padding_mask.device, ) new_key_padding_mask = torch.cat( [filler.float(), key_padding_mask.float()], dim=1 ) else: new_key_padding_mask = key_padding_mask.float() else: new_key_padding_mask = prev_key_padding_mask return new_key_padding_mask @torch.jit.export def reorder_incremental_state( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], new_order: Tensor, ): """Reorder buffered internal state (for incremental generation).""" input_buffer = self._get_input_buffer(incremental_state) if input_buffer is not None: for k in input_buffer.keys(): input_buffer_k = input_buffer[k] if input_buffer_k is not None: if self.encoder_decoder_attention: if input_buffer_k.size(0) * self.beam_size == new_order.size(0): return incremental_state elif self.beam_size > 1: input_buffer[k] = input_buffer_k.index_select( 0, new_order.reshape(-1, self.beam_size)[:, 0] // self.beam_size, ) else: input_buffer[k] = input_buffer_k.index_select(0, new_order) else: input_buffer[k] = input_buffer_k.index_select(0, new_order) incremental_state = self._set_input_buffer(incremental_state, input_buffer) return incremental_state def set_beam_size(self, beam_size): """Used for effiecient beamable enc-dec attention""" self.beam_size = beam_size def _get_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] ) -> Dict[str, Optional[Tensor]]: result = self.get_incremental_state(incremental_state, "attn_state") if result is not None: return result else: empty_result: Dict[str, Optional[Tensor]] = {} return empty_result def _set_input_buffer( self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], buffer: Dict[str, Optional[Tensor]], ): return self.set_incremental_state(incremental_state, "attn_state", buffer) def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): return attn_weights def upgrade_state_dict_named(self, state_dict, name): prefix = name + "." if name != "" else "" items_to_add = {} keys_to_remove = [] for k in state_dict.keys(): if k.endswith(prefix + "in_proj_weight"): # in_proj_weight used to be q + k + v with same dimensions dim = int(state_dict[k].shape[0] / 3) items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim] items_to_add[prefix + "k_proj.weight"] = state_dict[k][dim : 2 * dim] items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim :] keys_to_remove.append(k) k_bias = prefix + "in_proj_bias" if k_bias in state_dict.keys(): dim = int(state_dict[k].shape[0] / 3) items_to_add[prefix + "q_proj.bias"] = state_dict[k_bias][:dim] items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][ dim : 2 * dim ] items_to_add[prefix + "v_proj.bias"] = state_dict[k_bias][2 * dim :] keys_to_remove.append(prefix + "in_proj_bias") for k in keys_to_remove: del state_dict[k] for key, value in items_to_add.items(): state_dict[key] = value ================================================ FILE: fairseq/modules/positional_embedding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn from .learned_positional_embedding import LearnedPositionalEmbedding from .sinusoidal_positional_embedding import SinusoidalPositionalEmbedding def PositionalEmbedding( num_embeddings: int, embedding_dim: int, padding_idx: int, learned: bool = False, auto_expand: bool = True, ): if learned: # if padding_idx is specified then offset the embedding ids by # this index and adjust num_embeddings appropriately # TODO: The right place for this offset would be inside # LearnedPositionalEmbedding. Move this there for a cleaner implementation. if padding_idx is not None: num_embeddings = num_embeddings + padding_idx + 1 m = LearnedPositionalEmbedding(num_embeddings, embedding_dim, padding_idx) nn.init.normal_(m.weight, mean=0, std=embedding_dim**-0.5) if padding_idx is not None: nn.init.constant_(m.weight[padding_idx], 0) else: m = SinusoidalPositionalEmbedding( embedding_dim, padding_idx, init_size=num_embeddings + padding_idx + 1, auto_expand=auto_expand, ) return m ================================================ FILE: fairseq/modules/positional_encoding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn import math import torch class PositionalEncoding(nn.Module): """Positional encoding. Args: d_model: Embedding dimension. dropout_rate: Dropout rate. max_len: Maximum input length. reverse: Whether to reverse the input position. """ def __init__(self, d_model, dropout_rate, max_len=5000, reverse=False): """Construct an PositionalEncoding object.""" super(PositionalEncoding, self).__init__() self.d_model = d_model self.reverse = reverse self.xscale = math.sqrt(self.d_model) self.dropout = nn.Dropout(p=dropout_rate) self.pe = None self.extend_pe(torch.tensor(0.0).expand(1, max_len)) def extend_pe(self, x): """Reset the positional encodings.""" if self.pe is not None: if self.pe.size(1) >= x.size(1): if self.pe.dtype != x.dtype or self.pe.device != x.device: self.pe = self.pe.to(dtype=x.dtype, device=x.device) return pe = torch.zeros(x.size(1), self.d_model) if self.reverse: position = torch.arange( x.size(1) - 1, -1, -1.0, dtype=torch.float32 ).unsqueeze(1) else: position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) div_term = torch.exp( torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) ) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) self.pe = pe.to(device=x.device, dtype=x.dtype) def forward(self, x: torch.Tensor): """Add positional encoding. Args: x (torch.Tensor): Input tensor B X T X C Returns: torch.Tensor: Encoded tensor B X T X C """ self.extend_pe(x) x = x * self.xscale + self.pe[:, : x.size(1)] return self.dropout(x) class RelPositionalEncoding(nn.Module): """Relative positional encoding module (new implementation). Args: d_model: Embedding dimension. dropout_rate: Dropout rate. max_len: Maximum input length. """ def __init__(self, max_len, d_model): """Construct an PositionalEncoding object.""" super(RelPositionalEncoding, self).__init__() self.d_model = d_model self.pe = None self.extend_pe(torch.tensor(0.0).expand(1, max_len)) def extend_pe(self, x): """Reset the positional encodings.""" if self.pe is not None: # self.pe contains both positive and negative parts # the length of self.pe is 2 * input_len - 1 if self.pe.size(1) >= x.size(1) * 2 - 1: if self.pe.dtype != x.dtype or self.pe.device != x.device: self.pe = self.pe.to(dtype=x.dtype, device=x.device) return # Suppose `i` means to the position of query vecotr and `j` means the # position of key vector. We use position relative positions when keys # are to the left (i>j) and negative relative positions otherwise (i<j). pe_positive = torch.zeros(x.size(1), self.d_model) pe_negative = torch.zeros(x.size(1), self.d_model) position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) div_term = torch.exp( torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model) ) pe_positive[:, 0::2] = torch.sin(position * div_term) pe_positive[:, 1::2] = torch.cos(position * div_term) pe_negative[:, 0::2] = torch.sin(-1 * position * div_term) pe_negative[:, 1::2] = torch.cos(-1 * position * div_term) # Reserve the order of positive indices and concat both positive and # negative indices. This is used to support the shifting trick # as in https://arxiv.org/abs/1901.02860 pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0) pe_negative = pe_negative[1:].unsqueeze(0) pe = torch.cat([pe_positive, pe_negative], dim=1) self.pe = pe.to(device=x.device, dtype=x.dtype) def forward(self, x: torch.Tensor): """Add positional encoding. Args: x : Input tensor T X B X C. Returns: torch.Tensor: Encoded tensor T X B X C. """ x = x.transpose(0, 1) # Change TBC to BTC self.extend_pe(x) pos_emb = self.pe[ :, self.pe.size(1) // 2 - x.size(1) + 1 : self.pe.size(1) // 2 + x.size(1), ] pos_emb = pos_emb.transpose(0, 1) # change to TBC return pos_emb ================================================ FILE: fairseq/modules/quant_noise.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn def quant_noise(module, p, block_size): """ Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product Quantization as described in "Training with Quantization Noise for Extreme Model Compression" Args: - module: nn.Module - p: amount of Quantization Noise - block_size: size of the blocks for subsequent quantization with iPQ Remarks: - Module weights must have the right sizes wrt the block size - Only Linear, Embedding and Conv2d modules are supported for the moment - For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks" - We implement the simplest form of noise here as stated in the paper which consists in randomly dropping blocks """ # if no quantization noise, don't register hook if p <= 0: return module # supported modules assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) # test whether module.weight has the right sizes wrt block_size is_conv = module.weight.ndim == 4 # 2D matrix if not is_conv: assert ( module.weight.size(1) % block_size == 0 ), "Input features must be a multiple of block sizes" # 4D matrix else: # 1x1 convolutions if module.kernel_size == (1, 1): assert ( module.in_channels % block_size == 0 ), "Input channels must be a multiple of block sizes" # regular convolutions else: k = module.kernel_size[0] * module.kernel_size[1] assert k % block_size == 0, "Kernel size must be a multiple of block size" def _forward_pre_hook(mod, input): # no noise for evaluation if mod.training: if not is_conv: # gather weight and sizes weight = mod.weight in_features = weight.size(1) out_features = weight.size(0) # split weight matrix into blocks and randomly drop selected blocks mask = torch.zeros( in_features // block_size * out_features, device=weight.device ) mask.bernoulli_(p) mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) else: # gather weight and sizes weight = mod.weight in_channels = mod.in_channels out_channels = mod.out_channels # split weight matrix into blocks and randomly drop selected blocks if mod.kernel_size == (1, 1): mask = torch.zeros( int(in_channels // block_size * out_channels), device=weight.device, ) mask.bernoulli_(p) mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) else: mask = torch.zeros( weight.size(0), weight.size(1), device=weight.device ) mask.bernoulli_(p) mask = ( mask.unsqueeze(2) .unsqueeze(3) .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]) ) # scale weights and apply mask mask = mask.to( torch.bool ) # x.bool() is not currently supported in TorchScript s = 1 / (1 - p) mod.weight.data = s * weight.masked_fill(mask, 0) module.register_forward_pre_hook(_forward_pre_hook) return module ================================================ FILE: fairseq/modules/quantization/__init__.py ================================================ ================================================ FILE: fairseq/modules/quantization/pq/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .utils import SizeTracker, get_param, attrsetter, quantize_model_ # NOQA ================================================ FILE: fairseq/modules/quantization/pq/em.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import random from collections import Counter import torch class EM: """ EM algorithm used to quantize the columns of W to minimize ||W - W_hat||^2 Args: - W: weight matrix of size (in_features x out_features) - n_iter: number of k-means iterations - n_centroids: number of centroids (size of codebook) - eps: for cluster reassignment when an empty cluster is found - max_tentatives for cluster reassignment when an empty cluster is found - verbose: print error after each iteration Remarks: - If one cluster is empty, the most populated cluster is split into two clusters - All the relevant dimensions are specified in the code """ def __init__( self, W, n_centroids=256, n_iter=20, eps=1e-6, max_tentatives=30, verbose=True ): self.W = W self.n_centroids = n_centroids self.n_iter = n_iter self.eps = eps self.max_tentatives = max_tentatives self.verbose = verbose self.centroids = torch.Tensor() self.assignments = torch.Tensor() self.objective = [] def initialize_centroids(self): """ Initializes the centroids by sampling random columns from W. """ in_features, out_features = self.W.size() indices = torch.randint( low=0, high=out_features, size=(self.n_centroids,) ).long() self.centroids = self.W[:, indices].t() # (n_centroids x in_features) def step(self, i): """ There are two standard steps for each iteration: expectation (E) and minimization (M). The E-step (assignment) is performed with an exhaustive search and the M-step (centroid computation) is performed with the exact solution. Args: - i: step number Remarks: - The E-step heavily uses PyTorch broadcasting to speed up computations and reduce the memory overhead """ # assignments (E-step) distances = self.compute_distances() # (n_centroids x out_features) self.assignments = torch.argmin(distances, dim=0) # (out_features) n_empty_clusters = self.resolve_empty_clusters() # centroids (M-step) for k in range(self.n_centroids): W_k = self.W[:, self.assignments == k] # (in_features x size_of_cluster_k) self.centroids[k] = W_k.mean(dim=1) # (in_features) # book-keeping obj = (self.centroids[self.assignments].t() - self.W).norm(p=2).item() self.objective.append(obj) if self.verbose: logging.info( f"Iteration: {i},\t" f"objective: {obj:.6f},\t" f"resolved empty clusters: {n_empty_clusters}" ) def resolve_empty_clusters(self): """ If one cluster is empty, the most populated cluster is split into two clusters by shifting the respective centroids. This is done iteratively for a fixed number of tentatives. """ # empty clusters counts = Counter(map(lambda x: x.item(), self.assignments)) empty_clusters = set(range(self.n_centroids)) - set(counts.keys()) n_empty_clusters = len(empty_clusters) tentatives = 0 while len(empty_clusters) > 0: # given an empty cluster, find most populated cluster and split it into two k = random.choice(list(empty_clusters)) m = counts.most_common(1)[0][0] e = torch.randn_like(self.centroids[m]) * self.eps self.centroids[k] = self.centroids[m].clone() self.centroids[k] += e self.centroids[m] -= e # recompute assignments distances = self.compute_distances() # (n_centroids x out_features) self.assignments = torch.argmin(distances, dim=0) # (out_features) # check for empty clusters counts = Counter(map(lambda x: x.item(), self.assignments)) empty_clusters = set(range(self.n_centroids)) - set(counts.keys()) # increment tentatives if tentatives == self.max_tentatives: logging.info( f"Could not resolve all empty clusters, {len(empty_clusters)} remaining" ) raise EmptyClusterResolveError tentatives += 1 return n_empty_clusters def compute_distances(self): """ For every centroid m, computes ||M - m[None, :]||_2 Remarks: - We rely on PyTorch's broadcasting to speed up computations and reduce the memory overhead - Without chunking, the sizes in the broadcasting are modified as: (n_centroids x n_samples x out_features) -> (n_centroids x out_features) - The broadcasting computation is automatically chunked so that the tensors fit into the memory of the GPU """ nb_centroids_chunks = 1 while True: try: return torch.cat( [ (self.W[None, :, :] - centroids_c[:, :, None]).norm(p=2, dim=1) for centroids_c in self.centroids.chunk( nb_centroids_chunks, dim=0 ) ], dim=0, ) except RuntimeError: nb_centroids_chunks *= 2 def assign(self): """ Assigns each column of W to its closest centroid, thus essentially performing the E-step in train(). Remarks: - The function must be called after train() or after loading centroids using self.load(), otherwise it will return empty tensors """ distances = self.compute_distances() # (n_centroids x out_features) self.assignments = torch.argmin(distances, dim=0) # (out_features) def save(self, path, layer): """ Saves centroids and assignments. Args: - path: folder used to save centroids and assignments """ torch.save(self.centroids, os.path.join(path, "{}_centroids.pth".format(layer))) torch.save( self.assignments, os.path.join(path, "{}_assignments.pth".format(layer)) ) torch.save(self.objective, os.path.join(path, "{}_objective.pth".format(layer))) def load(self, path, layer): """ Loads centroids and assignments from a given path Args: - path: folder use to load centroids and assignments """ self.centroids = torch.load( os.path.join(path, "{}_centroids.pth".format(layer)) ) self.assignments = torch.load( os.path.join(path, "{}_assignments.pth".format(layer)) ) self.objective = torch.load( os.path.join(path, "{}_objective.pth".format(layer)) ) class EmptyClusterResolveError(Exception): pass ================================================ FILE: fairseq/modules/quantization/pq/modules/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .qconv import PQConv2d # NOQA from .qemb import PQEmbedding # NOQA from .qlinear import PQLinear # NOQA ================================================ FILE: fairseq/modules/quantization/pq/modules/qconv.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from torch.nn.modules.utils import _pair class PQConv2d(nn.Module): """ Quantized counterpart of nn.Conv2d module. Stores the centroid, the assignments and the non-quantized biases. The full weight is re-instantiated at each forward pass and autograd automatically computes the gradients with respect to the centroids. Args: - centroids: centroids of size n_centroids x block_size - assignments: assignments of the centroids to the subvectors of size self.out_channels x n_blocks - bias: the non-quantized bias, must be either torch.Tensor or None Remarks: - We refer the reader to the official documentation of the nn.Conv2d module for the other arguments and the behavior of the module. - Performance tests on GPU show that this implementation is 10% slower than the non-quantized nn.Conv2d module for a standard training loop. - During the backward, the gradients are averaged by cluster and not summed. This explains the hook registered to the centroids. """ def __init__( self, centroids, assignments, bias, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, padding_mode="zeros", ): super(PQConv2d, self).__init__() self.block_size = centroids.size(1) self.n_centroids = centroids.size(0) self.in_channels = in_channels self.out_channels = out_channels self.kernel_size = _pair(kernel_size) self.stride = _pair(stride) self.padding = _pair(padding) self.dilation = _pair(dilation) self.groups = groups self.padding_mode = padding_mode # check compatibility if in_channels // groups * np.prod(self.kernel_size) % self.block_size != 0: raise ValueError("Wrong PQ sizes") if len(assignments) % out_channels != 0: raise ValueError("Wrong PQ sizes") if in_channels % groups != 0: raise ValueError("in_channels must be divisible by groups") if out_channels % groups != 0: raise ValueError("out_channels must be divisible by groups") # define parameters self.centroids = nn.Parameter(centroids, requires_grad=True) self.register_buffer("assignments", assignments) self.register_buffer("counts", torch.bincount(assignments).type_as(centroids)) if bias is not None: self.bias = nn.Parameter(bias) else: self.register_parameter("bias", None) # register hook for averaging gradients per centroids instead of summing self.centroids.register_hook(lambda x: x / self.counts[:, None]) @property def weight(self): return ( self.centroids[self.assignments] .reshape(-1, self.out_channels, self.block_size) .permute(1, 0, 2) .reshape( self.out_channels, self.in_channels // self.groups, *self.kernel_size ) ) def forward(self, x): return F.conv2d( x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups, ) def extra_repr(self): s = "{in_channels}, {out_channels}, kernel_size={kernel_size}, stride={stride}" if self.padding != (0,) * len(self.padding): s += ", padding={padding}" if self.dilation != (1,) * len(self.dilation): s += ", dilation={dilation}" if self.groups != 1: s += ", groups={groups}" if self.bias is None: s += ", bias=False" if self.padding_mode != "zeros": s += ", padding_mode={padding_mode}" s += ", n_centroids={n_centroids}, block_size={block_size}" return s.format(**self.__dict__) ================================================ FILE: fairseq/modules/quantization/pq/modules/qemb.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F class PQEmbedding(nn.Module): """ Quantized counterpart of nn.Embedding module. Stores the centroids and the assignments. The full weight is re-instantiated at each forward pass. Args: - centroids: centroids of size n_centroids x block_size - assignments: assignments of the centroids to the subvectors of size self.out_features x n_blocks - bias: the non-quantized bias Remarks: - We refer the reader to the official documentation of the nn.Embedding module for the other arguments and the behavior of the module - Performance tests on GPU show that this implementation is 10% slower than the non-quantized nn.Embedding module for a standard training loop. """ def __init__( self, centroids, assignments, num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, ): super(PQEmbedding, self).__init__() self.block_size = centroids.size(1) self.n_centroids = centroids.size(0) self.num_embeddings = num_embeddings self.embedding_dim = embedding_dim if padding_idx is not None: if padding_idx > 0: assert ( padding_idx < self.num_embeddings ), "Padding_idx must be within num_embeddings" elif padding_idx < 0: assert ( padding_idx >= -self.num_embeddings ), "Padding_idx must be within num_embeddings" padding_idx = self.num_embeddings + padding_idx self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq self.sparse = sparse # check compatibility if self.embedding_dim % self.block_size != 0: raise ValueError("Wrong PQ sizes") if len(assignments) % self.num_embeddings != 0: raise ValueError("Wrong PQ sizes") # define parameters self.centroids = nn.Parameter(centroids, requires_grad=True) self.register_buffer("assignments", assignments) self.register_buffer("counts", torch.bincount(assignments).type_as(centroids)) @property def weight(self): return ( self.centroids[self.assignments] .reshape(-1, self.num_embeddings, self.block_size) .permute(1, 0, 2) .flatten(1, 2) ) def forward(self, input): return F.embedding( input, self.weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse, ) def extra_repr(self): s = "{num_embeddings}, {embedding_dim}" if self.padding_idx is not None: s += ", padding_idx={padding_idx}" if self.max_norm is not None: s += ", max_norm={max_norm}" if self.norm_type != 2: s += ", norm_type={norm_type}" if self.scale_grad_by_freq is not False: s += ", scale_grad_by_freq={scale_grad_by_freq}" if self.sparse is not False: s += ", sparse=True" s += ", n_centroids={n_centroids}, block_size={block_size}" return s.format(**self.__dict__) ================================================ FILE: fairseq/modules/quantization/pq/modules/qlinear.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F class PQLinear(nn.Module): """ Quantized counterpart of nn.Linear module. Stores the centroid, the assignments and the non-quantized biases. The full weight is re-instantiated at each forward pass. Args: - centroids: centroids of size n_centroids x block_size - assignments: assignments of the centroids to the subvectors of size self.out_features x n_blocks - bias: the non-quantized bias Remarks: - We refer the reader to the official documentation of the nn.Linear module for the other arguments and the behavior of the module - Performance tests on GPU show that this implementation is 15% slower than the non-quantized nn.Linear module for a standard training loop. """ def __init__(self, centroids, assignments, bias, in_features, out_features): super(PQLinear, self).__init__() self.block_size = centroids.size(1) self.n_centroids = centroids.size(0) self.in_features = in_features self.out_features = out_features # check compatibility if self.in_features % self.block_size != 0: raise ValueError("Wrong PQ sizes") if len(assignments) % self.out_features != 0: raise ValueError("Wrong PQ sizes") # define parameters self.centroids = nn.Parameter(centroids, requires_grad=True) self.register_buffer("assignments", assignments) self.register_buffer("counts", torch.bincount(assignments).type_as(centroids)) if bias is not None: self.bias = nn.Parameter(bias) else: self.register_parameter("bias", None) @property def weight(self): return ( self.centroids[self.assignments] .reshape(-1, self.out_features, self.block_size) .permute(1, 0, 2) .flatten(1, 2) ) def forward(self, x): return F.linear( x, self.weight, self.bias, ) def extra_repr(self): return f"in_features={self.in_features},\ out_features={self.out_features},\ n_centroids={self.n_centroids},\ block_size={self.block_size},\ bias={self.bias is not None}" ================================================ FILE: fairseq/modules/quantization/pq/pq.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .em import EM, EmptyClusterResolveError class PQ(EM): """ Quantizes the layer weights W with the standard Product Quantization technique. This learns a codebook of codewords or centroids of size block_size from W. For further reference on using PQ to quantize neural networks, see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks", Stock et al., ICLR 2020. PQ is performed in two steps: (1) The matrix W (weights or fully-connected or convolutional layer) is reshaped to (block_size, -1). - If W is fully-connected (2D), its columns are split into blocks of size block_size. - If W is convolutional (4D), its filters are split along the spatial dimension. (2) We apply the standard EM/k-means algorithm to the resulting reshaped matrix. Args: - W: weight matrix to quantize of size (in_features x out_features) - block_size: size of the blocks (subvectors) - n_centroids: number of centroids - n_iter: number of k-means iterations - eps: for cluster reassignment when an empty cluster is found - max_tentatives for cluster reassignment when an empty cluster is found - verbose: print information after each iteration Remarks: - block_size be compatible with the shape of W """ def __init__( self, W, block_size, n_centroids=256, n_iter=20, eps=1e-6, max_tentatives=30, verbose=True, ): self.block_size = block_size W_reshaped = self._reshape(W) super(PQ, self).__init__( W_reshaped, n_centroids=n_centroids, n_iter=n_iter, eps=eps, max_tentatives=max_tentatives, verbose=verbose, ) def _reshape(self, W): """ Reshapes the matrix W as expained in step (1). """ # fully connected: by convention the weight has size out_features x in_features if len(W.size()) == 2: self.out_features, self.in_features = W.size() assert ( self.in_features % self.block_size == 0 ), "Linear: n_blocks must be a multiple of in_features" return ( W.reshape(self.out_features, -1, self.block_size) .permute(2, 1, 0) .flatten(1, 2) ) # convolutional: we reshape along the spatial dimension elif len(W.size()) == 4: self.out_channels, self.in_channels, self.k_h, self.k_w = W.size() assert ( self.in_channels * self.k_h * self.k_w ) % self.block_size == 0, ( "Conv2d: n_blocks must be a multiple of in_channels * k_h * k_w" ) return ( W.reshape(self.out_channels, -1, self.block_size) .permute(2, 1, 0) .flatten(1, 2) ) # not implemented else: raise NotImplementedError(W.size()) def encode(self): """ Performs self.n_iter EM steps. """ self.initialize_centroids() for i in range(self.n_iter): try: self.step(i) except EmptyClusterResolveError: break def decode(self): """ Returns the encoded full weight matrix. Must be called after the encode function. """ # fully connected case if "k_h" not in self.__dict__: return ( self.centroids[self.assignments] .reshape(-1, self.out_features, self.block_size) .permute(1, 0, 2) .flatten(1, 2) ) # convolutional case else: return ( self.centroids[self.assignments] .reshape(-1, self.out_channels, self.block_size) .permute(1, 0, 2) .reshape(self.out_channels, self.in_channels, self.k_h, self.k_w) ) ================================================ FILE: fairseq/modules/quantization/pq/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import re from operator import attrgetter, itemgetter import torch import numpy as np import torch.distributed as dist import torch.nn as nn from .modules import PQConv2d, PQEmbedding, PQLinear from .pq import PQ def quantize_model_( model, size_tracker, layers_to_quantize, block_sizes_config, n_centroids_config, step=0, n_iter=15, eps=1e-6, max_tentatives=100, remove_weights=False, verbose=True, state_dict=None, ): """ Quantize a model in-place by stages. All the targeted layers are replaced by their quantized counterpart, and the model is ready for the finetuning of the centroids in a standard training loop (no modifications required). Note that we do not quantize biases. Args: - model: a nn.Module - size_tracker: useful for tracking quatization statistics - layers_to_quantize: a list containing regexps for filtering the layers to quantize at each stage according to their name (as in model.named_parameters()) - block_sizes_config: dict like { 'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), 'Linear': ('in_features', {'*': 8}) } For instance, all conv2d layers with kernel size 3x3 have a block size of 9 and all Linear layers are quantized with a block size of 8, irrespective of their size. - n_centroids_config: dict like { 'Conv2d': ('kernel_size', {'*': 256}), 'Linear': ('in_features', {'*': 256}) } For instance, all conv2d layers are quantized with 256 centroids - step: the layers to quantize inplace corresponding to layers_to_quantize[step] """ quantized_layers = get_layers( model, layers_to_quantize[step], remove_weights=remove_weights ) for layer in quantized_layers: # book-keeping is_master_process = (not dist.is_initialized()) or ( dist.is_initialized() and dist.get_rank() == 0 ) verbose = verbose and is_master_process # get block size and centroids module = attrgetter(layer)(model) block_size = get_param(module, layer, block_sizes_config) n_centroids = get_param(module, layer, n_centroids_config) if verbose: logging.info( f"Quantizing layer {layer} with block size {block_size} and {n_centroids} centroids" ) # quantize layer weight = module.weight.data.clone() is_bias = "bias" in [x[0] for x in module.named_parameters()] bias = module.bias.data.clone() if is_bias else None quantizer = PQ( weight, block_size, n_centroids=n_centroids, n_iter=n_iter, eps=eps, max_tentatives=max_tentatives, verbose=verbose, ) # quantization performed on all GPUs with same seed quantizer.encode() centroids = quantizer.centroids.contiguous() assignments = quantizer.assignments.contiguous() # If n_iter = 0 and state_dict is provided, then # we initialize random assignments and centroids to # random values of the appropriate dimensions # because the quantized model parameters will # overwritten by the state_dict later on. if n_iter == 0 and state_dict: # Initialize random centroids of the correct size centroids = torch.rand(centroids.size()) centroids.cuda() # Get counts and assignment keys from layer in loaded checkpoint. counts_key = layer + "." + "counts" assignment_key = layer + "." + "assignments" # Get number of different bins to include. counts = list(state_dict[counts_key].shape)[0] print(layer) print(state_dict[counts_key]) print(counts) # Initialize random assignments of the correct size # with an appropriate number of bins. num_assignments = list(state_dict[assignment_key].shape)[0] num_extra = num_assignments - counts print(num_assignments) print(num_extra) assignments_bins = torch.arange(counts) assignments_rand = torch.randint(0, counts - 1, (num_extra,)) assignments = torch.cat((assignments_bins, assignments_rand), 0) # assignments = assignments.type(torch.IntTensor) assignments.cuda() print("assignments") print(assignments) # broadcast results to make sure weights are up-to-date if dist.is_initialized(): dist.broadcast(centroids, 0) dist.broadcast(assignments, 0) # instantiate the quantized counterpart if isinstance(module, nn.Linear): out_features, in_features = map( lambda k: module.__dict__[k], ["out_features", "in_features"] ) quantized_module = PQLinear( centroids, assignments, bias, in_features, out_features ) elif isinstance(module, nn.Embedding): num_embeddings, embedding_dim = map( lambda k: module.__dict__[k], ["num_embeddings", "embedding_dim"] ) quantized_module = PQEmbedding( centroids, assignments, num_embeddings, embedding_dim ) elif isinstance(module, nn.Conv2d): out_channels, in_channels, kernel_size = map( lambda k: module.__dict__[k], ["out_channels", "in_channels", "kernel_size"], ) stride, padding, dilation, groups, padding_mode = map( lambda k: module.__dict__[k], ["stride", "padding", "dilation", "groups", "padding_mode"], ) quantized_module = PQConv2d( centroids, assignments, bias, in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, padding_mode=padding_mode, ) else: raise ValueError(f"Module {module} not yet supported for quantization") # replace layer by its quantized counterpart attrsetter(layer)(model, quantized_module) # update statistics size_tracker.update(weight, block_size, n_centroids) # return name of quantized layers return quantized_layers def get_layers(model, filter_regexp, remove_weights=False): """ Filters out the layers according to a regexp. Note that we omit biases. Args: - model: a nn.Module - filter_regexp: a regexp to filter the layers to keep according to their name in model.named_parameters(). For instance, the regexp: down_layers\\.[123456]\\.(conv[12]|identity\\.conv)) is keeping blocks down_layers from 1 to 6, and inside each block is keeping conv1, conv2 and identity.conv. Remarks: - We add (module\\.)? at the beginning of the regexp to account for the possible use of nn.parallel.DataParallel """ # get all parameter names all_layers = map(itemgetter(0), model.named_parameters()) # remove biases all_layers = filter(lambda x: "bias" not in x, all_layers) # remove .weight in all other names (or .weight_orig is spectral norm) all_layers = map(lambda x: x.replace(".weight_orig", ""), all_layers) # remove weights indicates whether the weights extension should be removed, in addition to # weight_orig and weight extension on names if remove_weights: all_layers = map(lambda x: x.replace(".weights", ""), all_layers) all_layers = map(lambda x: x.replace(".weight", ""), all_layers) # return filtered layers filter_regexp = "(module\\.)?" + "(" + filter_regexp + ")" r = re.compile(filter_regexp) return list(filter(r.match, all_layers)) def get_param(module, layer_name, param_config): """ Given a quantization configuration, get the right parameter for the module to be quantized. Args: - module: a nn.Module - layer_name: the name of the layer - param_config: a dict like { 'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), 'Linear': ('in_features', {'*': 8}) } For instance, all conv2d layers with kernel size 3x3 have a block size of 9 and all Linear layers are quantized with a block size of 8, irrespective of their size. Remarks: - if 'fuzzy_name' is passed as a parameter, layers whose layer_name include 'fuzzy_name' will be assigned the given parameter. In the following example, conv.expand layers will have a block size of 9 while conv.reduce will have a block size of 4 and all other layers will have a block size of 2. { 'Conv2d': ('fuzzy_name', {'expand': 9, 'reduce': 4, '*': 2}), 'Linear': ('fuzzy_name', {'classifier': 8, 'projection': 4}) } """ layer_type = module.__class__.__name__ if layer_type not in param_config: raise KeyError(f"Layer type {layer_type} not in config for layer {module}") feature, params = param_config[module.__class__.__name__] if feature != "fuzzy_name": feature_value = str(getattr(module, feature)) if feature_value not in params: if "*" in params: feature_value = "*" else: raise KeyError( f"{feature}={feature_value} not in config for layer {module}" ) else: feature_values = [name for name in params if name in layer_name] if len(feature_values) == 0: if "*" in params: feature_value = "*" else: raise KeyError(f"name={layer_name} not in config for {module}") else: feature_value = feature_values[0] return params[feature_value] class SizeTracker(object): """ Class to keep track of the compressed network size with iPQ. Args: - model: a nn.Module Remarks: - The compressed size is the sum of three components for each layer in the network: (1) Storing the centroids given by iPQ in fp16 (2) Storing the assignments of the blocks in int8 (3) Storing all non-compressed elements such as biases - This cost in only valid if we use 256 centroids (then indexing can indeed by done with int8). """ def __init__(self, model): self.model = model self.size_non_compressed_model = self.compute_size() self.size_non_quantized = self.size_non_compressed_model self.size_index = 0 self.size_centroids = 0 self.n_quantized_layers = 0 def compute_size(self): """ Computes the size of the model (in MB). """ res = 0 for _, p in self.model.named_parameters(): res += p.numel() return res * 4 / 1024 / 1024 def update(self, W, block_size, n_centroids): """ Updates the running statistics when quantizing a new layer. """ # bits per weights bits_per_weight = np.log2(n_centroids) / block_size self.n_quantized_layers += 1 # size of indexing the subvectors of size block_size (in MB) size_index_layer = bits_per_weight * W.numel() / 8 / 1024 / 1024 self.size_index += size_index_layer # size of the centroids stored in float16 (in MB) size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024 self.size_centroids += size_centroids_layer # size of non-compressed layers, e.g. LayerNorms or biases (in MB) size_uncompressed_layer = W.numel() * 4 / 1024 / 1024 self.size_non_quantized -= size_uncompressed_layer def __repr__(self): size_compressed = ( self.size_index + self.size_centroids + self.size_non_quantized ) compression_ratio = self.size_non_compressed_model / size_compressed # NOQA return ( f"Non-compressed model size: {self.size_non_compressed_model:.2f} MB. " f"After quantizing {self.n_quantized_layers} layers, size " f"(indexing + centroids + other): {self.size_index:.2f} MB + " f"{self.size_centroids:.2f} MB + {self.size_non_quantized:.2f} MB = " f"{size_compressed:.2f} MB, compression ratio: {compression_ratio:.2f}x" ) def attrsetter(*items): def resolve_attr(obj, attr): attrs = attr.split(".") head = attrs[:-1] tail = attrs[-1] for name in head: obj = getattr(obj, name) return obj, tail def g(obj, val): for attr in items: resolved_obj, resolved_attr = resolve_attr(obj, attr) setattr(resolved_obj, resolved_attr, val) return g ================================================ FILE: fairseq/modules/quantization/quantization_options.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. def parse_config_yaml(yaml_data): # Initialize to default options. quantization_options = { "n_centroids": { "Linear": ["in_features", {"*": 256}], "Embedding": ["embedding_dim", {"*": 256}], }, "block_sizes": { "Linear": ["fuzzy_name", {"fc": 8, "attn": 4, "emb": 4}], "Embedding": ["fuzzy_name", {"emb": 8}], }, "layers_to_quantize": [ "decoder\\.layers\\.\\d+\\.fc[12]", "decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01]", "decoder\\.layers\\.\\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)", ], } if "n_centroids" in yaml_data: quantization_options["n_centroids"] = { layer: convert_yaml_to_tuple(layer_data) for layer, layer_data in yaml_data["n_centroids"].items() } if "block_sizes" in yaml_data: quantization_options["block_sizes"] = { layer: convert_yaml_to_tuple(layer_data) for layer, layer_data in yaml_data["block_sizes"].items() } if "layers_to_quantize" in yaml_data: quantization_options["layers_to_quantize"] = yaml_data["layers_to_quantize"] return quantization_options def convert_yaml_to_tuple(yaml_dictionary): """Converts a yaml dictionary with two keys: `key` and `value` into a two argument tuple of those values.""" return (yaml_dictionary["key"], yaml_dictionary["value"]) ================================================ FILE: fairseq/modules/quantization/scalar/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .utils import quantize_model_ # NOQA ================================================ FILE: fairseq/modules/quantization/scalar/modules/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from .qact import ActivationQuantizer # NOQA from .qconv import IntConv2d # NOQA from .qemb import IntEmbedding # NOQA from .qlinear import IntLinear # NOQA ================================================ FILE: fairseq/modules/quantization/scalar/modules/qact.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from ..ops import emulate_int class ActivationQuantizer: """ Fake scalar quantization of the activations using a forward hook. Args: - module. a nn.Module for which we quantize the *post-activations* - p: proportion of activations to quantize, set by default to 1 - update_step: to recompute quantization parameters - bits: number of bits for quantization - method: choose among {"tensor", "histogram", "channel"} - clamp_threshold: to prevent gradients overflow Remarks: - Parameters scale and zero_point are recomputed every update_step forward pass to reduce the overhead - For the list of quantization methods and number of bits, see ops.py - To remove the hook from the module, simply call self.handle.remove() - At test time, the activations are fully quantized - We use the straight-through estimator so that the gradients back-propagate nicely in the network, this is implemented with the detach() trick - The activations are hard-clamped in [-clamp_threshold, clamp_threshold] to prevent overflow during the backward pass """ def __init__( self, module, p=1, update_step=1000, bits=8, method="histogram", clamp_threshold=5, ): self.module = module self.p = p self.update_step = update_step self.counter = 0 self.bits = bits self.method = method self.clamp_threshold = clamp_threshold self.handle = None self.register_hook() def register_hook(self): # forward hook def quantize_hook(module, x, y): # update parameters every 1000 iterations if self.counter % self.update_step == 0: self.scale = None self.zero_point = None self.counter += 1 # train with QuantNoise and evaluate the fully quantized network p = self.p if self.module.training else 1 # quantize activations y_q, self.scale, self.zero_point = emulate_int( y.detach(), bits=self.bits, method=self.method, scale=self.scale, zero_point=self.zero_point, ) # mask to apply noise mask = torch.zeros_like(y) mask.bernoulli_(1 - p) noise = (y_q - y).masked_fill(mask.bool(), 0) # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) return torch.clamp(y, clamp_low.item(), clamp_high.item()) + noise.detach() # register hook self.handle = self.module.register_forward_hook(quantize_hook) ================================================ FILE: fairseq/modules/quantization/scalar/modules/qconv.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn.functional as F from torch.nn.modules.conv import _ConvNd from torch.nn.modules.utils import _pair from ..ops import emulate_int class IntConv2d(_ConvNd): """ Quantized counterpart of the nn.Conv2d module that applies QuantNoise during training. Args: - standard nn.Conv2d parameters - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights) - bits: number of bits - method: choose among {"tensor", "histogram", "channel"} - update_step: recompute scale and zero_point every update_steps iterations Remarks: - We use the straight-thgourh estimator so that the gradients back-propagate nicely in the network, this is implemented with the detach() trick - Parameters scale and zero_point are recomputed every update_step forward pass to reduce the overhead - At test time, the weights are fully quantized """ def __init__( self, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1, bias=True, padding_mode="zeros", p=0, bits=8, method="histogram", update_step=1000, ): kernel_size = _pair(kernel_size) stride = _pair(stride) padding = _pair(padding) dilation = _pair(dilation) super(IntConv2d, self).__init__( in_channels, out_channels, kernel_size, stride, padding, dilation, False, _pair(0), groups, bias, padding_mode, ) # quantization parameters self.p = p self.bits = bits self.method = method self.update_step = update_step self.counter = 0 def _conv_forward(self, input, weight): if self.padding_mode != "zeros": return F.conv2d( F.pad(input, self._padding_repeated_twice, mode=self.padding_mode), weight, self.bias, self.stride, _pair(0), self.dilation, self.groups, ) return F.conv2d( input, weight, self.bias, self.stride, self.padding, self.dilation, self.groups, ) def forward(self, input): # train with QuantNoise and evaluate the fully quantized network p = self.p if self.training else 1 # update parameters every 100 iterations if self.counter % self.update_step == 0: self.scale = None self.zero_point = None self.counter += 1 # quantize weight weight_quantized, self.scale, self.zero_point = emulate_int( self.weight.detach(), bits=self.bits, method=self.method, scale=self.scale, zero_point=self.zero_point, ) # mask to apply noise mask = torch.zeros_like(self.weight) mask.bernoulli_(1 - p) noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0) # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) weight = ( torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach() ) # return output output = self._conv_forward(input, weight) return output def extra_repr(self): return ( "in_channels={}, out_channels={}, kernel_size={}, stride={}, " "padding={}, dilation={}, groups={}, bias={}, quant_noise={}, " "bits={}, method={}".format( self.in_channels, self.out_channels, self.kernel_size, self.stride, self.padding, self.dilation, self.groups, self.bias is not None, self.p, self.bits, self.method, ) ) ================================================ FILE: fairseq/modules/quantization/scalar/modules/qemb.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F from ..ops import emulate_int class IntEmbedding(nn.Module): """ Quantized counterpart of the nn.Embedding module that applies QuantNoise during training. Args: - num_embeddings: number of tokens - embedding_dim: embedding dimension - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights) - bits: number of bits - method: choose among {"tensor", "histogram", "channel"} - update_step: recompute scale and zero_point every update_steps iterations Remarks: - We use the straight-through estimator so that the gradients back-propagate nicely in the network, this is implemented with the detach() trick - Parameters scale and zero_point are recomputed every update_step forward pass to reduce the overhead - At test time, the weights are fully quantized """ def __init__( self, num_embeddings, embedding_dim, padding_idx=None, max_norm=None, norm_type=2.0, scale_grad_by_freq=False, sparse=False, _weight=None, p=0, update_step=1000, bits=8, method="histogram", ): super(IntEmbedding, self).__init__() self.num_embeddings = num_embeddings self.embedding_dim = embedding_dim if padding_idx is not None: if padding_idx > 0: assert ( padding_idx < self.num_embeddings ), "Padding_idx must be within num_embeddings" elif padding_idx < 0: assert ( padding_idx >= -self.num_embeddings ), "Padding_idx must be within num_embeddings" padding_idx = self.num_embeddings + padding_idx self.padding_idx = padding_idx self.max_norm = max_norm self.norm_type = norm_type self.scale_grad_by_freq = scale_grad_by_freq if _weight is None: self.weight = nn.Parameter(torch.Tensor(num_embeddings, embedding_dim)) self.reset_parameters() else: assert list(_weight.shape) == [ num_embeddings, embedding_dim, ], "Shape of weight does not match num_embeddings and embedding_dim" self.weight = nn.Parameter(_weight) self.sparse = sparse # quantization parameters self.p = p self.bits = bits self.method = method self.update_step = update_step self.counter = 0 def reset_parameters(self): nn.init.normal_(self.weight) if self.padding_idx is not None: with torch.no_grad(): self.weight[self.padding_idx].fill_(0) def forward(self, input): # train with QuantNoise and evaluate the fully quantized network p = self.p if self.training else 1 # update parameters every 1000 iterations if self.counter % self.update_step == 0: self.scale = None self.zero_point = None self.counter += 1 # quantize weight weight_quantized, self.scale, self.zero_point = emulate_int( self.weight.detach(), bits=self.bits, method=self.method, scale=self.scale, zero_point=self.zero_point, ) # mask to apply noise mask = torch.zeros_like(self.weight) mask.bernoulli_(1 - p) noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0) # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) weight = ( torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach() ) # return output output = F.embedding( input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse, ) return output def extra_repr(self): s = "{num_embeddings}, {embedding_dim}" if self.padding_idx is not None: s += ", padding_idx={padding_idx}" if self.max_norm is not None: s += ", max_norm={max_norm}" if self.norm_type != 2: s += ", norm_type={norm_type}" if self.scale_grad_by_freq is not False: s += ", scale_grad_by_freq={scale_grad_by_freq}" if self.sparse is not False: s += ", sparse=True" s += "quant_noise={p}, bits={bits}, method={method}" return s.format(**self.__dict__) ================================================ FILE: fairseq/modules/quantization/scalar/modules/qlinear.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.nn as nn import torch.nn.functional as F from ..ops import emulate_int class IntLinear(nn.Module): """ Quantized counterpart of the nn.Linear module that applies QuantNoise during training. Args: - in_features: input features - out_features: output features - bias: bias or not - p: amount of noise to inject (0 = no quantization, 1 = quantize all the weights) - bits: number of bits - method: choose among {"tensor", "histogram", "channel"} - update_step: recompute scale and zero_point every update_steps iterations Remarks: - We use the straight-through estimator so that the gradients back-propagate nicely in the network, this is implemented with the detach() trick. - Parameters scale and zero_point are recomputed every update_step forward pass to reduce the overhead - At test time, the weights are fully quantized """ def __init__( self, in_features, out_features, bias=True, p=0, update_step=3000, bits=8, method="histogram", ): super(IntLinear, self).__init__() self.in_features = int(in_features) self.out_features = int(out_features) self.weight = torch.nn.Parameter(torch.Tensor(out_features, in_features)) self.chosen_bias = bias if self.chosen_bias: self.bias = torch.nn.Parameter(torch.Tensor(out_features)) else: self.register_parameter("bias", None) self.reset_parameters() # quantization parameters self.p = p self.bits = bits self.method = method self.update_step = update_step self.counter = 0 def reset_parameters(self): nn.init.xavier_uniform_(self.weight) if self.chosen_bias: nn.init.constant_(self.bias, 0.0) return def forward(self, input): # train with QuantNoise and evaluate the fully quantized network p = self.p if self.training else 1 # update parameters every 100 iterations if self.counter % self.update_step == 0: self.scale = None self.zero_point = None self.counter += 1 # quantize weight weight_quantized, self.scale, self.zero_point = emulate_int( self.weight.detach(), bits=self.bits, method=self.method, scale=self.scale, zero_point=self.zero_point, ) # mask to apply noise mask = torch.zeros_like(self.weight) mask.bernoulli_(1 - p) noise = (weight_quantized - self.weight).masked_fill(mask.bool(), 0) # using straight-through estimator (STE) clamp_low = -self.scale * self.zero_point clamp_high = self.scale * (2**self.bits - 1 - self.zero_point) weight = ( torch.clamp(self.weight, clamp_low.item(), clamp_high.item()) + noise.detach() ) # return output output = F.linear(input, weight, self.bias) return output def extra_repr(self): return "in_features={}, out_features={}, bias={}, quant_noise={}, bits={}, method={}".format( self.in_features, self.out_features, self.bias is not None, self.p, self.bits, self.method, ) ================================================ FILE: fairseq/modules/quantization/scalar/ops.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch try: import torch.ao.quantization as quantization except ImportError: import torch.quantization as quantization def emulate_int(w, bits, method, scale=None, zero_point=None): q = globals()[f"emulate_int8_{method}"] return q(w, scale=scale, zero_point=zero_point, bits=bits) def quantize(w, scale, zero_point, bits=8): # In the default behavior, max_val = 255. max_val = 2**bits - 1 return ( torch.clamp(torch.round(w / scale + zero_point), 0, max_val) - zero_point ) * scale def emulate_int8_histogram(w, scale=None, zero_point=None, bits=8): if scale is None: obs = quantization.observer.HistogramObserver() obs.to(device=w.device) _ = obs(w.float()) scale, zero_point = obs.calculate_qparams() scale = scale.cuda().type_as(w) zero_point = zero_point.cuda().type_as(w) return quantize(w, scale, zero_point, bits=bits), scale, zero_point def emulate_int8_channel(w, scale=None, zero_point=None, bits=8): if scale is None: obs = quantization.observer.PerChannelMinMaxObserver( ch_axis=-1, qscheme=torch.per_channel_symmetric ) obs.to(device=w.device) _ = obs(w) scale, zero_point, ch_axis = obs.get_qparams() scale = scale.cuda().type_as(w) zero_point = zero_point.cuda().type_as(w) return quantize(w, scale, zero_point, bits=bits), scale, zero_point def emulate_int8_tensor(w, scale=None, zero_point=None, bits=8): if scale is None: obs = quantization.observer.MinMaxObserver() obs.to(device=w.device) _ = obs(w) scale, zero_point = obs.calculate_qparams() scale = scale.cuda().type_as(w) zero_point = zero_point.cuda().type_as(w) return quantize(w, scale, zero_point, bits=bits), scale, zero_point ================================================ FILE: fairseq/modules/quantization/scalar/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from operator import attrgetter import torch.distributed as dist import torch.nn as nn from ..pq.utils import attrsetter, get_layers from .modules import ActivationQuantizer, IntConv2d, IntEmbedding, IntLinear MAPPING = {nn.Linear: IntLinear, nn.Embedding: IntEmbedding, nn.Conv2d: IntConv2d} def quantize_model_( model, p=0.2, bits=8, update_step=3000, method="histogram", remove_weights=False ): """ Replaces all modules with their scalar quantized counterpart and registers hooks to quantize the post-ativations of those modules. Args: - model: a nn.Module - p: amount of noise (0 for no noise, 1 to quantize all the weights/activations) - bits: number of bits - update_step: update quantization parameters every update_step steps """ # quantize all layers # remove weights indicates whether the weights extension should be removed, in addition to # weight_orig and weight extension on names quantized_layers = get_layers(model, "(.*?)", remove_weights=remove_weights) for layer in quantized_layers: # book-keeping is_master_process = (not dist.is_initialized()) or ( dist.is_initialized() and dist.get_rank() == 0 ) # recover module module = attrgetter(layer)(model) if is_master_process: logging.info( f"Quantizing layer {layer} with bits={bits} and QuantNoise={p}" ) # quantization params q_params = { "p": p, "update_step": update_step, "bits": bits, "method": method, "counter": 0, } # instantiate the quantized counterpart if isinstance(module, tuple(MAPPING.keys())): QuantizedModule = MAPPING[module.__class__] quantized_module = QuantizedModule.__new__(QuantizedModule) params = module.__dict__ params.update(q_params) quantized_module.__dict__.update(params) else: if is_master_process: logging.info(f"Module {module} not yet supported for quantization") continue # activation quantization a_q = ActivationQuantizer(quantized_module, p=0, bits=bits, method=method) # replace layer by its quantized counterpart attrsetter(layer)(model, quantized_module) # return name of quantized layers return quantized_layers ================================================ FILE: fairseq/modules/rotary_positional_embedding.py ================================================ import torch class RotaryPositionalEmbedding(torch.nn.Module): def __init__(self, dim, base=10000, precision=torch.half): """Rotary positional embedding Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf Args: dim: Dimension of embedding base: Base value for exponential precision: precision to use for numerical values """ super().__init__() inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim)) self.register_buffer("inv_freq", inv_freq) self.seq_len_cached = 0 self.cos_cached = torch.empty(self.seq_len_cached, 1, 1, dim) self.sin_cached = torch.empty(self.seq_len_cached, 1, 1, dim) self.precision = precision def forward(self, x, seq_len: int = 0): """ Args: x: Input x with T X B X C seq_len: Sequence length of input x """ if seq_len > self.seq_len_cached: self.seq_len_cached = seq_len t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq) freqs = torch.einsum("i,j->ij", t, self.inv_freq) emb = torch.cat((freqs, freqs), dim=-1).to(x.device) self.cos_cached = emb.cos().view(emb.size(0), 1, 1, emb.size(1)) self.sin_cached = emb.sin().view(emb.size(0), 1, 1, emb.size(1)) return self.cos_cached, self.sin_cached # rotary pos emb helpers: def rotate_half(x): x1, x2 = x[..., : x.shape[-1] // 2], x[..., x.shape[-1] // 2 :] return torch.cat( (-x2, x1), dim=x1.ndim - 1 ) # dim=-1 triggers a bug in earlier torch versions def apply_rotary_pos_emb(q, k, cos, sin, offset: int = 0): cos, sin = ( cos[offset : q.shape[0] + offset, ...], sin[offset : q.shape[0] + offset, ...], ) return (q * cos) + (rotate_half(q) * sin), (k * cos) + (rotate_half(k) * sin) ================================================ FILE: fairseq/modules/same_pad.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from torch import nn class SamePad(nn.Module): def __init__(self, kernel_size, causal=False): super().__init__() if causal: self.remove = kernel_size - 1 else: self.remove = 1 if kernel_size % 2 == 0 else 0 def forward(self, x): if self.remove > 0: x = x[:, :, : -self.remove] return x class SamePad2d(nn.Module): def __init__(self, kernel_size): super().__init__() self.remove = 1 if kernel_size % 2 == 0 else 0 def forward(self, x): assert len(x.size()) == 4 if self.remove > 0: x = x[:, :, : -self.remove, : -self.remove] return x ================================================ FILE: fairseq/modules/scalar_bias.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # import torch class ScalarBias(torch.autograd.Function): """ Adds a vector of scalars, used in self-attention mechanism to allow the model to optionally attend to this vector instead of the past """ @staticmethod def forward(ctx, input, dim, bias_init): size = list(input.size()) size[dim] += 1 output = input.new(*size).fill_(bias_init) output.narrow(dim, 1, size[dim] - 1).copy_(input) ctx.dim = dim return output @staticmethod def backward(ctx, grad): return grad.narrow(ctx.dim, 1, grad.size(ctx.dim) - 1), None, None def scalar_bias(input, dim, bias_init=0): return ScalarBias.apply(input, dim, bias_init) ================================================ FILE: fairseq/modules/sinusoidal_positional_embedding.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import Any, Optional import torch import torch.onnx.operators from fairseq import utils from torch import nn, Tensor class SinusoidalPositionalEmbedding(nn.Module): """This module produces sinusoidal positional embeddings of any length. Padding symbols are ignored. """ def __init__(self, embedding_dim, padding_idx, init_size=1024, auto_expand=True): super().__init__() self.embedding_dim = embedding_dim self.padding_idx = padding_idx if padding_idx is not None else 0 self.register_buffer( "weights", SinusoidalPositionalEmbedding.get_embedding( init_size, embedding_dim, padding_idx ), persistent=False, ) self.max_positions = int(1e5) self.auto_expand = auto_expand self.onnx_trace = False def prepare_for_onnx_export_(self): self.onnx_trace = True def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs): # Ignore some deprecated keys that were used in older versions deprecated_keys = ["weights", "_float_tensor"] for key in deprecated_keys: if prefix + key in state_dict: del state_dict[prefix + key] super()._load_from_state_dict(state_dict, prefix, *args, **kwargs) @staticmethod def get_embedding( num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None ): """Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of "Attention Is All You Need". """ half_dim = embedding_dim // 2 emb = math.log(10000) / (half_dim - 1) emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb) emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze( 1 ) * emb.unsqueeze(0) emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view( num_embeddings, -1 ) if embedding_dim % 2 == 1: # zero pad emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1) if padding_idx is not None: emb[padding_idx, :] = 0 return emb def forward( self, input, incremental_state: Optional[Any] = None, timestep: Optional[Tensor] = None, positions: Optional[Any] = None, ): """Input is expected to be of size [bsz x seqlen].""" bspair = torch.onnx.operators.shape_as_tensor(input) bsz, seq_len = bspair[0], bspair[1] max_pos = self.padding_idx + 1 + seq_len weights = self.weights if max_pos > self.weights.size(0): # If the input is longer than the number of pre-computed embeddings, # compute the extra embeddings on the fly. # Only store the expanded embeddings if auto_expand=True. # In multithreading environments, mutating the weights of a module # may cause trouble. Set auto_expand=False if this happens. weights = SinusoidalPositionalEmbedding.get_embedding( max_pos, self.embedding_dim, self.padding_idx ).to(self.weights) if self.auto_expand: self.weights = weights if incremental_state is not None: # positions is the same for every token when decoding a single step pos = timestep.view(-1)[0] + 1 if timestep is not None else seq_len if self.onnx_trace: return ( weights.index_select(index=self.padding_idx + pos, dim=0) .unsqueeze(1) .repeat(bsz, 1, 1) ) return weights[self.padding_idx + pos, :].expand(bsz, 1, -1) positions = utils.make_positions( input, self.padding_idx, onnx_trace=self.onnx_trace ) if self.onnx_trace: flat_embeddings = weights.detach().index_select(0, positions.view(-1)) embedding_shape = torch.cat( (bsz.view(1), seq_len.view(1), torch.tensor([-1], dtype=torch.long)) ) embeddings = torch.onnx.operators.reshape_from_tensor_shape( flat_embeddings, embedding_shape ) return embeddings return ( weights.index_select(0, positions.view(-1)).view(bsz, seq_len, -1).detach() ) ================================================ FILE: fairseq/modules/sparse_multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch from .multihead_attention import MultiheadAttention class SparseMultiheadAttention(MultiheadAttention): """Sparse Multi-Headed Attention. "Generating Long Sequences with Sparse Transformers". Implements fixed factorized self attention, where l=stride and c=expressivity. A(1) includes all words in the stride window and A(2) takes a summary of c words from the end of each stride window. If is_bidirectional=False, we do not include any words past the current word, as in the paper. """ def __init__( self, embed_dim, num_heads, kdim=None, vdim=None, dropout=0.0, bias=True, add_bias_kv=False, add_zero_attn=False, self_attention=False, encoder_decoder_attention=False, stride=32, expressivity=8, is_bidirectional=True, ): super().__init__( embed_dim, num_heads, kdim, vdim, dropout, bias, add_bias_kv, add_zero_attn, self_attention, encoder_decoder_attention, ) self.is_bidirectional = is_bidirectional self.stride = stride self.expressivity = expressivity assert self.stride > 0 and self.stride >= self.expressivity # Used for Ai(2) calculations - beginning of [l-c, l] range def compute_checkpoint(self, word_index): if word_index % self.stride == 0 and word_index != 0: checkpoint_index = word_index - self.expressivity else: checkpoint_index = ( math.floor(word_index / self.stride) * self.stride + self.stride - self.expressivity ) return checkpoint_index # Computes Ai(2) def compute_subset_summaries(self, absolute_max): checkpoint_index = self.compute_checkpoint(0) subset_two = set() while checkpoint_index <= absolute_max - 1: summary = set( range( checkpoint_index, min(checkpoint_index + self.expressivity + 1, absolute_max), ) ) subset_two = subset_two.union(summary) checkpoint_index = self.compute_checkpoint(checkpoint_index + self.stride) return subset_two # Sparse Transformer Fixed Attention Pattern: https://arxiv.org/pdf/1904.10509.pdf def compute_fixed_attention_subset(self, word_index, tgt_len): # +1s account for range function; [min, max) -> [min, max] if not self.is_bidirectional: absolute_max = word_index + 1 else: absolute_max = tgt_len # Subset 1 - whole window rounded_index = ( math.floor((word_index + self.stride) / self.stride) * self.stride ) if word_index % self.stride == 0 and word_index != 0: subset_one = set( range(word_index - self.stride, min(absolute_max, word_index + 1)) ) else: subset_one = set( range( max(0, rounded_index - self.stride), min(absolute_max, rounded_index + 1), ) ) # Subset 2 - summary per window # If bidirectional, subset 2 is the same for every index subset_two = set() if not self.is_bidirectional: subset_two = self.compute_subset_summaries(absolute_max) return subset_one.union(subset_two) # Compute sparse mask - if bidirectional, can pre-compute and store def buffered_sparse_mask(self, tensor, tgt_len, src_len): assert tgt_len > self.stride sparse_mask = torch.empty((tgt_len, src_len)).float().fill_(float("-inf")) # If bidirectional, subset 2 is the same for every index subset_summaries = set() if self.is_bidirectional: subset_summaries = self.compute_subset_summaries(tgt_len) for i in range(tgt_len): fixed_attention_subset = self.compute_fixed_attention_subset(i, tgt_len) fixed_attention_subset = fixed_attention_subset.union(subset_summaries) included_word_indices = torch.LongTensor(list(fixed_attention_subset)) sparse_mask[i].index_fill_(0, included_word_indices, 0) return sparse_mask.type_as(tensor) def apply_sparse_mask(self, attn_weights, tgt_len, src_len, bsz): sparse_mask = self.buffered_sparse_mask(attn_weights, tgt_len, src_len) sparse_mask = sparse_mask.unsqueeze(0).expand( bsz * self.num_heads, tgt_len, src_len ) attn_weights += sparse_mask ================================================ FILE: fairseq/modules/sparse_transformer_sentence_encoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn as nn from fairseq.modules import TransformerSentenceEncoder from fairseq.modules.sparse_transformer_sentence_encoder_layer import ( SparseTransformerSentenceEncoderLayer, ) class SparseTransformerSentenceEncoder(TransformerSentenceEncoder): """ Sparse implementation of the TransformerSentenceEncoder - see SparseMultiheadAttention """ def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", learned_pos_embedding: bool = True, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, is_bidirectional: bool = True, stride: int = 32, expressivity: int = 8, ) -> None: super().__init__( padding_idx, vocab_size, num_encoder_layers, embedding_dim, ffn_embedding_dim, num_attention_heads, dropout, attention_dropout, activation_dropout, max_seq_len, num_segments, use_position_embeddings, offset_positions_by_padding, encoder_normalize_before, apply_bert_init, activation_fn, learned_pos_embedding, embed_scale, freeze_embeddings, n_trans_layers_to_freeze, export, ) self.layers = nn.ModuleList( [ SparseTransformerSentenceEncoderLayer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, is_bidirectional=is_bidirectional, stride=stride, expressivity=expressivity, ) for _ in range(num_encoder_layers) ] ) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer]) ================================================ FILE: fairseq/modules/sparse_transformer_sentence_encoder_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.modules import TransformerSentenceEncoderLayer from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention class SparseTransformerSentenceEncoderLayer(TransformerSentenceEncoderLayer): """ Implements a Sprase Transformer Encoder Layer (see SparseMultiheadAttention) """ def __init__( self, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = "relu", export: bool = False, is_bidirectional: bool = True, stride: int = 32, expressivity: int = 8, ) -> None: super().__init__( embedding_dim, ffn_embedding_dim, num_attention_heads, dropout, attention_dropout, activation_dropout, activation_fn, export, ) self.self_attn = SparseMultiheadAttention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, add_bias_kv=False, add_zero_attn=False, self_attention=True, is_bidirectional=is_bidirectional, stride=stride, expressivity=expressivity, ) ================================================ FILE: fairseq/modules/transformer_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Optional import torch import torch.nn as nn from torch import Tensor from fairseq import utils from fairseq.models.transformer import TransformerConfig from fairseq.modules import LayerNorm, MultiheadAttention from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise class TransformerEncoderLayerBase(nn.Module): """Encoder layer block. In the original paper each operation (multi-head attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *cfg.encoder.normalize_before* to ``True``. Args: cfg (argparse.Namespace): parsed command-line arguments """ def __init__(self, cfg, return_fc=False): super().__init__() self.cfg = cfg self.return_fc = return_fc self.embed_dim = cfg.encoder.embed_dim self.quant_noise = cfg.quant_noise.pq self.quant_noise_block_size = cfg.quant_noise.pq_block_size self.self_attn = self.build_self_attention(self.embed_dim, cfg) self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.dropout_module = FairseqDropout( cfg.dropout, module_name=self.__class__.__name__ ) self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn) activation_dropout_p = cfg.activation_dropout if activation_dropout_p == 0: # for backwards compatibility with models that use cfg.relu_dropout activation_dropout_p = cfg.relu_dropout or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = cfg.encoder.normalize_before self.fc1 = self.build_fc1( self.embed_dim, cfg.encoder.ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( cfg.encoder.ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise( nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size ) def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise( nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size ) def _get_fc_rank(self, remove_num: int) -> List[int]: f1_filter_param = [] for i in range(self.fc1.out_features): f1_filter_param.append( torch.sum(torch.abs(self.fc1.weight[i])) + torch.sum(torch.abs(self.fc2.weight[:, i])) + torch.abs(self.fc1.bias[i]) ) return sorted( range(len(f1_filter_param)), key=lambda k: f1_filter_param[k], reverse=False )[0:remove_num] def _prune_fc_layer(self, remove_index: List[int]): new_fc1_weight = [] new_fc1_bias = [] for i in range(self.fc1.out_features): if i not in remove_index: new_fc1_weight.append(self.fc1.weight[i]) new_fc1_bias.append(self.fc1.bias[i]) new_fc1_weight = torch.stack(new_fc1_weight).detach() new_fc1_weight.requires_grad = True new_fc1_bias = torch.stack(new_fc1_bias).detach() new_fc1_bias.requires_grad = True self.fc1 = quant_noise( nn.Linear(self.fc1.in_features, self.fc1.out_features - len(remove_index)), p=self.quant_noise, block_size=self.quant_noise_block_size, ) self.fc1.weight = torch.nn.Parameter(new_fc1_weight) self.fc1.bias = torch.nn.Parameter(new_fc1_bias) new_fc2_weight = [] new_fc2_bias = [] for i in range(self.fc2.in_features): if i not in remove_index: new_fc2_weight.append(self.fc2.weight[:, i]) new_fc2_bias = self.fc2.bias.detach() new_fc2_weight = torch.stack(new_fc2_weight, dim=-1).detach() new_fc2_weight.requires_grad = True new_fc2_bias = self.fc2.bias.detach() new_fc2_bias.requires_grad = True self.fc2 = quant_noise( nn.Linear(self.fc2.in_features - len(remove_index), self.fc2.out_features), p=self.quant_noise, block_size=self.quant_noise_block_size, ) self.fc2.weight = torch.nn.Parameter(new_fc2_weight) self.fc2.bias = torch.nn.Parameter(new_fc2_bias) def build_self_attention(self, embed_dim, cfg): return MultiheadAttention( embed_dim, cfg.encoder.attention_heads, dropout=cfg.attention_dropout, self_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, xformers_att_config=cfg.encoder.xformers_att_config, ) def residual_connection(self, x, residual): return residual + x def upgrade_state_dict_named(self, state_dict, name): """ Rename layer norm states from `...layer_norms.0.weight` to `...self_attn_layer_norm.weight` and `...layer_norms.1.weight` to `...final_layer_norm.weight` """ layer_norm_map = {"0": "self_attn_layer_norm", "1": "final_layer_norm"} for old, new in layer_norm_map.items(): for m in ("weight", "bias"): k = "{}.layer_norms.{}.{}".format(name, old, m) if k in state_dict: state_dict["{}.{}.{}".format(name, new, m)] = state_dict[k] del state_dict[k] def forward( self, x, encoder_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor] = None, ): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor): binary ByteTensor of shape `(batch, seq_len)` where padding elements are indicated by ``1``. attn_mask (ByteTensor): binary tensor of shape `(tgt_len, src_len)`, where `tgt_len` is the length of output and `src_len` is the length of input, though here both are equal to `seq_len`. `attn_mask[tgt_i, src_j] = 1` means that when calculating the embedding for `tgt_i`, we exclude (mask out) `src_j`. This is useful for strided self-attention. Returns: encoded output of shape `(seq_len, batch, embed_dim)` """ # anything in original attn_mask = 1, becomes -1e8 # anything in original attn_mask = 0, becomes 0 # Note that we cannot use -inf here, because at some edge cases, # the attention weight (before softmax) for some padded element in query # will become -inf, which results in NaN in model parameters if attn_mask is not None: attn_mask = attn_mask.masked_fill( attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4 ) residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) x, _ = self.self_attn( query=x, key=x, value=x, key_padding_mask=encoder_padding_mask, need_weights=False, attn_mask=attn_mask, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.self_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) fc_result = x x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) if self.return_fc and not torch.jit.is_scripting(): return x, fc_result return x # backward compatible with the legacy argparse format class TransformerEncoderLayer(TransformerEncoderLayerBase): def __init__(self, args): super().__init__(TransformerConfig.from_namespace(args)) self.args = args def build_self_attention(self, embed_dim, args): return super().build_self_attention( embed_dim, TransformerConfig.from_namespace(args) ) class TransformerDecoderLayerBase(nn.Module): """Decoder layer block. In the original paper each operation (multi-head attention, encoder attention or FFN) is postprocessed with: `dropout -> add residual -> layernorm`. In the tensor2tensor code they suggest that learning is more robust when preprocessing each layer with layernorm and postprocessing with: `dropout -> add residual`. We default to the approach in the paper, but the tensor2tensor approach can be enabled by setting *cfg.decoder.normalize_before* to ``True``. Args: args (argparse.Namespace): parsed command-line arguments no_encoder_attn (bool, optional): whether to attend to encoder outputs (default: False). """ def __init__( self, cfg, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__() self.embed_dim = cfg.decoder.embed_dim self.dropout_module = FairseqDropout( cfg.dropout, module_name=self.__class__.__name__ ) self.quant_noise = cfg.quant_noise.pq self.quant_noise_block_size = cfg.quant_noise.pq_block_size self.cross_self_attention = cfg.cross_self_attention self.self_attn = self.build_self_attention( self.embed_dim, cfg, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.attn_ln = ( LayerNorm(self.embed_dim) if utils.safe_getattr(cfg, "scale_attn", False) else None ) self.nh = self.self_attn.num_heads self.head_dim = self.self_attn.head_dim scale_heads = utils.safe_getattr(cfg, "scale_heads", False) self.c_attn = ( nn.Parameter(torch.ones((self.nh,)), requires_grad=True) if scale_heads else None ) self.activation_fn = utils.get_activation_fn(activation=cfg.activation_fn) activation_dropout_p = cfg.activation_dropout if activation_dropout_p == 0: # for backwards compatibility with models that use cfg.relu_dropout activation_dropout_p = cfg.relu_dropout or 0 self.activation_dropout_module = FairseqDropout( float(activation_dropout_p), module_name=self.__class__.__name__ ) self.normalize_before = cfg.decoder.normalize_before self.self_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) if no_encoder_attn: self.encoder_attn = None self.encoder_attn_layer_norm = None else: self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.ffn_layernorm = ( LayerNorm(cfg.decoder.ffn_embed_dim) if utils.safe_getattr(cfg, "scale_fc", False) else None ) self.w_resid = ( nn.Parameter( torch.ones( self.embed_dim, ), requires_grad=True, ) if utils.safe_getattr(cfg, "scale_resids", False) else None ) self.fc1 = self.build_fc1( self.embed_dim, cfg.decoder.ffn_embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.fc2 = self.build_fc2( cfg.decoder.ffn_embed_dim, self.embed_dim, self.quant_noise, self.quant_noise_block_size, ) self.final_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.need_attn = True self.onnx_trace = False def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_self_attention( self, embed_dim, cfg, add_bias_kv=False, add_zero_attn=False ): return MultiheadAttention( embed_dim, cfg.decoder.attention_heads, dropout=cfg.attention_dropout, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, self_attention=not cfg.cross_self_attention, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, xformers_att_config=cfg.decoder.xformers_att_config, ) def build_encoder_attention(self, embed_dim, cfg): return MultiheadAttention( embed_dim, cfg.decoder.attention_heads, kdim=cfg.encoder.embed_dim, vdim=cfg.encoder.embed_dim, dropout=cfg.attention_dropout, encoder_decoder_attention=True, q_noise=self.quant_noise, qn_block_size=self.quant_noise_block_size, xformers_att_config=cfg.encoder.xformers_att_config, ) def prepare_for_onnx_export_(self): self.onnx_trace = True def residual_connection(self, x, residual): return residual + x def forward( self, x, encoder_out: Optional[torch.Tensor] = None, encoder_padding_mask: Optional[torch.Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, prev_self_attn_state: Optional[List[torch.Tensor]] = None, prev_attn_state: Optional[List[torch.Tensor]] = None, self_attn_mask: Optional[torch.Tensor] = None, self_attn_padding_mask: Optional[torch.Tensor] = None, need_attn: bool = False, need_head_weights: bool = False, ): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor, optional): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. need_attn (bool, optional): return attention weights need_head_weights (bool, optional): return attention weights for each head (default: return average over heads). Returns: encoded output of shape `(seq_len, batch, embed_dim)` """ if need_head_weights: need_attn = True residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) if prev_self_attn_state is not None: prev_key, prev_value = prev_self_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_self_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] assert incremental_state is not None self.self_attn._set_input_buffer(incremental_state, saved_state) _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state) if self.cross_self_attention and not ( incremental_state is not None and _self_attn_input_buffer is not None and "prev_key" in _self_attn_input_buffer ): if self_attn_mask is not None: assert encoder_out is not None self_attn_mask = torch.cat( (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1 ) if self_attn_padding_mask is not None: if encoder_padding_mask is None: assert encoder_out is not None encoder_padding_mask = self_attn_padding_mask.new_zeros( encoder_out.size(1), encoder_out.size(0) ) self_attn_padding_mask = torch.cat( (encoder_padding_mask, self_attn_padding_mask), dim=1 ) assert encoder_out is not None y = torch.cat((encoder_out, x), dim=0) else: y = x x, attn = self.self_attn( query=x, key=y, value=y, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state, need_weights=False, attn_mask=self_attn_mask, ) if self.c_attn is not None: tgt_len, bsz = x.size(0), x.size(1) x = x.view(tgt_len, bsz, self.nh, self.head_dim) x = torch.einsum("tbhd,h->tbhd", x, self.c_attn) x = x.reshape(tgt_len, bsz, self.embed_dim) if self.attn_ln is not None: x = self.attn_ln(x) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.self_attn_layer_norm(x) if self.encoder_attn is not None and encoder_out is not None: residual = x if self.normalize_before: x = self.encoder_attn_layer_norm(x) if prev_attn_state is not None: prev_key, prev_value = prev_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_attn_state[2] assert incremental_state is not None self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.encoder_attn_layer_norm(x) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) if self.ffn_layernorm is not None: x = self.ffn_layernorm(x) x = self.fc2(x) x = self.dropout_module(x) if self.w_resid is not None: residual = torch.mul(self.w_resid, residual) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) if self.onnx_trace and incremental_state is not None: saved_state = self.self_attn._get_input_buffer(incremental_state) assert saved_state is not None if self_attn_padding_mask is not None: self_attn_state = [ saved_state["prev_key"], saved_state["prev_value"], saved_state["prev_key_padding_mask"], ] else: self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]] return x, attn, self_attn_state return x, attn, None def make_generation_fast_(self, need_attn: bool = False, **kwargs): self.need_attn = need_attn # backward compatible with the legacy argparse format class TransformerDecoderLayer(TransformerDecoderLayerBase): def __init__( self, args, no_encoder_attn=False, add_bias_kv=False, add_zero_attn=False ): super().__init__( TransformerConfig.from_namespace(args), no_encoder_attn=no_encoder_attn, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) self.args = args def build_self_attention( self, embed_dim, args, add_bias_kv=False, add_zero_attn=False ): return super().build_self_attention( embed_dim, TransformerConfig.from_namespace(args), add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ) def build_encoder_attention(self, embed_dim, args): return super().build_encoder_attention( embed_dim, TransformerConfig.from_namespace(args), ) ================================================ FILE: fairseq/modules/transformer_layer_aug.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Dict, List, Optional import torch from numpy.random import uniform from torch import Tensor from fairseq.modules import LayerNorm from fairseq.modules.transformer_layer import TransformerDecoderLayerBase class AugTransformerDecoderLayerBase(TransformerDecoderLayerBase): """Decoder layer block augmented with an additional cross-attention. This decoder block is processed with the sequence of the following sub-modules. self-attention -> cross-attention (first) -> cross-attention (second) -> FFN Args: cfg (argparse.Namespace): parsed command-line arguments encoder_attn_merge_type (str, optional): the way to combine outputs from two cross-attention modules. If "sequential" is set, two cross-attention modules are stacked sequentially. If "parallel" is set, they are processed in parallel and combined before feeding it to FFN (default: sequential). dropnet_ratio (float, optional): a probability to drop each cross-attention module during training (default: 0.0). """ def __init__( self, cfg, add_bias_kv=False, add_zero_attn=False, encoder_attn_merge_type="sequential", dropnet_ratio=0.0, ): super().__init__( cfg, no_encoder_attn=False, add_bias_kv=add_bias_kv, add_zero_attn=False, ) self.encoder_attn = self.build_encoder_attention(self.embed_dim, cfg) self.encoder_attn_layer_norm = LayerNorm(self.embed_dim, export=cfg.export) self.encoder_attn2 = self.build_encoder_attention(self.embed_dim, cfg) if encoder_attn_merge_type == "sequential": self.encoder_attn_layer_norm2 = LayerNorm(self.embed_dim, export=cfg.export) else: self.encoder_attn_layer_norm2 = None self.encoder_attn_merge_type = encoder_attn_merge_type self.dropnet_ratio = dropnet_ratio def forward( self, x, encoder_out: Optional[torch.Tensor] = None, encoder_padding_mask: Optional[torch.Tensor] = None, encoder_out_aug: Optional[torch.Tensor] = None, encoder_padding_mask2: Optional[torch.Tensor] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, prev_self_attn_state: Optional[List[torch.Tensor]] = None, prev_attn_state: Optional[List[torch.Tensor]] = None, self_attn_mask: Optional[torch.Tensor] = None, self_attn_padding_mask: Optional[torch.Tensor] = None, need_attn: bool = False, need_head_weights: bool = False, ): """ Args: x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)` encoder_padding_mask (ByteTensor, optional): binary ByteTensor of shape `(batch, src_len)` where padding elements are indicated by ``1``. need_attn (bool, optional): return attention weights need_head_weights (bool, optional): return attention weights for each head (default: return average over heads). Returns: encoded output of shape `(seq_len, batch, embed_dim)` """ if need_head_weights: need_attn = True residual = x if self.normalize_before: x = self.self_attn_layer_norm(x) if prev_self_attn_state is not None: prev_key, prev_value = prev_self_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_self_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_self_attn_state[2] assert incremental_state is not None self.self_attn._set_input_buffer(incremental_state, saved_state) _self_attn_input_buffer = self.self_attn._get_input_buffer(incremental_state) if self.cross_self_attention and not ( incremental_state is not None and _self_attn_input_buffer is not None and "prev_key" in _self_attn_input_buffer ): if self_attn_mask is not None: assert encoder_out is not None self_attn_mask = torch.cat( (x.new_zeros(x.size(0), encoder_out.size(0)), self_attn_mask), dim=1 ) if self_attn_padding_mask is not None: if encoder_padding_mask is None: assert encoder_out is not None encoder_padding_mask = self_attn_padding_mask.new_zeros( encoder_out.size(1), encoder_out.size(0) ) self_attn_padding_mask = torch.cat( (encoder_padding_mask, self_attn_padding_mask), dim=1 ) assert encoder_out is not None y = torch.cat((encoder_out, x), dim=0) else: y = x x, attn = self.self_attn( query=x, key=y, value=y, key_padding_mask=self_attn_padding_mask, incremental_state=incremental_state, need_weights=False, attn_mask=self_attn_mask, ) if self.c_attn is not None: tgt_len, bsz = x.size(0), x.size(1) x = x.view(tgt_len, bsz, self.nh, self.head_dim) x = torch.einsum("tbhd,h->tbhd", x, self.c_attn) x = x.reshape(tgt_len, bsz, self.embed_dim) if self.attn_ln is not None: x = self.attn_ln(x) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.self_attn_layer_norm(x) assert encoder_out is not None assert encoder_out_aug is not None if self.encoder_attn_merge_type == "sequential": ratios = self.get_dropnet_ratio() # first encoder attention if ratios[0] > 0: residual = x if self.normalize_before: x = self.encoder_attn_layer_norm(x) if prev_attn_state is not None: prev_key, prev_value = prev_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_attn_state[2] assert incremental_state is not None self.encoder_attn._set_input_buffer(incremental_state, saved_state) x, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.encoder_attn_layer_norm(x) x = ratios[0] * x # second encoder attention if ratios[1] > 0: residual = x if self.normalize_before: x = self.encoder_attn_layer_norm2(x) if prev_attn_state is not None: prev_key, prev_value = prev_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_attn_state[2] assert incremental_state is not None self.encoder_attn2._set_input_buffer(incremental_state, saved_state) x, attn2 = self.encoder_attn2( query=x, key=encoder_out_aug, value=encoder_out_aug, key_padding_mask=encoder_padding_mask2, incremental_state=incremental_state, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x = self.dropout_module(x) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.encoder_attn_layer_norm2(x) x = ratios[1] * x elif self.encoder_attn_merge_type == "parallel": residual = x if self.normalize_before: x = self.encoder_attn_layer_norm(x) if prev_attn_state is not None: prev_key, prev_value = prev_attn_state[:2] saved_state: Dict[str, Optional[Tensor]] = { "prev_key": prev_key, "prev_value": prev_value, } if len(prev_attn_state) >= 3: saved_state["prev_key_padding_mask"] = prev_attn_state[2] assert incremental_state is not None self.encoder_attn._set_input_buffer(incremental_state, saved_state) x1, attn = self.encoder_attn( query=x, key=encoder_out, value=encoder_out, key_padding_mask=encoder_padding_mask, incremental_state=incremental_state, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x2, attn2 = self.encoder_attn2( query=x, key=encoder_out_aug, value=encoder_out_aug, key_padding_mask=encoder_padding_mask2, incremental_state=incremental_state, static_kv=True, need_weights=need_attn or (not self.training and self.need_attn), need_head_weights=need_head_weights, ) x1 = self.dropout_module(x1) x2 = self.dropout_module(x2) ratios = self.get_dropnet_ratio() x = ratios[0] * x1 + ratios[1] * x2 x = self.residual_connection(x, residual) if not self.normalize_before: x = self.encoder_attn_layer_norm(x) else: raise NotImplementedError(self.encoder_attn_merge_type) residual = x if self.normalize_before: x = self.final_layer_norm(x) x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) if self.ffn_layernorm is not None: x = self.ffn_layernorm(x) x = self.fc2(x) x = self.dropout_module(x) if self.w_resid is not None: residual = torch.mul(self.w_resid, residual) x = self.residual_connection(x, residual) if not self.normalize_before: x = self.final_layer_norm(x) if self.onnx_trace and incremental_state is not None: saved_state = self.self_attn._get_input_buffer(incremental_state) assert saved_state is not None if self_attn_padding_mask is not None: self_attn_state = [ saved_state["prev_key"], saved_state["prev_value"], saved_state["prev_key_padding_mask"], ] else: self_attn_state = [saved_state["prev_key"], saved_state["prev_value"]] return x, attn, attn2, self_attn_state return x, attn, attn2, None def get_dropnet_ratio(self): if self.encoder_attn_merge_type == "sequential": if self.dropnet_ratio > 0: frand = float(uniform(0, 1)) if frand < self.dropnet_ratio and self.training: return [2, 0] elif frand > 1 - self.dropnet_ratio and self.training: return [0, 2] else: return [1, 1] else: return [1, 1] elif self.encoder_attn_merge_type == "parallel": if self.dropnet_ratio > 0: frand = float(uniform(0, 1)) if frand < self.dropnet_ratio and self.training: return [1, 0] elif frand > 1 - self.dropnet_ratio and self.training: return [0, 1] else: return [0.5, 0.5] else: return [0.5, 0.5] ================================================ FILE: fairseq/modules/transformer_sentence_encoder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Optional, Tuple import torch import torch.nn as nn from fairseq.modules import ( FairseqDropout, LayerDropModuleList, LayerNorm, MultiheadAttention, PositionalEmbedding, TransformerSentenceEncoderLayer, ) from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ def init_bert_params(module): """ Initialize the weights specific to the BERT Model. This overrides the default initializations depending on the specified arguments. 1. If normal_init_linear_weights is set then weights of linear layer will be initialized using the normal distribution and bais will be set to the specified value. 2. If normal_init_embed_weights is set then weights of embedding layer will be initialized using the normal distribution. 3. If normal_init_proj_weights is set then weights of in_project_weight for MultiHeadAttention initialized using the normal distribution (to be validated). """ def normal_(data): # with FSDP, module params will be on CUDA, so we cast them back to CPU # so that the RNG is consistent with and without FSDP data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device)) if isinstance(module, nn.Linear): normal_(module.weight.data) if module.bias is not None: module.bias.data.zero_() if isinstance(module, nn.Embedding): normal_(module.weight.data) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() if isinstance(module, MultiheadAttention): normal_(module.q_proj.weight.data) normal_(module.k_proj.weight.data) normal_(module.v_proj.weight.data) class TransformerSentenceEncoder(nn.Module): """ Implementation for a Bi-directional Transformer based Sentence Encoder used in BERT/XLM style pre-trained models. This first computes the token embedding using the token embedding matrix, position embeddings (if specified) and segment embeddings (if specified). After applying the specified number of TransformerEncoderLayers, it outputs all the internal states of the encoder as well as the final representation associated with the first token (usually CLS token). Input: - tokens: B x T matrix representing sentences - segment_labels: B x T matrix representing segment label for tokens Output: - a tuple of the following: - a list of internal model states used to compute the predictions where each tensor has shape T x B x C - sentence representation associated with first input token in format B x C. """ def __init__( self, padding_idx: int, vocab_size: int, num_encoder_layers: int = 6, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, layerdrop: float = 0.0, max_seq_len: int = 256, num_segments: int = 2, use_position_embeddings: bool = True, offset_positions_by_padding: bool = True, encoder_normalize_before: bool = False, apply_bert_init: bool = False, activation_fn: str = "relu", learned_pos_embedding: bool = True, embed_scale: float = None, freeze_embeddings: bool = False, n_trans_layers_to_freeze: int = 0, export: bool = False, traceable: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, ) -> None: super().__init__() self.padding_idx = padding_idx self.vocab_size = vocab_size self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.layerdrop = layerdrop self.max_seq_len = max_seq_len self.embedding_dim = embedding_dim self.num_segments = num_segments self.use_position_embeddings = use_position_embeddings self.apply_bert_init = apply_bert_init self.learned_pos_embedding = learned_pos_embedding self.traceable = traceable self.embed_tokens = self.build_embedding( self.vocab_size, self.embedding_dim, self.padding_idx ) self.embed_scale = embed_scale if q_noise > 0: self.quant_noise = apply_quant_noise_( nn.Linear(self.embedding_dim, self.embedding_dim, bias=False), q_noise, qn_block_size, ) else: self.quant_noise = None self.segment_embeddings = ( nn.Embedding(self.num_segments, self.embedding_dim, padding_idx=None) if self.num_segments > 0 else None ) self.embed_positions = ( PositionalEmbedding( self.max_seq_len, self.embedding_dim, padding_idx=(self.padding_idx if offset_positions_by_padding else None), learned=self.learned_pos_embedding, ) if self.use_position_embeddings else None ) if encoder_normalize_before: self.emb_layer_norm = LayerNorm(self.embedding_dim, export=export) else: self.emb_layer_norm = None if self.layerdrop > 0.0: self.layers = LayerDropModuleList(p=self.layerdrop) else: self.layers = nn.ModuleList([]) self.layers.extend( [ self.build_transformer_sentence_encoder_layer( embedding_dim=self.embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=self.dropout_module.p, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, q_noise=q_noise, qn_block_size=qn_block_size, ) for _ in range(num_encoder_layers) ] ) # Apply initialization of model params after building the model if self.apply_bert_init: self.apply(init_bert_params) def freeze_module_params(m): if m is not None: for p in m.parameters(): p.requires_grad = False if freeze_embeddings: freeze_module_params(self.embed_tokens) freeze_module_params(self.segment_embeddings) freeze_module_params(self.embed_positions) freeze_module_params(self.emb_layer_norm) for layer in range(n_trans_layers_to_freeze): freeze_module_params(self.layers[layer]) def build_embedding(self, vocab_size, embedding_dim, padding_idx): return nn.Embedding(vocab_size, embedding_dim, padding_idx) def build_transformer_sentence_encoder_layer( self, embedding_dim, ffn_embedding_dim, num_attention_heads, dropout, attention_dropout, activation_dropout, activation_fn, export, q_noise, qn_block_size, ): return TransformerSentenceEncoderLayer( embedding_dim=embedding_dim, ffn_embedding_dim=ffn_embedding_dim, num_attention_heads=num_attention_heads, dropout=dropout, attention_dropout=attention_dropout, activation_dropout=activation_dropout, activation_fn=activation_fn, export=export, q_noise=q_noise, qn_block_size=qn_block_size, ) def forward( self, tokens: torch.Tensor, segment_labels: torch.Tensor = None, last_state_only: bool = False, positions: Optional[torch.Tensor] = None, token_embeddings: Optional[torch.Tensor] = None, attn_mask: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: is_tpu = tokens.device.type == "xla" # compute padding mask. This is needed for multi-head attention padding_mask = tokens.eq(self.padding_idx) if not self.traceable and not is_tpu and not padding_mask.any(): padding_mask = None if token_embeddings is not None: x = token_embeddings else: x = self.embed_tokens(tokens) if self.embed_scale is not None: x = x * self.embed_scale if self.embed_positions is not None: x = x + self.embed_positions(tokens, positions=positions) if self.segment_embeddings is not None and segment_labels is not None: x = x + self.segment_embeddings(segment_labels) if self.quant_noise is not None: x = self.quant_noise(x) if self.emb_layer_norm is not None: x = self.emb_layer_norm(x) x = self.dropout_module(x) # account for padding while computing the representation if padding_mask is not None: x = x * (1 - padding_mask.unsqueeze(-1).type_as(x)) # B x T x C -> T x B x C x = x.transpose(0, 1) inner_states = [] if not last_state_only: inner_states.append(x) for layer in self.layers: x, _ = layer( x, self_attn_padding_mask=padding_mask, self_attn_mask=attn_mask ) if not last_state_only: inner_states.append(x) sentence_rep = x[0, :, :] if last_state_only: inner_states = [x] if self.traceable: return torch.stack(inner_states), sentence_rep else: return inner_states, sentence_rep ================================================ FILE: fairseq/modules/transformer_sentence_encoder_layer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Callable, Optional import torch import torch.nn as nn from fairseq import utils from fairseq.modules import LayerNorm, MultiheadAttention from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.quant_noise import quant_noise class TransformerSentenceEncoderLayer(nn.Module): """ Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained models. """ def __init__( self, embedding_dim: int = 768, ffn_embedding_dim: int = 3072, num_attention_heads: int = 8, dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, activation_fn: str = "relu", export: bool = False, q_noise: float = 0.0, qn_block_size: int = 8, init_fn: Callable = None, ) -> None: super().__init__() if init_fn is not None: init_fn() # Initialize parameters self.embedding_dim = embedding_dim self.num_attention_heads = num_attention_heads self.attention_dropout = attention_dropout self.q_noise = q_noise self.qn_block_size = qn_block_size self.dropout_module = FairseqDropout( dropout, module_name=self.__class__.__name__ ) self.activation_dropout_module = FairseqDropout( activation_dropout, module_name=self.__class__.__name__ ) # Initialize blocks self.activation_fn = utils.get_activation_fn(activation_fn) self.self_attn = self.build_self_attention( self.embedding_dim, num_attention_heads, dropout=attention_dropout, self_attention=True, q_noise=q_noise, qn_block_size=qn_block_size, ) # layer norm associated with the self attention layer self.self_attn_layer_norm = LayerNorm(self.embedding_dim, export=export) self.fc1 = self.build_fc1( self.embedding_dim, ffn_embedding_dim, q_noise=q_noise, qn_block_size=qn_block_size, ) self.fc2 = self.build_fc2( ffn_embedding_dim, self.embedding_dim, q_noise=q_noise, qn_block_size=qn_block_size, ) # layer norm associated with the position wise feed-forward NN self.final_layer_norm = LayerNorm(self.embedding_dim, export=export) def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_fc2(self, input_dim, output_dim, q_noise, qn_block_size): return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size) def build_self_attention( self, embed_dim, num_attention_heads, dropout, self_attention, q_noise, qn_block_size, ): return MultiheadAttention( embed_dim, num_attention_heads, dropout=dropout, self_attention=True, q_noise=q_noise, qn_block_size=qn_block_size, ) def forward( self, x: torch.Tensor, self_attn_mask: Optional[torch.Tensor] = None, self_attn_padding_mask: Optional[torch.Tensor] = None, ): """ LayerNorm is applied either before or after the self-attention/ffn modules similar to the original Transformer implementation. """ residual = x x, attn = self.self_attn( query=x, key=x, value=x, key_padding_mask=self_attn_padding_mask, need_weights=False, attn_mask=self_attn_mask, ) x = self.dropout_module(x) x = residual + x x = self.self_attn_layer_norm(x) residual = x x = self.activation_fn(self.fc1(x)) x = self.activation_dropout_module(x) x = self.fc2(x) x = self.dropout_module(x) x = residual + x x = self.final_layer_norm(x) return x, attn ================================================ FILE: fairseq/modules/transpose_last.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ transpose last 2 dimensions of the input """ import torch.nn as nn class TransposeLast(nn.Module): def __init__(self, deconstruct_idx=None, tranpose_dim=-2): super().__init__() self.deconstruct_idx = deconstruct_idx self.tranpose_dim = tranpose_dim def forward(self, x): if self.deconstruct_idx is not None: x = x[self.deconstruct_idx] return x.transpose(self.tranpose_dim, -1) ================================================ FILE: fairseq/modules/unfold.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.nn.functional as F def unfold1d(x, kernel_size: int, padding_l: int, pad_value: float = 0): """unfold T x B x C to T x B x C x K""" if kernel_size > 1: T, B, C = x.size() x = F.pad( x, (0, 0, 0, 0, padding_l, kernel_size - 1 - padding_l), value=pad_value ) x = x.as_strided((T, B, C, kernel_size), (B * C, C, 1, B * C)) else: x = x.unsqueeze(3) return x ================================================ FILE: fairseq/modules/vggblock.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from __future__ import absolute_import, division, print_function, unicode_literals from collections.abc import Iterable from itertools import repeat import torch import torch.nn as nn def _pair(v): if isinstance(v, Iterable): assert len(v) == 2, "len(v) != 2" return v return tuple(repeat(v, 2)) def infer_conv_output_dim(conv_op, input_dim, sample_inchannel): sample_seq_len = 200 sample_bsz = 10 x = torch.randn(sample_bsz, sample_inchannel, sample_seq_len, input_dim) # N x C x H x W # N: sample_bsz, C: sample_inchannel, H: sample_seq_len, W: input_dim x = conv_op(x) # N x C x H x W x = x.transpose(1, 2) # N x H x C x W bsz, seq = x.size()[:2] per_channel_dim = x.size()[3] # bsz: N, seq: H, CxW the rest return x.contiguous().view(bsz, seq, -1).size(-1), per_channel_dim class VGGBlock(torch.nn.Module): """ VGG motibated cnn module https://arxiv.org/pdf/1409.1556.pdf Args: in_channels: (int) number of input channels (typically 1) out_channels: (int) number of output channels conv_kernel_size: convolution channels pooling_kernel_size: the size of the pooling window to take a max over num_conv_layers: (int) number of convolution layers input_dim: (int) input dimension conv_stride: the stride of the convolving kernel. Can be a single number or a tuple (sH, sW) Default: 1 padding: implicit paddings on both sides of the input. Can be a single number or a tuple (padH, padW). Default: None layer_norm: (bool) if layer norm is going to be applied. Default: False Shape: Input: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features) Output: BxCxTxfeat, i.e. (batch_size, input_size, timesteps, features) """ def __init__( self, in_channels, out_channels, conv_kernel_size, pooling_kernel_size, num_conv_layers, input_dim, conv_stride=1, padding=None, layer_norm=False, ): assert ( input_dim is not None ), "Need input_dim for LayerNorm and infer_conv_output_dim" super(VGGBlock, self).__init__() self.in_channels = in_channels self.out_channels = out_channels self.conv_kernel_size = _pair(conv_kernel_size) self.pooling_kernel_size = _pair(pooling_kernel_size) self.num_conv_layers = num_conv_layers self.padding = ( tuple(e // 2 for e in self.conv_kernel_size) if padding is None else _pair(padding) ) self.conv_stride = _pair(conv_stride) self.layers = nn.ModuleList() for layer in range(num_conv_layers): conv_op = nn.Conv2d( in_channels if layer == 0 else out_channels, out_channels, self.conv_kernel_size, stride=self.conv_stride, padding=self.padding, ) self.layers.append(conv_op) if layer_norm: conv_output_dim, per_channel_dim = infer_conv_output_dim( conv_op, input_dim, in_channels if layer == 0 else out_channels ) self.layers.append(nn.LayerNorm(per_channel_dim)) input_dim = per_channel_dim self.layers.append(nn.ReLU()) if self.pooling_kernel_size is not None: pool_op = nn.MaxPool2d(kernel_size=self.pooling_kernel_size, ceil_mode=True) self.layers.append(pool_op) self.total_output_dim, self.output_dim = infer_conv_output_dim( pool_op, input_dim, out_channels ) def forward(self, x): for i, _ in enumerate(self.layers): x = self.layers[i](x) return x ================================================ FILE: fairseq/nan_detector.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch logger = logging.getLogger(__name__) class NanDetector: """ Detects the first NaN or Inf in forward and/or backward pass and logs, together with the module name """ def __init__(self, model, forward=True, backward=True): self.bhooks = [] self.fhooks = [] self.forward = forward self.backward = backward self.named_parameters = list(model.named_parameters()) self.reset() for name, mod in model.named_modules(): mod.__module_name = name self.add_hooks(mod) def __enter__(self): return self def __exit__(self, exc_type, exc_value, exc_traceback): # Dump out all model gnorms to enable better debugging norm = {} gradients = {} for name, param in self.named_parameters: if param.grad is not None: grad_norm = torch.norm(param.grad.data.float(), p=2) norm[name] = param.norm().item() if torch.isnan(grad_norm).any() or torch.isinf(grad_norm).any(): gradients[name] = param.grad.data if len(gradients) > 0: logger.info("Detected nan/inf grad norm, dumping norms...") logger.info(f"norms: {norm}") logger.info(f"gradients: {gradients}") self.close() def add_hooks(self, module): if self.forward: self.fhooks.append(module.register_forward_hook(self.fhook_fn)) if self.backward: self.bhooks.append(module.register_backward_hook(self.bhook_fn)) def reset(self): self.has_printed_f = False self.has_printed_b = False def _detect(self, tensor, name, backward): err = None if ( torch.is_floating_point(tensor) # single value tensors (like the loss) will not provide much info and tensor.numel() >= 2 ): with torch.no_grad(): if torch.isnan(tensor).any(): err = "NaN" elif torch.isinf(tensor).any(): err = "Inf" if err is not None: err = f"{err} detected in output of {name}, shape: {tensor.shape}, {'backward' if backward else 'forward'}" return err def _apply(self, module, inp, x, backward): if torch.is_tensor(x): if isinstance(inp, tuple) and len(inp) > 0: inp = inp[0] err = self._detect(x, module.__module_name, backward) if err is not None: if torch.is_tensor(inp) and not backward: err += ( f" input max: {inp.max().item()}, input min: {inp.min().item()}" ) has_printed_attr = "has_printed_b" if backward else "has_printed_f" logger.warning(err) setattr(self, has_printed_attr, True) elif isinstance(x, dict): for v in x.values(): self._apply(module, inp, v, backward) elif isinstance(x, list) or isinstance(x, tuple): for v in x: self._apply(module, inp, v, backward) def fhook_fn(self, module, inp, output): if not self.has_printed_f: self._apply(module, inp, output, backward=False) def bhook_fn(self, module, inp, output): if not self.has_printed_b: self._apply(module, inp, output, backward=True) def close(self): for hook in self.fhooks + self.bhooks: hook.remove() ================================================ FILE: fairseq/ngram_repeat_block.py ================================================ # Originally from Microsoft Corporation. # Licensed under the MIT License. """ Wrapper for ngram_repeat_block cuda extension """ import math import warnings from typing import List import torch from torch import nn try: from fairseq import ngram_repeat_block_cuda EXTENSION_BUILT = True except ImportError: EXTENSION_BUILT = False def is_cuda_extension_usable() -> bool: """Check whether ngram_repeat_block_cuda is built properly""" if not EXTENSION_BUILT or not torch.cuda.is_available(): return False bsz = 2 tokens = torch.tensor([[4, 4, 3, 2], [1, 2, 3, 4]], dtype=torch.long, device="cuda") lprobs = torch.rand((8, 12), device="cuda") try: outputs = ngram_repeat_block_cuda.forward(tokens, lprobs, bsz, 3, 4, 3) outputs = outputs + 4 # This line breaks if the extension is built incorrectly. return True except RuntimeError: warnings.warn( "NGramRepeatBlock extension must be rebuilt." 'Run TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0" python setup.py build_ext --inplace' ) return False class NGramRepeatBlock(nn.Module): """Wrapper class for calling ngram_repeat_block cuda extension""" def __init__(self, no_repeat_ngram_size: int, use_extension: bool = True): super().__init__() self.use_extension = is_cuda_extension_usable() if use_extension else False self.no_repeat_ngram_size = no_repeat_ngram_size def reset_parameters(self): pass @torch.jit.unused def call_cuda_extension( self, tokens, lprobs, bsz: int, beam_size: int, step: int, ): return ngram_repeat_block_cuda.forward( tokens, lprobs, bsz, step, beam_size, self.no_repeat_ngram_size ) def forward( self, tokens, lprobs, bsz: int, beam_size: int, step: int, ): """ Args: tokens(Tensor): Input tokens(Bsz*beam, seq_len) lprobs(Tensor): likelihood probability, Expected to be updated in place.(Bsz*beam, vocab_size) bsz(int): batch size step(int): current step beam_size(int): beam size no_repeat_ngram_size(int): Ngram size """ msg = f"expected {bsz *beam_size} got" assert tokens.size(0) == bsz * beam_size, f"{msg} {tokens.size(0)}" assert lprobs.size(0) == bsz * beam_size, f"{msg} {lprobs.size(0)}" if self.use_extension: return self.call_cuda_extension(tokens, lprobs, bsz, beam_size, step) else: return self._no_repeat_ngram( tokens, lprobs, bsz, beam_size, step, ) def _no_repeat_ngram(self, tokens, lprobs, bsz: int, beam_size: int, step: int): """For each hypothesis generate a list of previous ngrams and set associated lprobs to -inf""" banned_tokens = [ torch.jit.annotate(List[int], []) for bbsz_idx in range(bsz * beam_size) ] if step + 2 - self.no_repeat_ngram_size >= 0: cpu_tokens: List[List[int]] = tokens.cpu().tolist() check_start_pos = step + 2 - self.no_repeat_ngram_size for bbsz_idx in range(bsz * beam_size): ngram_to_check = cpu_tokens[bbsz_idx][ -(self.no_repeat_ngram_size - 1) : ] for i in range(check_start_pos): if ( ngram_to_check == cpu_tokens[bbsz_idx][i : i + self.no_repeat_ngram_size - 1] ): banned_tokens[bbsz_idx].append( cpu_tokens[bbsz_idx][i + self.no_repeat_ngram_size - 1] ) for bbsz_idx in range(bsz * beam_size): lprobs[bbsz_idx][ torch.tensor(banned_tokens[bbsz_idx], dtype=torch.int64) ] = torch.tensor(-math.inf).to(lprobs) return lprobs ================================================ FILE: fairseq/optim/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import importlib import os from fairseq import registry from fairseq.optim.bmuf import FairseqBMUF # noqa from fairseq.optim.fairseq_optimizer import ( # noqa FairseqOptimizer, LegacyFairseqOptimizer, ) from fairseq.optim.amp_optimizer import AMPOptimizer from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer from fairseq.optim.shard import shard_ from omegaconf import DictConfig __all__ = [ "AMPOptimizer", "FairseqOptimizer", "FP16Optimizer", "MemoryEfficientFP16Optimizer", "shard_", ] ( _build_optimizer, register_optimizer, OPTIMIZER_REGISTRY, OPTIMIZER_DATACLASS_REGISTRY, ) = registry.setup_registry("--optimizer", base_class=FairseqOptimizer, required=True) def build_optimizer(cfg: DictConfig, params, *extra_args, **extra_kwargs): if all(isinstance(p, dict) for p in params): params = [t for p in params for t in p.values()] params = list(filter(lambda p: p.requires_grad, params)) return _build_optimizer(cfg, params, *extra_args, **extra_kwargs) # automatically import any Python files in the optim/ directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): file_name = file[: file.find(".py")] importlib.import_module("fairseq.optim." + file_name) ================================================ FILE: fairseq/optim/adadelta.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.optim from . import LegacyFairseqOptimizer, register_optimizer @register_optimizer("adadelta") class Adadelta(LegacyFairseqOptimizer): def __init__(self, args, params): super().__init__(args) self._optimizer = torch.optim.Adadelta(params, **self.optimizer_config) @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" # fmt: off parser.add_argument('--adadelta-rho', type=float, default=0.9, metavar='RHO', help='coefficient used for computing a running average of squared gradients') parser.add_argument('--adadelta-eps', type=float, default=1e-6, metavar='EPS', help='term added to the denominator to improve numerical stability') parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') parser.add_argument('--anneal-eps', action='store_true', help='flag to anneal eps') # fmt: on @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.args.lr[0], "rho": self.args.adadelta_rho, "eps": self.args.adadelta_eps, "weight_decay": self.args.weight_decay, } @property def supports_flat_params(self): return True ================================================ FILE: fairseq/optim/adafactor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import torch import torch.optim from . import LegacyFairseqOptimizer, register_optimizer @register_optimizer("adafactor") class FairseqAdafactor(LegacyFairseqOptimizer): def __init__(self, args, params): super().__init__(args) self._optimizer = Adafactor(params, **self.optimizer_config) @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" # fmt: off parser.add_argument('--adafactor-eps', default='(1e-30, 1e-3)', metavar="E", help='epsilons for Adafactor optimizer') parser.add_argument('--clip-threshold', type=float, default=1.0, metavar="C", help='threshold for clipping update root mean square') parser.add_argument('--decay-rate', type=float, default=-0.8, metavar="D", help='decay rate of the second moment estimator') parser.add_argument('--beta1', type=float, default=None, metavar="B", help='beta for first moment estimator. Optional') parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') parser.add_argument('--scale-parameter', action='store_true', help='scale learning rate by root mean square of parameter') parser.add_argument('--relative-step', action='store_true', help='set learning rate to inverse square root of timestep,' 'otherwise use external learning rate') parser.add_argument('--warmup-init', action='store_true', help='use relative step for warm-up learning rate schedule') # fmt: on @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. Note : Convergence issues empirically observed with fp16 on. Might require search for appropriate configuration. """ return { "lr": self.args.lr[0], "eps": eval(self.args.adafactor_eps), "clip_threshold": self.args.clip_threshold, "decay_rate": self.args.decay_rate, "beta1": self.args.beta1, "weight_decay": self.args.weight_decay, "scale_parameter": self.args.scale_parameter, # defaults to False "relative_step": self.args.relative_step, # defaults to False "warmup_init": self.args.warmup_init, } class Adafactor(torch.optim.Optimizer): """Implements Adafactor algorithm. This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` (see https://arxiv.org/abs/1804.04235) Note that this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and *warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and `relative_step=False`. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): external learning rate (default: None) eps (tuple[float, float]): regularization constans for square gradient and parameter scale respectively (default: (1e-30, 1e-3)) clip_threshold (float): threshold of root mean square of final gradient update (default: 1.0) decay_rate (float): coefficient used to compute running averages of square gradient (default: -0.8) beta1 (float): coefficient used for computing running averages of gradient (default: None) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) scale_parameter (bool): if True, learning rate is scaled by root mean square of parameter (default: True) relative_step (bool): if True, time-dependent learning rate is computed instead of external learning rate (default: True) warmup_init (bool): time-dependent learning rate computation depends on whether warm-up initialization is being used (default: False) """ def __init__( self, params, lr=None, eps=(1e-30, 1e-3), clip_threshold=1.0, decay_rate=-0.8, beta1=None, weight_decay=0.0, scale_parameter=True, relative_step=True, warmup_init=False, ): if lr is not None and relative_step: raise ValueError("Cannot combine manual lr and relative_step options") if warmup_init and not relative_step: raise ValueError("warmup_init requires relative_step=True") defaults = dict( lr=lr, eps=eps, clip_threshold=clip_threshold, decay_rate=decay_rate, beta1=beta1, weight_decay=weight_decay, scale_parameter=scale_parameter, relative_step=relative_step, warmup_init=warmup_init, ) super(Adafactor, self).__init__(params, defaults) @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return False def _get_lr(self, param_group, param_state): rel_step_sz = param_group["lr"] if param_group["relative_step"]: min_step = ( 1e-6 * param_state["step"] if param_group["warmup_init"] else 1e-2 ) rel_step_sz = min(min_step, 1.0 / math.sqrt(param_state["step"])) param_scale = 1.0 if param_group["scale_parameter"]: param_scale = max(param_group["eps"][1], param_state["RMS"]) return param_scale * rel_step_sz def _get_options(self, param_group, param_shape): factored = len(param_shape) >= 2 use_first_moment = param_group["beta1"] is not None return factored, use_first_moment def _rms(self, tensor): return tensor.norm(2) / (tensor.numel() ** 0.5) def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col): r_factor = ( (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)) .rsqrt_() .unsqueeze(-1) ) c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt() return torch.mul(r_factor, c_factor) def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data if grad.dtype in {torch.float16, torch.bfloat16}: grad = grad.float() if grad.is_sparse: raise RuntimeError("Adafactor does not support sparse gradients.") state = self.state[p] grad_shape = grad.shape factored, use_first_moment = self._get_options(group, grad_shape) # State Initialization if len(state) == 0: state["step"] = 0 if use_first_moment: # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like(grad) if factored: state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad) state["exp_avg_sq_col"] = torch.zeros( grad_shape[:-2] + grad_shape[-1:] ).to(grad) else: state["exp_avg_sq"] = torch.zeros_like(grad) state["RMS"] = 0 else: if use_first_moment: state["exp_avg"] = state["exp_avg"].to(grad) if factored: state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad) state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad) else: state["exp_avg_sq"] = state["exp_avg_sq"].to(grad) p_data_fp32 = p.data if p.data.dtype in {torch.float16, torch.bfloat16}: p_data_fp32 = p_data_fp32.float() state["step"] += 1 state["RMS"] = self._rms(p_data_fp32) group["lr"] = self._get_lr(group, state) beta2t = 1.0 - math.pow(state["step"], group["decay_rate"]) update = (grad**2) + group["eps"][0] if factored: exp_avg_sq_row = state["exp_avg_sq_row"] exp_avg_sq_col = state["exp_avg_sq_col"] exp_avg_sq_row.mul_(beta2t).add_( update.mean(dim=-1), alpha=1.0 - beta2t ) exp_avg_sq_col.mul_(beta2t).add_( update.mean(dim=-2), alpha=1.0 - beta2t ) # Approximation of exponential moving average of square of gradient update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col) update.mul_(grad) else: exp_avg_sq = state["exp_avg_sq"] exp_avg_sq.mul_(beta2t).add_(update, alpha=1.0 - beta2t) update = exp_avg_sq.rsqrt().mul_(grad) update.div_( (self._rms(update) / group["clip_threshold"]).clamp_(min=1.0) ) update.mul_(group["lr"]) if use_first_moment: exp_avg = state["exp_avg"] exp_avg.mul_(group["beta1"]).add_(update, alpha=1 - group["beta1"]) update = exp_avg if group["weight_decay"] != 0: p_data_fp32.add_( p_data_fp32, alpha=-group["weight_decay"] * group["lr"] ) p_data_fp32.add_(-update) if p.data.dtype in {torch.float16, torch.bfloat16}: p.data.copy_(p_data_fp32) return loss ================================================ FILE: fairseq/optim/adagrad.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.optim from . import LegacyFairseqOptimizer, register_optimizer @register_optimizer("adagrad") class Adagrad(LegacyFairseqOptimizer): def __init__(self, args, params): super().__init__(args) self._optimizer = torch.optim.Adagrad(params, **self.optimizer_config) @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" # fmt: off parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # fmt: on @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.args.lr[0], "weight_decay": self.args.weight_decay, } @property def supports_flat_params(self): return False ================================================ FILE: fairseq/optim/adam.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import math from collections.abc import Collection from dataclasses import dataclass, field from typing import Any, List import torch import torch.distributed as dist import torch.optim from fairseq.dataclass import FairseqDataclass from fairseq.optim import FairseqOptimizer, register_optimizer from fairseq.optim.fused_adam import get_fused_adam_class from omegaconf import II, OmegaConf logger = logging.getLogger(__name__) @dataclass class FairseqAdamConfig(FairseqDataclass): adam_betas: Any = field( default=(0.9, 0.999), metadata={"help": "betas for Adam optimizer"} ) adam_eps: float = field( default=1e-8, metadata={"help": "epsilon for Adam optimizer"} ) weight_decay: float = field(default=0.0, metadata={"help": "weight decay"}) use_old_adam: bool = field( default=False, metadata={"help": "Use fairseq.optim.adam.Adam"} ) fp16_adam_stats: bool = field( default=False, metadata={"help": "use FP16 stats (with automatic scaling)"} ) # TODO common vars below in parent tpu: bool = II("common.tpu") lr: List[float] = II("optimization.lr") @register_optimizer("adam", dataclass=FairseqAdamConfig) class FairseqAdam(FairseqOptimizer): """Adam optimizer for fairseq. Important note: this optimizer corresponds to the "AdamW" variant of Adam in its weight decay behavior. As such, it is most closely analogous to torch.optim.AdamW from PyTorch. """ def __init__(self, cfg: FairseqAdamConfig, params): super().__init__(cfg) fused_adam_cls = get_fused_adam_class() use_fused_adam = ( not getattr(cfg, "use_old_adam", False) and fused_adam_cls is not None and torch.cuda.is_available() ) if getattr(cfg, "tpu", False): if self.cfg.fp16_adam_stats: raise NotImplementedError("--fp16-adam-stats is only supported on GPU") # on TPUs we use the Adam defined here, since it # automatically casts gradients to FP32 self._optimizer = Adam(params, **self.optimizer_config) elif use_fused_adam: logger.info("using FusedAdam") self._optimizer = fused_adam_cls( params, use_fp16_stats=self.cfg.fp16_adam_stats, **self.optimizer_config ) else: if self.cfg.fp16_adam_stats: raise NotImplementedError( "--fp16-adam-stats is only supported with FusedAdamV1" ) self._optimizer = Adam(params, **self.optimizer_config) @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.cfg.lr[0] if isinstance(self.cfg.lr, Collection) else self.cfg.lr, "betas": eval(self.cfg.adam_betas) if isinstance(self.cfg.adam_betas, str) else OmegaConf.to_container(self.cfg.adam_betas), "eps": self.cfg.adam_eps, "weight_decay": self.cfg.weight_decay, } def average_params(self): """Reduce Params is only used during BMUF distributed training.""" state_dict = self.optimizer.state_dict() total_gpus = float(dist.get_world_size()) for _, value in state_dict["state"].items(): value["exp_avg"] /= total_gpus value["exp_avg_sq"] /= total_gpus dist.all_reduce(value["exp_avg"], op=dist.ReduceOp.SUM) dist.all_reduce(value["exp_avg_sq"], op=dist.ReduceOp.SUM) class Adam(torch.optim.Optimizer): r"""Implements Adam algorithm. This implementation is modified from torch.optim.Adam based on: `Fixed Weight Decay Regularization in Adam` (see https://arxiv.org/abs/1711.05101) It has been proposed in `Adam: A Method for Stochastic Optimization`_. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ .. _Adam\: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__( self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, ): defaults = dict( lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad ) super(Adam, self).__init__(params, defaults) @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return True def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data if grad.dtype in {torch.float16, torch.bfloat16}: grad = grad.float() if grad.is_sparse: raise RuntimeError( "Adam does not support sparse gradients, please consider SparseAdam instead" ) amsgrad = group.get("amsgrad", False) p_data_fp32 = p.data if p.data.dtype in {torch.float16, torch.bfloat16}: p_data_fp32 = p_data_fp32.float() state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like(p_data_fp32) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) if amsgrad: # Maintains max of all exp. moving avg. of sq. grad. values state["max_exp_avg_sq"] = torch.zeros_like(p_data_fp32) else: state["exp_avg"] = state["exp_avg"].to(p_data_fp32) state["exp_avg_sq"] = state["exp_avg_sq"].to(p_data_fp32) if amsgrad: state["max_exp_avg_sq"] = state["max_exp_avg_sq"].to( p_data_fp32 ) exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] if amsgrad: max_exp_avg_sq = state["max_exp_avg_sq"] beta1, beta2 = group["betas"] state["step"] += 1 # Decay the first and second moment running average coefficient exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) if amsgrad: # Maintains the maximum of all 2nd moment running avg. till now torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = max_exp_avg_sq.sqrt().add_(group["eps"]) else: denom = exp_avg_sq.sqrt().add_(group["eps"]) bias_correction1 = 1 - beta1 ** state["step"] bias_correction2 = 1 - beta2 ** state["step"] step_size = group["lr"] * math.sqrt(bias_correction2) / bias_correction1 if group["weight_decay"] != 0: p_data_fp32.add_( p_data_fp32, alpha=-group["weight_decay"] * group["lr"] ) p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size) if p.data.dtype in {torch.float16, torch.bfloat16}: p.data.copy_(p_data_fp32) return loss ================================================ FILE: fairseq/optim/adamax.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch import torch.optim from . import LegacyFairseqOptimizer, register_optimizer @register_optimizer("adamax") class FairseqAdamax(LegacyFairseqOptimizer): def __init__(self, args, params): super().__init__(args) self._optimizer = Adamax(params, **self.optimizer_config) @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" # fmt: off parser.add_argument('--adamax-betas', default='(0.9, 0.999)', metavar='B', help='betas for Adam optimizer') parser.add_argument('--adamax-eps', type=float, default=1e-8, metavar='D', help='epsilon for Adam optimizer') parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') parser.add_argument('--no-bias-correction', default=False, action='store_true', help='disable bias correction') # fmt: on @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.args.lr[0], "betas": eval(self.args.adamax_betas), "eps": self.args.adamax_eps, "weight_decay": self.args.weight_decay, "bias_correction": not self.args.no_bias_correction, } class Adamax(torch.optim.Optimizer): """Implements Adamax algorithm (a variant of Adam based on infinity norm). It has been proposed in `Adam: A Method for Stochastic Optimization`__. Compared to the version in PyTorch, this version implements a fix for weight decay. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups lr (float, optional): learning rate (default: 2e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) bias_correction (bool, optional): enable bias correction (default: True) __ https://arxiv.org/abs/1412.6980 """ def __init__( self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, bias_correction=True, ): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) if not 0.0 <= weight_decay: raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) defaults = dict( lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction, ) super(Adamax, self).__init__(params, defaults) @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return True def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: for p in group["params"]: if p.grad is None: continue grad = p.grad.data.float() if grad.is_sparse: raise RuntimeError("Adamax does not support sparse gradients") p_data_fp32 = p.data if p.data.dtype in {torch.float16, torch.bfloat16}: p_data_fp32 = p_data_fp32.float() state = self.state[p] # State initialization if len(state) == 0: state["step"] = 0 state["exp_avg"] = torch.zeros_like(p_data_fp32) state["exp_inf"] = torch.zeros_like(p_data_fp32) else: state["exp_avg"] = state["exp_avg"].to(p_data_fp32) state["exp_inf"] = state["exp_inf"].to(p_data_fp32) exp_avg, exp_inf = state["exp_avg"], state["exp_inf"] beta1, beta2 = group["betas"] eps = group["eps"] state["step"] += 1 # Update biased first moment estimate. exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) # Update the exponentially weighted infinity norm. torch.max( exp_inf.mul_(beta2), grad.abs_(), out=exp_inf, ) step_size = group["lr"] if group["bias_correction"]: bias_correction = 1 - beta1 ** state["step"] step_size /= bias_correction if group["weight_decay"] != 0: p_data_fp32.add_( p_data_fp32, alpha=-group["weight_decay"] * group["lr"] ) p_data_fp32.addcdiv_(exp_avg, exp_inf.add(eps), value=-step_size) if p.data.dtype in {torch.float16, torch.bfloat16}: p.data.copy_(p_data_fp32) return loss ================================================ FILE: fairseq/optim/amp_optimizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import torch from fairseq import optim from omegaconf import DictConfig logger = logging.getLogger(__name__) class AMPOptimizer(optim.FairseqOptimizer): """ Wrap an *optimizer* to support AMP (automatic mixed precision) training. """ def __init__(self, cfg: DictConfig, params, fp32_optimizer, **kwargs): super().__init__(cfg.optimizer) self.fp32_optimizer = fp32_optimizer amp_kwargs = {"init_scale": cfg.common.fp16_init_scale} if getattr(cfg.common, "amp_scale_window", None) is not None: amp_kwargs["growth_interval"] = cfg.common.amp_init_scale self._grad_scaler = torch.cuda.amp.GradScaler(**amp_kwargs) self.min_loss_scale = cfg.common.min_loss_scale @classmethod def build_optimizer(cls, cfg: DictConfig, params, **kwargs): """ Args: cfg (omegaconf.DictConfig): fairseq args params (iterable): iterable of parameters to optimize """ fp32_optimizer = optim.build_optimizer(cfg.optimizer, params) return cls(cfg, params, fp32_optimizer, **kwargs) def backward(self, loss): """Computes the sum of gradients of the given tensor w.r.t. graph leaves. Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this function additionally dynamically scales the loss to avoid gradient underflow. """ self._grad_scaler.scale(loss).backward() def step(self): self.scaler.step(self.fp32_optimizer) self.scaler.update() def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): """Clips gradient norm.""" self.scaler.unscale_(self.optimizer) grad_norm = self.fp32_optimizer.clip_grad_norm(max_norm, aggregate_norm_fn) if not torch.isfinite(grad_norm).all(): new_loss_scale = self.next_loss_scale if new_loss_scale <= self.min_loss_scale: raise FloatingPointError( ( "AMP: Minimum loss scale reached ({}). Your loss is probably exploding. " "Try restarting training or use fp32. {}" ).format(self.min_loss_scale, new_loss_scale) ) else: logger.info( "AMP: overflow detected, setting scale to " f"to {new_loss_scale}" ) return grad_norm @property def scaler(self): return self._grad_scaler @property def next_loss_scale(self): return self.scaler.get_scale() * self.scaler.get_backoff_factor() @property def optimizer(self): return self.fp32_optimizer.optimizer @optimizer.setter def optimizer(self, optimizer): self.fp32_optimizer.optimizer = optimizer @property def lr_scheduler(self): return getattr(self.fp32_optimizer, "lr_scheduler", None) @property def optimizer_config(self): return self.fp32_optimizer.optimizer_config def get_lr(self): return self.fp32_optimizer.get_lr() def set_lr(self, lr): self.fp32_optimizer.set_lr(lr) def all_reduce_grads(self, module): self.fp32_optimizer.all_reduce_grads(module) @property def supports_flat_params(self): return self.fp32_optimizer.supports_flat_params ================================================ FILE: fairseq/optim/bmuf.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import torch import torch.distributed as dist from fairseq.dataclass.configs import FairseqBMUFConfig from fairseq.dataclass.utils import gen_parser_from_dataclass from fairseq.optim.fairseq_optimizer import FairseqOptimizer class FairseqBMUF(FairseqOptimizer): """ Implements incremental block distributed data parallelism similar to https://ieeexplore.ieee.org/document/7472805 Paper title: Scalable training of deep learning machines by incremental block training with intra-block parallel optimization and blockwise model-update filtering """ def __init__(self, cfg: FairseqBMUFConfig, optimizer): super().__init__(cfg) self._optimizer = optimizer self._num_updates = 0 self.sync_iter = cfg.global_sync_iter self.block_momentum = cfg.block_momentum self.block_lr = cfg.block_lr self._reset_local_data() self.warmup_iteration = cfg.warmup_iterations self.use_nbm = cfg.use_nbm self.initial_state = self._optimizer.state_dict() self.average_sync = self.cfg.average_sync self.world_size = self.cfg.distributed_world_size @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" gen_parser_from_dataclass(parser, FairseqBMUFConfig()) @property def optimizer(self): return self._optimizer.optimizer @property def optimizer_config(self): return self._optimizer.optimizer_config def get_lr(self): return self._optimizer.get_lr() def set_lr(self, lr): self._optimizer.set_lr(lr) def state_dict(self): return self._optimizer.state_dict() def load_state_dict(self, state_dict, optimizer_overrides=None): self._optimizer.load_state_dict(state_dict, optimizer_overrides) self.initial_state = self._optimizer.state_dict() def multiply_grads(self, c): """Multiplies grads by a constant *c*.""" self._optimizer.multiply_grads(c) def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): """Clips gradient norm.""" return self._optimizer.clip_grad_norm(max_norm, aggregate_norm_fn) def average_params(self): self._optimizer.average_params() def _block_sync(self): if self.world_size <= 1: return # Update the global model using local models from all GPUs # (Step-1) Calculate grad between previously synced model and # currrent local model if self.block_momentum != 0: self._calc_grad() # (Step-2) Average gradient from all GPUs self._avg_grad_from_all_gpus() # (Step-3) Calculate global momentum and update the global model if self.block_momentum != 0: self._update_global_model() # (Step-4) Average local optimizer params if self.average_sync: self.average_params() def _is_warmup_end(self): # Check whether train iterations is equal to warmup iter if self.get_num_updates() == self.warmup_iteration: return True return False def _is_bmuf_iter(self): # Check whether train iterations is equal to bmuf sync iter if (self.get_num_updates() > self.warmup_iteration) and ( self.get_num_updates() % self.sync_iter == 0 ): return True return False def _warmup_sync(self, root_rank=0): if self.world_size <= 1: return # Broadcast the local model to all gpus for param in self.params: dist.broadcast(param.data, src=root_rank) # Update local optimizer state if self.average_sync: self._optimizer.average_params() else: self._optimizer.load_state_dict(self.initial_state) self._reset_local_data() def step(self, closure=None): """Performs a single optimization step.""" self._optimizer.step(closure) self.set_num_updates(self.get_num_updates() + 1) if self._is_warmup_end(): self._warmup_sync() elif self._is_bmuf_iter(): self._block_sync() def zero_grad(self): """Clears the gradients of all optimized parameters.""" self._optimizer.zero_grad() def get_num_updates(self): """Get the number of parameters updates.""" return self._num_updates def set_num_updates(self, num_updates): """Set the number of parameters updates.""" self._num_updates = num_updates @torch.no_grad() def _reset_local_data(self): # (Step-0) Initialize global momentum parameters and store global copy on each gpu self.global_params = [torch.zeros_like(p.data) for p in self.params] self.smoothed_grads = [p.data.new_zeros(p.data.size()) for p in self.params] self.grads = [p.data.new_zeros(p.data.size()) for p in self.params] # saving the global model locally for calculating gradient during bmuf sync for param, global_param in zip(self.params, self.global_params): global_param.copy_(param.data) @torch.no_grad() def _calc_grad(self): # global_params is basically the global copy from the previously finished # synchronisation. param.data is local parameter after block_sync_freq # for the local gpu. so grad is difference between previously synced # model and currrent local model. for index, (param, global_param) in enumerate( zip(self.params, self.global_params) ): self.grads[index] = global_param - param.data def _avg_grad_from_all_gpus(self): for index, param in enumerate(self.params): sync_para = param.data if self.block_momentum == 0 else self.grads[index] sync_para /= float(dist.get_world_size()) dist.all_reduce(sync_para, op=dist.ReduceOp.SUM) @torch.no_grad() def _update_global_model(self): for index, (param, global_param, smoothed_grad, grad) in enumerate( zip( self.params, self.global_params, self.smoothed_grads, # all gpus would share the same value of smoothed_grad, since it is # always computed on synchronized gradients. self.grads, ) ): # global_param is basically last syncrhornized parameter. though # smoothed_grad is local, all processes will have same value of # smoothed_grad and hence param is globally synchronized copy. # smoothed_grad(t) = BM * smoothed_grad(t-1) + BM_lr * grad(t) smoothed_grad = self.block_momentum * smoothed_grad + self.block_lr * grad param.data.copy_(global_param - smoothed_grad) # A Nesterov momentum here is to do a partial weight update before # calculating the gradient if self.use_nbm: param.data.copy_(param.data - self.block_momentum * smoothed_grad) # backup for the next synchronization. self.smoothed_grads[index] = smoothed_grad global_param.copy_(param.data) ================================================ FILE: fairseq/optim/composite.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from collections import defaultdict from dataclasses import dataclass, field from typing import Dict, Any, List, Optional import torch.optim from fairseq.dataclass import FairseqDataclass from fairseq.optim import FairseqOptimizer, register_optimizer, _build_optimizer from fairseq.optim.lr_scheduler import FairseqLRScheduler, build_lr_scheduler from omegaconf import II, open_dict import copy logger = logging.getLogger(__name__) @dataclass class OptimizerAndSchedulerConfig(FairseqDataclass): optimizer: Any = None lr_scheduler: Optional[Any] = None lr: List = II("optimization.lr") lr_float: Optional[ float ] = None # this makes it easier to sweep on learning rate with auto sweepers @dataclass class CompositeOptimizerConfig(FairseqDataclass): groups: Dict[str, Any] = field( default_factory=lambda: {}, metadata={ "help": "optimizer name -> optimizer OptimizerAndSchedulerConfig. " "Configures a different optimizer and (optionally) lr scheduler for each parameter group" }, ) dynamic_groups: bool = field( default=False, metadata={ "help": "create groups dynamically based on parameters, if set to False, all parameters needs to have group_names" }, ) @register_optimizer("composite", dataclass=CompositeOptimizerConfig) class FairseqCompositeOptimizer(FairseqOptimizer): optimizers: Dict[str, FairseqOptimizer] = {} lr_schedulers: Dict[str, FairseqLRScheduler] = {} lr_scheduler: FairseqLRScheduler = None _optimizer: torch.optim.Optimizer def __init__(self, cfg: CompositeOptimizerConfig, params): super().__init__(cfg) assert ( len(params) > 1 ), "Composite optimizer only works when there are multiple parameter groups (try fp16_no_flatten_grads: true)" def dict_hash(dictionary: Dict[str, Any]) -> str: import hashlib import json dhash = hashlib.md5() encoded = json.dumps(dictionary, sort_keys=True).encode() dhash.update(encoded) return dhash.hexdigest() groupped_params = defaultdict(list) overrides = defaultdict(dict) if not cfg.dynamic_groups: for p in params: group = getattr(p, "param_group", "default") override_config = getattr(p, "optim_overrides", None) if override_config is not None and bool(override_config): overrides[group] = override_config else: assert ( override_config == None or override_config == overrides[group] ), f"For group {group}, different overrides found {override_config} v/s {overrides[group]}" groupped_params[group].append(p) for p, params in groupped_params.items(): override_config = getattr(params[0], "optim_overrides", None) if override_config is not None: for pp in params[1:]: assert override_config == getattr( pp, "optim_overrides", None ), f" {str(override_config)} != {str(getattr(pp, 'optim_overrides', None))}" else: for p in params: group = getattr(p, "param_group", "default") override_config = getattr(p, "optim_overrides", None) if override_config is not None: override_config["group_name"] = group group_name = dict_hash(override_config) overrides[group_name] = override_config else: group_name = group groupped_params[group_name].append(p) self.optimizers_config = {} for group, group_params in groupped_params.items(): p_group = group if group in overrides and "group_name" in overrides[group]: p_group = overrides[group]["group_name"] if group in cfg.groups: group_cfg = cfg.groups[group] optimizer_config = copy.deepcopy(group_cfg.optimizer) scheduler_config = copy.deepcopy(group_cfg.lr_scheduler) explicit_group_present = True else: group_cfg = cfg.groups[p_group] optimizer_config = copy.deepcopy(group_cfg.optimizer) scheduler_config = copy.deepcopy(group_cfg.lr_scheduler) explicit_group_present = False if getattr(group_cfg, "lr_float", None) is not None: with open_dict(optimizer_config): optimizer_config.lr = [group_cfg.lr_float] if group in overrides and "optimizer" in overrides[group]: with open_dict(optimizer_config): if "lr_scale" in overrides[group]["optimizer"]: lr_scale = overrides[group]["optimizer"]["lr_scale"] optimizer_config.lr = [ lr * lr_scale for lr in optimizer_config.lr ] if explicit_group_present: logger.info( f"For group:{group}, config as well as override present for lr" ) if ( "weight_decay_scale" in overrides[group]["optimizer"] and "optimizer_config" in optimizer_config ): weight_decay_scale = overrides[group]["optimizer"][ "weight_decay_scale" ] optimizer_config.weight_decay = ( optimizer_config.weight_decay * weight_decay_scale ) if explicit_group_present: logger.info( f"For group:{group}, config as well as override present for weight_decay" ) with open_dict(scheduler_config): scheduler_config.lr = optimizer_config.lr self.optimizers[group] = _build_optimizer(optimizer_config, group_params) self.optimizers_config[group] = optimizer_config if scheduler_config is not None: self.lr_schedulers[group] = build_lr_scheduler( scheduler_config, self.optimizers[group] ) logger.info("Optimizers for different groups are as below") for group in self.optimizers_config.keys(): logger.info(f"Group : {group}:{self.optimizers_config[group]}") if len(self.lr_schedulers) > 0: assert len(self.lr_schedulers) == len(self.optimizers), ( f"Please provide an lr scheduler for each optimizer to use pass_through scheduler. " f"Optimizers: {self.optimizers}; Lr scheds: {self.lr_schedulers}" ) self.lr_scheduler = CompositeLRScheduler(self.lr_schedulers) self._optimizer = CompositeOptimizer(self.optimizers) @property def supports_groups(self): return True @property def param_groups(self): for opt in self.optimizers.values(): for group in opt.param_groups: yield group def get_lr(self): """Return the current learning rate.""" k = ( "default" if "default" in self.optimizers else next(iter(self.optimizers.keys())) ) return self.optimizers[k].param_groups[0]["lr"] def state_dict(self): """Return the LR scheduler state dict.""" return {k: s.state_dict() for k, s in self.optimizers.items()} def load_state_dict(self, state_dict, optimizer_overrides=None): """Load an LR scheduler state dict.""" for k, state in state_dict.items(): if k not in self.optimizers: # skip extra keys like "loss_scale" added by fp16 optimizer continue overrides = ( optimizer_overrides[k] if isinstance(optimizer_overrides, dict) and k in optimizer_overrides else None ) self.optimizers[k].load_state_dict(state, optimizer_overrides=overrides) class CompositeOptimizer(torch.optim.Optimizer): def __init__(self, optimizers: Dict[str, FairseqOptimizer]): self.optimizers = optimizers @property def supports_memory_efficient_fp16(self): return all(o.supports_memory_efficient_fp16 for o in self.optimizers.values()) @property def supports_flat_params(self): return all(o.supports_flat_params for o in self.optimizers.values()) def step(self, closure=None, groups=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for k, opt in self.optimizers.items(): if groups is None or k in groups: opt.step() return loss def zero_grad(self): for opt in self.optimizers.values(): opt.zero_grad() class CompositeLRScheduler(FairseqLRScheduler): def __init__(self, lr_schedulers): super().__init__(None, None) self.lr_schedulers = lr_schedulers def state_dict(self): """Return the LR scheduler state dict.""" return {k: s.state_dict() for k, s in self.lr_schedulers.items()} def load_state_dict(self, state_dict): """Load an LR scheduler state dict.""" for k, state in state_dict.items(): self.lr_schedulers[k].load_state_dict(state) def step_begin_epoch(self, epoch): """Update the learning rate at the beginning of the given epoch.""" for s in self.lr_schedulers.values(): s.step_begin_epoch(epoch) def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" for s in self.lr_schedulers.values(): s.step(epoch) def step_update(self, num_updates): """Update the learning rate after each update.""" return {k: s.step_update(num_updates) for k, s in self.lr_schedulers.items()} ================================================ FILE: fairseq/optim/cpu_adam.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib from collections.abc import Collection from dataclasses import dataclass, field from typing import List import torch from fairseq.dataclass import FairseqDataclass from fairseq.optim import FairseqOptimizer, register_optimizer from omegaconf import II, DictConfig try: import deepspeed has_deepspeed = True except ImportError as e: has_deepspeed = False def _get_cpu_adam(): try: from deepspeed.ops.op_builder import CPUAdamBuilder return CPUAdamBuilder().load() except ImportError: # fbcode from deepspeed.ops.adam import DeepSpeedCPUAdam as ds_opt_adam return ds_opt_adam @dataclass class FairseqCPUAdamConfig(FairseqDataclass): adam_betas: str = field( default="(0.9, 0.999)", metadata={"help": "betas for Adam optimizer"} ) adam_eps: float = field( default=1e-8, metadata={"help": "epsilon for Adam optimizer"} ) weight_decay: float = field(default=0.0, metadata={"help": "weight decay"}) fp16_adam_stats: bool = field( default=False, metadata={"help": "use FP16 stats (with automatic scaling)"} ) # TODO common vars below in parent lr: List[float] = II("optimization.lr") @register_optimizer("cpu_adam", dataclass=FairseqCPUAdamConfig) class FairseqCPUAdam(FairseqOptimizer): """Adam optimizer for fairseq, optimized for CPU tensors. Important note: this optimizer corresponds to the "AdamW" variant of Adam in its weight decay behavior. As such, it is most closely analogous to torch.optim.AdamW from PyTorch. """ def __init__(self, cfg: DictConfig, params): super().__init__(cfg) self._optimizer = CPUAdam(params, **self.optimizer_config) @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.cfg.lr[0] if isinstance(self.cfg.lr, Collection) else self.cfg.lr, "betas": eval(self.cfg.adam_betas), "eps": self.cfg.adam_eps, "weight_decay": self.cfg.weight_decay, "use_fp16_stats": self.cfg.fp16_adam_stats, } class CPUAdam(torch.optim.Optimizer): optimizer_id = 0 def __init__( self, params, lr=1e-3, bias_correction=True, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, use_fp16_stats=False, ): defaults = { "lr": lr, "bias_correction": bias_correction, "betas": betas, "eps": eps, "weight_decay": weight_decay, } super().__init__(params, defaults) self.use_fp16_stats = use_fp16_stats self.FLOAT16_MAX = 65504.0 if not has_deepspeed: raise ImportError("Please install DeepSpeed: pip install deepspeed") self.opt_id = CPUAdam.optimizer_id CPUAdam.optimizer_id = CPUAdam.optimizer_id + 1 self.ds_opt_adam = _get_cpu_adam() adamw_mode = True self.ds_opt_adam.create_adam( self.opt_id, lr, betas[0], betas[1], eps, weight_decay, adamw_mode ) @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return True @torch.no_grad() def step(self, closure=None): loss = None if closure is not None: with torch.enable_grad(): loss = closure() torch.cuda.synchronize() for group_id, group in enumerate(self.param_groups): for param_id, p in enumerate(group["params"]): if p.grad is None: continue state = self.state[p] if len(state) == 0: state["step"] = 0 dtype = torch.float16 if self.use_fp16_stats else p.data.dtype # gradient momentums state["exp_avg"] = torch.zeros_like( p.data, dtype=dtype, device="cpu" ) # gradient variances state["exp_avg_sq"] = torch.zeros_like( p.data, dtype=dtype, device="cpu" ) if self.use_fp16_stats: assert torch.is_floating_point(p.data) state["exp_avg_scale"] = 1.0 state["exp_avg_sq_scale"] = 1.0 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] p_data_bak = p.data # backup of the original data pointer p.data = p.data.to(dtype=torch.float32, device="cpu") p.grad.data = p.grad.data.to(dtype=torch.float32, device="cpu") if self.use_fp16_stats: exp_avg = exp_avg.float() * state["exp_avg_scale"] exp_avg_sq = exp_avg_sq.float() * state["exp_avg_sq_scale"] state["step"] += 1 beta1, beta2 = group["betas"] self.ds_opt_adam.adam_update( self.opt_id, state["step"], group["lr"], beta1, beta2, group["eps"], group["weight_decay"], group["bias_correction"], p.data, p.grad.data, exp_avg, exp_avg_sq, ) if p_data_bak.data_ptr() != p.data.data_ptr(): p_data_bak.copy_(p.data) p.data = p_data_bak if self.use_fp16_stats: def inf_norm(t): return torch.norm(t, float("inf")) # from github.com/openai/jukebox/blob/master/jukebox/utils/fp16.py state["exp_avg_scale"], state["exp_avg_sq_scale"] = ( 1e-8 + inf_norm(exp_avg) / self.FLOAT16_MAX, 1e-8 + inf_norm(exp_avg_sq) / self.FLOAT16_MAX, ) state["exp_avg"], state["exp_avg_sq"] = ( (exp_avg / state["exp_avg_scale"]).half(), (exp_avg_sq / state["exp_avg_sq_scale"]).half(), ) return loss ================================================ FILE: fairseq/optim/dynamic_loss_scaler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. class DynamicLossScaler(object): def __init__( self, init_scale=2.0**15, scale_factor=2.0, scale_window=2000, tolerance=0.0, threshold=None, min_loss_scale=1e-4, ): self.loss_scale = init_scale self.scale_factor = scale_factor self.scale_window = scale_window self.tolerance = tolerance self.threshold = threshold self._iter = 0 self._last_overflow_iter = -1 self._last_rescale_iter = -1 self._overflows_since_rescale = 0 self.min_loss_scale = min_loss_scale def scale(self, outputs): return self.loss_scale * outputs def update(self): if (self._iter - self._last_overflow_iter) % self.scale_window == 0: self.loss_scale *= self.scale_factor self._last_rescale_iter = self._iter self._iter += 1 def _decrease_loss_scale(self): self.loss_scale /= self.scale_factor if self.threshold is not None: self.loss_scale = max(self.loss_scale, self.threshold) def check_overflow(self, grad_norm): # detect inf and nan if grad_norm == float("inf") or grad_norm != grad_norm: # overflow has occured prev_scale = self.loss_scale iter_since_rescale = self._iter - self._last_rescale_iter self._last_overflow_iter = self._iter self._overflows_since_rescale += 1 pct_overflow = self._overflows_since_rescale / float(iter_since_rescale) if pct_overflow >= self.tolerance: self._decrease_loss_scale() self._last_rescale_iter = self._iter self._overflows_since_rescale = 0 if self.loss_scale <= self.min_loss_scale: # Use FloatingPointError as an uncommon error that parent # functions can safely catch to stop training. self.loss_scale = prev_scale raise FloatingPointError( ( "Minimum loss scale reached ({}). Your loss is probably exploding. " "Try lowering the learning rate, using gradient clipping or " "increasing the batch size." ).format(self.min_loss_scale) ) self._iter += 1 raise OverflowError("setting loss scale to: " + str(self.loss_scale)) ================================================ FILE: fairseq/optim/fairseq_optimizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq import utils from fairseq.dataclass.utils import gen_parser_from_dataclass from collections import defaultdict class FairseqOptimizer(object): def __init__(self, cfg): super().__init__() self.cfg = cfg @classmethod def add_args(cls, parser): """Add optimizer-specific arguments to the parser.""" dc = getattr(cls, "__dataclass", None) if dc is not None: gen_parser_from_dataclass(parser, dc()) @property def optimizer(self): """Return a torch.optim.optimizer.Optimizer instance.""" if not hasattr(self, "_optimizer"): raise NotImplementedError if not isinstance(self._optimizer, torch.optim.Optimizer): raise ValueError("_optimizer must be an instance of torch.optim.Optimizer") return self._optimizer @optimizer.setter def optimizer(self, optimizer): """Reset optimizer instance.""" if not hasattr(self, "_optimizer"): raise NotImplementedError if not isinstance(self._optimizer, torch.optim.Optimizer): raise ValueError("_optimizer must be an instance of torch.optim.Optimizer") self._optimizer = optimizer @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ raise NotImplementedError @property def params(self): """Return an iterable of the parameters held by the optimizer.""" for param_group in self.param_groups: for p in param_group["params"]: yield p @property def param_groups(self): return self.optimizer.param_groups def __getstate__(self): return self._optimizer.__getstate__() def get_lr(self): """Return the current learning rate.""" return self.param_groups[0]["lr"] def set_lr(self, lr): """Set the learning rate.""" for param_group in self.param_groups: param_group["lr"] = lr def state_dict(self): """Return the optimizer's state dict.""" return self.optimizer.state_dict() def load_state_dict(self, state_dict, optimizer_overrides=None): """Load an optimizer state dict. In general we should prefer the configuration of the existing optimizer instance (e.g., learning rate) over that found in the state_dict. This allows us to resume training from a checkpoint using a new set of optimizer args. """ self.optimizer.load_state_dict(state_dict) if optimizer_overrides is not None and len(optimizer_overrides) > 0: # override learning rate, momentum, etc. with latest values for group in self.param_groups: group.update(optimizer_overrides) def backward(self, loss): """Computes the sum of gradients of the given tensor w.r.t. graph leaves.""" loss.backward() def all_reduce_grads(self, module): """Manually all-reduce gradients (if required).""" if hasattr(module, "all_reduce_grads"): module.all_reduce_grads() def multiply_grads(self, c): """Multiplies grads by a constant *c*.""" per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list)) for p in self.params: if p.grad is not None: if p.grad.is_sparse: p.grad.data.mul_(c.to(p.grad.device) if torch.is_tensor(c) else c) else: per_device_and_dtype_grads[p.grad.device][p.grad.dtype].append( p.grad.data ) for device, per_dtype_grads in per_device_and_dtype_grads.items(): for grads in per_dtype_grads.values(): torch._foreach_mul_(grads, c.to(device) if torch.is_tensor(c) else c) def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): """Clips gradient norm.""" return utils.clip_grad_norm_(self.params, max_norm, aggregate_norm_fn) def step(self, closure=None, scale=1.0, groups=None): """Performs a single optimization step.""" if self.supports_step_with_scale: if self.supports_groups: self.optimizer.step(closure, scale=scale, groups=groups) else: self.optimizer.step(closure, scale=scale) else: if scale != 1.0: self.multiply_grads(1.0 / scale) if self.supports_groups: self.optimizer.step(closure, groups=groups) else: self.optimizer.step(closure) def zero_grad(self): """Clears the gradients of all optimized parameters.""" for p in self.params: p.grad = None self.optimizer.zero_grad() @property def supports_memory_efficient_fp16(self): if hasattr(self.optimizer, "supports_memory_efficient_fp16"): return self.optimizer.supports_memory_efficient_fp16 return False @property def supports_step_with_scale(self): if hasattr(self.optimizer, "supports_step_with_scale"): return self.optimizer.supports_step_with_scale return False @property def supports_groups(self): if hasattr(self.optimizer, "supports_groups"): return self.optimizer.supports_groups return False @property def supports_flat_params(self): """ Whether the optimizer supports collapsing of the model parameters/gradients into a single contiguous Tensor. """ if hasattr(self.optimizer, "supports_flat_params"): return self.optimizer.supports_flat_params return False def average_params(self): pass def broadcast_global_state_dict(self, state_dict): """ Broadcasts a global state dict to all ranks. Useful for optimizers that shard state between ranks. """ if hasattr(self.optimizer, "broadcast_global_state_dict"): return self.optimizer.broadcast_global_state_dict(state_dict) else: return state_dict class LegacyFairseqOptimizer(FairseqOptimizer): def __init__(self, args): self.args = args ================================================ FILE: fairseq/optim/fp16_optimizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections import defaultdict from itertools import chain import torch from omegaconf import DictConfig from fairseq import optim from .dynamic_loss_scaler import DynamicLossScaler class _FP16OptimizerMixin(object): def __init__(self, *args, **kwargs): # forward __init__ call to the next class in mro(method resolution order) super().__init__(*args, **kwargs) self._multiply_factor = 1.0 @property def has_flat_params(self): return torch.is_tensor(self.fp32_params) or ( isinstance(self.fp32_params, dict) and all(torch.is_tensor(t) for t in self.fp32_params.values()) ) @classmethod def build_fp32_params(cls, args, params, flatten=True): # create FP32 copy of parameters and grads if flatten: is_pipeline_parallel = getattr( args, "pipeline_model_parallel", False ) and getattr(args, "distributed_no_spawn", False) total_param_size = sum(p.data.numel() for p in params) devices = [torch.cuda.current_device()] if is_pipeline_parallel: devices = list(set(args.pipeline_devices)) fp32_params = {} for device in devices: if is_pipeline_parallel: device_param_size = sum( p.data.numel() for p in params if p.device.index == device ) device_params = [p for p in params if p.device.index == device] else: device_param_size = total_param_size device_params = params fp32_params[device] = ( device_params[0].new(0).float().new(device_param_size) ) offset = 0 for p in device_params: numel = p.data.numel() fp32_params[device][offset : offset + numel].copy_(p.data.view(-1)) offset += numel fp32_params[device] = torch.nn.Parameter(fp32_params[device]) fp32_params[device].grad = fp32_params[device].data.new( device_param_size ) return fp32_params else: fp32_params = [] for p in params: p32 = torch.nn.Parameter(p.data.float()) if hasattr(p, "expert"): p32.expert = True elif hasattr(p, "base_expert"): p32.base_expert = True p32.grad = torch.zeros_like(p32.data) if hasattr(p, "param_group"): p32.param_group = p.param_group if hasattr(p, "optim_overrides"): p32.optim_overrides = p.optim_overrides fp32_params.append(p32) return fp32_params def state_dict(self): """Return the optimizer's state dict.""" state_dict = self.fp32_optimizer.state_dict() if self.scaler is not None: state_dict["loss_scale"] = self.scaler.loss_scale return state_dict def load_state_dict(self, state_dict, optimizer_overrides=None): """Load an optimizer state dict. In general we should prefer the configuration of the existing optimizer instance (e.g., learning rate) over that found in the state_dict. This allows us to resume training from a checkpoint using a new set of optimizer args. """ if "loss_scale" in state_dict and self.scaler is not None: self.scaler.loss_scale = state_dict["loss_scale"] self.fp32_optimizer.load_state_dict(state_dict, optimizer_overrides) def backward(self, loss): """Computes the sum of gradients of the given tensor w.r.t. graph leaves. Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this function additionally dynamically scales the loss to avoid gradient underflow. """ if self.scaler is not None: loss = self.scaler.scale(loss) loss.backward() self._needs_sync = True def _sync_fp16_grads_to_fp32(self): if self._needs_sync: # copy FP16 grads to FP32 if self.has_flat_params: devices = list(self.fp32_params.keys()) device_params_dict = defaultdict(list) for p in self.fp16_params: if p.requires_grad: device_params_dict[p.device.index].append(p) for device in devices: device_params = device_params_dict[device] offset = 0 for p in device_params: grad_data = ( p.grad.data if p.grad is not None else p.data.new_zeros(p.data.shape) ) numel = grad_data.numel() self.fp32_params[device].grad.data[ offset : offset + numel ].copy_(grad_data.view(-1)) offset += numel else: for p, p32 in zip(self.fp16_params, self.fp32_params): if not p.requires_grad: continue if p.grad is not None: if p32.grad is None: p32.grad = p.grad.data.float() else: p32.grad.data.copy_(p.grad.data) else: p32.grad = torch.zeros_like(p.data, dtype=torch.float) self._needs_sync = False def _sync_fp32_params_to_fp16(self): # copy FP32 params back into FP16 model if self.has_flat_params: devices = list(self.fp32_params.keys()) device_params_dict = defaultdict(list) for p in self.fp16_params: device_params_dict[p.device.index].append(p) for device in devices: device_params = device_params_dict[device] offset = 0 for p in device_params: numel = p.data.numel() p.data.copy_( self.fp32_params[device] .data[offset : offset + numel] .view_as(p.data) ) offset += numel else: for p, p32 in zip(self.fp16_params, self.fp32_params): if not p.requires_grad: continue p.data.copy_(p32.data) def _unscale_grads(self): self._sync_fp16_grads_to_fp32() if ( # Skip the multiplication if it's a no-op (i.e., if _multiply_factor # is 1.0). At the same time, we want to avoid the device-to-host # transfer by comparing it to 1.0. Since _multiply_factor starts as # a Python float, we roughly assume that if it's a tensor then it's # probably not =1.0 anymore and we do the multiplication. Otherwise # we can safely check the value without a D2H transfer. torch.is_tensor(self._multiply_factor) or self._multiply_factor != 1.0 ): self.fp32_optimizer.multiply_grads(self._multiply_factor) self._multiply_factor = 1.0 def multiply_grads(self, c): """Multiplies grads by a constant ``c``.""" self._multiply_factor *= c def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): """Clips gradient norm and updates dynamic loss scaler.""" self._sync_fp16_grads_to_fp32() grad_norm = self._multiply_factor * self.fp32_optimizer.clip_grad_norm( 0, aggregate_norm_fn ) if torch.is_tensor(self._multiply_factor): self._multiply_factor = self._multiply_factor.to(grad_norm.device) if self.scaler is not None: if grad_norm > max_norm > 0.0: self._multiply_factor *= max_norm / grad_norm self.scaler.check_overflow(grad_norm) elif max_norm > 0.0: clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1) self._multiply_factor *= clip_coef return grad_norm def step(self, closure=None, groups=None): """Performs a single optimization step.""" self._sync_fp16_grads_to_fp32() if getattr(self, "supports_step_with_scale", False): self.fp32_optimizer.step( closure, scale=(1.0 / self._multiply_factor), groups=groups ) else: self._unscale_grads() self.fp32_optimizer.step(closure, groups=groups) if self.scaler is not None: self.scaler.update() self._sync_fp32_params_to_fp16() def zero_grad(self): """Clears the gradients of all optimized parameters.""" for p in self.fp16_params: p.grad = None if self.has_flat_params: if torch.is_tensor(self.fp32_params): self.fp32_params.grad.zero_() elif isinstance(self.fp32_params, dict): for fp32_params in self.fp32_params.values(): fp32_params.grad.zero_() else: raise RuntimeError("self.fp32_params must be a tensor or dict") else: for p32 in self.fp32_params: if p32.grad is not None: p32.grad.zero_() self._needs_sync = False if self.scaler is not None: self._multiply_factor = 1.0 / float(self.scaler.loss_scale) class FP16Optimizer(_FP16OptimizerMixin, optim.FairseqOptimizer): """ Wrap an *optimizer* to support FP16 (mixed precision) training. """ def __init__(self, cfg: DictConfig, params, fp32_optimizer, fp32_params, **kwargs): super().__init__(cfg.optimizer) self.fp16_params = params self.fp32_optimizer = fp32_optimizer self.fp32_params = fp32_params if getattr(cfg.common, "fp16_scale_window", None) is None: if len(cfg.optimization.update_freq) > 1: raise ValueError( "--fp16-scale-window must be given explicitly when using a " "custom --update-freq schedule" ) data_parallel_size = int( cfg.distributed_training.distributed_world_size / cfg.common.model_parallel_size ) scale_window = int( 2**14 / data_parallel_size / cfg.optimization.update_freq[0] ) else: scale_window = cfg.common.fp16_scale_window if not getattr(cfg.common, "bf16", False): self.scaler = DynamicLossScaler( init_scale=cfg.common.fp16_init_scale, scale_window=scale_window, tolerance=cfg.common.fp16_scale_tolerance, threshold=cfg.common.threshold_loss_scale, min_loss_scale=cfg.common.min_loss_scale, ) else: # disable loss scaling for bfloat16 self.scaler = None @classmethod def build_optimizer(cls, cfg: DictConfig, params, **kwargs): """ Args: cfg (omegaconf.DictConfig): fairseq args params (iterable): iterable of parameters to optimize """ flatten = not getattr(cfg.common, "fp16_no_flatten_grads", False) if getattr(cfg.common, "bf16", False): flatten = False # mixed precision is faster on TPUs without flat grads fp32_params = cls.build_fp32_params(cfg.optimizer, params, flatten=flatten) if flatten: fp32_optimizer = optim.build_optimizer(cfg.optimizer, [fp32_params]) else: fp32_optimizer = optim.build_optimizer(cfg.optimizer, fp32_params) if flatten and not fp32_optimizer.supports_flat_params: raise RuntimeError( f"chosen optimizer {fp32_optimizer.__class__.__name__} does not support flat params, please set --fp16-no-flatten-grads" ) return cls(cfg, params, fp32_optimizer, fp32_params, **kwargs) @property def optimizer(self): return self.fp32_optimizer.optimizer @optimizer.setter def optimizer(self, optimizer): self.fp32_optimizer.optimizer = optimizer @property def lr_scheduler(self): return getattr(self.fp32_optimizer, "lr_scheduler", None) @property def optimizer_config(self): return self.fp32_optimizer.optimizer_config def get_lr(self): return self.fp32_optimizer.get_lr() def set_lr(self, lr): self.fp32_optimizer.set_lr(lr) def all_reduce_grads(self, module): self.fp32_optimizer.all_reduce_grads(module) @property def supports_flat_params(self): return self.fp32_optimizer.supports_flat_params class _MemoryEfficientFP16OptimizerMixin(object): def __init__(self, *args, **kwargs): # forward __init__ call to the next class in MRO (method resolution order) super().__init__(*args, **kwargs) self._multiply_factor = 1.0 @property def has_flat_params(self): return False def state_dict(self): """Return the optimizer's state dict.""" state_dict = self.wrapped_optimizer.state_dict() if self.scaler is not None: state_dict["loss_scale"] = self.scaler.loss_scale return state_dict def load_state_dict(self, state_dict, optimizer_overrides=None): """Load an optimizer state dict. In general we should prefer the configuration of the existing optimizer instance (e.g., learning rate) over that found in the state_dict. This allows us to resume training from a checkpoint using a new set of optimizer args. """ if "loss_scale" in state_dict and self.scaler is not None: self.scaler.loss_scale = state_dict["loss_scale"] self.wrapped_optimizer.load_state_dict(state_dict, optimizer_overrides) # Hack: PyTorch automatically casts the optimizer state to match the # type of the current parameters. But with --memory-efficient-fp16 the # params are FP16 while the optimizer state is FP32 and we don't want # to cast. A workaround is to manually copy back the original state # after the optimizer has been loaded. if not getattr(self.optimizer, "disable_mem_eff_fp16_loading_hack", False): groups = self.optimizer.param_groups saved_groups = state_dict["param_groups"] id_map = { old_id: p for old_id, p in zip( chain(*(g["params"] for g in saved_groups)), chain(*(g["params"] for g in groups)), ) } for k, v in state_dict["state"].items(): if k in id_map: param = id_map[k] self.optimizer.state[param] = v def backward(self, loss): """Computes the sum of gradients of the given tensor w.r.t. graph leaves. Compared to :func:`fairseq.optim.FairseqOptimizer.backward`, this function additionally dynamically scales the loss to avoid gradient underflow. """ if self.scaler is not None: loss = self.scaler.scale(loss) loss.backward() def _unscale_grads(self): if ( # Skip the multiplication if it's a no-op (i.e., if _multiply_factor # is 1.0). At the same time, we want to avoid the device-to-host # transfer by comparing it to 1.0. Since _multiply_factor starts as # a Python float, we roughly assume that if it's a tensor then it's # probably not =1.0 anymore and we do the multiplication. Otherwise # we can safely check the value without a D2H transfer. torch.is_tensor(self._multiply_factor) or self._multiply_factor != 1.0 ): self.wrapped_optimizer.multiply_grads(self._multiply_factor) self._multiply_factor = 1.0 def multiply_grads(self, c): """Multiplies grads by a constant *c*.""" self._multiply_factor *= c def clip_grad_norm(self, max_norm, aggregate_norm_fn=None): """Clips gradient norm and updates dynamic loss scaler.""" max_norm = float(max_norm) grad_norm = self._multiply_factor * self.wrapped_optimizer.clip_grad_norm( 0, aggregate_norm_fn ) if self.scaler is not None: grad_norm_cpu = float(grad_norm) if grad_norm_cpu > max_norm > 0.0: self._multiply_factor *= max_norm / grad_norm_cpu # detect overflow and adjust loss scale self.scaler.check_overflow(grad_norm_cpu) elif max_norm > 0.0: clip_coef = (max_norm / (grad_norm + 1e-6)).clamp_(max=1) self._multiply_factor *= clip_coef return grad_norm def step(self, closure=None, groups=None): """Performs a single optimization step.""" if getattr(self, "supports_step_with_scale", False): # NOTE(msb) optimizer divides by scale factor self.wrapped_optimizer.step( closure, scale=(1.0 / self._multiply_factor), groups=groups ) else: self._unscale_grads() self.wrapped_optimizer.step(closure, groups=groups) if self.scaler is not None: self.scaler.update() def zero_grad(self): """Clears the gradients of all optimized parameters.""" self.wrapped_optimizer.zero_grad() if self.scaler is not None: self._multiply_factor = 1.0 / float(self.scaler.loss_scale) else: self._multiply_factor = 1.0 @property def supports_flat_params(self): return self.wrapped_optimizer.supports_flat_params class MemoryEfficientFP16Optimizer( _MemoryEfficientFP16OptimizerMixin, optim.FairseqOptimizer ): """ Wrap an *optimizer* to support FP16 (mixed precision) training. Compared to :class:`fairseq.optim.FP16Optimizer`, this version does not maintain an FP32 copy of the model. We instead expect the optimizer to convert the gradients to FP32 internally and sync the results back to the FP16 model params. This significantly reduces memory usage but slightly increases the time spent in the optimizer. Since this wrapper depends on specific functionality in the wrapped optimizer (i.e., on-the-fly conversion of grads to FP32), only certain optimizers can be wrapped. This is determined by the *supports_memory_efficient_fp16* property. """ def __init__( self, cfg: DictConfig, params, optimizer, allow_unsupported=False, **kwargs ): if not allow_unsupported and not optimizer.supports_memory_efficient_fp16: raise ValueError( "Unsupported optimizer: {}".format(optimizer.__class__.__name__) ) super().__init__(getattr(cfg, "optimizer", None)) self.wrapped_optimizer = optimizer if getattr(cfg.common, "fp16_scale_window", None) is None: if len(cfg.optimization.update_freq) > 1: raise ValueError( "--fp16-scale-window must be given explicitly when using a " "custom --update-freq schedule" ) data_parallel_size = int( cfg.distributed_training.distributed_world_size / cfg.common.model_parallel_size ) scale_window = int( 2**14 / data_parallel_size / cfg.optimization.update_freq[0] ) else: scale_window = cfg.common.fp16_scale_window if not getattr(cfg.common, "bf16", False): self.scaler = DynamicLossScaler( init_scale=cfg.common.fp16_init_scale, scale_window=scale_window, tolerance=cfg.common.fp16_scale_tolerance, threshold=cfg.common.threshold_loss_scale, min_loss_scale=cfg.common.min_loss_scale, ) else: # disable loss scaling for bfloat16 self.scaler = None @classmethod def build_optimizer(cls, cfg: DictConfig, params, **kwargs): """ Args: args (argparse.Namespace): fairseq args params (iterable): iterable of parameters to optimize """ fp16_optimizer = optim.build_optimizer(cfg.optimizer, params) return cls(cfg, params, fp16_optimizer, **kwargs) @property def optimizer(self): return self.wrapped_optimizer.optimizer @optimizer.setter def optimizer(self, optimizer): self.wrapped_optimizer.optimizer = optimizer @property def optimizer_config(self): return self.wrapped_optimizer.optimizer_config @property def lr_scheduler(self): return getattr(self.wrapped_optimizer, "lr_scheduler", None) def get_lr(self): return self.wrapped_optimizer.get_lr() def set_lr(self, lr): self.wrapped_optimizer.set_lr(lr) def all_reduce_grads(self, module): self.wrapped_optimizer.all_reduce_grads(module) ================================================ FILE: fairseq/optim/fused_adam.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import types import torch def get_fused_adam_class(): """ Look for the FusedAdam optimizer from apex. We first try to load the "contrib" interface, which is a bit faster than the main interface, but is technically deprecated. """ try: # The "deprecated" interface in recent versions of apex is a bit # faster than the main interface, since we don't use the apex # optimizer. This can be installed by passing the # `--deprecated_fused_adam` option when building apex. global fused_adam_cuda import importlib fused_adam_cuda = importlib.import_module("fused_adam_cuda") return FusedAdamV1 except ImportError: try: # fallback to the newer interface from apex.multi_tensor_apply import multi_tensor_applier from apex.optimizers import FusedAdam as _FusedAdam # noqa if multi_tensor_applier.available: return FusedAdamV2 except ImportError: pass return None class FusedAdamV1(torch.optim.Optimizer): """ Implements Adam algorithm. Currently GPU-only. Requires Apex to be installed via ``python setup.py install --cuda_ext --cpp_ext``. It has been proposed in `Adam: A Method for Stochastic Optimization`_. Compared to the original version in Apex, the fairseq version casts grads and params to FP32 internally to support ``--memory-efficient-fp16``. Args: params (iterable): iterable of parameters to optimize or dicts defining parameter groups. lr (float, optional): learning rate. (default: 1e-3) betas (Tuple[float, float], optional): coefficients used for computing running averages of gradient and its square. (default: (0.9, 0.999)) eps (float, optional): term added to the denominator to improve numerical stability. (default: 1e-8) weight_decay (float, optional): weight decay (L2 penalty) (default: 0) amsgrad (boolean, optional): whether to use the AMSGrad variant of this algorithm from the paper `On the Convergence of Adam and Beyond`_ (default: False) NOT SUPPORTED in FusedAdam! eps_inside_sqrt (boolean, optional): in the 'update parameters' step, adds eps to the bias-corrected second moment estimate before evaluating square root instead of adding it to the square root of second moment estimate as in the original paper. (default: False) .. _Adam: A Method for Stochastic Optimization: https://arxiv.org/abs/1412.6980 .. _On the Convergence of Adam and Beyond: https://openreview.net/forum?id=ryQu7f-RZ """ def __init__( self, params, lr=1e-3, bias_correction=True, betas=(0.9, 0.999), eps=1e-8, eps_inside_sqrt=False, weight_decay=0.0, max_grad_norm=0.0, amsgrad=False, use_fp16_stats=False, ): global fused_adam_cuda import importlib fused_adam_cuda = importlib.import_module("fused_adam_cuda") if amsgrad: raise RuntimeError("FusedAdam does not support the AMSGrad variant.") defaults = { "lr": lr, "bias_correction": bias_correction, "betas": betas, "eps": eps, "weight_decay": weight_decay, "max_grad_norm": max_grad_norm, } super().__init__(params, defaults) self.eps_mode = 0 if eps_inside_sqrt else 1 self.use_fp16_stats = use_fp16_stats self.FLOAT16_MAX = 65504.0 @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return True @property def supports_step_with_scale(self): return True def step(self, closure=None, grads=None, scale=1.0, grad_norms=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. grads (list of tensors, optional): weight gradient to use for the optimizer update. If gradients have type torch.half, parameters are expected to be in type torch.float. (default: None) output params (list of tensors, optional): A reduced precision copy of the updated weights written out in addition to the regular updated weights. Have to be of same type as gradients. (default: None) scale (float, optional): factor to divide gradient tensor values by before applying to weights. (default: 1) """ loss = None if closure is not None: loss = closure() if grads is None: grads_group = [None] * len(self.param_groups) # backward compatibility # assuming a list/generator of parameter means single group elif isinstance(grads, types.GeneratorType): grads_group = [grads] elif type(grads[0]) != list: grads_group = [grads] else: grads_group = grads if grad_norms is None: grad_norms = [None] * len(self.param_groups) for group, grads_this_group, grad_norm in zip( self.param_groups, grads_group, grad_norms ): if grads_this_group is None: grads_this_group = [None] * len(group["params"]) # compute combined scale factor for this group combined_scale = scale if group.get("max_grad_norm", 0) > 0: # norm is in fact norm*scale clip = ((grad_norm / scale) + 1e-6) / group["max_grad_norm"] if clip > 1: combined_scale = clip * scale bias_correction = 1 if group.get("bias_correction", 1) else 0 for p, grad in zip(group["params"], grads_this_group): # note: p.grad should not ever be set for correct # operation of mixed precision optimizer that sometimes # sends None gradients if p.grad is None and grad is None: continue if grad is None: grad = p.grad.data if grad.is_sparse: raise RuntimeError( "FusedAdam does not support sparse gradients, " "please consider SparseAdam instead" ) if p.device.type == "cpu": p_data_fp32 = p.data.cuda(non_blocking=True).float() out_p = torch.tensor([], dtype=torch.float) else: p_data_fp32 = p.data.float() out_p = p.data state = self.state[p] # State initialization dtype = torch.float16 if self.use_fp16_stats else p_data_fp32.dtype if len(state) == 0: state["step"] = 0 # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like(p_data_fp32, dtype=dtype) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like(p_data_fp32, dtype=dtype) if self.use_fp16_stats: state["exp_avg_scale"] = 1.0 state["exp_avg_sq_scale"] = 1.0 else: device = p_data_fp32.device state["exp_avg"] = state["exp_avg"].to(device, dtype) state["exp_avg_sq"] = state["exp_avg_sq"].to(device, dtype) exp_avg = state["exp_avg"] exp_avg_sq = state["exp_avg_sq"] if self.use_fp16_stats: assert exp_avg.dtype == torch.float16 exp_avg = exp_avg.float() * state["exp_avg_scale"] exp_avg_sq = exp_avg_sq.float() * state["exp_avg_sq_scale"] beta1, beta2 = group["betas"] if "step" not in state: state["step"] = group["step"] state["step"] += 1 with torch.cuda.device(p_data_fp32.device): fused_adam_cuda.adam( p_data_fp32, out_p, exp_avg, exp_avg_sq, grad, group["lr"], beta1, beta2, group["eps"], combined_scale, state["step"], self.eps_mode, bias_correction, group["weight_decay"], ) if p.device.type == "cpu": p.data.copy_(p_data_fp32, non_blocking=True) if self.use_fp16_stats: def inf_norm(t): return torch.norm(t, float("inf")) # from github.com/openai/jukebox/blob/master/jukebox/utils/fp16.py state["exp_avg_scale"], state["exp_avg_sq_scale"] = ( 1e-8 + inf_norm(exp_avg) / self.FLOAT16_MAX, 1e-8 + inf_norm(exp_avg_sq) / self.FLOAT16_MAX, ) state["exp_avg"], state["exp_avg_sq"] = ( (exp_avg / state["exp_avg_scale"]).half(), (exp_avg_sq / state["exp_avg_sq_scale"]).half(), ) return loss try: from apex.multi_tensor_apply import multi_tensor_applier from apex.optimizers import FusedAdam class FusedAdamV2(FusedAdam): """ Compared to the original version in Apex, the fairseq version casts grads and params to FP32 internally to support ``--memory-efficient-fp16``. """ def __init__(self, *args, use_fp16_stats=False, **kwargs): if use_fp16_stats: raise NotImplementedError( "--fp16-adam-stats is only supported with FusedAdamV1" ) super().__init__(*args, **kwargs) if not hasattr(self, "multi_tensor_adam"): raise Exception( "Apex installation is outdated. Please install an updated version of apex." ) @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return True def step( self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None, ): """Performs a single optimization step.""" loss = None if closure is not None: loss = closure() for group in self.param_groups: bias_correction = 1 if group["bias_correction"] else 0 beta1, beta2 = group["betas"] # assume same step across group now to simplify things # per parameter step can be easily support by making it tensor, or pass list into kernel if "step" in group: group["step"] += 1 else: group["step"] = 1 # create lists for multi-tensor apply g_16, p_16, orig_p_16, m_16, v_16 = [], [], [], [], [] g_32, p_32, m_32, v_32 = [], [], [], [] for p in group["params"]: if p.grad is None: continue if p.grad.data.is_sparse: raise RuntimeError( "FusedAdam does not support sparse gradients, " "please consider SparseAdam instead" ) state = self.state[p] # State initialization if len(state) == 0: # Exponential moving average of gradient values state["exp_avg"] = torch.zeros_like(p.data, dtype=torch.float) # Exponential moving average of squared gradient values state["exp_avg_sq"] = torch.zeros_like( p.data, dtype=torch.float ) else: state["exp_avg"] = state["exp_avg"].to( device=p.data.device, dtype=torch.float ) state["exp_avg_sq"] = state["exp_avg_sq"].to( device=p.data.device, dtype=torch.float ) if p.dtype == torch.float16: g_16.append(p.grad.data.float()) p_16.append(p.data.float()) orig_p_16.append(p.data) m_16.append(state["exp_avg"]) v_16.append(state["exp_avg_sq"]) elif p.dtype == torch.float32: g_32.append(p.grad.data) p_32.append(p.data) m_32.append(state["exp_avg"]) v_32.append(state["exp_avg_sq"]) else: raise RuntimeError("FusedAdam only support fp16 and fp32.") with torch.cuda.device(p.device): if len(g_16) > 0: multi_tensor_applier( self.multi_tensor_adam, self._dummy_overflow_buf, [g_16, p_16, m_16, v_16], group["lr"], beta1, beta2, group["eps"], group["step"], self.adam_w_mode, bias_correction, group["weight_decay"], ) for orig_p, p in zip(orig_p_16, p_16): orig_p.copy_(p.data) if len(g_32) > 0: multi_tensor_applier( self.multi_tensor_adam, self._dummy_overflow_buf, [g_32, p_32, m_32, v_32], group["lr"], beta1, beta2, group["eps"], group["step"], self.adam_w_mode, bias_correction, group["weight_decay"], ) return loss except ImportError: pass ================================================ FILE: fairseq/optim/fused_lamb.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from fairseq.optim import LegacyFairseqOptimizer, register_optimizer @register_optimizer("lamb") class FairseqLAMB(LegacyFairseqOptimizer): """LAMB optimizer.""" def __init__(self, args, params): super().__init__(args) try: from apex.optimizers import FusedLAMB self._optimizer = FusedLAMB(params, **self.optimizer_config) except ImportError: raise ImportError("Please install apex to use LAMB optimizer") @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" # fmt: off parser.add_argument('--lamb-betas', default='(0.9, 0.999)', metavar='B', help='betas for LAMB optimizer') parser.add_argument('--lamb-eps', type=float, default=1e-8, metavar='D', help='epsilon for LAMB optimizer') parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # fmt: on @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.args.lr[0], "betas": eval(self.args.lamb_betas), "eps": self.args.lamb_eps, "weight_decay": self.args.weight_decay, } @property def supports_flat_params(self): return False ================================================ FILE: fairseq/optim/lr_scheduler/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import importlib import os from fairseq import registry from fairseq.optim.lr_scheduler.fairseq_lr_scheduler import ( # noqa FairseqLRScheduler, LegacyFairseqLRScheduler, ) from omegaconf import DictConfig ( build_lr_scheduler_, register_lr_scheduler, LR_SCHEDULER_REGISTRY, LR_SCHEDULER_DATACLASS_REGISTRY, ) = registry.setup_registry( "--lr-scheduler", base_class=FairseqLRScheduler, default="fixed" ) def build_lr_scheduler(cfg: DictConfig, optimizer): return build_lr_scheduler_(cfg, optimizer) # automatically import any Python files in the optim/lr_scheduler/ directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): file_name = file[: file.find(".py")] importlib.import_module("fairseq.optim.lr_scheduler." + file_name) ================================================ FILE: fairseq/optim/lr_scheduler/cosine_lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from collections.abc import Collection from dataclasses import dataclass, field from typing import List from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class CosineLRScheduleConfig(FairseqDataclass): warmup_updates: int = field( default=0, metadata={"help": "warmup the learning rate linearly for the first N updates"}, ) warmup_init_lr: float = field( default=-1, metadata={ "help": "initial learning rate during warmup phase; default is cfg.lr" }, ) lr: List[float] = field( default=II("optimization.lr"), metadata={"help": "max learning rate, must be more than cfg.min_lr"}, ) min_lr: float = field(default=0.0, metadata={"help": "min learning rate"}) t_mult: float = field( default=1.0, metadata={"help": "factor to grow the length of each period"} ) lr_period_updates: float = field( default=-1, metadata={"help": "initial number of updates per period"} ) lr_shrink: float = field( default=0.1, metadata={"help": "shrink factor for annealing"} ) # This is not required, but is for convenience in inferring lr_period_updates max_update: int = II("optimization.max_update") @register_lr_scheduler("cosine", dataclass=CosineLRScheduleConfig) class CosineLRSchedule(FairseqLRScheduler): """Assign LR based on a cyclical schedule that follows the cosine function. See https://arxiv.org/pdf/1608.03983.pdf for details. We also support a warmup phase where we linearly increase the learning rate from some initial learning rate (``--warmup-init-lr``) until the configured max learning rate (``--lr``). During warmup:: lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates) lr = lrs[update_num] After warmup:: lr = cfg.min_lr + 0.5*(cfg.lr - cfg.min_lr)*(1 + cos(t_curr / t_i)) where ``t_curr`` is current percentage of updates within the current period range and ``t_i`` is the current period range, which is scaled by ``t_mul`` after every iteration. """ def __init__(self, cfg: CosineLRScheduleConfig, fairseq_optimizer): super().__init__(cfg, fairseq_optimizer) if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with cosine." f" Consider --lr-scheduler=fixed instead. ({cfg.lr})" ) self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr if self.max_lr < cfg.min_lr: cfg.min_lr = self.max_lr warmup_end_lr = self.max_lr if cfg.warmup_init_lr < 0: cfg.warmup_init_lr = cfg.min_lr self.t_mult = cfg.t_mult self.period = cfg.lr_period_updates if self.period <= 0: assert ( cfg.max_update > 0 ), "Either --max_update or --lr-period-updates must be set" self.period = cfg.max_update - cfg.warmup_updates if cfg.warmup_updates > 0: # linearly warmup for the first cfg.warmup_updates self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates else: self.lr_step = 1 self.warmup_updates = cfg.warmup_updates self.lr_shrink = cfg.lr_shrink # initial learning rate self.lr = cfg.warmup_init_lr self.optimizer.set_lr(self.lr) def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" super().step(epoch, val_loss) # we don't change the learning rate at epoch boundaries return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" if num_updates < self.cfg.warmup_updates: self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step else: curr_updates = num_updates - self.cfg.warmup_updates if self.t_mult != 1: i = math.floor( math.log( 1 - curr_updates / self.period * (1 - self.t_mult), self.t_mult ) ) t_i = self.t_mult**i * self.period t_curr = ( curr_updates - (1 - self.t_mult**i) / (1 - self.t_mult) * self.period ) else: i = math.floor(curr_updates / self.period) t_i = self.period t_curr = curr_updates - (self.period * i) lr_shrink = self.lr_shrink**i min_lr = self.cfg.min_lr * lr_shrink max_lr = self.max_lr * lr_shrink self.lr = min_lr + 0.5 * (max_lr - min_lr) * ( 1 + math.cos(math.pi * t_curr / t_i) ) self.optimizer.set_lr(self.lr) return self.lr ================================================ FILE: fairseq/optim/lr_scheduler/fairseq_lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from argparse import Namespace from fairseq.dataclass.utils import gen_parser_from_dataclass from fairseq.optim import FairseqOptimizer class FairseqLRScheduler(object): def __init__(self, cfg, optimizer): super().__init__() if optimizer is not None and not isinstance(optimizer, FairseqOptimizer): raise ValueError("optimizer must be an instance of FairseqOptimizer") self.cfg = cfg self.optimizer = optimizer self.best = None @classmethod def add_args(cls, parser): """Add arguments to the parser for this LR scheduler.""" dc = getattr(cls, "__dataclass", None) if dc is not None: gen_parser_from_dataclass(parser, dc()) def state_dict(self): """Return the LR scheduler state dict.""" return {"best": self.best} def load_state_dict(self, state_dict): """Load an LR scheduler state dict.""" self.best = state_dict["best"] def step_begin_epoch(self, epoch): """Update the learning rate at the beginning of the given epoch.""" pass def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" if val_loss is not None: if self.best is None: self.best = val_loss else: self.best = min(self.best, val_loss) def step_update(self, num_updates): """Update the learning rate after each update.""" return self.optimizer.get_lr() class LegacyFairseqLRScheduler(FairseqLRScheduler): def __init__(self, args: Namespace, optimizer): if not isinstance(optimizer, FairseqOptimizer): raise ValueError("optimizer must be an instance of FairseqOptimizer") self.args = args self.optimizer = optimizer self.best = None ================================================ FILE: fairseq/optim/lr_scheduler/fixed_schedule.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from typing import Optional, List from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class FixedLRScheduleConfig(FairseqDataclass): force_anneal: Optional[int] = field( default=None, metadata={"help": "force annealing at specified epoch"}, ) lr_shrink: float = field( default=0.1, metadata={"help": "shrink factor for annealing, lr_new = (lr * lr_shrink)"}, ) warmup_updates: int = field( default=0, metadata={"help": "warmup the learning rate linearly for the first N updates"}, ) lr: List[float] = II("optimization.lr") @register_lr_scheduler("fixed", dataclass=FixedLRScheduleConfig) class FixedLRSchedule(FairseqLRScheduler): """Decay the LR on a fixed schedule.""" def __init__(self, cfg: FixedLRScheduleConfig, optimizer): super().__init__(cfg, optimizer) self.lr = cfg.lr[0] if cfg.warmup_updates > 0: self.warmup_factor = 1.0 / cfg.warmup_updates else: self.warmup_factor = 1 def state_dict(self): return {"lr": self.lr} def load_state_dict(self, state_dict): if "lr" in state_dict: self.lr = state_dict["lr"] def get_next_lr(self, epoch): lrs = self.cfg.lr if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal: # use fixed LR schedule next_lr = lrs[min(epoch - 1, len(lrs) - 1)] else: # annneal based on lr_shrink next_lr = lrs[-1] * self.cfg.lr_shrink ** ( epoch + 1 - self.cfg.force_anneal ) return next_lr def step_begin_epoch(self, epoch): """Update the learning rate at the beginning of the given epoch.""" self.lr = self.get_next_lr(epoch) self.optimizer.set_lr(self.warmup_factor * self.lr) return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" if self.cfg.warmup_updates > 0 and num_updates < self.cfg.warmup_updates: self.warmup_factor = (num_updates + 1) / float(self.cfg.warmup_updates) self.optimizer.set_lr(self.warmup_factor * self.lr) else: self.optimizer.set_lr(self.lr) return self.optimizer.get_lr() ================================================ FILE: fairseq/optim/lr_scheduler/inverse_square_root_schedule.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections.abc import Collection from dataclasses import dataclass, field from typing import List from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class InverseSquareRootLRScheduleConfig(FairseqDataclass): warmup_updates: int = field( default=4000, metadata={"help": "warmup the learning rate linearly for the first N updates"}, ) warmup_init_lr: float = field( default=-1, metadata={ "help": "initial learning rate during warmup phase; default is cfg.lr" }, ) lr: List[float] = II("optimization.lr") @register_lr_scheduler("inverse_sqrt", dataclass=InverseSquareRootLRScheduleConfig) class InverseSquareRootSchedule(FairseqLRScheduler): """Decay the LR based on the inverse square root of the update number. We also support a warmup phase where we linearly increase the learning rate from some initial learning rate (``--warmup-init-lr``) until the configured learning rate (``--lr``). Thereafter we decay proportional to the number of updates, with a decay factor set to align with the configured learning rate. During warmup:: lrs = torch.linspace(cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates) lr = lrs[update_num] After warmup:: decay_factor = cfg.lr * sqrt(cfg.warmup_updates) lr = decay_factor / sqrt(update_num) """ def __init__(self, cfg: InverseSquareRootLRScheduleConfig, optimizer): super().__init__(cfg, optimizer) if isinstance(cfg.lr, Collection) and len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with inverse_sqrt." " Consider --lr-scheduler=fixed instead." ) warmup_end_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr if cfg.warmup_init_lr < 0: cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr # linearly warmup for the first cfg.warmup_updates self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates # then, decay prop. to the inverse square root of the update number self.decay_factor = warmup_end_lr * cfg.warmup_updates**0.5 # initial learning rate self.lr = cfg.warmup_init_lr self.optimizer.set_lr(self.lr) def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" super().step(epoch, val_loss) # we don't change the learning rate at epoch boundaries return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" if num_updates < self.cfg.warmup_updates: self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step else: self.lr = self.decay_factor * num_updates**-0.5 self.optimizer.set_lr(self.lr) return self.lr ================================================ FILE: fairseq/optim/lr_scheduler/manual_lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from . import LegacyFairseqLRScheduler, register_lr_scheduler import logging import ast logger = logging.getLogger(__name__) logger.setLevel(logging.WARNING) @register_lr_scheduler("manual") class ManualSchedule(LegacyFairseqLRScheduler): """Decay the LR on a manual schedule.""" def __init__(self, args, optimizer): super().__init__(args, optimizer) self.epoch2lr = self.parse_manuallr_args(args.epoch2lr) self.update2lr = self.parse_manuallr_args(args.update2lr) logger.info("@@@ ManualSchedule epoch2lr={}".format(self.epoch2lr)) logger.info("@@@ ManualSchedule update2lr={}".format(self.update2lr)) if 1 in self.epoch2lr: self.lr = self.epoch2lr[1] elif 1 in self.update2lr: self.lr = self.update2lr[1] else: self.lr = args.lr[0] self.optimizer.set_lr(self.lr) # Set the beginning of the epoch. def parse_manuallr_args(self, lr_args_str): lr_dict = ast.literal_eval(lr_args_str.replace(" ", "")) if not isinstance(lr_dict, dict): raise ValueError("epoch2lr/update2lr must be abel to evaluated to a dict") lr_args = {} logger.info("@@@ after parsing input dictionary lr_dict = {}".format(lr_dict)) for key, val in lr_dict.items(): if "," in key: for k in key.split(","): lr_args[int(k)] = float(val) elif "-" in key: s = int(key.split("-")[0]) e = int(key.split("-")[1]) for k in range(s, e + 1, 1): lr_args[k] = float(val) else: lr_args[int(key)] = float(val) return lr_args @staticmethod def add_args(parser): """Add arguments to the parser for this LR scheduler.""" # fmt: off parser.add_argument( "--epoch2lr", type=str, metavar="DICT", default="{}", help="a dictionary used to set lr for each epoch manually", ) parser.add_argument( "--update2lr", type=str, metavar="DICT", default="{}", help="a dictionary used to set lr for each update manually", ) # fmt: on def state_dict(self): return {"lr": self.lr} def load_state_dict(self, state_dict): if "lr" in state_dict: self.lr = state_dict["lr"] def get_next_lr(self, epoch): manual_keys = [k for k in self.epoch2lr if k <= epoch] if manual_keys: manual_lr = self.epoch2lr[max(manual_keys)] else: logger.warning( "@@@ epoch={} does not exist in manual lr input. epoch2lr={}...".format( epoch, list(self.epoch2lr.items())[ : min(10, len(self.epoch2lr.keys()) - 1) ], ) ) manual_lr = self.optimizer.get_lr() return manual_lr def step_begin_epoch(self, epoch): """Update the learning rate at the beginning of the given epoch.""" self.lr = self.get_next_lr(epoch) self.optimizer.set_lr(self.lr) return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" manual_keys = [k for k in self.update2lr if k <= num_updates] if manual_keys: manual_lr = self.update2lr[max(manual_keys)] else: logger.warning( "epoch={} does not exist in manual lr input update2lr={}...".format( num_updates, list(self.update2lr.items())[ : min(10, len(self.update2lr.keys()) - 1) ], ) ) manual_lr = self.optimizer.get_lr() self.optimizer.set_lr(manual_lr) return self.optimizer.get_lr() ================================================ FILE: fairseq/optim/lr_scheduler/pass_through.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class PassThroughScheduleConfig(FairseqDataclass): pass @register_lr_scheduler("pass_through", dataclass=PassThroughScheduleConfig) class PassThroughScheduleSchedule(FairseqLRScheduler): """Delegate lr scheduling to the optimizer.""" def __init__(self, cfg: PassThroughScheduleConfig, optimizer): super().__init__(cfg, optimizer) assert ( hasattr(optimizer, "lr_scheduler") and optimizer.lr_scheduler is not None ), "Pass-through schedule can only be used with optimizers with their own schedulers" def state_dict(self): return self.optimizer.lr_scheduler.state_dict() def load_state_dict(self, state_dict): self.optimizer.lr_scheduler.load_state_dict(state_dict) def step_begin_epoch(self, epoch): """Update the learning rate at the beginning of the given epoch.""" return self.optimizer.lr_scheduler.step_begin_epoch(epoch) def step_update(self, num_updates): """Update the learning rate after each update.""" return self.optimizer.lr_scheduler.step_update(num_updates) ================================================ FILE: fairseq/optim/lr_scheduler/polynomial_decay_schedule.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from typing import Optional, List from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class PolynomialDecayLRScheduleConfig(FairseqDataclass): warmup_updates: int = field( default=0, metadata={"help": "warmup the learning rate linearly for the first N updates"}, ) force_anneal: Optional[int] = field( default=None, metadata={"help": "force annealing at specified epoch"}, ) end_learning_rate: float = field( default=0.0, metadata={"help": "learning rate to decay to"}, ) power: float = field( default=1.0, metadata={"help": "decay exponent"}, ) total_num_update: float = field( default=II("optimization.max_update"), metadata={"help": "total number of updates over which to decay learning rate"}, ) lr: List[float] = II("optimization.lr") @register_lr_scheduler("polynomial_decay", dataclass=PolynomialDecayLRScheduleConfig) class PolynomialDecayLRSchedule(FairseqLRScheduler): """Decay the LR on a fixed schedule.""" def __init__(self, cfg: PolynomialDecayLRScheduleConfig, optimizer): super().__init__(cfg, optimizer) assert cfg.total_num_update > 0 self.lr = cfg.lr[0] if cfg.warmup_updates > 0: self.warmup_factor = 1.0 / cfg.warmup_updates else: self.warmup_factor = 1 self.end_learning_rate = cfg.end_learning_rate self.total_num_update = cfg.total_num_update self.power = cfg.power self.optimizer.set_lr(self.warmup_factor * self.lr) def get_next_lr(self, epoch): lrs = self.cfg.lr if self.cfg.force_anneal is None or epoch < self.cfg.force_anneal: # use fixed LR schedule next_lr = lrs[min(epoch, len(lrs) - 1)] else: # annneal based on lr_shrink next_lr = self.optimizer.get_lr() return next_lr def step_begin_epoch(self, epoch): """Update the learning rate at the beginning of the given epoch.""" self.lr = self.get_next_lr(epoch) self.optimizer.set_lr(self.warmup_factor * self.lr) return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" if self.cfg.warmup_updates > 0 and num_updates <= self.cfg.warmup_updates: self.warmup_factor = num_updates / float(self.cfg.warmup_updates) lr = self.warmup_factor * self.lr elif num_updates >= self.total_num_update: lr = self.end_learning_rate else: warmup = self.cfg.warmup_updates lr_range = self.lr - self.end_learning_rate pct_remaining = 1 - (num_updates - warmup) / ( self.total_num_update - warmup ) lr = lr_range * pct_remaining ** (self.power) + self.end_learning_rate self.optimizer.set_lr(lr) return self.optimizer.get_lr() ================================================ FILE: fairseq/optim/lr_scheduler/reduce_lr_on_plateau.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from typing import List import torch.optim.lr_scheduler from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class ReduceLROnPlateauLRScheduleConfig(FairseqDataclass): lr_shrink: float = field( default=0.1, metadata={"help": "shrink factor for annealing"} ) lr_threshold: float = field( default=1e-4, metadata={ "help": ( "threshold for measuring the new optimum, to only focus on " "significant changes" ) }, ) lr_patience: int = field( default=0, metadata={ "help": ( "number of epochs with no improvement after which learning rate will " "be reduced" ) }, ) warmup_updates: int = field( default=0, metadata={"help": "warmup the learning rate linearly for the first N updates"}, ) warmup_init_lr: float = field( default=-1, metadata={ "help": "initial learning rate during warmup phase; default is cfg.lr" }, ) lr: List[float] = II("optimization.lr") maximize_best_checkpoint_metric: bool = II( "checkpoint.maximize_best_checkpoint_metric" ) @register_lr_scheduler( "reduce_lr_on_plateau", dataclass=ReduceLROnPlateauLRScheduleConfig ) class ReduceLROnPlateauLRSchedule(FairseqLRScheduler): """ Decay the LR by a factor every time the validation loss plateaus. Also comes with optional warmup phase, where we linearly increase the learning rate from some initial learning rate (``--warmup-init-lr``) until the configured learning rate (``--lr``). Thereafter the lr is adjusted according to original reduce_on_plateau scheme. During warmup:: lrs = torch.linspace( cfg.warmup_init_lr, cfg.lr, cfg.warmup_updates ) lr = lrs[update_num] """ def __init__(self, cfg: ReduceLROnPlateauLRScheduleConfig, optimizer): super().__init__(cfg, optimizer) if len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with reduce_lr_on_plateau." " Consider --lr-scheduler=fixed instead." ) self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( self.optimizer.optimizer, patience=cfg.lr_patience, factor=cfg.lr_shrink, mode="max" if cfg.maximize_best_checkpoint_metric else "min", threshold=cfg.lr_threshold, ) warmup_end_lr = cfg.lr[0] # if no warm up, sets initial lr to be cfg.lr[0] if cfg.warmup_init_lr < 0: cfg.warmup_init_lr = 0 if cfg.warmup_updates > 0 else warmup_end_lr # linearly warmup for the first cfg.warmup_updates if cfg.warmup_updates > 0: self.lr_step = (warmup_end_lr - cfg.warmup_init_lr) / cfg.warmup_updates # this flag is either set from arg when no warm up, or set by # step_update() when warmup finishes self.warmup_end = True if cfg.warmup_updates <= 0 else False # initial learning rate # this self.lr is used only during init and/or warm up period self.lr = warmup_end_lr if self.warmup_end else cfg.warmup_init_lr self.optimizer.set_lr(self.lr) def state_dict(self): """Return the LR scheduler state dict.""" return { "best": self.lr_scheduler.best, "last_epoch": self.lr_scheduler.last_epoch, } def load_state_dict(self, state_dict): """Load an LR scheduler state dict.""" self.lr_scheduler.best = state_dict["best"] if "last_epoch" in state_dict: self.lr_scheduler.last_epoch = state_dict["last_epoch"] def step(self, epoch, val_loss=None): """ Update the learning rate at the end of the given epoch if warmup finishes otherwise no update of lr on epoch boundaries """ if val_loss is not None and self.warmup_end is True: self.lr_scheduler.step(val_loss) else: self.lr_scheduler.last_epoch = epoch return self.optimizer.get_lr() def step_update(self, num_updates): """ Update the learning rate after each update.""" # if there is warmup if self.cfg.warmup_updates > 0: if num_updates <= self.cfg.warmup_updates: self.lr = self.cfg.warmup_init_lr + num_updates * self.lr_step self.optimizer.set_lr(self.lr) else: if self.warmup_end is False: self.warmup_end = True # else do nothing return self.optimizer.get_lr() ================================================ FILE: fairseq/optim/lr_scheduler/step_lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections.abc import Collection from dataclasses import dataclass, field from typing import List from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class StepLRScheduleConfig(FairseqDataclass): warmup_updates: int = field( default=0, metadata={"help": "warmup the learning rate linearly for the first N updates"}, ) warmup_init_lr: float = field( default=-1, metadata={ "help": "initial learning rate during warmup phase; default is cfg.lr" }, ) lr: List[float] = field( default=II("optimization.lr"), metadata={"help": "max learning rate, must be more than cfg.min_lr"}, ) min_lr: float = field(default=0.0, metadata={"help": "min learning rate"}) lr_deacy_period: int = field(default=25000, metadata={"help": "decay period"}) lr_decay: float = field(default=0.5, metadata={"help": "decay factor"}) @register_lr_scheduler("step", dataclass=StepLRScheduleConfig) class StepLRSchedule(FairseqLRScheduler): """Decay learning rate every k updates by a fixed factor""" def __init__(self, cfg: StepLRScheduleConfig, fairseq_optimizer): super().__init__(cfg, fairseq_optimizer) self.max_lr = cfg.lr[0] if isinstance(cfg.lr, Collection) else cfg.lr self.min_lr = cfg.min_lr self.lr_deacy_period = cfg.lr_deacy_period self.lr_decay = cfg.lr_decay self.warmup_updates = cfg.warmup_updates self.warmup_init_lr = ( cfg.warmup_init_lr if cfg.warmup_init_lr >= 0 else self.min_lr ) assert self.lr_deacy_period > 0 assert self.lr_decay <= 1 assert self.min_lr >= 0 assert self.max_lr > self.min_lr if cfg.warmup_updates > 0: # linearly warmup for the first cfg.warmup_updates self.warmup_lr_step = ( self.max_lr - self.warmup_init_lr ) / self.warmup_updates else: self.warmup_lr_step = 1 # initial learning rate self.lr = self.warmup_init_lr self.optimizer.set_lr(self.lr) def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" super().step(epoch, val_loss) # we don't change the learning rate at epoch boundaries return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" if num_updates < self.cfg.warmup_updates: self.lr = self.warmup_init_lr + num_updates * self.warmup_lr_step else: curr_updates = num_updates - self.cfg.warmup_updates lr_mult = self.lr_decay ** (curr_updates // self.lr_deacy_period) self.lr = max(self.max_lr * lr_mult, self.min_lr) self.optimizer.set_lr(self.lr) return self.lr ================================================ FILE: fairseq/optim/lr_scheduler/tri_stage_lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field from typing import Optional, List, Tuple from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class TriStageLRScheduleConfig(FairseqDataclass): warmup_steps: int = field( default=0, metadata={"help": "warmup the learning rate linearly for the first N updates"}, ) hold_steps: int = field( default=0, metadata={"help": "steps in hold stage"}, ) decay_steps: int = field( default=0, metadata={"help": "steps in decay stages"}, ) phase_ratio: Optional[Tuple[float, float, float]] = field( default=None, metadata={ "help": ( "if set, automatically sets warmup/hold/decay steps to the ratio " "specified here from max_updates. the ratios must add up to 1.0" ) }, ) init_lr_scale: float = field( default=0.01, metadata={"help": "initial learning rate scale during warmup phase"}, ) final_lr_scale: float = field( default=0.01, metadata={"help": "final learning rate scale"}, ) max_update: float = II("optimization.max_update") lr: List[float] = II("optimization.lr") @register_lr_scheduler("tri_stage", dataclass=TriStageLRScheduleConfig) class TriStageLRSchedule(FairseqLRScheduler): """Tristage learning rate schedulr Implement the learning rate scheduler in https://arxiv.org/pdf/1904.08779.pdf Similar to inverse_squre_root scheduler, but tri_stage learning rate employs three stages LR scheduling: - warmup stage, starting from `lr` * `init_lr_scale`, linearly increased to `lr` in `warmup_steps` iterations - hold stage, after `warmup_steps`, keep the LR as `lr` for `hold_steps` iterations - decay stage, after hold stage, decay LR exponetially to `lr` * `final_lr_scale` in `decay_steps`; after that LR is keep as `final_lr_scale` * `lr` During warmup:: init_lr = cfg.init_lr_scale * cfg.lr lrs = torch.linspace(init_lr, cfg.lr, cfg.warmup_steps) lr = lrs[update_num] During hold:: lr = cfg.lr During decay:: decay_factor = - math.log(cfg.final_lr_scale) / cfg.decay_steps lr = cfg.lr * exp(- (update_num - warmup_steps - decay_steps) * decay_factor) After that:: lr = cfg.lr * cfg.final_lr_scale """ def __init__(self, cfg: TriStageLRScheduleConfig, optimizer): super().__init__(cfg, optimizer) if len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with tri-stage lr." " Consider --lr-scheduler=fixed instead." ) # calculate LR at each point self.peak_lr = cfg.lr[0] self.init_lr = cfg.init_lr_scale * cfg.lr[0] self.final_lr = cfg.final_lr_scale * cfg.lr[0] if cfg.phase_ratio is not None: assert cfg.max_update > 0 assert sum(cfg.phase_ratio) == 1, "phase ratios must add up to 1" self.warmup_steps = int(cfg.max_update * cfg.phase_ratio[0]) self.hold_steps = int(cfg.max_update * cfg.phase_ratio[1]) self.decay_steps = int(cfg.max_update * cfg.phase_ratio[2]) else: self.warmup_steps = cfg.warmup_steps self.hold_steps = cfg.hold_steps self.decay_steps = cfg.decay_steps assert ( self.warmup_steps + self.hold_steps + self.decay_steps > 0 ), "please specify steps or phase_ratio" self.warmup_rate = ( (self.peak_lr - self.init_lr) / self.warmup_steps if self.warmup_steps != 0 else 0 ) self.decay_factor = -math.log(cfg.final_lr_scale) / self.decay_steps # initial learning rate self.lr = self.init_lr self.optimizer.set_lr(self.lr) def _decide_stage(self, update_step): """ return stage, and the corresponding steps within the current stage """ if update_step < self.warmup_steps: # warmup state return 0, update_step offset = self.warmup_steps if update_step < offset + self.hold_steps: # hold stage return 1, update_step - offset offset += self.hold_steps if update_step <= offset + self.decay_steps: # decay stage return 2, update_step - offset offset += self.decay_steps # still here ? constant lr stage return 3, update_step - offset def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" super().step(epoch, val_loss) # we don't change the learning rate at epoch boundaries return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" stage, steps_in_stage = self._decide_stage(num_updates) if stage == 0: self.lr = self.init_lr + self.warmup_rate * steps_in_stage elif stage == 1: self.lr = self.peak_lr elif stage == 2: self.lr = self.peak_lr * math.exp(-self.decay_factor * steps_in_stage) elif stage == 3: self.lr = self.final_lr else: raise ValueError("Undefined stage") self.optimizer.set_lr(self.lr) return self.lr ================================================ FILE: fairseq/optim/lr_scheduler/triangular_lr_scheduler.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from dataclasses import dataclass, field from typing import List from omegaconf import II from fairseq.dataclass import FairseqDataclass from fairseq.optim.lr_scheduler import FairseqLRScheduler, register_lr_scheduler @dataclass class TriangularLRScheduleConfig(FairseqDataclass): max_lr: float = field( default="???", metadata={"help": "max learning rate, must be more than cfg.lr"} ) lr_period_updates: float = field( default=5000, metadata={"help": "initial number of updates per period (cycle length)"}, ) lr_shrink: float = field( default=0.1, metadata={"help": "shrink factor for annealing"} ) shrink_min: bool = field( default=False, metadata={"help": "if set, also shrinks min lr"} ) lr: List[float] = II("optimization.lr") @register_lr_scheduler("triangular", dataclass=TriangularLRScheduleConfig) class TriangularLRSchedule(FairseqLRScheduler): """Assign LR based on a triangular cyclical schedule. See https://arxiv.org/pdf/1506.01186.pdf for details. """ def __init__(self, cfg: TriangularLRScheduleConfig, optimizer): super().__init__(cfg, optimizer) if len(cfg.lr) > 1: raise ValueError( "Cannot use a fixed learning rate schedule with triangular." " Consider --lr-scheduler=fixed instead." ) lr = cfg.lr[0] assert cfg.max_lr > lr, "max_lr must be more than lr" self.min_lr = lr self.max_lr = cfg.max_lr self.stepsize = cfg.lr_period_updates // 2 self.lr_shrink = cfg.lr_shrink self.shrink_min = cfg.shrink_min # initial learning rate self.lr = self.min_lr self.optimizer.set_lr(self.lr) def step(self, epoch, val_loss=None): """Update the learning rate at the end of the given epoch.""" super().step(epoch, val_loss) # we don't change the learning rate at epoch boundaries return self.optimizer.get_lr() def step_update(self, num_updates): """Update the learning rate after each update.""" cycle = math.floor(num_updates / (2 * self.stepsize)) lr_shrink = self.lr_shrink**cycle max_lr = self.max_lr * lr_shrink if self.shrink_min: min_lr = self.min_lr * lr_shrink else: min_lr = self.min_lr x = abs(num_updates / self.stepsize - 2 * (cycle + 1) + 1) self.lr = min_lr + (max_lr - min_lr) * max(0, (1 - x)) self.optimizer.set_lr(self.lr) return self.lr ================================================ FILE: fairseq/optim/nag.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from collections.abc import Collection from dataclasses import dataclass, field from typing import List import torch from fairseq.dataclass import FairseqDataclass from omegaconf import II, DictConfig from torch.optim.optimizer import Optimizer, required from . import FairseqOptimizer, register_optimizer @dataclass class FairseqNAGConfig(FairseqDataclass): momentum: float = field(default=0.99, metadata={"help": "momentum factor"}) weight_decay: float = field(default=0.0, metadata={"help": "weight decay"}) # TODO common vars in parent class lr: List[float] = II("optimization.lr") @register_optimizer("nag", dataclass=FairseqNAGConfig) class FairseqNAG(FairseqOptimizer): def __init__(self, cfg: DictConfig, params): super().__init__(cfg) self._optimizer = NAG(params, **self.optimizer_config) @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.cfg.lr[0] if isinstance(self.cfg.lr, Collection) else self.cfg.lr, "momentum": self.cfg.momentum, "weight_decay": self.cfg.weight_decay, } class NAG(Optimizer): def __init__(self, params, lr=required, momentum=0, weight_decay=0): defaults = dict(lr=lr, lr_old=lr, momentum=momentum, weight_decay=weight_decay) super(NAG, self).__init__(params, defaults) @property def supports_memory_efficient_fp16(self): return True @property def supports_flat_params(self): return True def step(self, closure=None): """Performs a single optimization step. Args: closure (callable, optional): A closure that reevaluates the model and returns the loss. """ loss = None if closure is not None: loss = closure() for group in self.param_groups: weight_decay = group["weight_decay"] momentum = group["momentum"] lr = group["lr"] lr_old = group.get("lr_old", lr) lr_correct = lr / lr_old if lr_old > 0 else lr for p in group["params"]: if p.grad is None: continue p_data_fp32 = p.data if p_data_fp32.dtype in {torch.float16, torch.bfloat16}: p_data_fp32 = p_data_fp32.float() d_p = p.grad.data.float() param_state = self.state[p] if "momentum_buffer" not in param_state: param_state["momentum_buffer"] = torch.zeros_like(d_p) else: param_state["momentum_buffer"] = param_state["momentum_buffer"].to( d_p ) buf = param_state["momentum_buffer"] if weight_decay != 0: p_data_fp32.mul_(1 - lr * weight_decay) p_data_fp32.add_(buf, alpha=momentum * momentum * lr_correct) p_data_fp32.add_(d_p, alpha=-(1 + momentum) * lr) buf.mul_(momentum * lr_correct).add_(d_p, alpha=-lr) if p.data.dtype in {torch.float16, torch.bfloat16}: p.data.copy_(p_data_fp32) group["lr_old"] = lr return loss ================================================ FILE: fairseq/optim/sgd.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch.optim from . import LegacyFairseqOptimizer, register_optimizer @register_optimizer("sgd") class SGD(LegacyFairseqOptimizer): def __init__(self, args, params): super().__init__(args) self._optimizer = torch.optim.SGD(params, **self.optimizer_config) @staticmethod def add_args(parser): """Add optimizer-specific arguments to the parser.""" # fmt: off parser.add_argument('--momentum', default=0.0, type=float, metavar='M', help='momentum factor') parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', help='weight decay') # fmt: on @property def optimizer_config(self): """ Return a kwarg dictionary that will be used to override optimizer args stored in checkpoints. This allows us to load a checkpoint and resume training using a different set of optimizer args, e.g., with a different learning rate. """ return { "lr": self.args.lr[0], "momentum": self.args.momentum, "weight_decay": self.args.weight_decay, } @property def supports_flat_params(self): return True ================================================ FILE: fairseq/optim/shard.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from typing import Any, Dict from fairseq.distributed import utils try: from fairscale.optim import OSS _has_fairscale = True except ImportError: _has_fairscale = False def shard_(optimizer, group): if not _has_fairscale: raise ImportError( "\n\nPlease install the fairscale package:" "\n\n pip install fairscale" ) class FairseqOSS(OSS): @property def disable_mem_eff_fp16_loading_hack(self): return True def __getattr__(self, name): if name.startswith("supports") and hasattr(self.optim, name): return getattr(self.optim, name) raise AttributeError( "'FairseqOSS' object has no attribute {0!r}".format(name) ) def broadcast_global_state_dict( self, state_dict: Dict[str, Any] ) -> Dict[str, Any]: """ Broadcasts the entire state_dict to all other ranks each rank is responsible to load their own partition of data """ return utils.broadcast_object( state_dict, src_rank=0, group=self.group, ) torch_optimizer = optimizer.optimizer optim_cls = type(torch_optimizer) optimizer.optimizer = FairseqOSS( torch_optimizer.param_groups, optim_cls, group=group, **optimizer.optimizer_config ) ================================================ FILE: fairseq/options.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse from pathlib import Path from typing import Callable, List, Optional, Union import torch from fairseq import utils from fairseq.data.indexed_dataset import get_available_dataset_impl from fairseq.dataclass.configs import ( CheckpointConfig, CommonConfig, CommonEvalConfig, DatasetConfig, DistributedTrainingConfig, EvalLMConfig, GenerationConfig, InteractiveConfig, OptimizationConfig, EMAConfig, ) from fairseq.dataclass.utils import gen_parser_from_dataclass # this import is for backward compatibility from fairseq.utils import csv_str_list, eval_bool, eval_str_dict, eval_str_list # noqa def get_preprocessing_parser(default_task="translation"): parser = get_parser("Preprocessing", default_task) add_preprocess_args(parser) return parser def get_training_parser(default_task="translation"): parser = get_parser("Trainer", default_task) add_dataset_args(parser, train=True) add_distributed_training_args(parser) add_model_args(parser) add_optimization_args(parser) add_checkpoint_args(parser) add_ema_args(parser) return parser def get_generation_parser(interactive=False, default_task="translation"): parser = get_parser("Generation", default_task) add_dataset_args(parser, gen=True) add_distributed_training_args(parser, default_world_size=1) add_generation_args(parser) add_checkpoint_args(parser) if interactive: add_interactive_args(parser) return parser def get_speech_generation_parser(default_task="text_to_speech"): parser = get_parser("Speech Generation", default_task) add_dataset_args(parser, gen=True) add_distributed_training_args(parser, default_world_size=1) add_speech_generation_args(parser) return parser def get_interactive_generation_parser(default_task="translation"): return get_generation_parser(interactive=True, default_task=default_task) def get_eval_lm_parser(default_task="language_modeling"): parser = get_parser("Evaluate Language Model", default_task) add_dataset_args(parser, gen=True) add_distributed_training_args(parser, default_world_size=1) add_eval_lm_args(parser) return parser def get_validation_parser(default_task=None): parser = get_parser("Validation", default_task) add_dataset_args(parser, train=True) add_distributed_training_args(parser, default_world_size=1) group = parser.add_argument_group("Evaluation") gen_parser_from_dataclass(group, CommonEvalConfig()) return parser def parse_args_and_arch( parser: argparse.ArgumentParser, input_args: List[str] = None, parse_known: bool = False, suppress_defaults: bool = False, modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None, ): """ Args: parser (ArgumentParser): the parser input_args (List[str]): strings to parse, defaults to sys.argv parse_known (bool): only parse known arguments, similar to `ArgumentParser.parse_known_args` suppress_defaults (bool): parse while ignoring all default values modify_parser (Optional[Callable[[ArgumentParser], None]]): function to modify the parser, e.g., to set default values """ if suppress_defaults: # Parse args without any default values. This requires us to parse # twice, once to identify all the necessary task/model args, and a second # time with all defaults set to None. args = parse_args_and_arch( parser, input_args=input_args, parse_known=parse_known, suppress_defaults=False, ) suppressed_parser = argparse.ArgumentParser(add_help=False, parents=[parser]) suppressed_parser.set_defaults(**{k: None for k, v in vars(args).items()}) args = suppressed_parser.parse_args(input_args) return argparse.Namespace( **{k: v for k, v in vars(args).items() if v is not None} ) from fairseq.models import ARCH_MODEL_REGISTRY, ARCH_CONFIG_REGISTRY, MODEL_REGISTRY # Before creating the true parser, we need to import optional user module # in order to eagerly import custom tasks, optimizers, architectures, etc. usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False) usr_parser.add_argument("--user-dir", default=None) usr_args, _ = usr_parser.parse_known_args(input_args) utils.import_user_module(usr_args) if modify_parser is not None: modify_parser(parser) # The parser doesn't know about model/criterion/optimizer-specific args, so # we parse twice. First we parse the model/criterion/optimizer, then we # parse a second time after adding the *-specific arguments. # If input_args is given, we will parse those args instead of sys.argv. args, _ = parser.parse_known_args(input_args) # Add model-specific args to parser. if hasattr(args, "arch"): model_specific_group = parser.add_argument_group( "Model-specific configuration", # Only include attributes which are explicitly given as command-line # arguments or which have default values. argument_default=argparse.SUPPRESS, ) if args.arch in ARCH_MODEL_REGISTRY: ARCH_MODEL_REGISTRY[args.arch].add_args(model_specific_group) elif args.arch in MODEL_REGISTRY: MODEL_REGISTRY[args.arch].add_args(model_specific_group) else: raise RuntimeError() if hasattr(args, "task"): from fairseq.tasks import TASK_REGISTRY TASK_REGISTRY[args.task].add_args(parser) if getattr(args, "use_bmuf", False): # hack to support extra args for block distributed data parallelism from fairseq.optim.bmuf import FairseqBMUF FairseqBMUF.add_args(parser) # Add *-specific args to parser. from fairseq.registry import REGISTRIES for registry_name, REGISTRY in REGISTRIES.items(): choice = getattr(args, registry_name, None) if choice is not None: cls = REGISTRY["registry"][choice] if hasattr(cls, "add_args"): cls.add_args(parser) elif hasattr(cls, "__dataclass"): gen_parser_from_dataclass(parser, cls.__dataclass()) # Modify the parser a second time, since defaults may have been reset if modify_parser is not None: modify_parser(parser) # Parse a second time. if parse_known: args, extra = parser.parse_known_args(input_args) else: args = parser.parse_args(input_args) extra = None # Post-process args. if ( hasattr(args, "batch_size_valid") and args.batch_size_valid is None ) or not hasattr(args, "batch_size_valid"): args.batch_size_valid = args.batch_size if hasattr(args, "max_tokens_valid") and args.max_tokens_valid is None: args.max_tokens_valid = args.max_tokens if getattr(args, "memory_efficient_fp16", False): args.fp16 = True if getattr(args, "memory_efficient_bf16", False): args.bf16 = True args.tpu = getattr(args, "tpu", False) args.bf16 = getattr(args, "bf16", False) if args.bf16: args.tpu = True if args.tpu and args.fp16: raise ValueError("Cannot combine --fp16 and --tpu, use --bf16 on TPUs") if getattr(args, "seed", None) is None: args.seed = 1 # default seed for training args.no_seed_provided = True else: args.no_seed_provided = False if getattr(args, "update_epoch_batch_itr", None) is None: if hasattr(args, "grouped_shuffling"): args.update_epoch_batch_itr = args.grouped_shuffling else: args.grouped_shuffling = False args.update_epoch_batch_itr = False # Apply architecture configuration. if hasattr(args, "arch") and args.arch in ARCH_CONFIG_REGISTRY: ARCH_CONFIG_REGISTRY[args.arch](args) if parse_known: return args, extra else: return args def get_parser(desc, default_task="translation"): # Before creating the true parser, we need to import optional user module # in order to eagerly import custom tasks, optimizers, architectures, etc. usr_parser = argparse.ArgumentParser(add_help=False, allow_abbrev=False) usr_parser.add_argument("--user-dir", default=None) usr_args, _ = usr_parser.parse_known_args() utils.import_user_module(usr_args) parser = argparse.ArgumentParser(allow_abbrev=False) gen_parser_from_dataclass(parser, CommonConfig()) from fairseq.registry import REGISTRIES for registry_name, REGISTRY in REGISTRIES.items(): parser.add_argument( "--" + registry_name.replace("_", "-"), default=REGISTRY["default"], choices=REGISTRY["registry"].keys(), ) # Task definitions can be found under fairseq/tasks/ from fairseq.tasks import TASK_REGISTRY parser.add_argument( "--task", metavar="TASK", default=default_task, choices=TASK_REGISTRY.keys(), help="task", ) # fmt: on return parser def add_preprocess_args(parser): group = parser.add_argument_group("Preprocessing") # fmt: off group.add_argument("-s", "--source-lang", default=None, metavar="SRC", help="source language") group.add_argument("-t", "--target-lang", default=None, metavar="TARGET", help="target language") group.add_argument("--trainpref", metavar="FP", default=None, help="train file prefix (also used to build dictionaries)") group.add_argument("--validpref", metavar="FP", default=None, help="comma separated, valid file prefixes " "(words missing from train set are replaced with <unk>)") group.add_argument("--testpref", metavar="FP", default=None, help="comma separated, test file prefixes " "(words missing from train set are replaced with <unk>)") group.add_argument("--align-suffix", metavar="FP", default=None, help="alignment file suffix") group.add_argument("--destdir", metavar="DIR", default="data-bin", help="destination dir") group.add_argument("--thresholdtgt", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--thresholdsrc", metavar="N", default=0, type=int, help="map words appearing less than threshold times to unknown") group.add_argument("--tgtdict", metavar="FP", help="reuse given target dictionary") group.add_argument("--srcdict", metavar="FP", help="reuse given source dictionary") group.add_argument("--nwordstgt", metavar="N", default=-1, type=int, help="number of target words to retain") group.add_argument("--nwordssrc", metavar="N", default=-1, type=int, help="number of source words to retain") group.add_argument("--alignfile", metavar="ALIGN", default=None, help="an alignment file (optional)") parser.add_argument('--dataset-impl', metavar='FORMAT', default='mmap', choices=get_available_dataset_impl(), help='output dataset implementation') group.add_argument("--joined-dictionary", action="store_true", help="Generate joined dictionary") group.add_argument("--only-source", action="store_true", help="Only process the source language") group.add_argument("--padding-factor", metavar="N", default=8, type=int, help="Pad dictionary size to be multiple of N") group.add_argument("--workers", metavar="N", default=1, type=int, help="number of parallel workers") group.add_argument("--dict-only", action='store_true', help="if true, only builds a dictionary and then exits") # fmt: on return parser def add_dataset_args(parser, train=False, gen=False): group = parser.add_argument_group("dataset_data_loading") gen_parser_from_dataclass(group, DatasetConfig()) # fmt: on return group def add_distributed_training_args(parser, default_world_size=None): group = parser.add_argument_group("distributed_training") if default_world_size is None: default_world_size = max(1, torch.cuda.device_count()) gen_parser_from_dataclass( group, DistributedTrainingConfig(distributed_world_size=default_world_size) ) return group def add_optimization_args(parser): group = parser.add_argument_group("optimization") # fmt: off gen_parser_from_dataclass(group, OptimizationConfig()) # fmt: on return group def add_checkpoint_args(parser): group = parser.add_argument_group("checkpoint") # fmt: off gen_parser_from_dataclass(group, CheckpointConfig()) # fmt: on return group def add_common_eval_args(group): gen_parser_from_dataclass(group, CommonEvalConfig()) def add_eval_lm_args(parser): group = parser.add_argument_group("LM Evaluation") add_common_eval_args(group) gen_parser_from_dataclass(group, EvalLMConfig()) def add_generation_args(parser): group = parser.add_argument_group("Generation") add_common_eval_args(group) gen_parser_from_dataclass(group, GenerationConfig()) return group def add_speech_generation_args(parser): group = parser.add_argument_group("Speech Generation") add_common_eval_args(group) # NOTE: remove_bpe is not needed # fmt: off group.add_argument('--eos_prob_threshold', default=0.5, type=float, help='terminate when eos probability exceeds this') # fmt: on return group def add_interactive_args(parser): group = parser.add_argument_group("Interactive") gen_parser_from_dataclass(group, InteractiveConfig()) def add_model_args(parser): group = parser.add_argument_group("Model configuration") # fmt: off # Model definitions can be found under fairseq/models/ # # The model architecture can be specified in several ways. # In increasing order of priority: # 1) model defaults (lowest priority) # 2) --arch argument # 3) --encoder/decoder-* arguments (highest priority) from fairseq.models import ARCH_MODEL_REGISTRY group.add_argument('--arch', '-a', metavar='ARCH', choices=ARCH_MODEL_REGISTRY.keys(), help='model architecture') # fmt: on return group def get_args( data: Union[str, Path], task: str = "translation", arch: str = "transformer", **overrides ): parser = get_training_parser(task) args = parse_args_and_arch(parser, [str(data), "--task", task, "--arch", arch]) for k, v in overrides.items(): setattr(args, k, v) return args def add_ema_args(parser): group = parser.add_argument_group("EMA configuration") gen_parser_from_dataclass(group, EMAConfig()) ================================================ FILE: fairseq/pdb.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import multiprocessing import os import pdb import sys __all__ = ["set_trace"] _stdin = [None] _stdin_lock = multiprocessing.Lock() try: _stdin_fd = sys.stdin.fileno() except Exception: _stdin_fd = None class MultiprocessingPdb(pdb.Pdb): """A Pdb wrapper that works in a multiprocessing environment. Usage: `from fairseq import pdb; pdb.set_trace()` """ def __init__(self): pdb.Pdb.__init__(self, nosigint=True) def _cmdloop(self): stdin_bak = sys.stdin with _stdin_lock: try: if _stdin_fd is not None: if not _stdin[0]: _stdin[0] = os.fdopen(_stdin_fd) sys.stdin = _stdin[0] self.cmdloop() finally: sys.stdin = stdin_bak def set_trace(): pdb = MultiprocessingPdb() pdb.set_trace(sys._getframe().f_back) ================================================ FILE: fairseq/quantization_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from fairseq.modules.quantization import pq, quantization_options, scalar from omegaconf import DictConfig logger = logging.getLogger(__name__) def quantize_model_scalar(model, model_cfg: DictConfig): quant_noise_scalar = getattr(model_cfg, "quant_noise_scalar", 0) or 0 if quant_noise_scalar > 0: # quantize_model edits the model in place scalar.quantize_model_(model, p=quant_noise_scalar, bits=8, update_step=1000) return model class Quantizer(object): def __init__(self, config_path, max_epoch, max_update): try: import yaml except ImportError: raise ImportError("Please install yaml with: pip install yaml") # parse config if config_path: with open(config_path) as config_file: config = quantization_options.parse_config_yaml( yaml.safe_load(config_file) ) else: config = quantization_options.parse_config_yaml({}) self.n_centroids_config = config["n_centroids"] self.block_sizes_config = config["block_sizes"] self.layers_to_quantize = config["layers_to_quantize"] # We assume that training will run for a fixed number of epochs # (or updates) and that we should train for equal durations # between iterations of PQ. num_iterations = len(self.layers_to_quantize) if max_epoch > 0: assert max_epoch % num_iterations == 0, ( "for iterative PQ, --max-epoch (={}) must be evenly divisible by " "len(layers_to_quantize) (={})".format(max_epoch, num_iterations) ) self.epoch_schedule = max_epoch // num_iterations else: self.epoch_schedule = None if max_update > 0: assert max_update % num_iterations == 0, ( "for iterative PQ, --max-update (={}) must be evenly divisible by " "len(layers_to_quantize) (={})".format(max_update, num_iterations) ) self.update_schedule = max_update // num_iterations else: self.update_schedule = None assert (self.epoch_schedule is not None) ^ ( self.update_schedule is not None ), "for iterative PQ, cannot specify both --max-update and --max-epoch" # 0 is a special value for quantization step, which will force # the first call to begin_epoch() to call step() self.quantization_step = 0 def set_trainer(self, trainer): self.trainer = trainer self.size_tracker = pq.SizeTracker(self.trainer.get_model()) def step(self): """Move to the next stage of quantization.""" if self.quantization_step >= len(self.layers_to_quantize): # Maybe we just finished the last training step or we loaded # a checkpoint for an iterative PQ model which previously # finished training. Either way, don't quantize again. return logger.info( "quantizing model (step={}; layers_to_quantize[step]={})".format( self.quantization_step, self.layers_to_quantize[self.quantization_step] ) ) quantized_layers = pq.quantize_model_( self.trainer.get_model(), self.size_tracker, self.layers_to_quantize, self.block_sizes_config, self.n_centroids_config, step=self.quantization_step, ) logger.info("quantized layers: {}".format(quantized_layers)) logger.info(self.size_tracker) self.quantization_step += 1 # reintialize the Trainer since model parameters have changed self.trainer.reinitialize() def begin_epoch(self, epoch): """Called at the beginning of each epoch (epochs start at 1).""" if ( ( self.epoch_schedule is not None and epoch > 0 and (epoch - 1) % self.epoch_schedule == 0 ) # we always step once in the beginning, even if using # update-based quantization or self.quantization_step == 0 ): self.step() def step_update(self, num_updates): """Called at the end of each step.""" if ( self.update_schedule is not None and num_updates > 0 and num_updates % self.update_schedule == 0 ): self.step() def state_dict(self): return { "n_centroids_config": self.n_centroids_config, "block_sizes_config": self.block_sizes_config, "layers_to_quantize": self.layers_to_quantize, "epoch_schedule": self.epoch_schedule, "update_schedule": self.update_schedule, "quantization_step": self.quantization_step, } def load_state_dict(self, state_dict): self.n_centroids_config = state_dict["n_centroids_config"] self.block_sizes_config = state_dict["block_sizes_config"] self.layers_to_quantize = state_dict["layers_to_quantize"] self.epoch_schedule = state_dict["epoch_schedule"] self.update_schedule = state_dict["update_schedule"] self.quantization_step = state_dict["quantization_step"] ================================================ FILE: fairseq/registry.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from argparse import Namespace from typing import Union from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import merge_with_parent from hydra.core.config_store import ConfigStore from omegaconf import DictConfig REGISTRIES = {} def setup_registry(registry_name: str, base_class=None, default=None, required=False): assert registry_name.startswith("--") registry_name = registry_name[2:].replace("-", "_") REGISTRY = {} REGISTRY_CLASS_NAMES = set() DATACLASS_REGISTRY = {} # maintain a registry of all registries if registry_name in REGISTRIES: return # registry already exists REGISTRIES[registry_name] = { "registry": REGISTRY, "default": default, "dataclass_registry": DATACLASS_REGISTRY, } def build_x(cfg: Union[DictConfig, str, Namespace], *extra_args, **extra_kwargs): if isinstance(cfg, DictConfig): choice = cfg._name if choice and choice in DATACLASS_REGISTRY: from_checkpoint = extra_kwargs.get("from_checkpoint", False) dc = DATACLASS_REGISTRY[choice] cfg = merge_with_parent(dc(), cfg, remove_missing=from_checkpoint) elif isinstance(cfg, str): choice = cfg if choice in DATACLASS_REGISTRY: cfg = DATACLASS_REGISTRY[choice]() else: choice = getattr(cfg, registry_name, None) if choice in DATACLASS_REGISTRY: cfg = DATACLASS_REGISTRY[choice].from_namespace(cfg) if choice is None: if required: raise ValueError("{} is required!".format(registry_name)) return None cls = REGISTRY[choice] if hasattr(cls, "build_" + registry_name): builder = getattr(cls, "build_" + registry_name) else: builder = cls if "from_checkpoint" in extra_kwargs: del extra_kwargs["from_checkpoint"] return builder(cfg, *extra_args, **extra_kwargs) def register_x(name, dataclass=None): def register_x_cls(cls): if name in REGISTRY: raise ValueError( "Cannot register duplicate {} ({})".format(registry_name, name) ) if cls.__name__ in REGISTRY_CLASS_NAMES: raise ValueError( "Cannot register {} with duplicate class name ({})".format( registry_name, cls.__name__ ) ) if base_class is not None and not issubclass(cls, base_class): raise ValueError( "{} must extend {}".format(cls.__name__, base_class.__name__) ) if dataclass is not None and not issubclass(dataclass, FairseqDataclass): raise ValueError( "Dataclass {} must extend FairseqDataclass".format(dataclass) ) cls.__dataclass = dataclass if cls.__dataclass is not None: DATACLASS_REGISTRY[name] = cls.__dataclass cs = ConfigStore.instance() node = dataclass() node._name = name cs.store(name=name, group=registry_name, node=node, provider="fairseq") REGISTRY[name] = cls return cls return register_x_cls return build_x, register_x, REGISTRY, DATACLASS_REGISTRY ================================================ FILE: fairseq/scoring/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import importlib import os from abc import ABC, abstractmethod from fairseq import registry from omegaconf import DictConfig class BaseScorer(ABC): def __init__(self, cfg): self.cfg = cfg self.ref = [] self.pred = [] def add_string(self, ref, pred): self.ref.append(ref) self.pred.append(pred) @abstractmethod def score(self) -> float: pass @abstractmethod def result_string(self) -> str: pass _build_scorer, register_scorer, SCORER_REGISTRY, _ = registry.setup_registry( "--scoring", default="bleu" ) def build_scorer(choice, tgt_dict): _choice = choice._name if isinstance(choice, DictConfig) else choice if _choice == "bleu": from fairseq.scoring import bleu return bleu.Scorer( bleu.BleuConfig(pad=tgt_dict.pad(), eos=tgt_dict.eos(), unk=tgt_dict.unk()) ) return _build_scorer(choice) # automatically import any Python files in the current directory for file in sorted(os.listdir(os.path.dirname(__file__))): if file.endswith(".py") and not file.startswith("_"): module = file[: file.find(".py")] importlib.import_module("fairseq.scoring." + module) ================================================ FILE: fairseq/scoring/bertscore.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import numpy as np from fairseq.dataclass import FairseqDataclass from fairseq.scoring import BaseScorer, register_scorer @dataclass class BertScoreScorerConfig(FairseqDataclass): bert_score_lang: str = field(default="en", metadata={"help": "BERTScore language"}) @register_scorer("bert_score", dataclass=BertScoreScorerConfig) class BertScoreScorer(BaseScorer): def __init__(self, cfg): super(BertScoreScorer, self).__init__(cfg) try: import bert_score as _bert_score except ImportError: raise ImportError("Please install BERTScore: pip install bert-score") self.cfg = cfg self._bert_score = _bert_score self.scores = None def add_string(self, ref, pred): self.ref.append(ref) self.pred.append(pred) def score(self, order=4): _, _, self.scores = self._bert_score.score( self.pred, self.ref, lang=self.cfg.bert_score_lang ) self.scores = self.scores.numpy() return np.mean(self.scores) def result_string(self, order=4): return f"BERTScore: {self.score():.4f}" ================================================ FILE: fairseq/scoring/bleu.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import ctypes import math import sys from dataclasses import dataclass, field import torch from fairseq.dataclass import FairseqDataclass from fairseq.scoring import BaseScorer, register_scorer from fairseq.scoring.tokenizer import EvaluationTokenizer class BleuStat(ctypes.Structure): _fields_ = [ ("reflen", ctypes.c_size_t), ("predlen", ctypes.c_size_t), ("match1", ctypes.c_size_t), ("count1", ctypes.c_size_t), ("match2", ctypes.c_size_t), ("count2", ctypes.c_size_t), ("match3", ctypes.c_size_t), ("count3", ctypes.c_size_t), ("match4", ctypes.c_size_t), ("count4", ctypes.c_size_t), ] @dataclass class SacrebleuConfig(FairseqDataclass): sacrebleu_tokenizer: EvaluationTokenizer.ALL_TOKENIZER_TYPES = field( default="13a", metadata={"help": "tokenizer"} ) sacrebleu_lowercase: bool = field( default=False, metadata={"help": "apply lowercasing"} ) sacrebleu_char_level: bool = field( default=False, metadata={"help": "evaluate at character level"} ) @register_scorer("sacrebleu", dataclass=SacrebleuConfig) class SacrebleuScorer(BaseScorer): def __init__(self, cfg): super(SacrebleuScorer, self).__init__(cfg) import sacrebleu self.sacrebleu = sacrebleu self.tokenizer = EvaluationTokenizer( tokenizer_type=cfg.sacrebleu_tokenizer, lowercase=cfg.sacrebleu_lowercase, character_tokenization=cfg.sacrebleu_char_level, ) def add_string(self, ref, pred): self.ref.append(self.tokenizer.tokenize(ref)) self.pred.append(self.tokenizer.tokenize(pred)) def _score(self, order=4): if order != 4: raise NotImplementedError # tokenization and lowercasing are performed by self.tokenizer instead. return self.sacrebleu.corpus_bleu(self.pred, [self.ref], tokenize="none") def score(self, order=4): return self._score(order).score def result_string(self, order=4): return self._score(order).format() @dataclass class BleuConfig(FairseqDataclass): pad: int = field(default=1, metadata={"help": "padding index"}) eos: int = field(default=2, metadata={"help": "eos index"}) unk: int = field(default=3, metadata={"help": "unk index"}) @register_scorer("bleu", dataclass=BleuConfig) class Scorer(object): def __init__(self, cfg): self.stat = BleuStat() self.pad = cfg.pad self.eos = cfg.eos self.unk = cfg.unk try: from fairseq import libbleu except ImportError as e: sys.stderr.write( "ERROR: missing libbleu.so. run `pip install --editable .`\n" ) raise e self.C = ctypes.cdll.LoadLibrary(libbleu.__file__) self.reset() def reset(self, one_init=False): if one_init: self.C.bleu_one_init(ctypes.byref(self.stat)) else: self.C.bleu_zero_init(ctypes.byref(self.stat)) def add(self, ref, pred): if not isinstance(ref, torch.IntTensor): raise TypeError("ref must be a torch.IntTensor (got {})".format(type(ref))) if not isinstance(pred, torch.IntTensor): raise TypeError("pred must be a torch.IntTensor(got {})".format(type(pred))) # don't match unknown words rref = ref.clone() assert not rref.lt(0).any() rref[rref.eq(self.unk)] = -999 rref = rref.contiguous().view(-1) pred = pred.contiguous().view(-1) self.C.bleu_add( ctypes.byref(self.stat), ctypes.c_size_t(rref.size(0)), ctypes.c_void_p(rref.data_ptr()), ctypes.c_size_t(pred.size(0)), ctypes.c_void_p(pred.data_ptr()), ctypes.c_int(self.pad), ctypes.c_int(self.eos), ) def score(self, order=4): psum = sum( math.log(p) if p > 0 else float("-Inf") for p in self.precision()[:order] ) return self.brevity() * math.exp(psum / order) * 100 def precision(self): def ratio(a, b): return a / b if b > 0 else 0 return [ ratio(self.stat.match1, self.stat.count1), ratio(self.stat.match2, self.stat.count2), ratio(self.stat.match3, self.stat.count3), ratio(self.stat.match4, self.stat.count4), ] def brevity(self): r = self.stat.reflen / self.stat.predlen return min(1, math.exp(1 - r)) def result_string(self, order=4): assert order <= 4, "BLEU scores for order > 4 aren't supported" fmt = "BLEU{} = {:2.2f}, {:2.1f}" for _ in range(1, order): fmt += "/{:2.1f}" fmt += " (BP={:.3f}, ratio={:.3f}, syslen={}, reflen={})" bleup = [p * 100 for p in self.precision()[:order]] return fmt.format( order, self.score(order=order), *bleup, self.brevity(), self.stat.predlen / self.stat.reflen, self.stat.predlen, self.stat.reflen ) ================================================ FILE: fairseq/scoring/chrf.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass from fairseq.dataclass import FairseqDataclass from fairseq.scoring import BaseScorer, register_scorer @dataclass class ChrFScorerConfig(FairseqDataclass): pass @register_scorer("chrf", dataclass=ChrFScorerConfig) class ChrFScorer(BaseScorer): def __init__(self, args): super(ChrFScorer, self).__init__(args) import sacrebleu self.sacrebleu = sacrebleu def add_string(self, ref, pred): self.ref.append(ref) self.pred.append(pred) def score(self, order=4): return self.result_string(order).score def result_string(self, order=4): if order != 4: raise NotImplementedError return self.sacrebleu.corpus_chrf(self.pred, [self.ref]).format() ================================================ FILE: fairseq/scoring/meteor.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np from dataclasses import dataclass from fairseq.dataclass import FairseqDataclass from fairseq.scoring import BaseScorer, register_scorer @dataclass class MeteorScorerConfig(FairseqDataclass): pass @register_scorer("meteor", dataclass=MeteorScorerConfig) class MeteorScorer(BaseScorer): def __init__(self, args): super(MeteorScorer, self).__init__(args) try: import nltk except ImportError: raise ImportError("Please install nltk to use METEOR scorer") self.nltk = nltk self.scores = [] def add_string(self, ref, pred): self.ref.append(ref) self.pred.append(pred) def score(self, order=4): self.scores = [ self.nltk.translate.meteor_score.single_meteor_score(r, p) for r, p in zip(self.ref, self.pred) ] return np.mean(self.scores) def result_string(self, order=4): return f"METEOR: {self.score():.4f}" ================================================ FILE: fairseq/scoring/tokenizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unicodedata import sacrebleu as sb from fairseq.dataclass import ChoiceEnum SACREBLEU_V2_ABOVE = int(sb.__version__[0]) >= 2 class EvaluationTokenizer(object): """A generic evaluation-time tokenizer, which leverages built-in tokenizers in sacreBLEU (https://github.com/mjpost/sacrebleu). It additionally provides lowercasing, punctuation removal and character tokenization, which are applied after sacreBLEU tokenization. Args: tokenizer_type (str): the type of sacreBLEU tokenizer to apply. lowercase (bool): lowercase the text. punctuation_removal (bool): remove punctuation (based on unicode category) from text. character_tokenization (bool): tokenize the text to characters. """ SPACE = chr(32) SPACE_ESCAPE = chr(9601) _ALL_TOKENIZER_TYPES = ( sb.BLEU.TOKENIZERS if SACREBLEU_V2_ABOVE else ["none", "13a", "intl", "zh", "ja-mecab"] ) ALL_TOKENIZER_TYPES = ChoiceEnum(_ALL_TOKENIZER_TYPES) def __init__( self, tokenizer_type: str = "13a", lowercase: bool = False, punctuation_removal: bool = False, character_tokenization: bool = False, ): assert ( tokenizer_type in self._ALL_TOKENIZER_TYPES ), f"{tokenizer_type}, {self._ALL_TOKENIZER_TYPES}" self.lowercase = lowercase self.punctuation_removal = punctuation_removal self.character_tokenization = character_tokenization if SACREBLEU_V2_ABOVE: self.tokenizer = sb.BLEU(tokenize=str(tokenizer_type)).tokenizer else: self.tokenizer = sb.tokenizers.TOKENIZERS[tokenizer_type]() @classmethod def remove_punctuation(cls, sent: str): """Remove punctuation based on Unicode category.""" return cls.SPACE.join( t for t in sent.split(cls.SPACE) if not all(unicodedata.category(c)[0] == "P" for c in t) ) def tokenize(self, sent: str): tokenized = self.tokenizer(sent) if self.punctuation_removal: tokenized = self.remove_punctuation(tokenized) if self.character_tokenization: tokenized = self.SPACE.join( list(tokenized.replace(self.SPACE, self.SPACE_ESCAPE)) ) if self.lowercase: tokenized = tokenized.lower() return tokenized ================================================ FILE: fairseq/scoring/wer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field from fairseq.dataclass import FairseqDataclass from fairseq.scoring import BaseScorer, register_scorer from fairseq.scoring.tokenizer import EvaluationTokenizer @dataclass class WerScorerConfig(FairseqDataclass): wer_tokenizer: EvaluationTokenizer.ALL_TOKENIZER_TYPES = field( default="none", metadata={"help": "sacreBLEU tokenizer to use for evaluation"} ) wer_remove_punct: bool = field( default=False, metadata={"help": "remove punctuation"} ) wer_char_level: bool = field( default=False, metadata={"help": "evaluate at character level"} ) wer_lowercase: bool = field(default=False, metadata={"help": "lowercasing"}) @register_scorer("wer", dataclass=WerScorerConfig) class WerScorer(BaseScorer): def __init__(self, cfg): super().__init__(cfg) self.reset() try: import editdistance as ed except ImportError: raise ImportError("Please install editdistance to use WER scorer") self.ed = ed self.tokenizer = EvaluationTokenizer( tokenizer_type=self.cfg.wer_tokenizer, lowercase=self.cfg.wer_lowercase, punctuation_removal=self.cfg.wer_remove_punct, character_tokenization=self.cfg.wer_char_level, ) def reset(self): self.distance = 0 self.ref_length = 0 def add_string(self, ref, pred): ref_items = self.tokenizer.tokenize(ref).split() pred_items = self.tokenizer.tokenize(pred).split() self.distance += self.ed.eval(ref_items, pred_items) self.ref_length += len(ref_items) def result_string(self): return f"WER: {self.score():.2f}" def score(self): return 100.0 * self.distance / self.ref_length if self.ref_length > 0 else 0 ================================================ FILE: fairseq/search.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math from typing import List, Optional import torch import torch.nn as nn from fairseq.token_generation_constraints import ( ConstraintState, OrderedConstraintState, UnorderedConstraintState, ) from torch import Tensor class Search(nn.Module): def __init__(self, tgt_dict): super().__init__() self.pad = tgt_dict.pad() self.unk = tgt_dict.unk() self.eos = tgt_dict.eos() self.vocab_size = len(tgt_dict) self.src_lengths = torch.tensor(-1) self.supports_constraints = False self.stop_on_max_len = False def step( self, step, lprobs, scores, prev_output_tokens=None, original_batch_idxs=None ): """Take a single search step. Args: step: the current search step, starting at 0 lprobs: (bsz x input_beam_size x vocab_size) the model's log-probabilities over the vocabulary at the current step scores: (bsz x input_beam_size x step) the historical model scores of each hypothesis up to this point prev_output_tokens: (bsz x step) the previously generated oputput tokens original_batch_idxs: (bsz) the tensor with the batch indices, in the range [0, bsz) this is useful in case there has been applied a re-ordering and we need to know the orignal indices Return: A tuple of (scores, indices, beams) where: scores: (bsz x output_beam_size) the scores of the chosen elements; output_beam_size can be larger than input_beam_size, e.g., we may return 2*input_beam_size to account for EOS indices: (bsz x output_beam_size) the indices of the chosen elements beams: (bsz x output_beam_size) the hypothesis ids of the chosen elements, in the range [0, input_beam_size) """ raise NotImplementedError @torch.jit.export def set_src_lengths(self, src_lengths): self.src_lengths = src_lengths @torch.jit.export def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int): """Initialize constraint states for constrained decoding (if supported). Args: batch_constraints: (torch.Tensor, optional) the list of constraints, in packed form beam_size: (int) the beam size Returns: *encoder_out* rearranged according to *new_order* """ pass def prune_sentences(self, batch_idxs: Tensor): """ Removes constraint states for completed sentences (if supported). This is called from sequence_generator._generate() when sentences are deleted from the batch. Args: batch_idxs: Indices of *sentences* whose constraint state should be *kept*. """ pass def update_constraints(self, active_hypos: Tensor): """ Updates the constraint states by selecting the beam items that are retained. This is called at each time step of sequence_generator._generate() when the set of 2 * {beam_size} candidate hypotheses are reduced to the beam size. Args: active_hypos: (batch size, beam size) list of integers denoting, for each sentence, which beam candidate items should be kept. """ pass class BeamSearch(Search): def __init__(self, tgt_dict): super().__init__(tgt_dict) self.constraint_states = None @torch.jit.export def step( self, step: int, lprobs, scores: Optional[Tensor], prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, candidate_multiple: int = 2, ): bsz, beam_size, vocab_size = lprobs.size() if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam lprobs = lprobs[:, ::beam_size, :].contiguous() else: # make probs contain cumulative scores for each hypothesis assert scores is not None lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1) top_prediction = torch.topk( lprobs.view(bsz, -1), k=min( # Take the best `candidate_muliple`(default 2) x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. candidate_multiple * beam_size, lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad ), ) scores_buf = top_prediction[0] indices_buf = top_prediction[1] # Project back into relative indices and beams beams_buf = torch.div(indices_buf, vocab_size, rounding_mode="trunc") indices_buf = indices_buf.fmod(vocab_size) # At this point, beams_buf and indices_buf are single-dim and contain relative indices return scores_buf, indices_buf, beams_buf class PrefixConstrainedBeamSearch(Search): def __init__(self, tgt_dict, prefix_allowed_tokens_fn): super().__init__(tgt_dict) self.prefix_allowed_tokens_fn = prefix_allowed_tokens_fn self.stop_on_max_len = True @torch.jit.export def apply_mask(self, x, prev_output_tokens, original_batch_idxs): beam_size = x.shape[0] // original_batch_idxs.shape[0] original_batch_idxs = ( original_batch_idxs.unsqueeze(-1).repeat((1, beam_size)).flatten().tolist() ) mask = torch.full_like(x, -math.inf) for sent_i, (sent, batch_i) in enumerate( zip(prev_output_tokens, original_batch_idxs) ): mask[sent_i, :, self.prefix_allowed_tokens_fn(batch_i, sent)] = 0 return mask @torch.jit.export def step( self, step: int, lprobs: Tensor, scores: Tensor, prev_output_tokens: Tensor, original_batch_idxs: Tensor, ): bsz, beam_size, vocab_size = lprobs.size() lprobs += self.apply_mask( lprobs.view(bsz * beam_size, 1, vocab_size), prev_output_tokens, original_batch_idxs, ).view(bsz, beam_size, vocab_size) if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam lprobs = lprobs[:, ::beam_size, :].contiguous() else: # make probs contain cumulative scores for each hypothesis assert scores is not None lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1) top_prediction = torch.topk( lprobs.view(bsz, -1), k=min( # Take the best beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. beam_size, lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad ), ) scores_buf = top_prediction[0] indices_buf = top_prediction[1] beams_buf = indices_buf // vocab_size indices_buf = indices_buf.fmod(vocab_size) return scores_buf, indices_buf, beams_buf class LexicallyConstrainedBeamSearch(Search): """Implements lexically constrained beam search as described in Fast Lexically Constrained Decoding with Dynamic Beam Allocation for Neural Machine Translation. Post & Vilar, NAACL 2018. https://www.aclweb.org/anthology/N18-1119/ and Improved Lexically Constrained Decoding for Translation and Monolingual Rewriting. Hu et al, NAACL 2019. https://www.aclweb.org/anthology/N19-1090/ This is accomplished by maintaining, for each beam hypothesis, a ConstraintState object (see constraints.py) that tracks which constraints have been generated and using this information to shape the beam for each input sentence. """ def __init__(self, tgt_dict, representation): super().__init__(tgt_dict) self.representation = representation self.vocab_size = len(tgt_dict) self.num_cands = 0 self.supports_constraints = True @torch.jit.export def init_constraints(self, batch_constraints: Optional[Tensor], beam_size: int): self.constraint_states = [] for constraint_tensor in batch_constraints: if self.representation == "ordered": constraint_state = OrderedConstraintState.create(constraint_tensor) elif self.representation == "unordered": constraint_state = UnorderedConstraintState.create(constraint_tensor) self.constraint_states.append([constraint_state for i in range(beam_size)]) @torch.jit.export def prune_sentences(self, batch_idxs: Tensor): self.constraint_states = [ self.constraint_states[i] for i in batch_idxs.tolist() ] @torch.jit.export def update_constraints(self, active_hypos: Tensor): if self.constraint_states: batch_size = active_hypos.size(0) for sentid in range(batch_size): self.constraint_states[sentid] = [ self.constraint_states[sentid][i] for i in active_hypos[sentid] ] @torch.jit.export def step( self, step: int, lprobs: Tensor, scores: Optional[Tensor], prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, ): """ A constrained step builds a large candidates list from the following: - the top 2 * {beam_size} items over the whole beam - for each item in the beam - the top {each_k} (default 1) - all next constraints We then compute the constrained state of each beam item, and assign stripe codes: 0 to the best in each bank, 1 to the 2nd-best, and so on. We then sort by (stripe, score), and truncate the list at 2 * beam size. Args: step: the decoder step lprobs: (batch size, beam size, target vocab) the target-vocab distributions for each item in the beam. Retrun: A tuple of (scores, indices, beams, constraints) where: scores: (batch, output beam size) the scores of the chosen elements indices: (batch, output beam size) the target vocab indices of the chosen elements beams: (batch, output beam size) the 0-indexed hypothesis ids of the chosen elements constraints: (batch, output beam size) the new constraint states """ each_k = 1 device = lprobs.device batch_size, beam_size, vocab_size = lprobs.size() self.num_cands = min( # Just take the k-best. We'll get another k from the 1-best from each # row, plus more from the constraints beam_size * 2, lprobs.view(batch_size, -1).size(1) - 1, # -1 so we never select pad ) # STEP 0: Preliminary. Prevent EOS for unfinished hyps across all batch items constraint_states = self.constraint_states if constraint_states and step > 0: not_finished_indices = [] for sentno, sent_constraints in enumerate(constraint_states): for beamno, state in enumerate(sent_constraints): index = sentno * beam_size + beamno if not state.finished: not_finished_indices.append(index) not_finished_indices = torch.tensor(not_finished_indices) if not_finished_indices.numel() > 0: lprobs.view(batch_size * beam_size, -1)[ not_finished_indices, self.eos ] = -math.inf if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam entry for each batch item lprobs = lprobs[:, ::beam_size, :].contiguous() else: # make probs contain cumulative scores for each hypothesis assert scores is not None lprobs = lprobs + scores[:, :, step - 1].unsqueeze(-1) top_prediction = torch.topk( lprobs.view(batch_size, -1), self.num_cands, ) scores_buf, indices_buf = top_prediction # Project back into relative indices and beams beams_buf = indices_buf // vocab_size indices_buf = indices_buf.fmod(vocab_size) # Short circuit if there are no constraints in this batch if not constraint_states: return scores_buf, indices_buf, beams_buf # STEP 1: get top-1 from each hypothesis across all sentences in the batch if step > 0: top_scores, top_indices = torch.topk( lprobs.view(batch_size * beam_size, -1), k=each_k, dim=1, ) top_scores = top_scores.view(batch_size, -1) top_indices = top_indices.view(batch_size, -1) scores_buf = torch.cat((scores_buf, top_scores), dim=1) indices_buf = torch.cat((indices_buf, top_indices), dim=1) new_beams = torch.arange(0, beam_size, device=device).repeat(batch_size, 1) beams_buf = torch.cat((beams_buf, new_beams), dim=1) # Now, process sentences in the batch one by one. new_scores_buf = torch.zeros((batch_size, 2 * beam_size), device=device) new_indices_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long() new_beams_buf = torch.zeros((batch_size, 2 * beam_size), device=device).long() for sentno, states in enumerate(constraint_states): scores, indices, beams, new_states = self.step_sentence( step, sentno, lprobs[sentno], constraint_states[sentno], beams_buf[sentno].clone(), indices_buf[sentno].clone(), scores_buf[sentno].clone(), ) new_scores_buf[sentno] = scores new_indices_buf[sentno] = indices new_beams_buf[sentno] = beams self.constraint_states[sentno] = new_states return new_scores_buf, new_indices_buf, new_beams_buf @torch.jit.export def step_sentence( self, step: int, sentno: int, lprobs: Tensor, constraint_states: List[List[ConstraintState]], beams_buf: Tensor, indices_buf: Tensor, scores_buf: Tensor, ): """Does per-sentence processing. Adds all constraints for each hypothesis to the list of candidates; then removes duplicates, sorts, and dynamically stripes across the banks. All tensor inputs are collapsed to those pertaining to a single input sentence. """ device = lprobs.device # STEP 2: Add all constraints for each beam item for beamno, state in enumerate(constraint_states): next_tokens = torch.tensor(list(state.next_tokens()), device=device).long() if next_tokens.numel() != 0: indices_buf = torch.cat((indices_buf, next_tokens)) next_beams = ( torch.tensor(beamno, device=device) .repeat(next_tokens.size(0)) .long() ) beams_buf = torch.cat((beams_buf, next_beams)) next_values = lprobs[beamno].take(next_tokens.view(-1)) scores_buf = torch.cat((scores_buf, next_values)) # At the 0th time step, there is just one beam item if step == 0: break # STEP 3: Compute the "bank" for each candidate. This is the # number of constraints it's generated. We need this so that # we can do round-robin allocation of the beam across these # banks. If C is the number of constraints, we select the best # item in bank C, then the best in bank C-1, etc, followed by # the 2nd-best in bank C, the 2nd-best in bank C-1, etc, and so # on, until the maximum beam size. We accomplish this by # creating a sort key and striping across the banks. # Compute the new states for all candidates cands_size = indices_buf.size(0) constraint_states = [ constraint_states[beams_buf[i]].advance(indices_buf[i]) for i in range(cands_size) ] banks = torch.tensor([state.bank for state in constraint_states], device=device) # STEP 4: Sort num_constraint_tokens = len(state.tokens) # Sort by keys (bank, score) (i.e., sort banks together, and scores # within banks). AFAIK pytorch doesn't support either stable sort or # multi-key sorting, so we have to hack this. MAX_SCORE = -100 sort_key = (num_constraint_tokens - banks) * MAX_SCORE + scores_buf sort_values, sort_indices = sort_key.sort(dim=0, descending=True) scores_buf = scores_buf[sort_indices] indices_buf = indices_buf[sort_indices] beams_buf = beams_buf[sort_indices] banks = banks[sort_indices] # Sort the constraints to follow suit constraint_states = [constraint_states[i] for i in sort_indices] # STEP 5: Remove duplicates. The topk calls (overall and # per-row) plus the per-row generation of constraints will # produce duplicates. Here we remove them. def roll(t): """Rolls a 1d tensor left by 1. [0, 1, 2, 3, 4] becomes [4, 0, 1, 2, 3] """ return torch.cat((t[-1].unsqueeze(0), t[0:-1]), dim=0) # We map candidates (beam, token_id) to a single dimension. # This is then shifted by 1. We can then easily identify # duplicates and create a mask that identifies unique # extensions. uniques_mask = beams_buf * (self.vocab_size + 1) + indices_buf uniques_mask = roll(uniques_mask) != uniques_mask # Use the mask to pare down the data structures scores_buf = torch.masked_select(scores_buf, uniques_mask) indices_buf = torch.masked_select(indices_buf, uniques_mask) beams_buf = torch.masked_select(beams_buf, uniques_mask) banks = torch.masked_select(banks, uniques_mask) i = 1 for mask in uniques_mask[1:]: if not mask: constraint_states.pop(i) i += mask # STEP 6: Assign IDs round-robin across banks, sort, and # truncate. Now that the candidates are sorted by (bank, # score) and uniqed, we dynamically allocate the {beam_size} # beam by striping across the candidates. These stripes will # be used as sort keys to do round-robin selection. This is # accomplished in a single pass with offsets. Sorting by # highest-banks (furthest-along hypotheses) first ensures # progress through the constraints. # # e.g., BANKS: 3 3 3 2 2 2 2 1 1 1 0 0 # OLD STRIPES: 0 1 2 0 1 2 3 0 1 2 0 1 # NEW STRIPES: 0 1+4 2+8 0+1 1+5 2+9 3+11 0+2 1+6 2+10 0+3 1+7 # = 0 5 10 1 6 11 13 2 7 12 3 8 # # Sorting by this then gives the following banks: # # 3 2 1 0 3 2 1 0 3 2 1 2 # # We'll take the top {beam_size} of these. stripe_offsets = [offset * (len(banks) + 1) for offset in range(len(banks) + 1)] stripes = torch.zeros_like(banks) cur_bank_count = -1 cur_bank = banks[0] for i, bank in enumerate(banks): if bank != cur_bank: cur_bank_count = 0 cur_bank = bank else: cur_bank_count += 1 stripes[i] = num_constraint_tokens - bank + stripe_offsets[cur_bank_count] # STEP 7: Sort by the stripes values sort_values, sort_indices = stripes.sort(dim=0) scores_buf = scores_buf[sort_indices] indices_buf = indices_buf[sort_indices] beams_buf = beams_buf[sort_indices] constraint_states = [constraint_states[i] for i in sort_indices] # STEP 8: Truncate to the candidates size! scores_buf = scores_buf[: self.num_cands] indices_buf = indices_buf[: self.num_cands] beams_buf = beams_buf[: self.num_cands] return scores_buf, indices_buf, beams_buf, constraint_states class LengthConstrainedBeamSearch(Search): def __init__(self, tgt_dict, min_len_a, min_len_b, max_len_a, max_len_b): super().__init__(tgt_dict) self.min_len_a = min_len_a self.min_len_b = min_len_b self.max_len_a = max_len_a self.max_len_b = max_len_b self.beam = BeamSearch(tgt_dict) self.needs_src_lengths = True def step( self, step: int, lprobs, scores, prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, ): min_lens = self.min_len_a * self.src_lengths + self.min_len_b max_lens = self.max_len_a * self.src_lengths + self.max_len_b lprobs[step < min_lens, :, self.eos] = -math.inf lprobs[step >= max_lens, :, self.eos] = 0 return self.beam.step(step, lprobs, scores) class DiverseBeamSearch(Search): """Diverse Beam Search. See "Diverse Beam Search: Decoding Diverse Solutions from Neural Sequence Models" for details. We implement cumulative diversity penalty here as default, optionally provide Hamming diversity described in the original paper, and a way to interpolate between the two through diversity_discount. Take the example below for illustration of cumulative diversity implemented. A) I like dogs. B) I like ____. C) There are ___. And we are at step=2, trying to fill in the blank: Hamming diversity: Penalty for B from A is 1 for "dogs" and 0 for any other words like "cats". Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats". Cumulative diversity (default): Penalty for B from A is 3 for "dogs" and 0 for any other words like "cats". Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats". B and C differ because B matches with A for "I" and "like" at respective steps incurring 2 cumulative penalty. Using divesrity_discount to interpolate between the two: if diverstiy_discount = 0.5, then Penalty for B from A is 1.75 (1 + 0.5 + 0.25) for "dogs" and 0 for any other words like "cats". Penalty for C from A is 1 for "dogs" and 0 for any other words like "cats". "I" and "like" matched for B and A at step 0 and 1 respectively. Since "I" is two steps away and "like" is one step away, they are discounted by (0.5)^2 and 0.5 respectively. When diversity_discount = 0, we recover Hammning diversity and when diversity_discount = 1, we recover cumulative diversity. NB: During beam search for each diversity group, `candidate_mutiple` is set to 1 rather than BeamSearch default(2). This is to ensure we have final `beam_size` candidates so that no diversity groups would be dropped during final token selection in sequence generation. For full backwards compatibility, use diversity_discount=0 and candidate_multiple=2. """ def __init__( self, tgt_dict, num_groups, diversity_strength, diversity_discount=1.0, candidate_multiple=1, ): super().__init__(tgt_dict) self.num_groups = num_groups self.diversity_strength = -diversity_strength self.beam = BeamSearch(tgt_dict) self.diversity_discount = diversity_discount self.candidate_multiple = candidate_multiple # Float tensor to keep track of overlap between groups. # Each token shared at the same step between two groups is counted as one. # Then token counts are discounted by `diversity_discount` for every next timestep. # Once initialized, dimension is batch_size * num_groups * num_groups. self.group_overlap = torch.empty(0) @torch.jit.export def step( self, step: int, lprobs, scores, prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, ): bsz, beam_size, vocab_size = lprobs.size() if beam_size % self.num_groups != 0: raise ValueError( "DiverseBeamSearch requires --beam to be divisible by the number of groups" ) # initialize diversity penalty diversity_buf = torch.zeros(lprobs[:, 0, :].size()).to(lprobs) scores_G, beams_G = [], [] # pre-allocating tensor for indices for all groups indices_G_stacked = torch.empty( bsz, int(beam_size / self.num_groups) * self.candidate_multiple, self.num_groups, dtype=torch.long, device=lprobs.device, ) for g in range(self.num_groups): lprobs_g = lprobs[:, g :: self.num_groups, :] scores_g = scores[:, g :: self.num_groups, :] if step > 0 else None diversity_buf.zero_() # apply diversity penalty if g > 0: indices_ = indices_G_stacked[:, :, :g] if step > 0: penalty_val = 1 + self.group_overlap[original_batch_idxs, g, :g] penalty_val = penalty_val.unsqueeze(1) else: penalty_val = torch.ones(bsz, 1, 1) diversity_buf.scatter_add_( 1, indices_.reshape(bsz, -1), penalty_val.expand(indices_.size()) .reshape(bsz, -1) .to(diversity_buf), ) lprobs_g = torch.add( lprobs_g, other=diversity_buf.unsqueeze(1), alpha=self.diversity_strength, ) else: lprobs_g = lprobs_g.contiguous() scores_buf, indices_buf, beams_buf = self.beam.step( step, lprobs_g, scores_g, candidate_multiple=self.candidate_multiple ) beams_buf.mul_(self.num_groups).add_(g) scores_G.append(scores_buf.clone()) beams_G.append(beams_buf.clone()) indices_G_stacked[:, :, g] = indices_buf # interleave results from different groups scores_buf = torch.stack(scores_G, dim=2).view(bsz, -1) indices_buf = indices_G_stacked.view(bsz, -1) beams_buf = torch.stack(beams_G, dim=2).view(bsz, -1) # find num of overlapped tokens for each group pair # then discount it for next timestamp overlap = self.diversity_discount * torch.sum( indices_G_stacked.unsqueeze(2).eq(indices_G_stacked.unsqueeze(3)), dim=1 ) if step == 0: self.group_overlap = overlap else: self.group_overlap[original_batch_idxs] = ( self.group_overlap[original_batch_idxs] * self.diversity_discount + overlap ) return scores_buf, indices_buf, beams_buf class Sampling(Search): sampling_topk: int sampling_topp: float def __init__(self, tgt_dict, sampling_topk=-1, sampling_topp=-1.0): super().__init__(tgt_dict) self.sampling_topk = sampling_topk self.sampling_topp = sampling_topp def _sample_topp(self, lprobs): """Sample among the smallest set of elements whose cumulative probability mass exceeds p. See `"The Curious Case of Neural Text Degeneration" (Holtzman et al., 2019) <https://arxiv.org/abs/1904.09751>`_. Args: lprobs: (bsz x input_beam_size x vocab_size) the model's log-probabilities over the vocabulary at the current step Return: A tuple of (trimed_probs, truncated_indices) where: trimed_probs: (bsz x input_beam_size x ?) the model's probabilities over the elements selected to sample from. The width of the third dimension is determined by top-P. truncated_indices: (bsz x input_beam_size x ?) the indices of the chosen elements. """ probs = lprobs.exp_() # sort the last dimension (vocab dimension) in descending order sorted_probs, sorted_indices = probs.sort(descending=True) # compute a mask to indicate the words to be included in the top-P set. cumsum_probs = sorted_probs.cumsum(dim=2) mask = cumsum_probs.lt(self.sampling_topp) # note that mask was computed by 'lt'. One more word needs to be included # so that the cumulative probability mass can exceed p. cumsum_mask = mask.cumsum(dim=2) last_included = cumsum_mask[:, :, -1:] last_included.clamp_(0, mask.size()[2] - 1) mask = mask.scatter_(2, last_included, 1) # truncate unnecessary dims. max_dim = last_included.max() truncated_mask = mask[:, :, : max_dim + 1] truncated_probs = sorted_probs[:, :, : max_dim + 1] truncated_indices = sorted_indices[:, :, : max_dim + 1] # trim the words that are not in top-P by setting their probabilities # to 0, so that they would not be sampled later. trim_mask = ~truncated_mask trimed_probs = truncated_probs.masked_fill_(trim_mask, 0) return trimed_probs, truncated_indices @torch.jit.export def step( self, step: int, lprobs, scores, prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, ): bsz, beam_size, vocab_size = lprobs.size() if step == 0: # at the first step all hypotheses are equally likely, so use # only the first beam lprobs = lprobs[:, ::beam_size, :].contiguous() if self.sampling_topp > 0: # only sample from the smallest set of words whose cumulative probability mass exceeds p probs, top_indices = self._sample_topp(lprobs) elif self.sampling_topk > 0: # only sample from top-k candidates lprobs, top_indices = lprobs.topk(self.sampling_topk) probs = lprobs.exp_() else: probs = lprobs.exp_() # dummy data to be consistent with true branch for type check top_indices = torch.empty(0).to(probs) # sample if step == 0: indices_buf = torch.multinomial( probs.view(bsz, -1), beam_size, replacement=True, ).view(bsz, beam_size) else: indices_buf = torch.multinomial( probs.view(bsz * beam_size, -1), 1, replacement=True, ).view(bsz, beam_size) if step == 0: # expand to beam size probs = probs.expand(bsz, beam_size, -1) # gather scores scores_buf = torch.gather(probs, dim=2, index=indices_buf.unsqueeze(-1)) scores_buf = scores_buf.log_().view(bsz, -1) # remap indices if using top-k or top-P sampling if self.sampling_topk > 0 or self.sampling_topp > 0: indices_buf = torch.gather( top_indices.expand(bsz, beam_size, -1), dim=2, index=indices_buf.unsqueeze(-1), ).squeeze(2) if step == 0: beams_buf = indices_buf.new_zeros(bsz, beam_size) else: beams_buf = torch.arange(0, beam_size).to(indices_buf).repeat(bsz, 1) # make scores cumulative scores_buf.add_( torch.gather(scores[:, :, step - 1], dim=1, index=beams_buf) ) return scores_buf, indices_buf, beams_buf class DiverseSiblingsSearch(Search): """ Beam search with diverse siblings. See "A Simple, Fast Diverse Decoding Algorithm for Neural Generation" for details. https://arxiv.org/abs/1611.08562 1/ Calculate hypotheses for each beam 2/ Intra-sibling ordering 3/ Rewrite scores 4/ Choose top K hypotheses if diversity_rate == 0 is equivalent to BeamSearch """ def __init__(self, tgt_dict, diversity_rate): super().__init__(tgt_dict) self.diversity_rate = diversity_rate self.beam = BeamSearch(tgt_dict) def step( self, step: int, lprobs, scores, prev_output_tokens: Optional[Tensor] = None, original_batch_idxs: Optional[Tensor] = None, ): bsz, beam_size, vocab_size = lprobs.size() k = min( # Take the best 2 x beam_size predictions. We'll choose the first # beam_size of these which don't predict eos to continue with. beam_size * 2, lprobs.view(bsz, -1).size(1) - 1, # -1 so we never select pad ) s_list: List[Tensor] i_list: List[Tensor] s_list = [torch.empty(0).to(lprobs) for i in range(beam_size)] i_list = [torch.LongTensor().to(device=lprobs.device) for i in range(beam_size)] sibling_score = torch.arange(1, k + 1).to(lprobs) * self.diversity_rate if step == 0: return self.beam.step(step, lprobs, scores) lprobs.add_(scores[:, :, step - 1].unsqueeze(-1)) # 1/ Calculate hypotheses for each beam for i in range(beam_size): torch.topk(lprobs[:, i, :].view(bsz, -1), k, out=(s_list[i], i_list[i])) i_list[i].fmod_(vocab_size) # 2/ Intra-sibling ordering by default from topk + 3/ Rewrite scores s_list[i].sub_(sibling_score) # 4/ Choose top K hypotheses indices = torch.stack(i_list, dim=1).view(bsz, -1) final_scores = torch.empty(0).to(lprobs) final_indices = torch.LongTensor().to(device=lprobs.device) final_beams = torch.LongTensor().to(device=lprobs.device) (final_scores, final_indices) = torch.topk( torch.stack(s_list, dim=1).view(bsz, -1), k, ) final_beams = final_indices // k for i in range(bsz): final_indices[i] = indices[i][final_indices[i]] return final_scores, final_indices, final_beams ================================================ FILE: fairseq/sequence_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import math import sys from typing import Dict, List, Optional import torch import torch.nn as nn from torch import Tensor from fairseq import search, utils from fairseq.data import data_utils from fairseq.models import FairseqIncrementalDecoder from fairseq.ngram_repeat_block import NGramRepeatBlock class SequenceGenerator(nn.Module): def __init__( self, models, tgt_dict, beam_size=1, max_len_a=0, max_len_b=200, max_len=0, min_len=1, normalize_scores=True, len_penalty=1.0, unk_penalty=0.0, temperature=1.0, match_source_len=False, no_repeat_ngram_size=0, search_strategy=None, eos=None, symbols_to_strip_from_output=None, lm_model=None, lm_weight=1.0, tokens_to_suppress=(), ): """Generates translations of a given source sentence. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models, currently support fairseq.models.TransformerModel for scripting beam_size (int, optional): beam width (default: 1) max_len_a/b (int, optional): generate sequences of maximum length ax + b, where x is the source length max_len (int, optional): the maximum length of the generated output (not including end-of-sentence) min_len (int, optional): the minimum length of the generated output (not including end-of-sentence) normalize_scores (bool, optional): normalize scores by the length of the output (default: True) len_penalty (float, optional): length penalty, where <1.0 favors shorter, >1.0 favors longer sentences (default: 1.0) unk_penalty (float, optional): unknown word penalty, where <0 produces more unks, >0 produces fewer (default: 0.0) temperature (float, optional): temperature, where values >1.0 produce more uniform samples and values <1.0 produce sharper samples (default: 1.0) match_source_len (bool, optional): outputs should match the source length (default: False) """ super().__init__() if isinstance(models, EnsembleModel): self.model = models else: self.model = EnsembleModel(models) self.tgt_dict = tgt_dict self.pad = tgt_dict.pad() self.unk = tgt_dict.unk() self.eos = tgt_dict.eos() if eos is None else eos self.symbols_to_strip_from_output = ( symbols_to_strip_from_output.union({self.eos}) if symbols_to_strip_from_output is not None else {self.eos} ) self.token_indices_to_suppress: Optional[Tensor] = None token_indices_to_suppress = [] for token_string in tokens_to_suppress: token_index = tgt_dict.index(token_string) assert token_index != self.unk token_indices_to_suppress.append(token_index) if len(token_indices_to_suppress) > 0: self.token_indices_to_suppress = torch.Tensor( token_indices_to_suppress ).long() self.vocab_size = len(tgt_dict) self.beam_size = beam_size # the max beam size is the dictionary size - 1, since we never select pad self.beam_size = min(beam_size, self.vocab_size - 1) self.model.set_decoder_beam_size(self.beam_size) self.max_len_a = max_len_a self.max_len_b = max_len_b self.min_len = min_len self.max_len = max_len or self.model.max_decoder_positions() self.normalize_scores = normalize_scores self.len_penalty = len_penalty self.unk_penalty = unk_penalty self.temperature = temperature self.match_source_len = match_source_len if no_repeat_ngram_size > 0: self.repeat_ngram_blocker = NGramRepeatBlock(no_repeat_ngram_size) else: self.repeat_ngram_blocker = None assert temperature > 0, "--temperature must be greater than 0" self.search = ( search.BeamSearch(tgt_dict) if search_strategy is None else search_strategy ) # We only need to set src_lengths in LengthConstrainedBeamSearch. # As a module attribute, setting it would break in multithread # settings when the model is shared. self.should_set_src_lengths = ( hasattr(self.search, "needs_src_lengths") and self.search.needs_src_lengths ) self.model.eval() self.lm_model = lm_model self.lm_weight = lm_weight if self.lm_model is not None: self.lm_model.eval() def cuda(self): self.model.cuda() return self @torch.no_grad() def forward( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, bos_token: Optional[int] = None, ): """Generate a batch of translations. Args: sample (dict): batch prefix_tokens (torch.LongTensor, optional): force decoder to begin with these tokens bos_token (int, optional): beginning of sentence token (default: self.eos) """ return self._generate(sample, prefix_tokens, bos_token=bos_token) # TODO(myleott): unused, deprecate after pytorch-translate migration def generate_batched_itr(self, data_itr, beam_size=None, cuda=False, timer=None): """Iterate over a batched dataset and yield individual translations. Args: cuda (bool, optional): use GPU for generation timer (StopwatchMeter, optional): time generations """ for sample in data_itr: s = utils.move_to_cuda(sample) if cuda else sample if "net_input" not in s: continue input = s["net_input"] # model.forward normally channels prev_output_tokens into the decoder # separately, but SequenceGenerator directly calls model.encoder encoder_input = { k: v for k, v in input.items() if k != "prev_output_tokens" } if timer is not None: timer.start() with torch.no_grad(): hypos = self.generate(encoder_input) if timer is not None: timer.stop(sum(len(h[0]["tokens"]) for h in hypos)) for i, id in enumerate(s["id"].data): # remove padding src = utils.strip_pad(input["src_tokens"].data[i, :], self.pad) ref = ( utils.strip_pad(s["target"].data[i, :], self.pad) if s["target"] is not None else None ) yield id, src, ref, hypos[i] @torch.no_grad() def generate( self, models, sample: Dict[str, Dict[str, Tensor]], **kwargs ) -> List[List[Dict[str, Tensor]]]: """Generate translations. Match the api of other fairseq generators. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models sample (dict): batch prefix_tokens (torch.LongTensor, optional): force decoder to begin with these tokens constraints (torch.LongTensor, optional): force decoder to include the list of constraints bos_token (int, optional): beginning of sentence token (default: self.eos) """ return self._generate(sample, **kwargs) def _generate( self, sample: Dict[str, Dict[str, Tensor]], prefix_tokens: Optional[Tensor] = None, constraints: Optional[Tensor] = None, bos_token: Optional[int] = None, ): incremental_states = torch.jit.annotate( List[Dict[str, Dict[str, Optional[Tensor]]]], [ torch.jit.annotate(Dict[str, Dict[str, Optional[Tensor]]], {}) for i in range(self.model.models_size) ], ) net_input = sample["net_input"] if "src_tokens" in net_input: src_tokens = net_input["src_tokens"] # length of the source text being the character length except EndOfSentence and pad # if src_lengths exists in net_input (speech_to_text dataset case), then use it if "src_lengths" in net_input: src_lengths = net_input["src_lengths"] else: src_lengths = ( (src_tokens.ne(self.eos) & src_tokens.ne(self.pad)) .long() .sum(dim=1) ) elif "source" in net_input: src_tokens = net_input["source"] src_lengths = ( net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens) ) elif "features" in net_input: src_tokens = net_input["features"] src_lengths = ( net_input["padding_mask"].size(-1) - net_input["padding_mask"].sum(-1) if net_input["padding_mask"] is not None else torch.tensor(src_tokens.size(-1)).to(src_tokens) ) else: raise Exception( "expected src_tokens or source in net input. input keys: " + str(net_input.keys()) ) # bsz: total number of sentences in beam # Note that src_tokens may have more than 2 dimensions (i.e. audio features) bsz, src_len = src_tokens.size()[:2] beam_size = self.beam_size if constraints is not None and not self.search.supports_constraints: raise NotImplementedError( "Target-side constraints were provided, but search method doesn't support them" ) # Initialize constraints, when active self.search.init_constraints(constraints, beam_size) max_len: int = -1 if self.match_source_len: max_len = src_lengths.max().item() else: max_len = min( int(self.max_len_a * src_len + self.max_len_b), self.max_len - 1, ) assert ( self.min_len <= max_len ), "min_len cannot be larger than max_len, please adjust these!" # compute the encoder output for each beam with torch.autograd.profiler.record_function("EnsembleModel: forward_encoder"): encoder_outs = self.model.forward_encoder(net_input) # placeholder of indices for bsz * beam_size to hold tokens and accumulative scores new_order = torch.arange(bsz).view(-1, 1).repeat(1, beam_size).view(-1) new_order = new_order.to(src_tokens.device).long() encoder_outs = self.model.reorder_encoder_out(encoder_outs, new_order) # ensure encoder_outs is a List. assert encoder_outs is not None # initialize buffers scores = ( torch.zeros(bsz * beam_size, max_len + 1).to(src_tokens).float() ) # +1 for eos; pad is never chosen for scoring tokens = ( torch.zeros(bsz * beam_size, max_len + 2) .to(src_tokens) .long() .fill_(self.pad) ) # +2 for eos and pad tokens[:, 0] = self.eos if bos_token is None else bos_token attn: Optional[Tensor] = None # A list that indicates candidates that should be ignored. # For example, suppose we're sampling and have already finalized 2/5 # samples. Then cands_to_ignore would mark 2 positions as being ignored, # so that we only finalize the remaining 3 samples. cands_to_ignore = ( torch.zeros(bsz, beam_size).to(src_tokens).eq(-1) ) # forward and backward-compatible False mask # list of completed sentences finalized = torch.jit.annotate( List[List[Dict[str, Tensor]]], [torch.jit.annotate(List[Dict[str, Tensor]], []) for i in range(bsz)], ) # contains lists of dictionaries of infomation about the hypothesis being finalized at each step # a boolean array indicating if the sentence at the index is finished or not finished = [False for i in range(bsz)] num_remaining_sent = bsz # number of sentences remaining # number of candidate hypos per step cand_size = 2 * beam_size # 2 x beam size in case half are EOS # offset arrays for converting between different indexing schemes bbsz_offsets = ( (torch.arange(0, bsz) * beam_size) .unsqueeze(1) .type_as(tokens) .to(src_tokens.device) ) cand_offsets = torch.arange(0, cand_size).type_as(tokens).to(src_tokens.device) reorder_state: Optional[Tensor] = None batch_idxs: Optional[Tensor] = None original_batch_idxs: Optional[Tensor] = None if "id" in sample and isinstance(sample["id"], Tensor): original_batch_idxs = sample["id"] else: original_batch_idxs = torch.arange(0, bsz).type_as(tokens) for step in range(max_len + 1): # one extra step for EOS marker # reorder decoder internal states based on the prev choice of beams if reorder_state is not None: if batch_idxs is not None: # update beam indices to take into account removed sentences corr = batch_idxs - torch.arange(batch_idxs.numel()).type_as( batch_idxs ) reorder_state.view(-1, beam_size).add_( corr.unsqueeze(-1) * beam_size ) original_batch_idxs = original_batch_idxs[batch_idxs] self.model.reorder_incremental_state(incremental_states, reorder_state) encoder_outs = self.model.reorder_encoder_out( encoder_outs, reorder_state ) with torch.autograd.profiler.record_function( "EnsembleModel: forward_decoder" ): lprobs, avg_attn_scores = self.model.forward_decoder( tokens[:, : step + 1], encoder_outs, incremental_states, self.temperature, ) if self.lm_model is not None: lm_out = self.lm_model(tokens[:, : step + 1]) probs = self.lm_model.get_normalized_probs( lm_out, log_probs=True, sample=None ) probs = probs[:, -1, :] * self.lm_weight lprobs += probs lprobs[lprobs != lprobs] = torch.tensor(-math.inf).to(lprobs) lprobs[:, self.pad] = -math.inf # never select pad lprobs[:, self.unk] -= self.unk_penalty # apply unk penalty # handle max length constraint if step >= max_len: lprobs[:, : self.eos] = -math.inf lprobs[:, self.eos + 1 :] = -math.inf # handle prefix tokens (possibly with different lengths) if ( prefix_tokens is not None and step < prefix_tokens.size(1) and step < max_len ): lprobs, tokens, scores = self._prefix_tokens( step, lprobs, scores, tokens, prefix_tokens, beam_size ) else: if step < self.min_len: # minimum length constraint (does not apply if using prefix_tokens) lprobs[:, self.eos] = -math.inf if self.token_indices_to_suppress is not None: lprobs[:, self.token_indices_to_suppress] = -math.inf # Record attention scores, only support avg_attn_scores is a Tensor if avg_attn_scores is not None: if attn is None: attn = torch.empty( bsz * beam_size, avg_attn_scores.size(1), max_len + 2 ).to(scores) attn[:, :, step + 1].copy_(avg_attn_scores) scores = scores.type_as(lprobs) eos_bbsz_idx = torch.empty(0).to( tokens ) # indices of hypothesis ending with eos (finished sentences) eos_scores = torch.empty(0).to( scores ) # scores of hypothesis ending with eos (finished sentences) if self.should_set_src_lengths: self.search.set_src_lengths(src_lengths) if self.repeat_ngram_blocker is not None: lprobs = self.repeat_ngram_blocker(tokens, lprobs, bsz, beam_size, step) # Shape: (batch, cand_size) cand_scores, cand_indices, cand_beams = self.search.step( step, lprobs.view(bsz, -1, self.vocab_size), scores.view(bsz, beam_size, -1)[:, :, :step], tokens[:, : step + 1], original_batch_idxs, ) # cand_bbsz_idx contains beam indices for the top candidate # hypotheses, with a range of values: [0, bsz*beam_size), # and dimensions: [bsz, cand_size] cand_bbsz_idx = cand_beams.add(bbsz_offsets) # finalize hypotheses that end in eos # Shape of eos_mask: (batch size, beam size) eos_mask = cand_indices.eq(self.eos) & cand_scores.ne(-math.inf) eos_mask[:, :beam_size][cands_to_ignore] = torch.tensor(0).to(eos_mask) # only consider eos when it's among the top beam_size indices # Now we know what beam item(s) to finish # Shape: 1d list of absolute-numbered eos_bbsz_idx = torch.masked_select( cand_bbsz_idx[:, :beam_size], mask=eos_mask[:, :beam_size] ) finalized_sents: List[int] = [] if eos_bbsz_idx.numel() > 0: eos_scores = torch.masked_select( cand_scores[:, :beam_size], mask=eos_mask[:, :beam_size] ) finalized_sents = self.finalize_hypos( step, eos_bbsz_idx, eos_scores, tokens, scores, finalized, finished, beam_size, attn, src_lengths, max_len, ) num_remaining_sent -= len(finalized_sents) assert num_remaining_sent >= 0 if num_remaining_sent == 0: break if self.search.stop_on_max_len and step >= max_len: break assert step < max_len, f"{step} < {max_len}" # Remove finalized sentences (ones for which {beam_size} # finished hypotheses have been generated) from the batch. if len(finalized_sents) > 0: new_bsz = bsz - len(finalized_sents) # construct batch_idxs which holds indices of batches to keep for the next pass batch_mask = torch.ones( bsz, dtype=torch.bool, device=cand_indices.device ) batch_mask[finalized_sents] = False # TODO replace `nonzero(as_tuple=False)` after TorchScript supports it batch_idxs = torch.arange( bsz, device=cand_indices.device ).masked_select(batch_mask) # Choose the subset of the hypothesized constraints that will continue self.search.prune_sentences(batch_idxs) eos_mask = eos_mask[batch_idxs] cand_beams = cand_beams[batch_idxs] bbsz_offsets.resize_(new_bsz, 1) cand_bbsz_idx = cand_beams.add(bbsz_offsets) cand_scores = cand_scores[batch_idxs] cand_indices = cand_indices[batch_idxs] if prefix_tokens is not None: prefix_tokens = prefix_tokens[batch_idxs] src_lengths = src_lengths[batch_idxs] cands_to_ignore = cands_to_ignore[batch_idxs] scores = scores.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) tokens = tokens.view(bsz, -1)[batch_idxs].view(new_bsz * beam_size, -1) if attn is not None: attn = attn.view(bsz, -1)[batch_idxs].view( new_bsz * beam_size, attn.size(1), -1 ) bsz = new_bsz else: batch_idxs = None # Set active_mask so that values > cand_size indicate eos hypos # and values < cand_size indicate candidate active hypos. # After, the min values per row are the top candidate active hypos # Rewrite the operator since the element wise or is not supported in torchscript. eos_mask[:, :beam_size] = ~((~cands_to_ignore) & (~eos_mask[:, :beam_size])) active_mask = torch.add( eos_mask.type_as(cand_offsets) * cand_size, cand_offsets[: eos_mask.size(1)], ) # get the top beam_size active hypotheses, which are just # the hypos with the smallest values in active_mask. # {active_hypos} indicates which {beam_size} hypotheses # from the list of {2 * beam_size} candidates were # selected. Shapes: (batch size, beam size) new_cands_to_ignore, active_hypos = torch.topk( active_mask, k=beam_size, dim=1, largest=False ) # update cands_to_ignore to ignore any finalized hypos. cands_to_ignore = new_cands_to_ignore.ge(cand_size)[:, :beam_size] # Make sure there is at least one active item for each sentence in the batch. assert (~cands_to_ignore).any(dim=1).all() # update cands_to_ignore to ignore any finalized hypos # {active_bbsz_idx} denotes which beam number is continued for each new hypothesis (a beam # can be selected more than once). active_bbsz_idx = torch.gather(cand_bbsz_idx, dim=1, index=active_hypos) active_scores = torch.gather(cand_scores, dim=1, index=active_hypos) active_bbsz_idx = active_bbsz_idx.view(-1) active_scores = active_scores.view(-1) # copy tokens and scores for active hypotheses # Set the tokens for each beam (can select the same row more than once) tokens[:, : step + 1] = torch.index_select( tokens[:, : step + 1], dim=0, index=active_bbsz_idx ) # Select the next token for each of them tokens.view(bsz, beam_size, -1)[:, :, step + 1] = torch.gather( cand_indices, dim=1, index=active_hypos ) if step > 0: scores[:, :step] = torch.index_select( scores[:, :step], dim=0, index=active_bbsz_idx ) scores.view(bsz, beam_size, -1)[:, :, step] = torch.gather( cand_scores, dim=1, index=active_hypos ) # Update constraints based on which candidates were selected for the next beam self.search.update_constraints(active_hypos) # copy attention for active hypotheses if attn is not None: attn[:, :, : step + 2] = torch.index_select( attn[:, :, : step + 2], dim=0, index=active_bbsz_idx ) # reorder incremental state in decoder reorder_state = active_bbsz_idx # sort by score descending for sent in range(len(finalized)): scores = torch.tensor( [float(elem["score"].item()) for elem in finalized[sent]] ) _, sorted_scores_indices = torch.sort(scores, descending=True) finalized[sent] = [finalized[sent][ssi] for ssi in sorted_scores_indices] finalized[sent] = torch.jit.annotate( List[Dict[str, Tensor]], finalized[sent] ) return finalized def _prefix_tokens( self, step: int, lprobs, scores, tokens, prefix_tokens, beam_size: int ): """Handle prefix tokens""" prefix_toks = prefix_tokens[:, step].unsqueeze(-1).repeat(1, beam_size).view(-1) prefix_lprobs = lprobs.gather(-1, prefix_toks.unsqueeze(-1)) prefix_mask = prefix_toks.ne(self.pad) lprobs[prefix_mask] = torch.tensor(-math.inf).to(lprobs) lprobs[prefix_mask] = lprobs[prefix_mask].scatter( -1, prefix_toks[prefix_mask].unsqueeze(-1), prefix_lprobs[prefix_mask] ) # if prefix includes eos, then we should make sure tokens and # scores are the same across all beams eos_mask = prefix_toks.eq(self.eos) if eos_mask.any(): # validate that the first beam matches the prefix first_beam = tokens[eos_mask].view(-1, beam_size, tokens.size(-1))[ :, 0, 1 : step + 1 ] eos_mask_batch_dim = eos_mask.view(-1, beam_size)[:, 0] target_prefix = prefix_tokens[eos_mask_batch_dim][:, :step] assert (first_beam == target_prefix).all() # copy tokens, scores and lprobs from the first beam to all beams tokens = self.replicate_first_beam(tokens, eos_mask_batch_dim, beam_size) scores = self.replicate_first_beam(scores, eos_mask_batch_dim, beam_size) lprobs = self.replicate_first_beam(lprobs, eos_mask_batch_dim, beam_size) return lprobs, tokens, scores def replicate_first_beam(self, tensor, mask, beam_size: int): tensor = tensor.view(-1, beam_size, tensor.size(-1)) tensor[mask] = tensor[mask][:, :1, :] return tensor.view(-1, tensor.size(-1)) def finalize_hypos( self, step: int, bbsz_idx, eos_scores, tokens, scores, finalized: List[List[Dict[str, Tensor]]], finished: List[bool], beam_size: int, attn: Optional[Tensor], src_lengths, max_len: int, ): """Finalize hypothesis, store finalized information in `finalized`, and change `finished` accordingly. A sentence is finalized when {beam_size} finished items have been collected for it. Returns number of sentences (not beam items) being finalized. These will be removed from the batch and not processed further. Args: bbsz_idx (Tensor): """ assert bbsz_idx.numel() == eos_scores.numel() # clone relevant token and attention tensors. # tokens is (batch * beam, max_len). So the index_select # gets the newly EOS rows, then selects cols 1..{step + 2} tokens_clone = tokens.index_select(0, bbsz_idx)[ :, 1 : step + 2 ] # skip the first index, which is EOS tokens_clone[:, step] = self.eos attn_clone = ( attn.index_select(0, bbsz_idx)[:, :, 1 : step + 2] if attn is not None else None ) # compute scores per token position pos_scores = scores.index_select(0, bbsz_idx)[:, : step + 1] pos_scores[:, step] = eos_scores # convert from cumulative to per-position scores pos_scores[:, 1:] = pos_scores[:, 1:] - pos_scores[:, :-1] # normalize sentence-level scores if self.normalize_scores: eos_scores /= (step + 1) ** self.len_penalty # cum_unfin records which sentences in the batch are finished. # It helps match indexing between (a) the original sentences # in the batch and (b) the current, possibly-reduced set of # sentences. cum_unfin: List[int] = [] prev = 0 for f in finished: if f: prev += 1 else: cum_unfin.append(prev) cum_fin_tensor = torch.tensor(cum_unfin, dtype=torch.int).to(bbsz_idx) unfin_idx = torch.div(bbsz_idx, beam_size, rounding_mode="trunc") sent = unfin_idx + torch.index_select(cum_fin_tensor, 0, unfin_idx) # Create a set of "{sent}{unfin_idx}", where # "unfin_idx" is the index in the current (possibly reduced) # list of sentences, and "sent" is the index in the original, # unreduced batch # For every finished beam item # sentence index in the current (possibly reduced) batch seen = (sent << 32) + unfin_idx unique_seen: List[int] = torch.unique(seen).tolist() if self.match_source_len: condition = step > torch.index_select(src_lengths, 0, unfin_idx) eos_scores = torch.where(condition, torch.tensor(-math.inf), eos_scores) sent_list: List[int] = sent.tolist() for i in range(bbsz_idx.size()[0]): # An input sentence (among those in a batch) is finished when # beam_size hypotheses have been collected for it if len(finalized[sent_list[i]]) < beam_size: if attn_clone is not None: # remove padding tokens from attn scores hypo_attn = attn_clone[i] else: hypo_attn = torch.empty(0) finalized[sent_list[i]].append( { "tokens": tokens_clone[i], "score": eos_scores[i], "attention": hypo_attn, # src_len x tgt_len "alignment": torch.empty(0), "positional_scores": pos_scores[i], } ) newly_finished: List[int] = [] for unique_s in unique_seen: # check termination conditions for this sentence unique_sent: int = unique_s >> 32 unique_unfin_idx: int = unique_s - (unique_sent << 32) if not finished[unique_sent] and self.is_finished( step, unique_unfin_idx, max_len, len(finalized[unique_sent]), beam_size ): finished[unique_sent] = True newly_finished.append(unique_unfin_idx) return newly_finished def is_finished( self, step: int, unfin_idx: int, max_len: int, finalized_sent_len: int, beam_size: int, ): """ Check whether decoding for a sentence is finished, which occurs when the list of finalized sentences has reached the beam size, or when we reach the maximum length. """ assert finalized_sent_len <= beam_size if finalized_sent_len == beam_size or step == max_len: return True return False class EnsembleModel(nn.Module): """A wrapper around an ensemble of models.""" def __init__(self, models): super().__init__() self.models_size = len(models) # method '__len__' is not supported in ModuleList for torch script self.single_model = models[0] self.models = nn.ModuleList(models) self.has_incremental: bool = False if all( hasattr(m, "decoder") and isinstance(m.decoder, FairseqIncrementalDecoder) for m in models ): self.has_incremental = True def forward(self): pass def has_encoder(self): return hasattr(self.single_model, "encoder") def has_incremental_states(self): return self.has_incremental def max_decoder_positions(self): return min( [ m.max_decoder_positions() for m in self.models if hasattr(m, "max_decoder_positions") ] + [sys.maxsize] ) def set_decoder_beam_size(self, beam_size): """Set beam size for efficient beamable enc-dec attention.""" if beam_size > 1: for model in self.models: if hasattr(model, "set_beam_size"): model.set_beam_size(beam_size) @torch.jit.export def forward_encoder(self, net_input: Dict[str, Tensor]): if not self.has_encoder(): return None return [model.encoder.forward_torchscript(net_input) for model in self.models] @torch.jit.export def forward_decoder( self, tokens, encoder_outs: List[Dict[str, List[Tensor]]], incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], temperature: float = 1.0, ): log_probs = [] avg_attn: Optional[Tensor] = None encoder_out: Optional[Dict[str, List[Tensor]]] = None for i, model in enumerate(self.models): if self.has_encoder(): encoder_out = encoder_outs[i] # decode each model if self.has_incremental_states(): decoder_out = model.decoder.forward( tokens, encoder_out=encoder_out, incremental_state=incremental_states[i], ) else: if hasattr(model, "decoder"): decoder_out = model.decoder.forward(tokens, encoder_out=encoder_out) else: decoder_out = model.forward(tokens) attn: Optional[Tensor] = None decoder_len = len(decoder_out) if decoder_len > 1 and decoder_out[1] is not None: if isinstance(decoder_out[1], Tensor): attn = decoder_out[1] else: attn_holder = decoder_out[1]["attn"] if isinstance(attn_holder, Tensor): attn = attn_holder elif attn_holder is not None: attn = attn_holder[0] if attn is not None: attn = attn[:, -1, :] decoder_out_tuple = ( decoder_out[0][:, -1:, :].div_(temperature), None if decoder_len <= 1 else decoder_out[1], ) probs = model.get_normalized_probs( decoder_out_tuple, log_probs=True, sample=None ) probs = probs[:, -1, :] if self.models_size == 1: return probs, attn log_probs.append(probs) if attn is not None: if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) avg_probs = torch.logsumexp(torch.stack(log_probs, dim=0), dim=0) - math.log( self.models_size ) if avg_attn is not None: avg_attn.div_(self.models_size) return avg_probs, avg_attn @torch.jit.export def reorder_encoder_out( self, encoder_outs: Optional[List[Dict[str, List[Tensor]]]], new_order ): """ Reorder encoder output according to *new_order*. Args: encoder_out: output from the ``forward()`` method new_order (LongTensor): desired order Returns: *encoder_out* rearranged according to *new_order* """ new_outs: List[Dict[str, List[Tensor]]] = [] if not self.has_encoder(): return new_outs for i, model in enumerate(self.models): assert encoder_outs is not None new_outs.append( model.encoder.reorder_encoder_out(encoder_outs[i], new_order) ) return new_outs @torch.jit.export def reorder_incremental_state( self, incremental_states: List[Dict[str, Dict[str, Optional[Tensor]]]], new_order, ): if not self.has_incremental_states(): return for i, model in enumerate(self.models): model.decoder.reorder_incremental_state_scripting( incremental_states[i], new_order ) class SequenceGeneratorWithAlignment(SequenceGenerator): def __init__( self, models, tgt_dict, left_pad_target=False, print_alignment="hard", **kwargs ): """Generates translations of a given source sentence. Produces alignments following "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: left_pad_target (bool, optional): Whether or not the hypothesis should be left padded or not when they are teacher forced for generating alignments. """ super().__init__(EnsembleModelWithAlignment(models), tgt_dict, **kwargs) self.left_pad_target = left_pad_target if print_alignment == "hard": self.extract_alignment = utils.extract_hard_alignment elif print_alignment == "soft": self.extract_alignment = utils.extract_soft_alignment @torch.no_grad() def generate(self, models, sample, **kwargs): finalized = super()._generate(sample, **kwargs) src_tokens = sample["net_input"]["src_tokens"] bsz = src_tokens.shape[0] beam_size = self.beam_size ( src_tokens, src_lengths, prev_output_tokens, tgt_tokens, ) = self._prepare_batch_for_alignment(sample, finalized) if any(getattr(m, "full_context_alignment", False) for m in self.model.models): attn = self.model.forward_align(src_tokens, src_lengths, prev_output_tokens) else: attn = [ finalized[i // beam_size][i % beam_size]["attention"].transpose(1, 0) for i in range(bsz * beam_size) ] if src_tokens.device != "cpu": src_tokens = src_tokens.to("cpu") tgt_tokens = tgt_tokens.to("cpu") attn = [i.to("cpu") for i in attn] # Process the attn matrix to extract hard alignments. for i in range(bsz * beam_size): alignment = self.extract_alignment( attn[i], src_tokens[i], tgt_tokens[i], self.pad, self.eos ) finalized[i // beam_size][i % beam_size]["alignment"] = alignment return finalized def _prepare_batch_for_alignment(self, sample, hypothesis): src_tokens = sample["net_input"]["src_tokens"] bsz = src_tokens.shape[0] src_tokens = ( src_tokens[:, None, :] .expand(-1, self.beam_size, -1) .contiguous() .view(bsz * self.beam_size, -1) ) src_lengths = sample["net_input"]["src_lengths"] src_lengths = ( src_lengths[:, None] .expand(-1, self.beam_size) .contiguous() .view(bsz * self.beam_size) ) prev_output_tokens = data_utils.collate_tokens( [beam["tokens"] for example in hypothesis for beam in example], self.pad, self.eos, self.left_pad_target, move_eos_to_beginning=True, ) tgt_tokens = data_utils.collate_tokens( [beam["tokens"] for example in hypothesis for beam in example], self.pad, self.eos, self.left_pad_target, move_eos_to_beginning=False, ) return src_tokens, src_lengths, prev_output_tokens, tgt_tokens class EnsembleModelWithAlignment(EnsembleModel): """A wrapper around an ensemble of models.""" def __init__(self, models): super().__init__(models) def forward_align(self, src_tokens, src_lengths, prev_output_tokens): avg_attn = None for model in self.models: decoder_out = model(src_tokens, src_lengths, prev_output_tokens) attn = decoder_out[1]["attn"][0] if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) if len(self.models) > 1: avg_attn.div_(len(self.models)) return avg_attn ================================================ FILE: fairseq/sequence_scorer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys import torch from fairseq import utils class SequenceScorer(object): """Scores the target for a given source sentence.""" def __init__( self, tgt_dict, softmax_batch=None, compute_alignment=False, eos=None, symbols_to_strip_from_output=None, ): self.pad = tgt_dict.pad() self.eos = tgt_dict.eos() if eos is None else eos self.softmax_batch = softmax_batch or sys.maxsize assert self.softmax_batch > 0 self.compute_alignment = compute_alignment self.symbols_to_strip_from_output = ( symbols_to_strip_from_output.union({self.eos}) if symbols_to_strip_from_output is not None else {self.eos} ) @torch.no_grad() def generate(self, models, sample, **kwargs): """Score a batch of translations.""" net_input = sample["net_input"] def batch_for_softmax(dec_out, target): # assumes decoder_out[0] is the only thing needed (may not be correct for future models!) first, rest = dec_out[0], dec_out[1:] bsz, tsz, dim = first.shape if bsz * tsz < self.softmax_batch: yield dec_out, target, True else: flat = first.contiguous().view(1, -1, dim) flat_tgt = target.contiguous().view(flat.shape[:-1]) s = 0 while s < flat.size(1): e = s + self.softmax_batch yield (flat[:, s:e],) + rest, flat_tgt[:, s:e], False s = e def gather_target_probs(probs, target): probs = probs.gather( dim=2, index=target.unsqueeze(-1), ) return probs orig_target = sample["target"] # compute scores for each model in the ensemble avg_probs = None avg_attn = None for model in models: model.eval() decoder_out = model(**net_input) attn = decoder_out[1] if len(decoder_out) > 1 else None if type(attn) is dict: attn = attn.get("attn", None) batched = batch_for_softmax(decoder_out, orig_target) probs, idx = None, 0 for bd, tgt, is_single in batched: sample["target"] = tgt curr_prob = model.get_normalized_probs( bd, log_probs=len(models) == 1, sample=sample ).data if is_single: probs = gather_target_probs(curr_prob, orig_target) else: if probs is None: probs = curr_prob.new(orig_target.numel()) step = curr_prob.size(0) * curr_prob.size(1) end = step + idx tgt_probs = gather_target_probs( curr_prob.view(tgt.shape + (curr_prob.size(-1),)), tgt ) probs[idx:end] = tgt_probs.view(-1) idx = end sample["target"] = orig_target probs = probs.view(sample["target"].shape) if avg_probs is None: avg_probs = probs else: avg_probs.add_(probs) if attn is not None: if torch.is_tensor(attn): attn = attn.data else: attn = attn[0] if avg_attn is None: avg_attn = attn else: avg_attn.add_(attn) if len(models) > 1: avg_probs.div_(len(models)) avg_probs.log_() if avg_attn is not None: avg_attn.div_(len(models)) bsz = avg_probs.size(0) hypos = [] start_idxs = sample["start_indices"] if "start_indices" in sample else [0] * bsz for i in range(bsz): # remove padding from ref ref = ( utils.strip_pad(sample["target"][i, start_idxs[i] :], self.pad) if sample["target"] is not None else None ) tgt_len = ref.numel() avg_probs_i = avg_probs[i][start_idxs[i] : start_idxs[i] + tgt_len] score_i = avg_probs_i.sum() / tgt_len if avg_attn is not None: avg_attn_i = avg_attn[i] if self.compute_alignment: alignment = utils.extract_hard_alignment( avg_attn_i, sample["net_input"]["src_tokens"][i], sample["target"][i], self.pad, self.eos, ) else: alignment = None else: avg_attn_i = alignment = None hypos.append( [ { "tokens": ref, "score": score_i, "attention": avg_attn_i, "alignment": alignment, "positional_scores": avg_probs_i, } ] ) return hypos ================================================ FILE: fairseq/speech_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import torch from fairseq.data.audio.speech_to_text_dataset import S2TDataConfig class SpeechGenerator(object): def __init__(self, model, vocoder, data_cfg: S2TDataConfig): self.model = model self.vocoder = vocoder stats_npz_path = data_cfg.global_cmvn_stats_npz self.gcmvn_stats = None if stats_npz_path is not None: self.gcmvn_stats = np.load(stats_npz_path) def gcmvn_denormalize(self, x): # x: B x T x C if self.gcmvn_stats is None: return x mean = torch.from_numpy(self.gcmvn_stats["mean"]).to(x) std = torch.from_numpy(self.gcmvn_stats["std"]).to(x) assert len(x.shape) == 3 and mean.shape[0] == std.shape[0] == x.shape[2] x = x * std.view(1, 1, -1).expand_as(x) return x + mean.view(1, 1, -1).expand_as(x) def get_waveform(self, feat): # T x C -> T return None if self.vocoder is None else self.vocoder(feat).squeeze(0) class AutoRegressiveSpeechGenerator(SpeechGenerator): def __init__( self, model, vocoder, data_cfg, max_iter: int = 6000, eos_prob_threshold: float = 0.5, ): super().__init__(model, vocoder, data_cfg) self.max_iter = max_iter self.eos_prob_threshold = eos_prob_threshold @torch.no_grad() def generate(self, model, sample, has_targ=False, **kwargs): model.eval() src_tokens = sample["net_input"]["src_tokens"] src_lengths = sample["net_input"]["src_lengths"] bsz, src_len = src_tokens.size()[:2] n_frames_per_step = model.decoder.n_frames_per_step out_dim = model.decoder.out_dim raw_dim = out_dim // n_frames_per_step # initialize encoder_out = model.forward_encoder( src_tokens, src_lengths, speaker=sample["speaker"] ) incremental_state = {} feat, attn, eos_prob = [], [], [] finished = src_tokens.new_zeros((bsz,)).bool() out_lens = src_lengths.new_zeros((bsz,)).long().fill_(self.max_iter) prev_feat_out = encoder_out["encoder_out"][0].new_zeros(bsz, 1, out_dim) for step in range(self.max_iter): cur_out_lens = out_lens.clone() cur_out_lens.masked_fill_(cur_out_lens.eq(self.max_iter), step + 1) _, cur_eos_out, cur_extra = model.forward_decoder( prev_feat_out, encoder_out=encoder_out, incremental_state=incremental_state, target_lengths=cur_out_lens, speaker=sample["speaker"], **kwargs, ) cur_eos_prob = torch.sigmoid(cur_eos_out).squeeze(2) feat.append(cur_extra["feature_out"]) attn.append(cur_extra["attn"]) eos_prob.append(cur_eos_prob) cur_finished = cur_eos_prob.squeeze(1) > self.eos_prob_threshold out_lens.masked_fill_((~finished) & cur_finished, step + 1) finished = finished | cur_finished if finished.sum().item() == bsz: break prev_feat_out = cur_extra["feature_out"] feat = torch.cat(feat, dim=1) feat = model.decoder.postnet(feat) + feat eos_prob = torch.cat(eos_prob, dim=1) attn = torch.cat(attn, dim=2) alignment = attn.max(dim=1)[1] feat = feat.reshape(bsz, -1, raw_dim) feat = self.gcmvn_denormalize(feat) eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1) attn = attn.repeat_interleave(n_frames_per_step, dim=2) alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) out_lens = out_lens * n_frames_per_step finalized = [ { "feature": feat[b, :out_len], "eos_prob": eos_prob[b, :out_len], "attn": attn[b, :, :out_len], "alignment": alignment[b, :out_len], "waveform": self.get_waveform(feat[b, :out_len]), } for b, out_len in zip(range(bsz), out_lens) ] if has_targ: assert sample["target"].size(-1) == out_dim tgt_feats = sample["target"].view(bsz, -1, raw_dim) tgt_feats = self.gcmvn_denormalize(tgt_feats) tgt_lens = sample["target_lengths"] * n_frames_per_step for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): finalized[b]["targ_feature"] = f[:l] finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) return finalized class MultiDecoderSpeechGenerator(SpeechGenerator): def __init__( self, models, args, vocoder, data_cfg, tgt_dict_mt, max_iter: int = 6000, eos_prob_threshold: float = 0.5, eos_mt=None, symbols_to_strip_from_output=None, ): super().__init__(models[0], vocoder, data_cfg) self.max_iter = max_iter self.eos_prob_threshold = eos_prob_threshold self.tgt_dict_mt = tgt_dict_mt self.eos_mt = eos_mt from examples.speech_to_speech.unity.sequence_generator import SequenceGenerator from fairseq import search self.text_generator = SequenceGenerator( models, tgt_dict_mt, beam_size=max(1, getattr(args, "beam", 5)), max_len_a=getattr(args, "max_len_a", 0), max_len_b=getattr(args, "max_len_b", 200), min_len=getattr(args, "min_len", 1), normalize_scores=(not getattr(args, "unnormalized", False)), len_penalty=getattr(args, "lenpen", 1), unk_penalty=getattr(args, "unkpen", 0), temperature=getattr(args, "temperature", 1.0), match_source_len=getattr(args, "match_source_len", False), no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), search_strategy=search.BeamSearch(tgt_dict_mt), eos=eos_mt, symbols_to_strip_from_output=symbols_to_strip_from_output, ) @torch.no_grad() def generate(self, model, sample, has_targ=False, **kwargs): model.eval() src_tokens = sample["net_input"]["src_tokens"] src_lengths = sample["net_input"]["src_lengths"] bsz, src_len = src_tokens.size()[:2] n_frames_per_step = model.decoder.n_frames_per_step out_dim = model.decoder.out_dim raw_dim = out_dim // n_frames_per_step # initialize encoder_out = model.forward_encoder( src_tokens, src_lengths, speaker=sample["speaker"] ) prefix_tokens = None constraints = None bos_token = None mt_decoder = getattr(model, f"{model.mt_task_name}_decoder") # 1. MT decoder finalized_mt = self.text_generator.generate_decoder( [encoder_out], src_tokens, src_lengths, sample, prefix_tokens, constraints, bos_token, aux_task_name=model.mt_task_name, ) # extract decoder output corresponding to the best hypothesis max_tgt_len = max([len(hypo[0]["tokens"]) for hypo in finalized_mt]) prev_output_tokens_mt = ( src_tokens.new_zeros(src_tokens.shape[0], max_tgt_len) .fill_(mt_decoder.padding_idx) .int() ) # B x T for i, hypo in enumerate(finalized_mt): i_beam = 0 tmp = hypo[i_beam]["tokens"].int() # hyp + eos prev_output_tokens_mt[i, 0] = self.text_generator.eos if tmp[-1] == self.text_generator.eos: tmp = tmp[:-1] prev_output_tokens_mt[i, 1 : len(tmp) + 1] = tmp text = "".join([self.tgt_dict_mt[c] for c in tmp]) text = text.replace("_", " ") text = text.replace("▁", " ") text = text.replace("<unk>", " ") text = text.replace("<s>", "") text = text.replace("</s>", "") if len(text) > 0 and text[0] == " ": text = text[1:] sample_id = sample["id"].tolist()[i] print("{} (None-{})".format(text, sample_id)) mt_decoder_out = mt_decoder( prev_output_tokens_mt, encoder_out=encoder_out, features_only=True, ) x = mt_decoder_out[0].transpose(0, 1) mt_decoder_padding_mask = None if prev_output_tokens_mt.eq(mt_decoder.padding_idx).any(): mt_decoder_padding_mask = prev_output_tokens_mt.eq(mt_decoder.padding_idx) # 2. TTS encoder if getattr(model, "synthesizer_encoder", None) is not None: synthesizer_encoder_out = model.synthesizer_encoder( x, mt_decoder_padding_mask, ) else: synthesizer_encoder_out = { "encoder_out": [x], # T x B x C "encoder_padding_mask": [mt_decoder_padding_mask] if mt_decoder_padding_mask is not None else [], # B x T "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [], } # 3. TTS decoder incremental_state = {} feat, attn, eos_prob = [], [], [] finished = src_tokens.new_zeros((bsz,)).bool() out_lens = src_lengths.new_zeros((bsz,)).long().fill_(self.max_iter) prev_feat_out = encoder_out["encoder_out"][0].new_zeros(bsz, 1, out_dim) for step in range(self.max_iter): cur_out_lens = out_lens.clone() cur_out_lens.masked_fill_(cur_out_lens.eq(self.max_iter), step + 1) _, cur_eos_out, cur_extra = model.forward_decoder( prev_feat_out, encoder_out=synthesizer_encoder_out, incremental_state=incremental_state, target_lengths=cur_out_lens, speaker=sample["speaker"], **kwargs, ) cur_eos_prob = torch.sigmoid(cur_eos_out).squeeze(2) feat.append(cur_extra["feature_out"]) attn.append(cur_extra["attn"]) eos_prob.append(cur_eos_prob) cur_finished = cur_eos_prob.squeeze(1) > self.eos_prob_threshold out_lens.masked_fill_((~finished) & cur_finished, step + 1) finished = finished | cur_finished if finished.sum().item() == bsz: break prev_feat_out = cur_extra["feature_out"] feat = torch.cat(feat, dim=1) feat = model.decoder.postnet(feat) + feat eos_prob = torch.cat(eos_prob, dim=1) attn = torch.cat(attn, dim=2) alignment = attn.max(dim=1)[1] feat = feat.reshape(bsz, -1, raw_dim) feat = self.gcmvn_denormalize(feat) eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1) attn = attn.repeat_interleave(n_frames_per_step, dim=2) alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) out_lens = out_lens * n_frames_per_step finalized = [ { "feature": feat[b, :out_len], "eos_prob": eos_prob[b, :out_len], "attn": attn[b, :, :out_len], "alignment": alignment[b, :out_len], "waveform": self.get_waveform(feat[b, :out_len]), } for b, out_len in zip(range(bsz), out_lens) ] if has_targ: assert sample["target"].size(-1) == out_dim tgt_feats = sample["target"].view(bsz, -1, raw_dim) tgt_feats = self.gcmvn_denormalize(tgt_feats) tgt_lens = sample["target_lengths"] * n_frames_per_step for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): finalized[b]["targ_feature"] = f[:l] finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) return finalized class NonAutoregressiveSpeechGenerator(SpeechGenerator): @torch.no_grad() def generate(self, model, sample, has_targ=False, **kwargs): model.eval() bsz, max_src_len = sample["net_input"]["src_tokens"].size() n_frames_per_step = model.encoder.n_frames_per_step out_dim = model.encoder.out_dim raw_dim = out_dim // n_frames_per_step feat, feat_post, out_lens, log_dur_out, _, _ = model( src_tokens=sample["net_input"]["src_tokens"], src_lengths=sample["net_input"]["src_lengths"], prev_output_tokens=sample["net_input"]["prev_output_tokens"], incremental_state=None, target_lengths=sample["target_lengths"], speaker=sample["speaker"], ) if feat_post is not None: feat = feat_post feat = feat.view(bsz, -1, raw_dim) feat = self.gcmvn_denormalize(feat) dur_out = torch.clamp(torch.round(torch.exp(log_dur_out) - 1).long(), min=0) def get_dur_plot_data(d): r = [] for i, dd in enumerate(d): r += [i + 1] * dd.item() return r out_lens = out_lens * n_frames_per_step finalized = [ { "feature": feat[b, :l] if l > 0 else feat.new_zeros([1, raw_dim]), "waveform": self.get_waveform( feat[b, :l] if l > 0 else feat.new_zeros([1, raw_dim]) ), "attn": feat.new_tensor(get_dur_plot_data(dur_out[b])), } for b, l in zip(range(bsz), out_lens) ] if has_targ: tgt_feats = sample["target"].view(bsz, -1, raw_dim) tgt_feats = self.gcmvn_denormalize(tgt_feats) tgt_lens = sample["target_lengths"] * n_frames_per_step for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): finalized[b]["targ_feature"] = f[:l] finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) return finalized class TeacherForcingAutoRegressiveSpeechGenerator(AutoRegressiveSpeechGenerator): @torch.no_grad() def generate(self, model, sample, has_targ=False, **kwargs): model.eval() src_tokens = sample["net_input"]["src_tokens"] src_lens = sample["net_input"]["src_lengths"] prev_out_tokens = sample["net_input"]["prev_output_tokens"] tgt_lens = sample["target_lengths"] n_frames_per_step = model.decoder.n_frames_per_step raw_dim = model.decoder.out_dim // n_frames_per_step bsz = src_tokens.shape[0] feat, eos_prob, extra = model( src_tokens, src_lens, prev_out_tokens, incremental_state=None, target_lengths=tgt_lens, speaker=sample["speaker"], ) attn = extra["attn"] # B x T_s x T_t alignment = attn.max(dim=1)[1] feat = feat.reshape(bsz, -1, raw_dim) feat = self.gcmvn_denormalize(feat) eos_prob = eos_prob.repeat_interleave(n_frames_per_step, dim=1) attn = attn.repeat_interleave(n_frames_per_step, dim=2) alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) tgt_lens = sample["target_lengths"] * n_frames_per_step finalized = [ { "feature": feat[b, :tgt_len], "eos_prob": eos_prob[b, :tgt_len], "attn": attn[b, :, :tgt_len], "alignment": alignment[b, :tgt_len], "waveform": self.get_waveform(feat[b, :tgt_len]), } for b, tgt_len in zip(range(bsz), tgt_lens) ] if has_targ: tgt_feats = sample["target"].view(bsz, -1, raw_dim) tgt_feats = self.gcmvn_denormalize(tgt_feats) for b, (f, l) in enumerate(zip(tgt_feats, tgt_lens)): finalized[b]["targ_feature"] = f[:l] finalized[b]["targ_waveform"] = self.get_waveform(f[:l]) return finalized ================================================ FILE: fairseq/tasks/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import argparse import importlib import os from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import merge_with_parent from hydra.core.config_store import ConfigStore from .fairseq_task import FairseqTask, LegacyFairseqTask # noqa # register dataclass TASK_DATACLASS_REGISTRY = {} TASK_REGISTRY = {} TASK_CLASS_NAMES = set() def setup_task(cfg: FairseqDataclass, **kwargs): task = None task_name = getattr(cfg, "task", None) if isinstance(task_name, str): # legacy tasks task = TASK_REGISTRY[task_name] if task_name in TASK_DATACLASS_REGISTRY: dc = TASK_DATACLASS_REGISTRY[task_name] cfg = dc.from_namespace(cfg) else: task_name = getattr(cfg, "_name", None) if task_name and task_name in TASK_DATACLASS_REGISTRY: remove_missing = "from_checkpoint" in kwargs and kwargs["from_checkpoint"] dc = TASK_DATACLASS_REGISTRY[task_name] cfg = merge_with_parent(dc(), cfg, remove_missing=remove_missing) task = TASK_REGISTRY[task_name] assert ( task is not None ), f"Could not infer task type from {cfg}. Available argparse tasks: {TASK_REGISTRY.keys()}. Available hydra tasks: {TASK_DATACLASS_REGISTRY.keys()}" return task.setup_task(cfg, **kwargs) def register_task(name, dataclass=None): """ New tasks can be added to fairseq with the :func:`~fairseq.tasks.register_task` function decorator. For example:: @register_task('classification') class ClassificationTask(FairseqTask): (...) .. note:: All Tasks must implement the :class:`~fairseq.tasks.FairseqTask` interface. Args: name (str): the name of the task """ def register_task_cls(cls): if name in TASK_REGISTRY: return TASK_REGISTRY[name] if not issubclass(cls, FairseqTask): raise ValueError( "Task ({}: {}) must extend FairseqTask".format(name, cls.__name__) ) if cls.__name__ in TASK_CLASS_NAMES: raise ValueError( "Cannot register task with duplicate class name ({})".format( cls.__name__ ) ) TASK_REGISTRY[name] = cls TASK_CLASS_NAMES.add(cls.__name__) if dataclass is not None and not issubclass(dataclass, FairseqDataclass): raise ValueError( "Dataclass {} must extend FairseqDataclass".format(dataclass) ) cls.__dataclass = dataclass if dataclass is not None: TASK_DATACLASS_REGISTRY[name] = dataclass cs = ConfigStore.instance() node = dataclass() node._name = name cs.store(name=name, group="task", node=node, provider="fairseq") return cls return register_task_cls def get_task(name): return TASK_REGISTRY[name] def import_tasks(tasks_dir, namespace): for file in os.listdir(tasks_dir): path = os.path.join(tasks_dir, file) if ( not file.startswith("_") and not file.startswith(".") and (file.endswith(".py") or os.path.isdir(path)) ): task_name = file[: file.find(".py")] if file.endswith(".py") else file importlib.import_module(namespace + "." + task_name) # expose `task_parser` for sphinx if task_name in TASK_REGISTRY: parser = argparse.ArgumentParser(add_help=False) group_task = parser.add_argument_group("Task name") # fmt: off group_task.add_argument('--task', metavar=task_name, help='Enable this task with: ``--task=' + task_name + '``') # fmt: on group_args = parser.add_argument_group( "Additional command-line arguments" ) TASK_REGISTRY[task_name].add_args(group_args) globals()[task_name + "_parser"] = parser # automatically import any Python files in the tasks/ directory tasks_dir = os.path.dirname(__file__) import_tasks(tasks_dir, "fairseq.tasks") ================================================ FILE: fairseq/tasks/audio_classification.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. from collections import OrderedDict import itertools import logging import os import sys from dataclasses import dataclass, field from typing import Optional import numpy as np import torch from omegaconf import II, MISSING from sklearn import metrics as sklearn_metrics from fairseq.data import AddTargetDataset, Dictionary, FileAudioDataset from fairseq.data.multi_corpus_dataset import MultiCorpusDataset from fairseq.data.text_compressor import TextCompressionLevel, TextCompressor from fairseq.dataclass import FairseqDataclass from fairseq.tasks.audio_pretraining import AudioPretrainingConfig, AudioPretrainingTask from fairseq.tasks.audio_finetuning import label_len_fn, LabelEncoder from .. import utils from ..logging import metrics from . import FairseqTask, register_task logger = logging.getLogger(__name__) @dataclass class AudioClassificationConfig(AudioPretrainingConfig): target_dictionary: Optional[str] = field( default=None, metadata={"help": "override default dictionary location"} ) @register_task("audio_classification", dataclass=AudioClassificationConfig) class AudioClassificationTask(AudioPretrainingTask): """Task for audio classification tasks.""" cfg: AudioClassificationConfig def __init__( self, cfg: AudioClassificationConfig, ): super().__init__(cfg) self.state.add_factory("target_dictionary", self.load_target_dictionary) logging.info(f"=== Number of labels = {len(self.target_dictionary)}") def load_target_dictionary(self): if self.cfg.labels: target_dictionary = self.cfg.data if self.cfg.target_dictionary: # override dict target_dictionary = self.cfg.target_dictionary dict_path = os.path.join(target_dictionary, f"dict.{self.cfg.labels}.txt") logger.info("Using dict_path : {}".format(dict_path)) return Dictionary.load(dict_path, add_special_symbols=False) return None def load_dataset( self, split: str, task_cfg: AudioClassificationConfig = None, **kwargs ): super().load_dataset(split, task_cfg, **kwargs) task_cfg = task_cfg or self.cfg assert task_cfg.labels is not None text_compression_level = getattr( TextCompressionLevel, str(self.cfg.text_compression_level) ) data_path = self.cfg.data if task_cfg.multi_corpus_keys is None: label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) text_compressor = TextCompressor(level=text_compression_level) with open(label_path, "r") as f: labels = [ text_compressor.compress(l) for i, l in enumerate(f) if i not in skipped_indices ] assert len(labels) == len(self.datasets[split]), ( f"labels length ({len(labels)}) and dataset length " f"({len(self.datasets[split])}) do not match" ) process_label = LabelEncoder(self.target_dictionary) self.datasets[split] = AddTargetDataset( self.datasets[split], labels, pad=self.target_dictionary.pad(), eos=self.target_dictionary.eos(), batch_targets=True, process_label=process_label, label_len_fn=label_len_fn, add_to_input=False, # text_compression_level=text_compression_level, ) else: target_dataset_map = OrderedDict() multi_corpus_keys = [ k.strip() for k in task_cfg.multi_corpus_keys.split(",") ] corpus_idx_map = {k: idx for idx, k in enumerate(multi_corpus_keys)} data_keys = [k.split(":") for k in split.split(",")] multi_corpus_sampling_weights = [ float(val.strip()) for val in task_cfg.multi_corpus_sampling_weights.split(",") ] data_weights = [] for key, file_name in data_keys: k = key.strip() label_path = os.path.join( data_path, f"{file_name.strip()}.{task_cfg.labels}" ) skipped_indices = getattr( self.dataset_map[split][k], "skipped_indices", set() ) text_compressor = TextCompressor(level=text_compression_level) with open(label_path, "r") as f: labels = [ text_compressor.compress(l) for i, l in enumerate(f) if i not in skipped_indices ] assert len(labels) == len(self.dataset_map[split][k]), ( f"labels length ({len(labels)}) and dataset length " f"({len(self.dataset_map[split][k])}) do not match" ) process_label = LabelEncoder(self.target_dictionary) # TODO: Remove duplication of code from the if block above target_dataset_map[k] = AddTargetDataset( self.dataset_map[split][k], labels, pad=self.target_dictionary.pad(), eos=self.target_dictionary.eos(), batch_targets=True, process_label=process_label, label_len_fn=label_len_fn, add_to_input=False, # text_compression_level=text_compression_level, ) data_weights.append(multi_corpus_sampling_weights[corpus_idx_map[k]]) if len(target_dataset_map) == 1: self.datasets[split] = list(target_dataset_map.values())[0] else: self.datasets[split] = MultiCorpusDataset( target_dataset_map, distribution=data_weights, seed=0, sort_indices=True, ) @property def source_dictionary(self): return None @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.state.target_dictionary def train_step(self, sample, model, *args, **kwargs): sample["target"] = sample["target"].to(dtype=torch.long) loss, sample_size, logging_output = super().train_step( sample, model, *args, **kwargs ) self._log_metrics(sample, model, logging_output) return loss, sample_size, logging_output def valid_step(self, sample, model, criterion): sample["target"] = sample["target"].to(dtype=torch.long) loss, sample_size, logging_output = super().valid_step(sample, model, criterion) self._log_metrics(sample, model, logging_output) return loss, sample_size, logging_output def _log_metrics(self, sample, model, logging_output): metrics = self._inference_with_metrics( sample, model, ) """ logging_output["_precision"] = metrics["precision"] logging_output["_recall"] = metrics["recall"] logging_output["_f1"] = metrics["f1"] logging_output["_eer"] = metrics["eer"] logging_output["_accuracy"] = metrics["accuracy"] """ logging_output["_correct"] = metrics["correct"] logging_output["_total"] = metrics["total"] def _inference_with_metrics(self, sample, model): def _compute_eer(target_list, lprobs): # from scipy.optimize import brentq # from scipy.interpolate import interp1d y_one_hot = np.eye(len(self.state.target_dictionary))[target_list] fpr, tpr, thresholds = sklearn_metrics.roc_curve( y_one_hot.ravel(), lprobs.ravel() ) # Revisit the interpolation approach. # eer = brentq(lambda x: 1.0 - x - interp1d(fpr, tpr)(x), 0.0, 1.0) fnr = 1 - tpr eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))] return eer with torch.no_grad(): net_output = model(**sample["net_input"]) lprobs = ( model.get_normalized_probs(net_output, log_probs=True).cpu().detach() ) target_list = sample["target"][:, 0].detach().cpu() predicted_list = torch.argmax(lprobs, 1).detach().cpu() # B,C->B metrics = { "correct": torch.sum(target_list == predicted_list).item(), "total": len(target_list), } return metrics def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) zero = torch.scalar_tensor(0.0) correct, total = 0, 0 for log in logging_outputs: correct += log.get("_correct", zero) total += log.get("_total", zero) metrics.log_scalar("_correct", correct) metrics.log_scalar("_total", total) if total > 0: def _fn_accuracy(meters): if meters["_total"].sum > 0: return utils.item(meters["_correct"].sum / meters["_total"].sum) return float("nan") metrics.log_derived("accuracy", _fn_accuracy) """ prec_sum, recall_sum, f1_sum, acc_sum, eer_sum = 0.0, 0.0, 0.0, 0.0, 0.0 for log in logging_outputs: prec_sum += log.get("_precision", zero).item() recall_sum += log.get("_recall", zero).item() f1_sum += log.get("_f1", zero).item() acc_sum += log.get("_accuracy", zero).item() eer_sum += log.get("_eer", zero).item() metrics.log_scalar("avg_precision", prec_sum / len(logging_outputs)) metrics.log_scalar("avg_recall", recall_sum / len(logging_outputs)) metrics.log_scalar("avg_f1", f1_sum / len(logging_outputs)) metrics.log_scalar("avg_accuracy", acc_sum / len(logging_outputs)) metrics.log_scalar("avg_eer", eer_sum / len(logging_outputs)) """ ================================================ FILE: fairseq/tasks/audio_finetuning.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import os from fairseq.data.multi_corpus_dataset import MultiCorpusDataset import torch import json from argparse import Namespace from dataclasses import dataclass, field from typing import Optional, Any, OrderedDict from fairseq.data import AddTargetDataset, Dictionary, encoders from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.configs import GenerationConfig from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel from . import register_task from .. import utils from ..logging import metrics logger = logging.getLogger(__name__) class LabelEncoder(object): def __init__(self, dictionary): self.dictionary = dictionary def __call__(self, label): return self.dictionary.encode_line( label, append_eos=False, add_if_not_exist=False ) def label_len_fn(label): return len(label.split(" ")) @dataclass class AudioFinetuningConfig(AudioPretrainingConfig): # Options for reporting WER metrics during validation. Only applicable to # Seq2Seq models during fine-tuning eval_wer: bool = field( default=False, metadata={"help": "compute WER for Seq2Seq models"} ) eval_wer_config: GenerationConfig = field( default_factory=lambda: GenerationConfig(), metadata={"help": "beam search config for evaluating wer during training"}, ) eval_wer_tokenizer: Any = field( default=None, metadata={"help": "tokenizer config for evaluating wer during training"}, ) eval_wer_post_process: str = field( default="letter", metadata={ "help": "remove BPE tokens before scoring (can be sentencepiece, letter, and more)" }, ) eval_bleu: bool = field( default=False, metadata={"help": "evaluation with BLEU scores"} ) eval_bleu_detok: Optional[str] = field( default=None, metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); " "required if using --eval-bleu; use 'space' to disable " "detokenization; see fairseq.data.encoders for other options" }, ) eval_bleu_detok_args: str = field( default="{}", metadata={"help": "args for building the tokenizer, if needed"} ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"} ) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={"help": "remove BPE before computing BLEU"} ) eval_bleu_args: str = field( default="{}", metadata={ "help": "generation args for BLUE scoring, e.g., " '\'{"beam": 4, "lenpen": 0.6}\'' }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"} ) autoregressive: bool = field( default=False, metadata={ "help": "required for autoregressive decoders (like seq2seq models); " "adds 'prev_output_tokens' to input and appends eos to target" }, ) rebuild_batches: bool = True target_dictionary: Optional[str] = field( default=None, metadata={ "help": "override default dictionary location" } ) @register_task("audio_finetuning", dataclass=AudioFinetuningConfig) class AudioFinetuningTask(AudioPretrainingTask): """ """ cfg: AudioFinetuningConfig def __init__( self, cfg: AudioFinetuningConfig, ): super().__init__(cfg) self.blank_symbol = "<s>" self.state.add_factory("target_dictionary", self.load_target_dictionary) def load_target_dictionary(self): if self.cfg.labels: target_dictionary = self.cfg.data if self.cfg.target_dictionary: # override dict target_dictionary = self.cfg.target_dictionary dict_path = os.path.join(target_dictionary, f"dict.{self.cfg.labels}.txt") logger.info('Using dict_path : {}'.format(dict_path)) return Dictionary.load(dict_path) return None def load_dataset( self, split: str, task_cfg: AudioFinetuningConfig = None, **kwargs ): super().load_dataset(split, task_cfg, **kwargs) task_cfg = task_cfg or self.cfg assert task_cfg.labels is not None text_compression_level = getattr( TextCompressionLevel, str(self.cfg.text_compression_level) ) data_path = self.cfg.data if task_cfg.multi_corpus_keys is None: label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) text_compressor = TextCompressor(level=text_compression_level) with open(label_path, "r") as f: labels = [ text_compressor.compress(l) for i, l in enumerate(f) if i not in skipped_indices ] assert len(labels) == len(self.datasets[split]), ( f"labels length ({len(labels)}) and dataset length " f"({len(self.datasets[split])}) do not match" ) process_label = LabelEncoder(self.target_dictionary) self.datasets[split] = AddTargetDataset( self.datasets[split], labels, pad=self.target_dictionary.pad(), eos=self.target_dictionary.eos(), batch_targets=True, process_label=process_label, label_len_fn=label_len_fn, add_to_input=task_cfg.get("autoregressive", False), text_compression_level=text_compression_level, ) else: target_dataset_map = OrderedDict() multi_corpus_keys = [k.strip() for k in task_cfg.multi_corpus_keys.split(",")] corpus_idx_map = {k: idx for idx, k in enumerate(multi_corpus_keys)} data_keys = [k.split(":") for k in split.split(",")] multi_corpus_sampling_weights = [float(val.strip()) for val in task_cfg.multi_corpus_sampling_weights.split(",")] data_weights = [] for key, file_name in data_keys: k = key.strip() label_path = os.path.join(data_path, f"{file_name.strip()}.{task_cfg.labels}") skipped_indices = getattr(self.dataset_map[split][k], "skipped_indices", set()) text_compressor = TextCompressor(level=text_compression_level) with open(label_path, "r") as f: labels = [ text_compressor.compress(l) for i, l in enumerate(f) if i not in skipped_indices ] assert len(labels) == len(self.dataset_map[split][k]), ( f"labels length ({len(labels)}) and dataset length " f"({len(self.dataset_map[split][k])}) do not match" ) process_label = LabelEncoder(self.target_dictionary) # TODO: Remove duplication of code from the if block above target_dataset_map[k] = AddTargetDataset( self.dataset_map[split][k], labels, pad=self.target_dictionary.pad(), eos=self.target_dictionary.eos(), batch_targets=True, process_label=process_label, label_len_fn=label_len_fn, add_to_input=task_cfg.get("autoregressive", False), text_compression_level=text_compression_level, ) data_weights.append(multi_corpus_sampling_weights[corpus_idx_map[k]]) if len(target_dataset_map) == 1: self.datasets[split] = list(target_dataset_map.values())[0] else: self.datasets[split] = MultiCorpusDataset(target_dataset_map, distribution=data_weights, seed=0, sort_indices=True) @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.state.target_dictionary def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) if self.cfg.eval_wer and self.cfg.autoregressive: metrics = self._inference_with_wer(self.sequence_generator, sample, model) logging_output["_num_char_errors"] = metrics["num_char_errors"] logging_output["_num_chars"] = metrics["num_chars"] logging_output["_num_word_errors"] = metrics["num_word_errors"] logging_output["_num_words"] = metrics["num_words"] if self.cfg.eval_bleu and self.cfg.autoregressive: metrics = self._inference_with_bleu(self.sequence_generator, sample, model) logging_output["_bleu_sys_len"] = metrics.sys_len logging_output["_bleu_ref_len"] = metrics.ref_len # we split counts into separate entries so that they can be # summed efficiently across workers using fast-stat-sync assert len(metrics.counts) == 4 for i in range(4): logging_output[f"_bleu_counts_{i}"] = metrics.counts[i] logging_output[f"_bleu_totals_{i}"] = metrics.totals[i] return loss, sample_size, logging_output def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): model = super().build_model(model_cfg, from_checkpoint) if self.cfg.eval_wer and self.cfg.autoregressive: self.sequence_generator = self.build_generator( [model], self.cfg.eval_wer_config, ) if self.cfg.eval_wer_tokenizer: self.tokenizer = encoders.build_tokenizer(self.cfg.eval_wer_tokenizer) else: self.tokenizer = None if self.cfg.eval_bleu and self.cfg.autoregressive: assert self.cfg.eval_bleu_detok is not None, ( "--eval-bleu-detok is required if using --eval-bleu; " "try --eval-bleu-detok=moses (or --eval-bleu-detok=space " "to disable detokenization, e.g., when using sentencepiece)" ) detok_args = json.loads(self.cfg.eval_bleu_detok_args) self.tokenizer = encoders.build_tokenizer( Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args) ) gen_args = json.loads(self.cfg.eval_bleu_args) gen_args = Namespace(**gen_args) self.sequence_generator = self.build_generator([model], gen_args) return model def _inference_with_wer(self, generator, sample, model): import editdistance def decode(toks): s = self.target_dictionary.string( toks.int().cpu(), self.cfg.eval_wer_post_process, escape_unk=True, ) if self.tokenizer: s = self.tokenizer.decode(s) return s num_word_errors, num_char_errors = 0, 0 num_chars, num_words = 0, 0 gen_out = self.inference_step(generator, [model], sample, None) for i in range(len(gen_out)): hyp = decode(gen_out[i][0]["tokens"]) ref = decode( utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), ) num_char_errors += editdistance.eval(hyp, ref) num_chars += len(ref) hyp_words = hyp.split() ref_words = ref.split() num_word_errors += editdistance.eval(hyp_words, ref_words) num_words += len(ref_words) return { "num_char_errors": num_char_errors, "num_chars": num_chars, "num_word_errors": num_word_errors, "num_words": num_words, } def _inference_with_bleu(self, generator, sample, model): import sacrebleu def decode(toks, is_ref): s = self.target_dictionary.string( toks.int().cpu(), self.cfg.eval_bleu_remove_bpe, # The default unknown string in fairseq is `<unk>`, but # this is tokenized by sacrebleu as `< unk >`, inflating # BLEU scores. Instead, we use a somewhat more verbose # alternative that is unlikely to appear in the real # reference, but doesn't get split into multiple tokens. unk_string=("UNKNOWNTOKENINREF" if is_ref else "UNKNOWNTOKENINHYP"), ) if self.tokenizer: s = self.tokenizer.decode(s) return s gen_out = self.inference_step(generator, [model], sample) hyps, refs = [], [] for i in range(len(gen_out)): hyps.append(decode(gen_out[i][0]["tokens"], is_ref=False)) refs.append( decode( utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), is_ref=True, # don't count <unk> as matches to the hypo ) ) if self.cfg.eval_bleu_print_samples: logger.info("H-{} {}".format(sample["id"][0], hyps[0])) logger.info("T-{} {}".format(sample["id"][0], refs[0])) eval_tokenization = "none" if self.cfg.eval_tokenized_bleu else "13a" return sacrebleu.corpus_bleu(hyps, [refs], tokenize=eval_tokenization) def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if self.cfg.eval_wer: zero = torch.scalar_tensor(0.0) num_char_errors = sum( log.get("_num_char_errors", zero) for log in logging_outputs ) num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs) num_word_errors = sum( log.get("_num_word_errors", zero) for log in logging_outputs ) num_words = sum(log.get("_num_words", zero) for log in logging_outputs) metrics.log_scalar("_num_char_errors", num_char_errors) metrics.log_scalar("_num_chars", num_chars) metrics.log_scalar("_num_word_errors", num_word_errors) metrics.log_scalar("_num_words", num_words) if num_chars > 0: metrics.log_derived( "uer", lambda meters: meters["_num_char_errors"].sum * 100.0 / meters["_num_chars"].sum if meters["_num_chars"].sum > 0 else float("nan"), ) if num_words > 0: metrics.log_derived( "wer", lambda meters: meters["_num_word_errors"].sum * 100.0 / meters["_num_words"].sum if meters["_num_words"].sum > 0 else float("nan"), ) if self.cfg.eval_bleu: len_keys = ["_bleu_sys_len", "_bleu_ref_len"] count_keys = [f"_bleu_counts_{i}" for i in range(4)] total_keys = [f"_bleu_totals_{i}" for i in range(4)] for k in len_keys + count_keys + total_keys: metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs)) import sacrebleu metrics.log_derived( "bleu", lambda meters: sacrebleu.compute_bleu( correct=[meters[k].sum for k in count_keys], total=[meters[k].sum for k in total_keys], sys_len=meters["_bleu_sys_len"].sum, ref_len=meters["_bleu_ref_len"].sum, smooth_method="exp", ).score, ) ================================================ FILE: fairseq/tasks/audio_pretraining.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import os import sys from argparse import Namespace from dataclasses import dataclass, field from typing import Optional, OrderedDict from fairseq.data.multi_corpus_dataset import MultiCorpusDataset from omegaconf import MISSING, II, OmegaConf from fairseq.data import BinarizedAudioDataset, FileAudioDataset, SubsampleDataset from fairseq.dataclass import FairseqDataclass, ChoiceEnum from fairseq.data.text_compressor import TextCompressionLevel from . import FairseqTask, register_task logger = logging.getLogger(__name__) @dataclass class AudioMaskingConfig: feature_encoder_spec: str = II("model.modalities.audio.feature_encoder_spec") mask_prob: float = II("model.modalities.audio.mask_prob") mask_prob_adjust: float = II("model.modalities.audio.mask_prob_adjust") mask_length: int = II("model.modalities.audio.mask_length") inverse_mask: bool = II("model.modalities.audio.inverse_mask") mask_dropout: float = II("model.modalities.audio.mask_dropout") clone_batch: int = II("model.clone_batch") expand_adjacent: bool = False non_overlapping: bool = False @dataclass class AudioPretrainingConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) labels: Optional[str] = field( default=None, metadata={"help": "extension of the label file to load, used for fine-tuning"}, ) multi_corpus_keys: Optional[str] = field( default=None, metadata={"help": "Comma separated names for loading multi corpus datasets"}) multi_corpus_sampling_weights: Optional[str] = field( default=None, metadata={"help": "Comma separated string of sampling weights corresponding to the multi_corpus_keys"}) binarized_dataset: bool = field( default=False, metadata={ "help": "if true, loads binarized dataset (useful for very large datasets). " "See examples/wav2vec/scripts/binarize_manifest.sh" }, ) sample_rate: int = field( default=16_000, metadata={ "help": "target sample rate. audio files will be up/down sampled to this rate" }, ) normalize: bool = field( default=False, metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, ) enable_padding: bool = field( default=False, metadata={"help": "pad shorter samples instead of cropping"} ) max_sample_size: Optional[int] = field( default=None, metadata={"help": "max sample size to crop to for batching"} ) min_sample_size: Optional[int] = field( default=None, metadata={"help": "min sample size to skip small examples"} ) num_batch_buckets: int = field( default=0, metadata={"help": "number of buckets"}, ) tpu: bool = II("common.tpu") text_compression_level: ChoiceEnum([x.name for x in TextCompressionLevel]) = field( default="none", metadata={ "help": "compression level for texts (e.g. audio filenames, " "target texts): none/low/high (default: none). " }, ) rebuild_batches: bool = True precompute_mask_config: Optional[AudioMaskingConfig] = None post_save_script: Optional[str] = None subsample: float = 1 seed: int = II("common.seed") @register_task("audio_pretraining", dataclass=AudioPretrainingConfig) class AudioPretrainingTask(FairseqTask): """ """ cfg: AudioPretrainingConfig @classmethod def setup_task(cls, cfg: AudioPretrainingConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: cfg (AudioPretrainingConfig): configuration of this task """ return cls(cfg) def load_dataset(self, split: str, task_cfg: FairseqDataclass = None, **kwargs): data_path = self.cfg.data task_cfg = task_cfg or self.cfg # upgrade old task if isinstance(task_cfg, Namespace): if not hasattr(task_cfg, "autoregressive"): task_cfg.autoregressive = not task_cfg.criterion == "ctc" text_compression_level = getattr( TextCompressionLevel, str(self.cfg.text_compression_level) ) compute_mask = getattr(task_cfg, "precompute_mask_config", None) is not None mask_args = {} if compute_mask: mask_args = task_cfg.precompute_mask_config if getattr(task_cfg, "binarized_dataset", False): self.datasets[split] = BinarizedAudioDataset( data_path, split=split, sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate), max_sample_size=self.cfg.max_sample_size, min_sample_size=self.cfg.min_sample_size, pad=task_cfg.labels is not None or task_cfg.enable_padding, normalize=task_cfg.normalize, num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu), compute_mask=compute_mask, **mask_args, ) else: if task_cfg.multi_corpus_keys is None: manifest_path = os.path.join(data_path, "{}.tsv".format(split)) self.datasets[split] = FileAudioDataset( manifest_path=manifest_path, sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate), max_sample_size=self.cfg.max_sample_size, min_sample_size=self.cfg.min_sample_size, pad=task_cfg.labels is not None or task_cfg.enable_padding, normalize=task_cfg.normalize, num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu), text_compression_level=text_compression_level, compute_mask=compute_mask, **mask_args, ) else: dataset_map = OrderedDict() self.dataset_map = {} multi_corpus_keys = [k.strip() for k in task_cfg.multi_corpus_keys.split(",")] corpus_idx_map = {k: idx for idx, k in enumerate(multi_corpus_keys)} data_keys = [k.split(":") for k in split.split(",")] multi_corpus_sampling_weights = [float(val.strip()) for val in task_cfg.multi_corpus_sampling_weights.split(",")] data_weights = [] for key, file_name in data_keys: k = key.strip() manifest_path = os.path.join(data_path, "{}.tsv".format(file_name.strip())) # TODO: Remove duplication of code from the if block above dataset_map[k] = FileAudioDataset( manifest_path=manifest_path, sample_rate=task_cfg.get("sample_rate", self.cfg.sample_rate), max_sample_size=self.cfg.max_sample_size, min_sample_size=self.cfg.min_sample_size, pad=task_cfg.labels is not None or task_cfg.enable_padding, normalize=task_cfg.normalize, num_buckets=self.cfg.num_batch_buckets or int(self.cfg.tpu), text_compression_level=text_compression_level, compute_mask=compute_mask, corpus_key=corpus_idx_map[k], **mask_args, ) data_weights.append(multi_corpus_sampling_weights[corpus_idx_map[k]]) self.dataset_map[split] = dataset_map if len(dataset_map) == 1: self.datasets[split] = list(dataset_map.values())[0] else: self.datasets[split] = MultiCorpusDataset(dataset_map, distribution=data_weights, seed=0, sort_indices=True) if getattr(task_cfg, "subsample", 1) < 1: self.datasets[split] = SubsampleDataset( self.datasets[split], task_cfg.subsample, shuffle=True, seed=task_cfg.seed, ) if self.cfg.tpu and task_cfg.inferred_w2v_config.mask_channel_prob == 0.0: logger.info( "Pretraining on TPUs may suffer convergence " "issues when training with `mask_channel_prob` value of " "0. You may want to set this to a low value close to 0." ) def max_positions(self): """Maximum input length supported by the encoder.""" return sys.maxsize, sys.maxsize def build_model(self, model_cfg: FairseqDataclass, from_checkpoint=False): model = super().build_model(model_cfg, from_checkpoint) actualized_cfg = getattr(model, "cfg", None) if actualized_cfg is not None: # if "w2v_args" in actualized_cfg: if hasattr(actualized_cfg, "w2v_args"): model_cfg.w2v_args = actualized_cfg.w2v_args return model def post_save(self, cp_path, num_updates): if self.cfg.post_save_script is not None: logger.info(f"launching {self.cfg.post_save_script}") import os.path as osp from fairseq.file_io import PathManager eval_cp_path = osp.join( osp.dirname(cp_path), f"checkpoint_eval_{num_updates}.pt" ) print(cp_path, eval_cp_path, osp.dirname(cp_path)) assert PathManager.copy( cp_path, eval_cp_path, overwrite=True ), f"Failed to copy {cp_path} to {eval_cp_path}" import subprocess import shlex subprocess.call(shlex.split(f"{self.cfg.post_save_script} {eval_cp_path}")) ================================================ FILE: fairseq/tasks/cross_lingual_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import logging import os from collections import OrderedDict import numpy as np from fairseq import tokenizer, utils from fairseq.data import ConcatDataset, Dictionary, TokenBlockDataset, data_utils from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset from fairseq.tasks import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) @register_task("cross_lingual_lm") class CrossLingualLMTask(LegacyFairseqTask): """ Task for training cross-lingual language models. For more details look at: https://arxiv.org/pdf/1901.07291.pdf Args: dictionary (Dictionary): the dictionary for the input of the task """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument( "data", help="colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner", ) parser.add_argument( "--tokens-per-sample", default=512, type=int, help="max number of total tokens over all segments" " per sample", ) parser.add_argument( "--monolingual-langs", default="en", type=str, help="comma separated list of languages for which we" " want to train XLM on", ) parser.add_argument( "--shuffle", action="store_true", help="shuffle each monolingual dataset while" " training", ) def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary self.seed = args.seed self.distributed_world_size = args.distributed_world_size self.langs2id = self._lang_to_id(args.monolingual_langs) def _lang_to_id(self, languages: str): """ Build a map from languages to ids. These ids are used as segment labels for cross-lingual LM training. """ lang2id = {} langs = [l.strip() for l in languages.split(",")] for id, lang in enumerate(langs): lang2id[lang] = id return lang2id @classmethod def load_dictionary(cls, filename): return MaskedLMDictionary.load(filename) @classmethod def build_dictionary( cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8 ): d = MaskedLMDictionary() for filename in filenames: Dictionary.add_file_to_dictionary( filename, d, tokenizer.tokenize_line, workers ) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d @property def target_dictionary(self): return self.dictionary @classmethod def setup_task(cls, args, **kwargs): """Setup the task.""" dictionary = MaskedLMDictionary.load(os.path.join(args.data, "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) return cls(args, dictionary) def _load_single_lang_dataset(self, split, epoch): loaded_datasets = [] paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") path = os.path.join(data_path, split_k) ds = data_utils.load_indexed_dataset( path, self.dictionary, self.args.dataset_impl ) if ds is None: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) # Since we append each block with the classification_token, # we need to effectively create blocks of length # tokens_per_sample-1 loaded_datasets.append( TokenBlockDataset( ds, ds.sizes, self.args.tokens_per_sample - 1, pad=self.dictionary.pad(), eos=self.dictionary.eos(), ) ) logger.info( "{} {} {} examples".format(data_path, split_k, len(loaded_datasets[-1])) ) if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) return dataset, sizes def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset_map = OrderedDict() for lang in self.langs2id.keys(): # Datasets are expected to be in "split.lang" format (Eg: train.en) language_split = "{}.{}".format(split, lang) block_dataset, sizes = self._load_single_lang_dataset( split=language_split, epoch=epoch ) dataset_map[lang] = MaskedLMDataset( dataset=block_dataset, sizes=sizes, vocab=self.dictionary, pad_idx=self.dictionary.pad(), mask_idx=self.dictionary.mask(), classif_token_idx=self.dictionary.eos(), sep_token_idx=self.dictionary.eos(), shuffle=getattr(self.args, "shuffle", False), has_pairs=False, segment_id=self.langs2id[lang], seed=self.seed, ) self.datasets[split] = MultiCorpusSampledDataset(dataset_map) logger.info( "{} {} {} examples".format( utils.split_paths(self.args.data)[epoch - 1], split, len(self.datasets[split]), ) ) ================================================ FILE: fairseq/tasks/denoising.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field from typing import Any, Optional import numpy as np from omegaconf import II, MISSING from fairseq import utils from fairseq.data import ( AppendTokenDataset, DenoisingDataset, Dictionary, IdDataset, NestedDictionaryDataset, NumelDataset, PadDataset, PrependTokenDataset, StripTokenDataset, TokenBlockDataset, data_utils, ) from fairseq.data.encoders.utils import get_whole_word_mask from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.tasks import FairseqTask, register_task from ..data.indexed_dataset import get_available_dataset_impl logger = logging.getLogger(__name__) SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) MASK_LENGTH_CHOICES = ChoiceEnum(["subword", "word", "span-poisson"]) @dataclass class DenoisingConfig(FairseqDataclass): data: str = field( default=MISSING, metadata={"help": "path to data directory"}, ) bpe: Optional[str] = field( default=None, metadata={"help": "TODO"}, ) tokens_per_sample: int = field( default=512, metadata={ "help": "max number of total tokens over all segments " "per sample for dataset" }, ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="complete_doc", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) replace_length: int = field( default=0, metadata={"help": "TODO, should only allow -1, 0 and 1"}, ) mask: float = field( default=0.0, metadata={"help": "fraction of words/subwords that will be masked"}, ) mask_random: float = field( default=0.0, metadata={"help": "instead of using [MASK], use random token this often"}, ) insert: float = field( default=0.0, metadata={"help": "insert this percentage of additional random tokens"}, ) permute: float = field( default=0.0, metadata={"help": "take this proportion of subwords and permute them"}, ) rotate: float = field( default=0.5, metadata={"help": "rotate this proportion of inputs"}, ) poisson_lambda: float = field( default=3.0, metadata={"help": "randomly shuffle sentences for this proportion of inputs"}, ) shuffle_instance: float = field( default=0.0, metadata={"help": "shuffle this proportion of sentences in all inputs"}, ) mask_length: MASK_LENGTH_CHOICES = field( default="subword", metadata={"help": "mask length to choose"}, ) permute_sentences: int = field( default=-1, metadata={ "help": "when masking N tokens, replace with 0, 1, or N tokens (use -1 for N)" }, ) seed: int = II("common.seed") shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"}, ) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"}, ) dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) @register_task("denoising", dataclass=DenoisingConfig) class DenoisingTask(FairseqTask): """ Denoising task for applying sequence to sequence denoising. (ie. BART) """ cfg: DenoisingConfig def __init__(self, cfg, dictionary): super().__init__(cfg) self.dictionary = dictionary # add mask token self.mask_idx = self.dictionary.add_symbol("<mask>") @classmethod def setup_task(cls, cfg: DenoisingConfig, **kwargs): """Setup the task.""" paths = utils.split_paths(cfg.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) if not hasattr(cfg, "shuffle_instance"): cfg.shuffle_instance = False return cls(cfg, dictionary) def _load_dataset_split(self, split, epoch, combine): paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.cfg.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) dataset = StripTokenDataset(dataset, self.dictionary.eos()) dataset = maybe_shorten_dataset( dataset, split, self.cfg.shorten_data_split_list, self.cfg.shorten_method, self.cfg.tokens_per_sample, self.cfg.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.cfg.tokens_per_sample - 2, # one less for <s> and one for </s> pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.cfg.sample_break_mode, document_sep_len=0, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) return dataset def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset = self._load_dataset_split(split, epoch, combine) mask_whole_words = ( get_whole_word_mask(self.cfg.bpe, self.source_dictionary) if self.cfg.mask_length != "subword" else None ) self.datasets[split] = DenoisingDataset( dataset, dataset.sizes, self.dictionary, self.mask_idx, mask_whole_words, shuffle=self.cfg.shuffle_instance, seed=self.cfg.seed, mask=self.cfg.mask, mask_random=self.cfg.mask_random, insert=self.cfg.insert, rotate=self.cfg.rotate, permute_sentences=self.cfg.permute_sentences, bpe=self.cfg.bpe, replace_length=self.cfg.replace_length, mask_length=self.cfg.mask_length, poisson_lambda=self.cfg.poisson_lambda, ) logger.info( "Split: {0}, Loaded {1} samples of denoising_dataset".format( split, len(self.datasets[split]), ) ) def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): """ Generate batches for inference. We assume that the input begins with a bos symbol (`<s>`) and ends with an eos symbol (`</s>`). """ pad = self.source_dictionary.pad() eos = self.source_dictionary.eos() src_dataset = TokenBlockDataset( src_tokens, src_lengths, block_size=self.cfg.tokens_per_sample - 2, # for <s> and </s> pad=pad, eos=eos, break_mode=self.cfg.sample_break_mode, document_sep_len=0, ) prev_output_tokens = PrependTokenDataset( StripTokenDataset(src_dataset, eos), eos ) src_dataset = PadDataset(src_dataset, pad_idx=pad, left_pad=False) return NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": src_dataset, "src_lengths": NumelDataset(src_dataset, reduce=False), "prev_output_tokens": PadDataset( prev_output_tokens, pad_idx=pad, left_pad=False ), }, "target": src_dataset, }, sizes=[np.array(src_lengths)], ) def max_positions(self): """Return the max sentence length allowed by the task.""" return (self.cfg.max_source_positions, self.cfg.max_target_positions) @property def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary`.""" return self.dictionary @property def target_dictionary(self): """Return the target :class:`~fairseq.data.Dictionary`.""" return self.dictionary ================================================ FILE: fairseq/tasks/fairseq_task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import warnings from argparse import Namespace from typing import Any, Callable, Dict, List import torch from fairseq import search, tokenizer, utils from fairseq.logging import metrics from fairseq.data import Dictionary, FairseqDataset, data_utils, encoders, iterators from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import gen_parser_from_dataclass from fairseq.optim.amp_optimizer import AMPOptimizer from omegaconf import DictConfig logger = logging.getLogger(__name__) class StatefulContainer(object): def __init__(self): self._state = dict() self._factories = dict() def add_factory(self, name, factory: Callable[[], Any]): self._factories[name] = factory def merge_state_dict(self, state_dict: Dict[str, Any]): self._state.update(state_dict) @property def state_dict(self) -> Dict[str, Any]: return self._state def __getattr__(self, name): if name not in self._state and name in self._factories: self._state[name] = self._factories[name]() if name in self._state: return self._state[name] raise AttributeError(f"Task state has no factory for attribute {name}") class FairseqTask(object): """ Tasks store dictionaries and provide helpers for loading/iterating over Datasets, initializing the Model/Criterion and calculating the loss. Tasks have limited statefulness. In particular, state that needs to be saved to/loaded from checkpoints needs to be stored in the `self.state` :class:`StatefulContainer` object. For example:: self.state.add_factory("dictionary", self.load_dictionary) print(self.state.dictionary) # calls self.load_dictionary() This is necessary so that when loading checkpoints, we can properly recreate the task state after initializing the task instance. """ @classmethod def add_args(cls, parser): """Add task-specific arguments to the parser.""" dc = getattr(cls, "__dataclass", None) if dc is not None: gen_parser_from_dataclass(parser, dc()) @staticmethod def logging_outputs_can_be_summed(criterion) -> bool: """ Whether the logging outputs returned by `train_step` and `valid_step` can be summed across workers prior to calling `aggregate_logging_outputs`. Setting this to True will improves distributed training speed. """ return criterion.logging_outputs_can_be_summed() def __init__(self, cfg: FairseqDataclass, **kwargs): self.cfg = cfg self.datasets = dict() self.dataset_to_epoch_iter = dict() self.state = StatefulContainer() @classmethod def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ return Dictionary.load(filename) @classmethod def build_dictionary( cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8 ): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary( filename, d, tokenizer.tokenize_line, workers ) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d @classmethod def setup_task(cls, cfg: DictConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: cfg (omegaconf.DictConfig): parsed command-line arguments """ return cls(cfg, **kwargs) def has_sharded_data(self, split): return os.pathsep in getattr(self.cfg, "data", "") def load_dataset( self, split: str, combine: bool = False, task_cfg: FairseqDataclass = None, **kwargs, ): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) combine (bool): combines a split segmented into pieces into one dataset task_cfg (FairseqDataclass): optional task configuration stored in the checkpoint that can be used to load datasets """ raise NotImplementedError def dataset(self, split): """ Return a loaded dataset split. Args: split (str): name of the split (e.g., train, valid, test) Returns: a :class:`~fairseq.data.FairseqDataset` corresponding to *split* """ from fairseq.data import FairseqDataset if split not in self.datasets: raise KeyError("Dataset not loaded: " + split) if not isinstance(self.datasets[split], FairseqDataset): raise TypeError("Datasets are expected to be of type FairseqDataset") return self.datasets[split] def filter_indices_by_size( self, indices, dataset, max_positions=None, ignore_invalid_inputs=False ): """ Filter examples that are too large Args: indices (np.array): original array of sample indices dataset (~fairseq.data.FairseqDataset): dataset to batch max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). Returns: np.array: array of filtered sample indices """ indices, ignored = dataset.filter_indices_by_size(indices, max_positions) if len(ignored) > 0: if not ignore_invalid_inputs: raise Exception( ( "Size of sample #{} is invalid (={}) since max_positions={}, " "skip this example with --skip-invalid-size-inputs-valid-test" ).format(ignored[0], dataset.size(ignored[0]), max_positions) ) logger.warning( ( "{:,} samples have invalid sizes and will be skipped, " "max_positions={}, first few sample ids={}" ).format(len(ignored), max_positions, ignored[:10]) ) return indices def can_reuse_epoch_itr(self, dataset): # We can reuse the epoch iterator across epochs as long as the dataset # hasn't disabled it. We default to ``False`` here, although in practice # this will be ``True`` for most datasets that inherit from # ``FairseqDataset`` due to the base implementation there. return getattr(dataset, "can_reuse_epoch_itr_across_epochs", False) def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, skip_remainder_batch=False, grouped_shuffling=False, update_epoch_batch_itr=False, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 1). data_buffer_size (int, optional): number of batches to preload (default: 0). disable_iterator_cache (bool, optional): don't cache the EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`) (default: False). skip_remainder_batch (bool, optional): if set, discard the last batch in each training epoch, as the last batch is often smaller than local_batch_size * distributed_word_size (default: ``True``). grouped_shuffling (bool, optional): group batches with each groups containing num_shards batches and shuffle groups. Reduces difference between sequence lengths among workers for batches sorted by length. update_epoch_batch_itr (bool optional): if true then donot use the cached batch iterator for the epoch Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ can_reuse_epoch_itr = ( not disable_iterator_cache and not update_epoch_batch_itr and self.can_reuse_epoch_itr(dataset) ) logger.info(f"can_reuse_epoch_itr = {can_reuse_epoch_itr}") if can_reuse_epoch_itr and dataset in self.dataset_to_epoch_iter: logger.debug("reusing EpochBatchIterator for epoch {}".format(epoch)) return self.dataset_to_epoch_iter[dataset] assert isinstance(dataset, FairseqDataset) # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) def make_batches(dataset, epoch): logger.info(f"creating new batches for epoch {epoch}") # get indices ordered by example size with data_utils.numpy_seed(seed + epoch): indices = dataset.ordered_indices() # filter examples that are too large if max_positions is not None: indices = self.filter_indices_by_size( indices, dataset, max_positions, ignore_invalid_inputs ) # create mini-batches with given size constraints batches = dataset.batch_by_size( indices, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) return batches reuse_dataloader = getattr(self.cfg, "reuse_dataloader", True) persistent_workers = getattr(self.cfg, "persistent_workers", True) rebuild_batches = getattr(self.cfg, "rebuild_batches", False) logger.info(f"reuse_dataloader = {reuse_dataloader}") logger.info(f"rebuild_batches = {rebuild_batches}") if rebuild_batches: logger.info("batches will be rebuilt for each epoch") batch_sampler = make_batches else: batch_sampler = make_batches(dataset, epoch) # return a reusable, sharded iterator epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, buffer_size=data_buffer_size, skip_remainder_batch=skip_remainder_batch, grouped_shuffling=grouped_shuffling, reuse_dataloader=reuse_dataloader, persistent_workers=persistent_workers, ) if can_reuse_epoch_itr: self.dataset_to_epoch_iter[dataset] = epoch_iter return epoch_iter def build_model(self, cfg: FairseqDataclass, from_checkpoint=False): """ Build the :class:`~fairseq.models.BaseFairseqModel` instance for this task. Args: cfg (FairseqDataclass): configuration object Returns: a :class:`~fairseq.models.BaseFairseqModel` instance """ from fairseq import models, quantization_utils model = models.build_model(cfg, self, from_checkpoint) model = quantization_utils.quantize_model_scalar(model, cfg) return model def build_criterion(self, cfg: DictConfig, from_checkpoint=False): """ Build the :class:`~fairseq.criterions.FairseqCriterion` instance for this task. Args: cfg (omegaconf.DictConfig): configration object Returns: a :class:`~fairseq.criterions.FairseqCriterion` instance """ from fairseq import criterions return criterions.build_criterion(cfg, self, from_checkpoint=from_checkpoint) def build_generator( self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, prefix_allowed_tokens_fn=None, ): """ Build a :class:`~fairseq.SequenceGenerator` instance for this task. Args: models (List[~fairseq.models.FairseqModel]): ensemble of models args (fairseq.dataclass.configs.GenerationConfig): configuration object (dataclass) for generation extra_gen_cls_kwargs (Dict[str, Any]): extra options to pass through to SequenceGenerator prefix_allowed_tokens_fn (Callable[[int, torch.Tensor], List[int]]): If provided, this function constrains the beam search to allowed tokens only at each step. The provided function should take 2 arguments: the batch ID (`batch_id: int`) and a unidimensional tensor of token ids (`inputs_ids: torch.Tensor`). It has to return a `List[int]` with the allowed tokens for the next generation step conditioned on the previously generated tokens (`inputs_ids`) and the batch ID (`batch_id`). This argument is useful for constrained generation conditioned on the prefix, as described in "Autoregressive Entity Retrieval" (https://arxiv.org/abs/2010.00904) and https://github.com/facebookresearch/GENRE. """ if getattr(args, "score_reference", False): from fairseq.sequence_scorer import SequenceScorer return SequenceScorer( self.target_dictionary, compute_alignment=getattr(args, "print_alignment", False), ) from fairseq.sequence_generator import ( SequenceGenerator, SequenceGeneratorWithAlignment, ) # Choose search strategy. Defaults to Beam Search. sampling = getattr(args, "sampling", False) sampling_topk = getattr(args, "sampling_topk", -1) sampling_topp = getattr(args, "sampling_topp", -1.0) diverse_beam_groups = getattr(args, "diverse_beam_groups", -1) diverse_beam_strength = getattr(args, "diverse_beam_strength", 0.5) match_source_len = getattr(args, "match_source_len", False) diversity_rate = getattr(args, "diversity_rate", -1) constrained = getattr(args, "constraints", False) if prefix_allowed_tokens_fn is None: prefix_allowed_tokens_fn = getattr(args, "prefix_allowed_tokens_fn", None) if ( sum( int(cond) for cond in [ sampling, diverse_beam_groups > 0, match_source_len, diversity_rate > 0, ] ) > 1 ): raise ValueError("Provided Search parameters are mutually exclusive.") assert sampling_topk < 0 or sampling, "--sampling-topk requires --sampling" assert sampling_topp < 0 or sampling, "--sampling-topp requires --sampling" if sampling: search_strategy = search.Sampling( self.target_dictionary, sampling_topk, sampling_topp ) elif diverse_beam_groups > 0: search_strategy = search.DiverseBeamSearch( self.target_dictionary, diverse_beam_groups, diverse_beam_strength ) elif match_source_len: # this is useful for tagging applications where the output # length should match the input length, so we hardcode the # length constraints for simplicity search_strategy = search.LengthConstrainedBeamSearch( self.target_dictionary, min_len_a=1, min_len_b=0, max_len_a=1, max_len_b=0, ) elif diversity_rate > -1: search_strategy = search.DiverseSiblingsSearch( self.target_dictionary, diversity_rate ) elif constrained: search_strategy = search.LexicallyConstrainedBeamSearch( self.target_dictionary, args.constraints ) elif prefix_allowed_tokens_fn: search_strategy = search.PrefixConstrainedBeamSearch( self.target_dictionary, prefix_allowed_tokens_fn ) else: search_strategy = search.BeamSearch(self.target_dictionary) extra_gen_cls_kwargs = extra_gen_cls_kwargs or {} if seq_gen_cls is None: if getattr(args, "print_alignment", False): seq_gen_cls = SequenceGeneratorWithAlignment extra_gen_cls_kwargs["print_alignment"] = args.print_alignment else: seq_gen_cls = SequenceGenerator return seq_gen_cls( models, self.target_dictionary, beam_size=getattr(args, "beam", 5), max_len_a=getattr(args, "max_len_a", 0), max_len_b=getattr(args, "max_len_b", 200), min_len=getattr(args, "min_len", 1), normalize_scores=(not getattr(args, "unnormalized", False)), len_penalty=getattr(args, "lenpen", 1), unk_penalty=getattr(args, "unkpen", 0), temperature=getattr(args, "temperature", 1.0), match_source_len=getattr(args, "match_source_len", False), no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), search_strategy=search_strategy, **extra_gen_cls_kwargs, ) def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): """ Do forward and backward, and return the loss as computed by *criterion* for the given *model* and *sample*. Args: sample (dict): the mini-batch. The format is defined by the :class:`~fairseq.data.FairseqDataset`. model (~fairseq.models.BaseFairseqModel): the model criterion (~fairseq.criterions.FairseqCriterion): the criterion optimizer (~fairseq.optim.FairseqOptimizer): the optimizer update_num (int): the current update ignore_grad (bool): multiply loss by 0 if this is set to True Returns: tuple: - the loss - the sample size, which is used as the denominator for the gradient - logging outputs to display while training """ model.train() model.set_num_updates(update_num) with torch.autograd.profiler.record_function("forward"): with torch.cuda.amp.autocast(enabled=(isinstance(optimizer, AMPOptimizer))): loss, sample_size, logging_output = criterion(model, sample) if ignore_grad: loss *= 0 with torch.autograd.profiler.record_function("backward"): optimizer.backward(loss) return loss, sample_size, logging_output def valid_step(self, sample, model, criterion): model.eval() with torch.no_grad(): loss, sample_size, logging_output = criterion(model, sample) return loss, sample_size, logging_output def optimizer_step(self, optimizer, model, update_num): optimizer.step() def build_dataset_for_inference( self, src_tokens: List[torch.Tensor], src_lengths: List[int], **kwargs ) -> torch.utils.data.Dataset: raise NotImplementedError def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): return generator.generate( models, sample, prefix_tokens=prefix_tokens, constraints=constraints ) def begin_epoch(self, epoch, model): """Hook function called before the start of each epoch.""" pass def begin_valid_epoch(self, epoch, model): """Hook function called before the start of each validation epoch.""" pass def aggregate_logging_outputs(self, logging_outputs, criterion): """[deprecated] Aggregate logging outputs from data parallel training.""" utils.deprecation_warning( "The aggregate_logging_outputs API is deprecated. " "Please use the reduce_metrics API instead." ) with metrics.aggregate() as agg: self.reduce_metrics(logging_outputs, criterion) return agg.get_smoothed_values() def reduce_metrics(self, logging_outputs, criterion): """Aggregate logging outputs from data parallel training.""" # backward compatibility for tasks that override aggregate_logging_outputs base_func = FairseqTask.aggregate_logging_outputs self_func = getattr(self, "aggregate_logging_outputs").__func__ if self_func is not base_func: utils.deprecation_warning( "Tasks should implement the reduce_metrics API. " "Falling back to deprecated aggregate_logging_outputs API." ) agg_logging_outputs = self.aggregate_logging_outputs( logging_outputs, criterion ) for k, v in agg_logging_outputs.items(): metrics.log_scalar(k, v) return if not any("ntokens" in log for log in logging_outputs): warnings.warn( "ntokens not found in Criterion logging outputs, cannot log wpb or wps" ) else: ntokens = sum(log.get("ntokens", 0) for log in logging_outputs) metrics.log_scalar("wpb", ntokens, priority=180, round=1) metrics.log_speed("wps", ntokens, priority=90, round=1) if not any("nsentences" in log for log in logging_outputs): warnings.warn( "nsentences not found in Criterion logging outputs, cannot log bsz" ) else: nsentences = sum(log.get("nsentences", 0) for log in logging_outputs) metrics.log_scalar("bsz", nsentences, priority=190, round=1) criterion.__class__.reduce_metrics(logging_outputs) def state_dict(self): if self.state is not None: return self.state.state_dict return {} def load_state_dict(self, state_dict: Dict[str, Any]): if self.state is not None: self.state.merge_state_dict(state_dict) def max_positions(self): """Return the max input length allowed by the task.""" return None @property def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary` (if applicable for this task).""" return None @property def target_dictionary(self): """Return the target :class:`~fairseq.data.Dictionary` (if applicable for this task).""" return None def build_tokenizer(self, args): """Build the pre-tokenizer for this task.""" return encoders.build_tokenizer(args) def build_bpe(self, args): """Build the tokenizer for this task.""" return encoders.build_bpe(args) def get_interactive_tokens_and_lengths(self, lines, encode_fn): tokens = [ self.source_dictionary.encode_line( encode_fn(src_str), add_if_not_exist=False ).long() for src_str in lines ] lengths = [t.numel() for t in tokens] return tokens, lengths class LegacyFairseqTask(FairseqTask): def __init__(self, args: Namespace): super().__init__(None) self.args = args self.datasets = {} self.dataset_to_epoch_iter = {} @classmethod def setup_task(cls, args: Namespace, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ return cls(args, **kwargs) def has_sharded_data(self, split): return os.pathsep in getattr(self.args, "data", "") def build_model(self, args: Namespace, from_checkpoint=False): """ Build the :class:`~fairseq.models.BaseFairseqModel` instance for this task. Args: args (argparse.Namespace): parsed command-line arguments Returns: a :class:`~fairseq.models.BaseFairseqModel` instance """ from fairseq import models, quantization_utils model = models.build_model(args, self, from_checkpoint) model = quantization_utils.quantize_model_scalar(model, args) return model def build_criterion(self, args: Namespace): """ Build the :class:`~fairseq.criterions.FairseqCriterion` instance for this task. Args: args (argparse.Namespace): parsed command-line arguments Returns: a :class:`~fairseq.criterions.FairseqCriterion` instance """ from fairseq import criterions return criterions.build_criterion(args, self) ================================================ FILE: fairseq/tasks/frm_text_to_speech.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from fairseq.data.audio.frm_text_to_speech_dataset import FrmTextToSpeechDatasetCreator from fairseq.tasks import register_task from fairseq.tasks.text_to_speech import TextToSpeechTask logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, ) logger = logging.getLogger(__name__) @register_task("frm_text_to_speech") class FrmTextToSpeechTask(TextToSpeechTask): @staticmethod def add_args(parser): TextToSpeechTask.add_args(parser) parser.add_argument("--do_chunk", action="store_true", help="train on chunks") parser.add_argument("--chunk_bound", default=-1, type=int) parser.add_argument("--chunk_init", default=50, type=int) parser.add_argument("--chunk_incr", default=5, type=int) parser.add_argument("--add_eos", action="store_true") parser.add_argument("--dedup", action="store_true") parser.add_argument("--ref_fpu", default=-1, type=float) def load_dataset(self, split, **unused_kwargs): is_train_split = split.startswith("train") pre_tokenizer = self.build_tokenizer(self.args) bpe_tokenizer = self.build_bpe(self.args) self.datasets[split] = FrmTextToSpeechDatasetCreator.from_tsv( self.args.data, self.data_cfg, split, self.src_dict, pre_tokenizer, bpe_tokenizer, is_train_split=is_train_split, n_frames_per_step=self.args.n_frames_per_step, speaker_to_id=self.speaker_to_id, do_chunk=self.args.do_chunk, chunk_bound=self.args.chunk_bound, chunk_init=self.args.chunk_init, chunk_incr=self.args.chunk_incr, add_eos=self.args.add_eos, dedup=self.args.dedup, ref_fpu=self.args.ref_fpu, ) ================================================ FILE: fairseq/tasks/hubert_pretraining.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import os import sys from typing import Dict, List, Optional, Tuple import numpy as np from dataclasses import dataclass, field from fairseq.data import Dictionary, HubertDataset from fairseq.dataclass.configs import FairseqDataclass from fairseq.tasks import register_task from fairseq.tasks.fairseq_task import FairseqTask from omegaconf import MISSING logger = logging.getLogger(__name__) class LabelEncoder(object): def __init__(self, dictionary: Dictionary) -> None: self.dictionary = dictionary def __call__(self, label: str) -> List[str]: return self.dictionary.encode_line( label, append_eos=False, add_if_not_exist=False, ) @dataclass class HubertPretrainingConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) fine_tuning: bool = field( default=False, metadata={"help": "set to true if fine-tuning Hubert"} ) labels: List[str] = field( default_factory=lambda: ["ltr"], metadata={ "help": ( "extension of the label files to load, frame-level labels for" " pre-training, and sequence-level label for fine-tuning" ) }, ) label_dir: Optional[str] = field( default=None, metadata={ "help": "if set, looks for labels in this directory instead", }, ) label_rate: float = field( default=-1.0, metadata={"help": "label frame rate. -1.0 for sequence label"}, ) sample_rate: int = field( default=16_000, metadata={ "help": "target sample rate. audio files will be up/down " "sampled to this rate" }, ) normalize: bool = field( default=False, metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, ) enable_padding: bool = field( default=False, metadata={"help": "pad shorter samples instead of cropping"}, ) max_keep_size: Optional[int] = field( default=None, metadata={"help": "exclude sample longer than this"}, ) max_sample_size: Optional[int] = field( default=None, metadata={"help": "max sample size to crop to for batching"}, ) min_sample_size: Optional[int] = field( default=None, metadata={"help": "min sample size to crop to for batching"}, ) single_target: Optional[bool] = field( default=False, metadata={ "help": "if set, AddTargetDatasets outputs same keys " "as AddTargetDataset" }, ) random_crop: Optional[bool] = field( default=True, metadata={"help": "always crop from the beginning if false"}, ) pad_audio: Optional[bool] = field( default=False, metadata={"help": "pad audio to the longest one in the batch if true"}, ) @register_task("hubert_pretraining", dataclass=HubertPretrainingConfig) class HubertPretrainingTask(FairseqTask): cfg: HubertPretrainingConfig def __init__( self, cfg: HubertPretrainingConfig, ) -> None: super().__init__(cfg) logger.info(f"current directory is {os.getcwd()}") logger.info(f"HubertPretrainingTask Config {cfg}") self.cfg = cfg self.fine_tuning = cfg.fine_tuning if cfg.fine_tuning: self.state.add_factory("target_dictionary", self.load_dictionaries) else: self.state.add_factory("dictionaries", self.load_dictionaries) self.blank_symbol = "<s>" @property def source_dictionary(self) -> Optional[Dictionary]: return None @property def target_dictionary(self) -> Optional[Dictionary]: return self.state.target_dictionary @property def dictionaries(self) -> List[Dictionary]: return self.state.dictionaries @classmethod def setup_task( cls, cfg: HubertPretrainingConfig, **kwargs ) -> "HubertPretrainingTask": return cls(cfg) def load_dictionaries(self): label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir dictionaries = [ Dictionary.load(f"{label_dir}/dict.{label}.txt") for label in self.cfg.labels ] return dictionaries[0] if self.cfg.fine_tuning else dictionaries def get_label_dir(self) -> str: if self.cfg.label_dir is None: return self.cfg.data return self.cfg.label_dir def load_dataset(self, split: str, **kwargs) -> None: manifest = f"{self.cfg.data}/{split}.tsv" dicts = [self.target_dictionary] if self.cfg.fine_tuning else self.dictionaries pad_list = [dict.pad() for dict in dicts] eos_list = [dict.eos() for dict in dicts] procs = [LabelEncoder(dict) for dict in dicts] paths = [f"{self.get_label_dir()}/{split}.{l}" for l in self.cfg.labels] # hubert v1: pad_audio=True, random_crop=False; self.datasets[split] = HubertDataset( manifest, sample_rate=self.cfg.sample_rate, label_paths=paths, label_rates=self.cfg.label_rate, pad_list=pad_list, eos_list=eos_list, label_processors=procs, max_keep_sample_size=self.cfg.max_keep_size, min_keep_sample_size=self.cfg.min_sample_size, max_sample_size=self.cfg.max_sample_size, pad_audio=self.cfg.pad_audio, normalize=self.cfg.normalize, store_labels=False, random_crop=self.cfg.random_crop, single_target=self.cfg.single_target, ) def max_positions(self) -> Tuple[int, int]: return (sys.maxsize, sys.maxsize) def filter_indices_by_size(self, indices: np.array, *args, **kwargs) -> np.array: return indices ================================================ FILE: fairseq/tasks/language_modeling.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field from typing import Optional import numpy as np import torch from fairseq import utils from fairseq.data import ( AppendTokenDataset, Dictionary, IdDataset, LMContextWindowDataset, MonolingualDataset, NestedDictionaryDataset, NumelDataset, PadDataset, PrependTokenDataset, StripTokenDataset, TokenBlockDataset, TruncatedDictionary, data_utils, ) from fairseq.data.indexed_dataset import get_available_dataset_impl from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.tasks import LegacyFairseqTask, register_task from omegaconf import II SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) logger = logging.getLogger(__name__) @dataclass class LanguageModelingConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={"help": "path to data directory"} ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) output_dictionary_size: int = field( default=-1, metadata={"help": "limit the size of output dictionary"} ) self_target: bool = field(default=False, metadata={"help": "include self target"}) future_target: bool = field( default=False, metadata={"help": "include future target"} ) past_target: bool = field(default=False, metadata={"help": "include past target"}) add_bos_token: bool = field( default=False, metadata={"help": "prepend beginning of sentence token (<s>)"} ) max_target_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the target sequence"} ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) pad_to_fixed_length: Optional[bool] = field( default=False, metadata={"help": "pad to fixed length"}, ) pad_to_fixed_bsz: Optional[bool] = field( default=False, metadata={"help": "boolean to pad to fixed batch size"}, ) # TODO common vars below add to parent seed: int = II("common.seed") batch_size: Optional[int] = II("dataset.batch_size") batch_size_valid: Optional[int] = II("dataset.batch_size_valid") dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) data_buffer_size: int = II("dataset.data_buffer_size") tpu: bool = II("common.tpu") use_plasma_view: bool = II("common.use_plasma_view") plasma_path: str = II("common.plasma_path") @register_task("language_modeling", dataclass=LanguageModelingConfig) class LanguageModelingTask(LegacyFairseqTask): """ Train a language model. Args: dictionary (~fairseq.data.Dictionary): the dictionary for the input of the language model output_dictionary (~fairseq.data.Dictionary): the dictionary for the output of the language model. In most cases it will be the same as *dictionary*, but could possibly be a more limited version of the dictionary (if ``--output-dictionary-size`` is used). targets (List[str]): list of the target types that the language model should predict. Can be one of "self", "future", and "past". Defaults to "future". .. note:: The language modeling task is compatible with :mod:`fairseq-train`, :mod:`fairseq-generate`, :mod:`fairseq-interactive` and :mod:`fairseq-eval-lm`. The language modeling task provides the following additional command-line arguments: .. argparse:: :ref: fairseq.tasks.language_modeling_parser :prog: """ def __init__(self, args, dictionary, output_dictionary=None, targets=None): super().__init__(args) self.dictionary = dictionary self.output_dictionary = output_dictionary or dictionary if targets is None: targets = ["future"] self.targets = targets @classmethod def setup_dictionary(cls, args, **kwargs): dictionary = None output_dictionary = None if args.data: paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) output_dictionary = dictionary if args.output_dictionary_size >= 0: output_dictionary = TruncatedDictionary( dictionary, args.output_dictionary_size ) return (dictionary, output_dictionary) @classmethod def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs) # upgrade old checkpoints if getattr(args, "exclude_self_target", False): args.self_target = False targets = [] if getattr(args, "self_target", False): targets.append("self") if getattr(args, "future_target", False): targets.append("future") if getattr(args, "past_target", False): targets.append("past") if len(targets) == 0: # standard language modeling targets = ["future"] return cls(args, dictionary, output_dictionary, targets=targets) def build_model(self, args, from_checkpoint=False): model = super().build_model(args, from_checkpoint) for target in self.targets: if target not in model.supported_targets: raise ValueError( "Unsupported language modeling target: {}".format(target) ) return model def load_dataset( self, split: str, epoch=1, combine=False, **kwargs ) -> MonolingualDataset: """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, valid1, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) # each process has its own copy of the raw data (likely to be an np.memmap) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine ) if dataset is None: raise FileNotFoundError(f"Dataset not found: {split} ({split_path})") dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, use_plasma_view=self.args.use_plasma_view, split_path=split_path, plasma_path=self.args.plasma_path, ) add_eos_for_other_targets = ( self.args.sample_break_mode is not None and self.args.sample_break_mode != "none" ) fixed_pad_length = None if self.args.pad_to_fixed_length: fixed_pad_length = self.args.tokens_per_sample pad_to_bsz = None if self.args.pad_to_fixed_bsz: pad_to_bsz = ( self.args.batch_size_valid if "valid" in split else self.args.batch_size ) self.datasets[split] = MonolingualDataset( dataset=dataset, sizes=dataset.sizes, src_vocab=self.dictionary, tgt_vocab=self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, add_bos_token=self.args.add_bos_token, fixed_pad_length=fixed_pad_length, pad_to_bsz=pad_to_bsz, ) def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): """ Generate batches for inference. We prepend an eos token to src_tokens (or bos if `--add-bos-token` is set) and we append a <pad> to target. This is convenient both for generation with a prefix and LM scoring. """ dataset = StripTokenDataset( TokenBlockDataset( src_tokens, src_lengths, block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ), # remove eos from (end of) target sequence self.source_dictionary.eos(), ) src_dataset = PrependTokenDataset( dataset, token=( self.source_dictionary.bos() if getattr(self.args, "add_bos_token", False) else self.source_dictionary.eos() ), ) tgt_dataset = AppendTokenDataset(dataset, token=self.source_dictionary.pad()) return NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": PadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, ), "src_lengths": NumelDataset(src_dataset, reduce=False), }, "target": PadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False ), }, sizes=[np.array(src_lengths)], ) def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): # Generation will always be conditioned on bos_token if getattr(self.args, "add_bos_token", False): bos_token = self.source_dictionary.bos() else: bos_token = self.source_dictionary.eos() if constraints is not None: raise NotImplementedError( "Constrained decoding with the language_modeling task is not supported" ) # SequenceGenerator doesn't use src_tokens directly, we need to # pass the `prefix_tokens` argument instead if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement(): prefix_tokens = sample["net_input"]["src_tokens"] if prefix_tokens[:, 0].eq(bos_token).all(): prefix_tokens = prefix_tokens[:, 1:] return generator.generate( models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token ) def eval_lm_dataloader( self, dataset, max_tokens: Optional[int] = 36000, batch_size: Optional[int] = None, max_positions: Optional[int] = None, num_shards: int = 1, shard_id: int = 0, num_workers: int = 1, data_buffer_size: int = 10, # ensures that every evaluated token has access to a context of at least # this size, if possible context_window: int = 0, ): if context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=self.args.tokens_per_sample, context_window=context_window, pad_idx=self.source_dictionary.pad(), ) return self.get_batch_iterator( dataset=dataset, max_tokens=max_tokens, max_sentences=batch_size, max_positions=max_positions, ignore_invalid_inputs=True, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, data_buffer_size=data_buffer_size, ).next_epoch_itr(shuffle=False) @property def source_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.dictionary @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.output_dictionary ================================================ FILE: fairseq/tasks/legacy_masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import itertools import logging import os import numpy as np from fairseq import tokenizer, utils from fairseq.data import ConcatDataset, Dictionary, data_utils, indexed_dataset from fairseq.data.legacy.block_pair_dataset import BlockPairDataset from fairseq.data.legacy.masked_lm_dataset import MaskedLMDataset from fairseq.data.legacy.masked_lm_dictionary import BertDictionary from fairseq.tasks import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) @register_task("legacy_masked_lm") class LegacyMaskedLMTask(LegacyFairseqTask): """ Task for training Masked LM (BERT) model. Args: dictionary (Dictionary): the dictionary for the input of the task """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument( "data", help="colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner", ) parser.add_argument( "--tokens-per-sample", default=512, type=int, help="max number of total tokens over all segments" " per sample for BERT dataset", ) parser.add_argument( "--break-mode", default="doc", type=str, help="mode for breaking sentence" ) parser.add_argument("--shuffle-dataset", action="store_true", default=False) def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary self.seed = args.seed @classmethod def load_dictionary(cls, filename): return BertDictionary.load(filename) @classmethod def build_dictionary( cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8 ): d = BertDictionary() for filename in filenames: Dictionary.add_file_to_dictionary( filename, d, tokenizer.tokenize_line, workers ) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d @property def target_dictionary(self): return self.dictionary @classmethod def setup_task(cls, args, **kwargs): """Setup the task.""" paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = BertDictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) return cls(args, dictionary) def load_dataset(self, split, epoch=1, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] logger.info("data_path", data_path) for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") path = os.path.join(data_path, split_k) ds = indexed_dataset.make_dataset( path, impl=self.args.dataset_impl, fix_lua_indexing=True, dictionary=self.dictionary, ) if ds is None: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( BlockPairDataset( ds, self.dictionary, ds.sizes, self.args.tokens_per_sample, break_mode=self.args.break_mode, doc_break_size=1, ) ) logger.info( "{} {} {} examples".format(data_path, split_k, len(loaded_datasets[-1])) ) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = MaskedLMDataset( dataset=dataset, sizes=sizes, vocab=self.dictionary, pad_idx=self.dictionary.pad(), mask_idx=self.dictionary.mask(), classif_token_idx=self.dictionary.cls(), sep_token_idx=self.dictionary.sep(), shuffle=self.args.shuffle_dataset, seed=self.seed, ) ================================================ FILE: fairseq/tasks/masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field import numpy as np from omegaconf import II, MISSING, OmegaConf from fairseq import utils from fairseq.data import ( Dictionary, IdDataset, MaskTokensDataset, NestedDictionaryDataset, NumelDataset, NumSamplesDataset, PrependTokenDataset, RightPadDataset, RightPaddingMaskDataset, SortDataset, TokenBlockDataset, data_utils, ) from fairseq.data.encoders.utils import get_whole_word_mask from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.dataclass import FairseqDataclass from fairseq.tasks import FairseqTask, register_task from .language_modeling import SAMPLE_BREAK_MODE_CHOICES, SHORTEN_METHOD_CHOICES logger = logging.getLogger(__name__) @dataclass class MaskedLMConfig(FairseqDataclass): data: str = field( default=MISSING, metadata={ "help": "colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner" }, ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) mask_prob: float = field( default=0.15, metadata={"help": "probability of replacing a token with mask"}, ) leave_unmasked_prob: float = field( default=0.1, metadata={"help": "probability that a masked token is unmasked"}, ) random_token_prob: float = field( default=0.1, metadata={"help": "probability of replacing a token with a random token"}, ) freq_weighted_replacement: bool = field( default=False, metadata={"help": "sample random replacement words based on word frequencies"}, ) mask_whole_words: bool = field( default=False, metadata={"help": "mask whole words; you may also want to set --bpe"}, ) mask_multiple_length: int = field( default=1, metadata={"help": "repeat the mask indices multiple times"}, ) mask_stdev: float = field( default=0.0, metadata={"help": "stdev of the mask length"}, ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) seed: int = II("common.seed") include_target_tokens: bool = field( default=False, metadata={ "help": "include target tokens in model input. this is used for data2vec" }, ) include_index: bool = field( default=True, metadata={"help": "include index in model input. this is used for data2vec"}, ) skip_masking: bool = field( default=False, metadata={"help": "skip masking at dataset"}, ) # subsample_train: float = field( # default=1, # metadata={"help": "shorten training set for debugging"}, # ) d2v2_multi: bool = field( default=False, metadata={"help": "prepare dataset for data2vec_multi"}, ) @register_task("masked_lm", dataclass=MaskedLMConfig) class MaskedLMTask(FairseqTask): cfg: MaskedLMConfig """Task for training masked language models (e.g., BERT, RoBERTa).""" def __init__(self, cfg: MaskedLMConfig, dictionary=None): super().__init__(cfg) self.dictionary = dictionary or self.load_dict(cfg) # add mask token self.mask_idx = self.dictionary.add_symbol("<mask>") @classmethod def setup_task(cls, cfg: MaskedLMConfig, **kwargs): dictionary = cls.load_dict(cfg) return cls(cfg, dictionary) @classmethod def load_dict(cls, cfg): paths = utils.split_paths(cfg.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) return dictionary def _load_dataset_split(self, split, epoch, combine): paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, combine=combine, ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) dataset = maybe_shorten_dataset( dataset, split, self.cfg.shorten_data_split_list, self.cfg.shorten_method, self.cfg.tokens_per_sample, self.cfg.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.cfg.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.cfg.sample_break_mode, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) return PrependTokenDataset(dataset, self.source_dictionary.bos()) def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset = self._load_dataset_split(split, epoch, combine) # create masked input and targets mask_whole_words = ( get_whole_word_mask(self.args, self.source_dictionary) if self.cfg.mask_whole_words else None ) src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( dataset, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.cfg.seed, mask_prob=self.cfg.mask_prob, leave_unmasked_prob=self.cfg.leave_unmasked_prob, random_token_prob=self.cfg.random_token_prob, freq_weighted_replacement=self.cfg.freq_weighted_replacement, mask_whole_words=mask_whole_words, mask_multiple_length=self.cfg.mask_multiple_length, mask_stdev=self.cfg.mask_stdev, skip_masking=self.cfg.skip_masking, ) with data_utils.numpy_seed(self.cfg.seed): shuffle = np.random.permutation(len(src_dataset)) target_dataset = RightPadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), ) if self.cfg.d2v2_multi: dataset = self._d2v2_multi_dataset(src_dataset) else: dataset = self._regular_dataset(src_dataset, target_dataset) self.datasets[split] = SortDataset( dataset, sort_order=[shuffle, src_dataset.sizes] ) def _regular_dataset(self, src_dataset, target_dataset): input_dict = { "src_tokens": RightPadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_dataset, reduce=False), } if self.cfg.include_target_tokens: input_dict["target_tokens"] = target_dataset if self.cfg.include_index: input_dict["src_id"] = IdDataset() dataset = NestedDictionaryDataset( { "id": IdDataset(), "net_input": input_dict, "target": target_dataset, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_dataset, reduce=True), }, sizes=[src_dataset.sizes], ) return dataset def _d2v2_multi_dataset(self, src_dataset): input_dict = { "source": RightPadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), ), "id": IdDataset(), "padding_mask": RightPaddingMaskDataset(src_dataset), } dataset = NestedDictionaryDataset( { "id": IdDataset(), "net_input": input_dict, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_dataset, reduce=True), }, sizes=[src_dataset.sizes], ) return dataset def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True): src_dataset = RightPadDataset( TokenBlockDataset( src_tokens, src_lengths, self.cfg.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ), pad_idx=self.source_dictionary.pad(), ) src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos()) src_dataset = NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": src_dataset, "src_lengths": NumelDataset(src_dataset, reduce=False), }, }, sizes=src_lengths, ) if sort: src_dataset = SortDataset(src_dataset, sort_order=[src_lengths]) return src_dataset @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary def begin_epoch(self, epoch, model): model.set_epoch(epoch) def max_positions(self): return self.cfg.tokens_per_sample ================================================ FILE: fairseq/tasks/multilingual_denoising.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field from typing import Optional import numpy as np from omegaconf import II from fairseq.data import ( AppendTokenDataset, ConcatDataset, DenoisingDataset, Dictionary, PrependTokenDataset, ResamplingDataset, SortDataset, TokenBlockDataset, data_utils, ) from fairseq.data.encoders.utils import get_whole_word_mask from fairseq.tasks import register_task from .denoising import DenoisingConfig, DenoisingTask logger = logging.getLogger(__name__) @dataclass class MultilingualDenoisingConfig(DenoisingConfig): multilang_sampling_alpha: float = field( default=1.0, metadata={"help": "smoothing alpha for sample ratios across multiple datasets"}, ) add_lang_token: bool = field( default=False, metadata={"help": ""}, ) langs: Optional[str] = field( default=None, metadata={"help": "language ids we are considering"}, ) no_whole_word_mask_langs: str = field( default="", metadata={ "help": "languages without spacing between words don't support whole word masking" }, ) train_subset: str = II("common.train_subset") valid_subset: str = II("common.valid_subset") @register_task("multilingual_denoising", dataclass=MultilingualDenoisingConfig) class MultilingualDenoisingTask(DenoisingTask): cfg: MultilingualDenoisingConfig @classmethod def setup_task(cls, cfg: MultilingualDenoisingConfig, **kwargs): """Setup the task.""" paths = cfg.data.split(":") assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) data_path = paths[0] if cfg.langs is None: languages = sorted( [ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) ] ) else: languages = cfg.langs.split(",") if cfg.add_lang_token: for lang in languages: dictionary.add_symbol("[{}]".format(lang)) logger.info("dictionary: {} types".format(len(dictionary))) if not hasattr(cfg, "shuffle_instance"): cfg.shuffle_instance = False return cls(cfg, dictionary) def __init__(self, cfg: MultilingualDenoisingConfig, dictionary): super().__init__(cfg, dictionary) self.dictionary = dictionary # add mask token self.mask_idx = self.dictionary.add_symbol("<mask>") self.cfg = cfg def _get_sample_prob(self, dataset_lens): """ Get smoothed sampling probability by languages. This helps low resource languages by upsampling them. """ prob = dataset_lens / dataset_lens.sum() smoothed_prob = prob**self.cfg.multilang_sampling_alpha smoothed_prob = smoothed_prob / smoothed_prob.sum() return smoothed_prob def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = self.cfg.data.split(":") assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) if self.cfg.langs is None: languages = sorted( [ name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) ] ) else: languages = self.cfg.langs.split(",") for name in languages: p = os.path.join(data_path, name) assert os.path.exists(p), "data not found: {}".format(p) logger.info("Training on {0} languages: {1}".format(len(languages), languages)) logger.info( "Language to id mapping: ", {lang: id for id, lang in enumerate(languages)} ) mask_whole_words = get_whole_word_mask(self.cfg.bpe, self.dictionary) language_without_segmentations = self.cfg.no_whole_word_mask_langs.split(",") lang_datasets = [] for language in languages: split_path = os.path.join(data_path, language, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.cfg.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) end_token = ( self.source_dictionary.index("[{}]".format(language)) if self.cfg.add_lang_token else self.source_dictionary.eos() ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.cfg.tokens_per_sample - 2, # one less for <s> pad=self.source_dictionary.pad(), eos=end_token, break_mode=self.cfg.sample_break_mode, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, end_token) lang_mask_whole_words = ( mask_whole_words if language not in language_without_segmentations else None ) lang_dataset = DenoisingDataset( dataset, dataset.sizes, self.dictionary, self.mask_idx, lang_mask_whole_words, shuffle=self.cfg.shuffle_instance, seed=self.cfg.seed, mask=self.cfg.mask, mask_random=self.cfg.mask_random, insert=self.cfg.insert, rotate=self.cfg.rotate, permute_sentences=self.cfg.permute_sentences, bpe=self.cfg.bpe, replace_length=self.cfg.replace_length, mask_length=self.cfg.mask_length, poisson_lambda=self.cfg.poisson_lambda, eos=None if not self.cfg.add_lang_token else self.source_dictionary.index("[{}]".format(language)), ) lang_datasets.append(lang_dataset) dataset_lengths = np.array( [len(d) for d in lang_datasets], dtype=float, ) logger.info( "loaded total {} blocks for all languages".format( int(dataset_lengths.sum()), ) ) if split == self.cfg.train_subset: # For train subset, additionally up or down sample languages. sample_probs = self._get_sample_prob(dataset_lengths) logger.info( "Sample probability by language: {}".format( { lang: "{0:.4f}".format(sample_probs[id]) for id, lang in enumerate(languages) } ) ) size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths logger.info( "Up/Down Sampling ratio by language: {}".format( { lang: "{0:.2f}".format(size_ratio[id]) for id, lang in enumerate(languages) } ) ) resampled_lang_datasets = [ ResamplingDataset( lang_datasets[i], size_ratio=size_ratio[i], seed=self.cfg.seed, epoch=epoch, replace=size_ratio[i] >= 1.0, ) for i, d in enumerate(lang_datasets) ] dataset = ConcatDataset( resampled_lang_datasets, ) else: dataset = ConcatDataset(lang_datasets) lang_splits = [split] for lang_id, lang_dataset in enumerate(lang_datasets): split_name = split + "_" + languages[lang_id] lang_splits.append(split_name) self.datasets[split_name] = lang_dataset if split in self.cfg.valid_subset: self.cfg.valid_subset = self.cfg.valid_subset.replace( split, ",".join(lang_splits) ) with data_utils.numpy_seed(self.cfg.seed + epoch): shuffle = np.random.permutation(len(dataset)) self.datasets[split] = SortDataset( dataset, sort_order=[ shuffle, dataset.sizes, ], ) ================================================ FILE: fairseq/tasks/multilingual_language_modeling.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field from typing import Optional import numpy as np import torch from omegaconf import II from fairseq import utils from fairseq.data import ( AppendTokenDataset, ConcatDataset, Dictionary, IdDataset, LMContextWindowDataset, MonolingualDataset, NestedDictionaryDataset, NumelDataset, PadDataset, PrependTokenDataset, ResamplingDataset, SortDataset, StripTokenDataset, TokenBlockDataset, TruncatedDictionary, data_utils, ) from fairseq.data.indexed_dataset import get_available_dataset_impl from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.tasks import LegacyFairseqTask, register_task SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) logger = logging.getLogger(__name__) def lang_token(lang): return f"<{lang}>" @dataclass class MultilingualLanguageModelingConfig(FairseqDataclass): # TODO common var add to parent data: Optional[str] = field( default=None, metadata={"help": "path to data directory"} ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) output_dictionary_size: int = field( default=-1, metadata={"help": "limit the size of output dictionary"} ) self_target: bool = field(default=False, metadata={"help": "include self target"}) future_target: bool = field( default=False, metadata={"help": "include future target"} ) past_target: bool = field(default=False, metadata={"help": "include past target"}) add_bos_token: bool = field( default=False, metadata={"help": "prepend lang id token <dialect>"} ) max_source_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the source sequence"} ) max_target_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the target sequence"} ) pad_to_fixed_length: Optional[bool] = field( default=False, metadata={"help": "pad to fixed length"} ) pad_to_fixed_bsz: Optional[bool] = field( default=False, metadata={"help": "boolean to pad to fixed batch size"} ) multilang_sampling_alpha: Optional[float] = field( default=1.0, metadata={ "help": "smoothing alpha for sample rations across multiple datasets" }, ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) langs: str = field( default="", metadata={ "help": "comma-separated list of languages (default: all directories in data path)" }, ) baseline_model_langs: str = field( default="", metadata={ "help": "comma-separated list of languages in the baseline model (default: none)" }, ) # TODO: legacy parameter kept for compatibility baseline_model: str = field( default="", metadata={"help": "path to the baseline model (default: none)"}, ) lang_to_offline_shard_ratio: str = field( default="", metadata={ "help": "absolute path of tsv file location to indicate lang to offline shard ratio.", }, ) # TODO common vars below add to parent seed: int = II("common.seed") dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) data_buffer_size: int = II("dataset.data_buffer_size") tpu: bool = II("common.tpu") batch_size: Optional[int] = II("dataset.batch_size") batch_size_valid: Optional[int] = II("dataset.batch_size_valid") train_subset: str = II("common.train_subset") valid_subset: str = II("common.valid_subset") @register_task( "multilingual_language_modeling", dataclass=MultilingualLanguageModelingConfig ) class MultilingualLanguageModelingTask(LegacyFairseqTask): """ Train a language model. Args: dictionary (~fairseq.data.Dictionary): the dictionary for the input of the language model output_dictionary (~fairseq.data.Dictionary): the dictionary for the output of the language model. In most cases it will be the same as *dictionary*, but could possibly be a more limited version of the dictionary (if ``--output-dictionary-size`` is used). targets (List[str]): list of the target types that the language model should predict. Can be one of "self", "future", and "past". Defaults to "future". .. note:: The language modeling task is compatible with :mod:`fairseq-train`, :mod:`fairseq-generate`, :mod:`fairseq-interactive` and :mod:`fairseq-eval-lm`. The language modeling task provides the following additional command-line arguments: .. argparse:: :ref: fairseq.tasks.language_modeling_parser :prog: """ def __init__(self, args, dictionary, output_dictionary=None, targets=None): super().__init__(args) self.dictionary = dictionary self.output_dictionary = output_dictionary or dictionary if targets is None: targets = ["future"] self.targets = targets @staticmethod def _get_langs(args, epoch=1): paths = utils.split_paths(args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] languages = sorted( name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) ) if args.langs: keep_langs = set(args.langs.split(",")) languages = [lang for lang in languages if lang in keep_langs] assert len(languages) == len(keep_langs) return languages, data_path @classmethod def setup_dictionary(cls, args, **kwargs): dictionary = None output_dictionary = None if args.data: paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) if args.add_bos_token: languages, _ = cls._get_langs(args) logger.info("----------------") for lang in languages: dictionary.add_symbol(lang_token(lang)) logger.info(f"add language token: {lang_token(lang)}") logger.info("----------------") logger.info("dictionary: {} types".format(len(dictionary))) output_dictionary = dictionary if args.output_dictionary_size >= 0: output_dictionary = TruncatedDictionary( dictionary, args.output_dictionary_size ) return (dictionary, output_dictionary) @classmethod def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ dictionary, output_dictionary = cls.setup_dictionary(args, **kwargs) # upgrade old checkpoints if hasattr(args, "exclude_self_target"): args.self_target = not args.exclude_self_target targets = [] if getattr(args, "self_target", False): targets.append("self") if getattr(args, "future_target", False): targets.append("future") if getattr(args, "past_target", False): targets.append("past") if len(targets) == 0: # standard language modeling targets = ["future"] return cls(args, dictionary, output_dictionary, targets=targets) def build_model(self, args, from_checkpoint=False): model = super().build_model(args, from_checkpoint) for target in self.targets: if target not in model.supported_targets: raise ValueError( f"Unsupported language modeling target: {target} not in {model.supported_targets}" ) return model def _get_sample_prob(self, dataset_lens): """ Get smoothed sampling porbability by languages. This helps low resource languages by upsampling them. """ prob = dataset_lens / dataset_lens.sum() smoothed_prob = prob**self.args.multilang_sampling_alpha smoothed_prob = smoothed_prob / smoothed_prob.sum() return smoothed_prob def load_dataset(self, split: str, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ languages, data_path = MultilingualLanguageModelingTask._get_langs( self.args, epoch ) lang_to_offline_shard_ratio = None if self.args.lang_to_offline_shard_ratio != "": lang_to_offline_shard_ratio = {} assert os.path.exists( self.args.lang_to_offline_shard_ratio ), "provided offline shard ratio file doesn't exist: {0}".format( self.args.lang_to_offline_shard_ratio ) with open(self.args.lang_to_offline_shard_ratio) as fin: for line in fin: lang, ratio = line.strip().split("\t") ratio = float(ratio) lang_to_offline_shard_ratio[lang] = ratio logger.info( "Found offline sharded ratio: %s", lang_to_offline_shard_ratio, ) if split == self.args.train_subset: logger.info( "Training on {0} languages: {1}".format(len(languages), languages) ) else: logger.info( "Evaluating on {0} languages: {1}".format(len(languages), languages) ) tokens_per_sample = self.args.tokens_per_sample - int(self.args.add_bos_token) fixed_pad_length = None if self.args.pad_to_fixed_length: fixed_pad_length = self.args.tokens_per_sample pad_to_bsz = None if self.args.pad_to_fixed_bsz: pad_to_bsz = ( self.args.batch_size_valid if "valid" in split else self.args.batch_size ) lang_datasets = [] for lang_id, language in enumerate(languages): split_path = os.path.join(data_path, language, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine ) # print('len(dataset) =', len(dataset)) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, tokens_per_sample, self.args.seed, ) dataset = TokenBlockDataset( dataset, dataset.sizes, tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, ) add_eos_for_other_targets = ( self.args.sample_break_mode is not None and self.args.sample_break_mode != "none" ) src_lang_idx, tgt_lang_idx = None, None if self.args.add_bos_token: src_lang_idx = self.dictionary.index(lang_token(language)) tgt_lang_idx = self.output_dictionary.index(lang_token(language)) lang_datasets.append( MonolingualDataset( dataset=dataset, sizes=dataset.sizes, src_vocab=self.dictionary, tgt_vocab=self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=True, targets=self.targets, fixed_pad_length=fixed_pad_length, pad_to_bsz=pad_to_bsz, add_bos_token=self.args.add_bos_token, src_lang_idx=src_lang_idx, tgt_lang_idx=tgt_lang_idx, ) ) dataset_lengths = np.array( [len(d) for d in lang_datasets], dtype=float, ) logger.info( "loaded total {} blocks for all languages".format( dataset_lengths.sum(), ) ) if split == self.args.train_subset: dataset_lengths_ratio_multiplier = np.ones(len(dataset_lengths)) if lang_to_offline_shard_ratio is not None: dataset_lengths_ratio_multiplier = [] for lang in languages: assert ( lang in lang_to_offline_shard_ratio ), "Lang: {0} missing in offline shard ratio file: {1}".format( lang, self.args.lang_to_offline_shard_ratio, ) dataset_lengths_ratio_multiplier.append( lang_to_offline_shard_ratio[lang] ) dataset_lengths_ratio_multiplier = np.array( dataset_lengths_ratio_multiplier ) true_dataset_lengths = ( dataset_lengths * dataset_lengths_ratio_multiplier ) else: true_dataset_lengths = dataset_lengths # For train subset, additionally up or down sample languages. sample_probs = self._get_sample_prob(true_dataset_lengths) logger.info( "Sample probability by language: %s", { lang: "{0:.4f}".format(sample_probs[id]) for id, lang in enumerate(languages) }, ) size_ratio = (sample_probs * true_dataset_lengths.sum()) / dataset_lengths # TODO: add an option for shrinking all size ratios to below 1 # if self.args.multilang_sampling_alpha != 1: # size_ratio /= size_ratio.max() # Fix numeric errors in size ratio computation # 0.999999999999999999 -> 1 # 1.000000000000000002 -> 1 for i in range(len(size_ratio)): size_ratio[i] = round(size_ratio[i], 8) logger.info( "Up/Down Sampling ratio by language: %s", { lang: "{0:.2f}".format(size_ratio[id]) for id, lang in enumerate(languages) }, ) logger.info( "Actual dataset size by language: %s", { lang: "{0:.2f}".format(len(lang_datasets[id])) for id, lang in enumerate(languages) }, ) resampled_lang_datasets = [ ResamplingDataset( lang_datasets[i], size_ratio=size_ratio[i], seed=self.args.seed, epoch=epoch, replace=size_ratio[i] > 1.0, ) for i, d in enumerate(lang_datasets) ] logger.info( "Resampled dataset size by language: %s", { lang: "{0:.2f}".format(len(resampled_lang_datasets[id])) for id, lang in enumerate(languages) }, ) dataset = ConcatDataset(resampled_lang_datasets) else: dataset = ConcatDataset(lang_datasets) lang_splits = [split] for lang_id, lang_dataset in enumerate(lang_datasets): split_name = split + "_" + languages[lang_id] lang_splits.append(split_name) self.datasets[split_name] = lang_dataset # [TODO]: This is hacky for now to print validation ppl for each # language individually. Maybe need task API changes to allow it # in more generic ways. if split in self.args.valid_subset: self.args.valid_subset = self.args.valid_subset.replace( split, ",".join(lang_splits) ) with data_utils.numpy_seed(self.args.seed + epoch): shuffle = np.random.permutation(len(dataset)) self.datasets[split] = SortDataset( dataset, sort_order=[ shuffle, dataset.sizes, ], ) def build_dataset_for_inference( self, src_tokens, src_lengths, language="en_XX", **kwargs ): """ Generate batches for inference. We prepend an eos token to src_tokens (or bos if `--add-bos-token` is set) and we append a <pad> to target. This is convenient both for generation with a prefix and LM scoring. """ dataset = StripTokenDataset( TokenBlockDataset( src_tokens, src_lengths, block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ), # remove eos from (end of) target sequence self.source_dictionary.eos(), ) src_lang_idx = self.dictionary.index(lang_token(language)) src_dataset = PrependTokenDataset( dataset, token=( (src_lang_idx or self.source_dictionary.bos()) if getattr(self.args, "add_bos_token", False) else self.source_dictionary.eos() ), ) max_seq_len = max(src_lengths) + 1 tgt_dataset = AppendTokenDataset(dataset, token=self.source_dictionary.pad()) return NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": PadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, pad_length=max_seq_len, ), "src_lengths": NumelDataset(src_dataset, reduce=False), }, "target": PadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, pad_length=max_seq_len, ), }, sizes=[np.array(src_lengths)], ) @torch.no_grad() def inference_step( self, generator, models, sample, language="en_XX", prefix_tokens=None, constraints=None, ): # Generation will always be conditioned on bos_token if getattr(self.args, "add_bos_token", False): src_lang_idx = self.dictionary.index(lang_token(language)) bos_token = src_lang_idx or self.source_dictionary.bos() else: bos_token = self.source_dictionary.eos() if constraints is not None: raise NotImplementedError( "Constrained decoding with the language_modeling task is not supported" ) # SequenceGenerator doesn't use src_tokens directly, we need to # pass the `prefix_tokens` argument instead if prefix_tokens is None and sample["net_input"]["src_tokens"].nelement(): prefix_tokens = sample["net_input"]["src_tokens"] if prefix_tokens[:, 0].eq(bos_token).all(): prefix_tokens = prefix_tokens[:, 1:] return generator.generate( models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token ) def eval_lm_dataloader( self, dataset, max_tokens: Optional[int] = 36000, batch_size: Optional[int] = None, max_positions: Optional[int] = None, num_shards: int = 1, shard_id: int = 0, num_workers: int = 1, data_buffer_size: int = 10, # ensures that every evaluated token has access to a context of at least # this size, if possible context_window: int = 0, ): if context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=self.args.tokens_per_sample, context_window=context_window, pad_idx=self.source_dictionary.pad(), ) return self.get_batch_iterator( dataset=dataset, max_tokens=max_tokens, max_sentences=batch_size, max_positions=max_positions, ignore_invalid_inputs=True, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, data_buffer_size=data_buffer_size, ) @property def source_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.dictionary @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.output_dictionary ================================================ FILE: fairseq/tasks/multilingual_masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import numpy as np import torch from fairseq import utils from fairseq.data import ( ConcatDataset, Dictionary, IdDataset, MaskTokensDataset, NestedDictionaryDataset, NumelDataset, NumSamplesDataset, PadDataset, PrependTokenDataset, RawLabelDataset, ResamplingDataset, SortDataset, TokenBlockDataset, data_utils, encoders, ) from fairseq.tasks import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) @register_task("multilingual_masked_lm") class MultiLingualMaskedLMTask(LegacyFairseqTask): """Task for training masked language models (e.g., BERT, RoBERTa).""" @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument( "data", help="colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner", ) parser.add_argument( "--sample-break-mode", default="complete", choices=["none", "complete", "complete_doc", "eos"], help='If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.', ) parser.add_argument( "--tokens-per-sample", default=512, type=int, help="max number of total tokens over all segments " "per sample for BERT dataset", ) parser.add_argument( "--mask-prob", default=0.15, type=float, help="probability of replacing a token with mask", ) parser.add_argument( "--leave-unmasked-prob", default=0.1, type=float, help="probability that a masked token is unmasked", ) parser.add_argument( "--random-token-prob", default=0.1, type=float, help="probability of replacing a token with a random token", ) parser.add_argument( "--freq-weighted-replacement", action="store_true", help="sample random replacement words based on word frequencies", ) parser.add_argument( "--mask-whole-words", default=False, action="store_true", help="mask whole words; you may also want to set --bpe", ) parser.add_argument( "--multilang-sampling-alpha", type=float, default=1.0, help="smoothing alpha for sample rations across multiple datasets", ) def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary self.seed = args.seed # add mask token self.mask_idx = dictionary.add_symbol("<mask>") @classmethod def setup_task(cls, args, **kwargs): paths = utils.split_paths(args.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) return cls(args, dictionary) def _get_whole_word_mask(self): # create masked input and targets if self.args.mask_whole_words: bpe = encoders.build_bpe(self.args) if bpe is not None: def is_beginning_of_word(i): if i < self.source_dictionary.nspecial: # special elements are always considered beginnings return True tok = self.source_dictionary[i] if tok.startswith("madeupword"): return True try: return bpe.is_beginning_of_word(tok) except ValueError: return True mask_whole_words = torch.ByteTensor( list(map(is_beginning_of_word, range(len(self.source_dictionary)))) ) else: mask_whole_words = None return mask_whole_words def _get_sample_prob(self, dataset_lens): """ Get smoothed sampling porbability by languages. This helps low resource languages by upsampling them. """ prob = dataset_lens / dataset_lens.sum() smoothed_prob = prob**self.args.multilang_sampling_alpha smoothed_prob = smoothed_prob / smoothed_prob.sum() return smoothed_prob def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] languages = sorted( name for name in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, name)) ) logger.info("Training on {0} languages: {1}".format(len(languages), languages)) logger.info( "Language to id mapping: ", {lang: id for id, lang in enumerate(languages)} ) mask_whole_words = self._get_whole_word_mask() lang_datasets = [] for lang_id, language in enumerate(languages): split_path = os.path.join(data_path, language, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.args.sample_break_mode, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( dataset, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=mask_whole_words, ) lang_dataset = NestedDictionaryDataset( { "net_input": { "src_tokens": PadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, ), "src_lengths": NumelDataset(src_dataset, reduce=False), }, "target": PadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, ), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_dataset, reduce=True), "lang_id": RawLabelDataset([lang_id] * src_dataset.sizes.shape[0]), }, sizes=[src_dataset.sizes], ) lang_datasets.append(lang_dataset) dataset_lengths = np.array( [len(d) for d in lang_datasets], dtype=float, ) logger.info( "loaded total {} blocks for all languages".format( dataset_lengths.sum(), ) ) if split == self.args.train_subset: # For train subset, additionally up or down sample languages. sample_probs = self._get_sample_prob(dataset_lengths) logger.info( "Sample probability by language: ", { lang: "{0:.4f}".format(sample_probs[id]) for id, lang in enumerate(languages) }, ) size_ratio = (sample_probs * dataset_lengths.sum()) / dataset_lengths logger.info( "Up/Down Sampling ratio by language: ", { lang: "{0:.2f}".format(size_ratio[id]) for id, lang in enumerate(languages) }, ) resampled_lang_datasets = [ ResamplingDataset( lang_datasets[i], size_ratio=size_ratio[i], seed=self.args.seed, epoch=epoch, replace=size_ratio[i] >= 1.0, ) for i, d in enumerate(lang_datasets) ] dataset = ConcatDataset(resampled_lang_datasets) else: dataset = ConcatDataset(lang_datasets) lang_splits = [split] for lang_id, lang_dataset in enumerate(lang_datasets): split_name = split + "_" + languages[lang_id] lang_splits.append(split_name) self.datasets[split_name] = lang_dataset # [TODO]: This is hacky for now to print validation ppl for each # language individually. Maybe need task API changes to allow it # in more generic ways. if split in self.args.valid_subset: self.args.valid_subset = self.args.valid_subset.replace( split, ",".join(lang_splits) ) with data_utils.numpy_seed(self.args.seed + epoch): shuffle = np.random.permutation(len(dataset)) self.datasets[split] = SortDataset( dataset, sort_order=[ shuffle, dataset.sizes, ], ) def build_dataset_for_inference(self, src_tokens, src_lengths, sort=True): src_dataset = PadDataset( TokenBlockDataset( src_tokens, src_lengths, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ), pad_idx=self.source_dictionary.pad(), left_pad=False, ) src_dataset = PrependTokenDataset(src_dataset, self.source_dictionary.bos()) src_dataset = NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": src_dataset, "src_lengths": NumelDataset(src_dataset, reduce=False), }, }, sizes=src_lengths, ) if sort: src_dataset = SortDataset(src_dataset, sort_order=[src_lengths]) return src_dataset @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary ================================================ FILE: fairseq/tasks/multilingual_translation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import logging import os from collections import OrderedDict from argparse import ArgumentError import torch from fairseq import options, utils from fairseq.logging import metrics from fairseq.data import ( Dictionary, LanguagePairDataset, RoundRobinZipDatasets, TransformEosLangPairDataset, ) from fairseq.models import FairseqMultiModel from fairseq.tasks.translation import load_langpair_dataset from . import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) def _lang_token(lang: str): return "__{}__".format(lang) def _lang_token_index(dic: Dictionary, lang: str): """Return language token index.""" idx = dic.index(_lang_token(lang)) assert idx != dic.unk_index, "cannot find language token for lang {}".format(lang) return idx @register_task("multilingual_translation") class MultilingualTranslationTask(LegacyFairseqTask): """A task for training multiple translation models simultaneously. We iterate round-robin over batches from multiple language pairs, ordered according to the `--lang-pairs` argument. The training loop is roughly: for i in range(len(epoch)): for lang_pair in args.lang_pairs: batch = next_batch_for_lang_pair(lang_pair) loss = criterion(model_for_lang_pair(lang_pair), batch) loss.backward() optimizer.step() In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that implements the `FairseqMultiModel` interface. During inference it is required to specify a single `--source-lang` and `--target-lang`, which indicates the inference langauge direction. `--lang-pairs`, `--encoder-langtok`, `--decoder-langtok` have to be set to the same value as training. """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off parser.add_argument('data', metavar='DIR', help='path to data directory') parser.add_argument('--lang-pairs', default=None, metavar='PAIRS', help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr') parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language (only needed for inference)') parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language (only needed for inference)') parser.add_argument('--left-pad-source', default='True', type=str, metavar='BOOL', help='pad the source on the left (default: True)') parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL', help='pad the target on the left (default: False)') try: parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N', help='max number of tokens in the source sequence') parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N', help='max number of tokens in the target sequence') except ArgumentError: # this might have already been defined. Once we transition this to hydra it should be fine to add it here. pass parser.add_argument('--upsample-primary', default=1, type=int, help='amount to upsample primary dataset') parser.add_argument('--encoder-langtok', default=None, type=str, choices=['src', 'tgt'], metavar='SRCTGT', help='replace beginning-of-sentence in source sentence with source or target ' 'language token. (src/tgt)') parser.add_argument('--decoder-langtok', action='store_true', help='replace beginning-of-sentence in target sentence with target language token') # fmt: on def __init__(self, args, dicts, training): super().__init__(args) self.dicts = dicts self.training = training if training: self.lang_pairs = args.lang_pairs else: self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)] # eval_lang_pairs for multilingual translation is usually all of the # lang_pairs. However for other multitask settings or when we want to # optimize for certain languages we want to use a different subset. Thus # the eval_lang_pairs class variable is provided for classes that extend # this class. self.eval_lang_pairs = self.lang_pairs # model_lang_pairs will be used to build encoder-decoder model pairs in # models.build_model(). This allows multitask type of sub-class can # build models other than the input lang_pairs self.model_lang_pairs = self.lang_pairs self.langs = list(dicts.keys()) @classmethod def setup_task(cls, args, **kwargs): dicts, training = cls.prepare(args, **kwargs) return cls(args, dicts, training) @classmethod def update_args(cls, args): args.left_pad_source = utils.eval_bool(args.left_pad_source) args.left_pad_target = utils.eval_bool(args.left_pad_target) if args.lang_pairs is None: raise ValueError( "--lang-pairs is required. List all the language pairs in the training objective." ) if isinstance(args.lang_pairs, str): args.lang_pairs = args.lang_pairs.split(",") @classmethod def prepare(cls, args, **kargs): cls.update_args(args) sorted_langs = sorted( list({x for lang_pair in args.lang_pairs for x in lang_pair.split("-")}) ) if args.source_lang is not None or args.target_lang is not None: training = False else: training = True # load dictionaries dicts = OrderedDict() for lang in sorted_langs: paths = utils.split_paths(args.data) assert len(paths) > 0 dicts[lang] = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(lang)) ) if len(dicts) > 0: assert dicts[lang].pad() == dicts[sorted_langs[0]].pad() assert dicts[lang].eos() == dicts[sorted_langs[0]].eos() assert dicts[lang].unk() == dicts[sorted_langs[0]].unk() if args.encoder_langtok is not None or args.decoder_langtok: for lang_to_add in sorted_langs: dicts[lang].add_symbol(_lang_token(lang_to_add)) logger.info("[{}] dictionary: {} types".format(lang, len(dicts[lang]))) return dicts, training def get_encoder_langtok(self, src_lang, tgt_lang): if self.args.encoder_langtok is None: return self.dicts[src_lang].eos() if self.args.encoder_langtok == "src": return _lang_token_index(self.dicts[src_lang], src_lang) else: return _lang_token_index(self.dicts[src_lang], tgt_lang) def get_decoder_langtok(self, tgt_lang): if not self.args.decoder_langtok: return self.dicts[tgt_lang].eos() return _lang_token_index(self.dicts[tgt_lang], tgt_lang) def alter_dataset_langtok( self, lang_pair_dataset, src_eos=None, src_lang=None, tgt_eos=None, tgt_lang=None, ): if self.args.encoder_langtok is None and not self.args.decoder_langtok: return lang_pair_dataset new_src_eos = None if ( self.args.encoder_langtok is not None and src_eos is not None and src_lang is not None and tgt_lang is not None ): new_src_eos = self.get_encoder_langtok(src_lang, tgt_lang) else: src_eos = None new_tgt_bos = None if self.args.decoder_langtok and tgt_eos is not None and tgt_lang is not None: new_tgt_bos = self.get_decoder_langtok(tgt_lang) else: tgt_eos = None return TransformEosLangPairDataset( lang_pair_dataset, src_eos=src_eos, new_src_eos=new_src_eos, tgt_bos=tgt_eos, new_tgt_bos=new_tgt_bos, ) def load_dataset(self, split, epoch=1, **kwargs): """Load a dataset split.""" paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] def language_pair_dataset(lang_pair): src, tgt = lang_pair.split("-") langpair_dataset = load_langpair_dataset( data_path, split, src, self.dicts[src], tgt, self.dicts[tgt], combine=True, dataset_impl=self.args.dataset_impl, upsample_primary=self.args.upsample_primary, left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, max_source_positions=self.args.max_source_positions, max_target_positions=self.args.max_target_positions, ) return self.alter_dataset_langtok( langpair_dataset, src_eos=self.dicts[src].eos(), src_lang=src, tgt_eos=self.dicts[tgt].eos(), tgt_lang=tgt, ) self.datasets[split] = RoundRobinZipDatasets( OrderedDict( [ (lang_pair, language_pair_dataset(lang_pair)) for lang_pair in self.lang_pairs ] ), eval_key=None if self.training else "%s-%s" % (self.args.source_lang, self.args.target_lang), ) def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): if constraints is not None: raise NotImplementedError( "Constrained decoding with the multilingual_translation task is not supported" ) lang_pair = "%s-%s" % (self.args.source_lang, self.args.target_lang) return RoundRobinZipDatasets( OrderedDict( [ ( lang_pair, self.alter_dataset_langtok( LanguagePairDataset( src_tokens, src_lengths, self.source_dictionary ), src_eos=self.source_dictionary.eos(), src_lang=self.args.source_lang, tgt_eos=self.target_dictionary.eos(), tgt_lang=self.args.target_lang, ), ) ] ), eval_key=lang_pair, ) def build_model(self, args, from_checkpoint=False): def check_args(): messages = [] if ( len(set(self.args.lang_pairs).symmetric_difference(args.lang_pairs)) != 0 ): messages.append( "--lang-pairs should include all the language pairs {}.".format( args.lang_pairs ) ) if self.args.encoder_langtok != args.encoder_langtok: messages.append( "--encoder-langtok should be {}.".format(args.encoder_langtok) ) if self.args.decoder_langtok != args.decoder_langtok: messages.append( "--decoder-langtok should {} be set.".format( "" if args.decoder_langtok else "not" ) ) if len(messages) > 0: raise ValueError(" ".join(messages)) # Update args -> the fact that the constructor here # changes the args object doesn't mean you get the same one here self.update_args(args) # Check if task args are consistant with model args check_args() from fairseq import models model = models.build_model(args, self, from_checkpoint) if not isinstance(model, FairseqMultiModel): raise ValueError( "MultilingualTranslationTask requires a FairseqMultiModel architecture" ) return model def _per_lang_pair_train_loss( self, lang_pair, model, update_num, criterion, sample, optimizer, ignore_grad ): loss, sample_size, logging_output = criterion( model.models[lang_pair], sample[lang_pair] ) if ignore_grad: loss *= 0 optimizer.backward(loss) return loss, sample_size, logging_output def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): model.train() from collections import defaultdict agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, defaultdict(float) curr_lang_pairs = [ lang_pair for lang_pair in self.model_lang_pairs if sample[lang_pair] is not None and len(sample[lang_pair]) != 0 ] for idx, lang_pair in enumerate(curr_lang_pairs): def maybe_no_sync(): if ( self.args.distributed_world_size > 1 and hasattr(model, "no_sync") and idx < len(curr_lang_pairs) - 1 ): return model.no_sync() else: return contextlib.ExitStack() # dummy contextmanager with maybe_no_sync(): loss, sample_size, logging_output = self._per_lang_pair_train_loss( lang_pair, model, update_num, criterion, sample, optimizer, ignore_grad, ) agg_loss += loss.detach().item() # TODO make summing of the sample sizes configurable agg_sample_size += sample_size for k in logging_output: agg_logging_output[k] += logging_output[k] agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k] return agg_loss, agg_sample_size, agg_logging_output def _per_lang_pair_valid_loss(self, lang_pair, model, criterion, sample): return criterion(model.models[lang_pair], sample[lang_pair]) def valid_step(self, sample, model, criterion): model.eval() with torch.no_grad(): from collections import defaultdict agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, defaultdict(float) for lang_pair in self.eval_lang_pairs: if ( lang_pair not in sample or sample[lang_pair] is None or len(sample[lang_pair]) == 0 ): continue loss, sample_size, logging_output = self._per_lang_pair_valid_loss( lang_pair, model, criterion, sample ) agg_loss += loss.data.item() # TODO make summing of the sample sizes configurable agg_sample_size += sample_size for k in logging_output: agg_logging_output[k] += logging_output[k] agg_logging_output[f"{lang_pair}:{k}"] += logging_output[k] return agg_loss, agg_sample_size, agg_logging_output def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): if self.args.decoder_langtok: bos_token = _lang_token_index( self.target_dictionary, self.args.target_lang ) else: bos_token = self.target_dictionary.eos() return generator.generate( models, sample, prefix_tokens=prefix_tokens, constraints=constraints, bos_token=bos_token, ) def reduce_metrics(self, logging_outputs, criterion): with metrics.aggregate(): # pass 'sample_size', 'nsentences', 'ntokens' stats to fairseq_task super().reduce_metrics(logging_outputs, criterion) for k in ["sample_size", "nsentences", "ntokens"]: metrics.log_scalar(k, sum(l[k] for l in logging_outputs)) @property def source_dictionary(self): if self.training: return next(iter(self.dicts.values())) else: return self.dicts[self.args.source_lang] @property def target_dictionary(self): if self.training: return next(iter(self.dicts.values())) else: return self.dicts[self.args.target_lang] def max_positions(self): """Return the max sentence length allowed by the task.""" if len(self.datasets.values()) == 0: return { "%s-%s" % (self.args.source_lang, self.args.target_lang): ( self.args.max_source_positions, self.args.max_target_positions, ) } return OrderedDict( [ (key, (self.args.max_source_positions, self.args.max_target_positions)) for split in self.datasets.keys() for key in self.datasets[split].datasets.keys() ] ) ================================================ FILE: fairseq/tasks/multires_hubert_pretraining.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import os import sys from typing import Dict, List, Optional, Tuple import numpy as np from dataclasses import dataclass, field from fairseq.data import Dictionary, HubertDataset from fairseq.dataclass.configs import FairseqDataclass from fairseq.tasks import register_task from fairseq.tasks.fairseq_task import FairseqTask from omegaconf import MISSING logger = logging.getLogger(__name__) class LabelEncoder(object): def __init__(self, dictionary: Dictionary) -> None: self.dictionary = dictionary def __call__(self, label: str) -> List[str]: return self.dictionary.encode_line( label, append_eos=False, add_if_not_exist=False, ) @dataclass class MultiresHubertPretrainingConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) fine_tuning: bool = field( default=False, metadata={"help": "set to true if fine-tuning Hubert"} ) labels: List[str] = field( default_factory=lambda: ["ltr50", "ltr25"], metadata={ "help": ( "extension of the label files to load, frame-level labels for" " pre-training, and sequence-level label for fine-tuning" ) }, ) label_dir: Optional[str] = field( default=None, metadata={ "help": "if set, looks for labels in this directory instead", }, ) label_rate: float = field( default=-1.0, metadata={"help": "label frame rate. -1.0 for sequence label"}, ) # label_rate: 1,2,2,5 # (imply (1,2), (2,5)) # if base label_rate = 50 # (1,2), (2,5) --> label rates 50, 25, 10 label_rate_ratios: List[int] = field(default=MISSING, metadata={"help": "tuple for label rates e.g., [(1,2), (2,5)]"}) sample_rate: int = field( default=16_000, metadata={ "help": "target sample rate. audio files will be up/down " "sampled to this rate" }, ) normalize: bool = field( default=False, metadata={"help": "if set, normalizes input to have 0 mean and unit variance"}, ) enable_padding: bool = field( default=False, metadata={"help": "pad shorter samples instead of cropping"}, ) max_keep_size: Optional[int] = field( default=None, metadata={"help": "exclude sample longer than this"}, ) max_sample_size: Optional[int] = field( default=None, metadata={"help": "max sample size to crop to for batching"}, ) min_sample_size: Optional[int] = field( default=None, metadata={"help": "min sample size to crop to for batching"}, ) random_crop: Optional[bool] = field( default=True, metadata={"help": "always crop from the beginning if false"}, ) pad_audio: Optional[bool] = field( default=False, metadata={"help": "pad audio to the longest one in the batch if true"}, ) @register_task("multires_hubert_pretraining", dataclass=MultiresHubertPretrainingConfig) class MultiresHubertPretrainingTask(FairseqTask): """ Multiresolution HuBERT Pretraining Task. The task is based on `HubertPretrainingTask` but extended to multiresolution. """ cfg: MultiresHubertPretrainingConfig def __init__( self, cfg: MultiresHubertPretrainingConfig, ) -> None: super().__init__(cfg) logger.info(f"current directory is {os.getcwd()}") logger.info(f"MultiresHubertPretrainingTask Config {cfg}") self.cfg = cfg self.fine_tuning = cfg.fine_tuning if cfg.fine_tuning: self.state.add_factory("target_dictionary", self.load_dictionaries) self.res_number = 1 else: self.state.add_factory("dictionaries", self.load_dictionaries) self.blank_symbol = "<s>" @property def source_dictionary(self) -> Optional[Dictionary]: return None @property def target_dictionary(self) -> Optional[Dictionary]: return self.state.target_dictionary @property def dictionaries(self) -> List[Dictionary]: return self.state.dictionaries @classmethod def setup_task( cls, cfg: MultiresHubertPretrainingConfig, **kwargs ) -> "MultiresHubertPretrainingTask": return cls(cfg) def load_dictionaries(self): label_dir = self.cfg.data if self.cfg.label_dir is None else self.cfg.label_dir self.res_number = len(label_dir) dictionaries = [ (Dictionary.load(f"{label_dir}/dict.{label}.txt") if label is not "" else None ) for label in self.cfg.labels] return dictionaries[0] if self.cfg.fine_tuning else dictionaries def get_label_dir(self) -> str: if self.cfg.label_dir is None: return self.cfg.data return self.cfg.label_dir def load_dataset(self, split: str, **kwargs) -> None: manifest = f"{self.cfg.data}/{split}.tsv" dicts = [self.target_dictionary] if self.cfg.fine_tuning else self.dictionaries pad_list = [(dict.pad() if dict is not None else None) for dict in dicts] eos_list = [(dict.eos() if dict is not None else None) for dict in dicts] procs = [LabelEncoder(dict) for dict in dicts] paths = [(f"{self.get_label_dir()}/{split}.{l}" if l != "" else None) for l in self.cfg.labels] base_rate = self.cfg.label_rate self.label_rates = [base_rate] label_rate_ratios = self.cfg.label_rate_ratios self.label_rate_ratios = [] for i in range(len(label_rate_ratios) // 2): upsample_rate, downsample_rate = label_rate_ratios[i * 2], label_rate_ratios[i * 2 + 1] # parse label rate ratios self.label_rate_ratios.append((upsample_rate, downsample_rate)) base_rate = base_rate * upsample_rate // downsample_rate self.label_rates.append(base_rate) # hubert v1: pad_audio=True, random_crop=False; self.datasets[split] = HubertDataset( manifest, sample_rate=self.cfg.sample_rate, label_paths=paths, label_rates=self.label_rates, pad_list=pad_list, eos_list=eos_list, label_processors=procs, max_keep_sample_size=self.cfg.max_keep_size, min_keep_sample_size=self.cfg.min_sample_size, max_sample_size=self.cfg.max_sample_size, pad_audio=self.cfg.pad_audio, normalize=self.cfg.normalize, store_labels=False, random_crop=self.cfg.random_crop, ) def max_positions(self) -> Tuple[int, int]: return (sys.maxsize, sys.maxsize) def filter_indices_by_size(self, indices: np.array, *args, **kwargs) -> np.array: return indices ================================================ FILE: fairseq/tasks/nlu_finetuning.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import os import torch import json from argparse import Namespace from dataclasses import dataclass, field from typing import Optional, Any from fairseq.data import AddTargetDataset, Dictionary, encoders from fairseq.tasks.audio_pretraining import AudioPretrainingTask, AudioPretrainingConfig from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.configs import GenerationConfig from fairseq.data.text_compressor import TextCompressor, TextCompressionLevel from . import register_task from .. import utils from ..logging import metrics logger = logging.getLogger(__name__) class LabelEncoder(object): def __init__(self, dictionary): self.dictionary = dictionary def __call__(self, label): return self.dictionary.encode_line( label, append_eos=False, add_if_not_exist=False ) def label_len_fn(label): return len(label.split(" ")) @dataclass class NLUFinetuningConfig(AudioPretrainingConfig): # Options for reporting WER metrics during validation. Only applicable to # Seq2Seq models during fine-tuning eval_wer: bool = field( default=False, metadata={"help": "compute WER for Seq2Seq models"} ) eval_wer_parse: bool = field( default=False, metadata={"help": "compute WER for Seq2Seq models"} ) eval_wer_config: GenerationConfig = field( default_factory=lambda: GenerationConfig(), metadata={"help": "beam search config for evaluating wer during training"}, ) eval_wer_tokenizer: Any = field( default=None, metadata={"help": "tokenizer config for evaluating wer during training"}, ) eval_wer_post_process: str = field( default="letter", metadata={ "help": "remove BPE tokens before scoring (can be sentencepiece, letter, and more)" }, ) eval_bleu: bool = field( default=False, metadata={"help": "evaluation with BLEU scores"} ) eval_bleu_detok: Optional[str] = field( default=None, metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); " "required if using --eval-bleu; use 'space' to disable " "detokenization; see fairseq.data.encoders for other options" }, ) eval_bleu_detok_args: str = field( default="{}", metadata={"help": "args for building the tokenizer, if needed"} ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"} ) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={"help": "remove BPE before computing BLEU"} ) eval_bleu_args: str = field( default="{}", metadata={ "help": "generation args for BLUE scoring, e.g., " '\'{"beam": 4, "lenpen": 0.6}\'' }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"} ) autoregressive: bool = field( default=False, metadata={ "help": "required for autoregressive decoders (like seq2seq models); " "adds 'prev_output_tokens' to input and appends eos to target" }, ) @register_task("nlu_finetuning", dataclass=NLUFinetuningConfig) class NLUFinetuningTask(AudioPretrainingTask): """ """ cfg: NLUFinetuningConfig def __init__( self, cfg: NLUFinetuningConfig, ): super().__init__(cfg) self.blank_symbol = "<s>" self.state.add_factory("target_dictionary", self.load_target_dictionary) def load_target_dictionary(self): if self.cfg.labels: dict_path = os.path.join(self.cfg.data, f"dict.{self.cfg.labels}.txt") return Dictionary.load(dict_path) return None def load_dataset(self, split: str, task_cfg: NLUFinetuningConfig = None, **kwargs): super().load_dataset(split, task_cfg, **kwargs) task_cfg = task_cfg or self.cfg assert task_cfg.labels is not None text_compression_level = getattr( TextCompressionLevel, str(self.cfg.text_compression_level) ) data_path = self.cfg.data label_path = os.path.join(data_path, f"{split}.{task_cfg.labels}") skipped_indices = getattr(self.datasets[split], "skipped_indices", set()) text_compressor = TextCompressor(level=text_compression_level) with open(label_path, "r") as f: labels = [ text_compressor.compress(l) for i, l in enumerate(f) if i not in skipped_indices ] assert len(labels) == len(self.datasets[split]), ( f"labels length ({len(labels)}) and dataset length " f"({len(self.datasets[split])}) do not match" ) process_label = LabelEncoder(self.target_dictionary) self.datasets[split] = AddTargetDataset( self.datasets[split], labels, pad=self.target_dictionary.pad(), eos=self.target_dictionary.eos(), batch_targets=True, process_label=process_label, label_len_fn=label_len_fn, add_to_input=task_cfg.get("autoregressive", False), text_compression_level=text_compression_level, ) @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.state.target_dictionary def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) if self.cfg.eval_wer_parse and self.cfg.autoregressive: metrics = self._inference_with_wer_parse( self.sequence_generator, sample, model ) logging_output["_num_char_errors"] = metrics["num_char_errors"] logging_output["_num_chars"] = metrics["num_chars"] logging_output["_num_word_errors"] = metrics["num_word_errors"] logging_output["_num_words"] = metrics["num_words"] logging_output["_num_em_errors"] = metrics["num_em_errors"] logging_output["_num_ems"] = metrics["num_ems"] logging_output["_num_tree_errors"] = metrics["num_tree_errors"] logging_output["_num_trees"] = metrics["num_trees"] if self.cfg.eval_wer and self.cfg.autoregressive: metrics = self._inference_with_wer(self.sequence_generator, sample, model) logging_output["_num_char_errors"] = metrics["num_char_errors"] logging_output["_num_chars"] = metrics["num_chars"] logging_output["_num_word_errors"] = metrics["num_word_errors"] logging_output["_num_words"] = metrics["num_words"] if self.cfg.eval_bleu and self.cfg.autoregressive: metrics = self._inference_with_bleu(self.sequence_generator, sample, model) logging_output["_bleu_sys_len"] = metrics.sys_len logging_output["_bleu_ref_len"] = metrics.ref_len # we split counts into separate entries so that they can be # summed efficiently across workers using fast-stat-sync assert len(metrics.counts) == 4 for i in range(4): logging_output[f"_bleu_counts_{i}"] = metrics.counts[i] logging_output[f"_bleu_totals_{i}"] = metrics.totals[i] return loss, sample_size, logging_output def build_model(self, model_cfg: FairseqDataclass): model = super().build_model(model_cfg) if (self.cfg.eval_wer or self.cfg.eval_wer_parse) and self.cfg.autoregressive: self.sequence_generator = self.build_generator( [model], self.cfg.eval_wer_config, ) if self.cfg.eval_wer_tokenizer: self.tokenizer = encoders.build_tokenizer(self.cfg.eval_wer_tokenizer) else: self.tokenizer = None if self.cfg.eval_bleu and self.cfg.autoregressive: assert self.cfg.eval_bleu_detok is not None, ( "--eval-bleu-detok is required if using --eval-bleu; " "try --eval-bleu-detok=moses (or --eval-bleu-detok=space " "to disable detokenization, e.g., when using sentencepiece)" ) detok_args = json.loads(self.cfg.eval_bleu_detok_args) self.tokenizer = encoders.build_tokenizer( Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args) ) gen_args = json.loads(self.cfg.eval_bleu_args) gen_args = Namespace(**gen_args) self.sequence_generator = self.build_generator([model], gen_args) return model def _inference_with_wer_parse(self, generator, sample, model): import editdistance def decode(toks): s = self.target_dictionary.string( toks.int().cpu(), self.cfg.eval_wer_post_process, escape_unk=True, ) if self.tokenizer: s = self.tokenizer.decode(s) return s def decode_to_list(toks): def token_string(i): if i == self.target_dictionary.unk(): return self.target_dictionary.unk_string(False) else: return self.target_dictionary[i] return [token_string(i) for i in toks] def is_ont_token(token): return "[" in token or "]" in token def post_process(l): o = [] for w in l: if w == self.target_dictionary.eos_word or w == "|": continue if w == "_": o.append(" ") else: o.append(w) if is_ont_token(w): o.append(" ") return o num_word_errors, num_char_errors = 0, 0 num_chars, num_words = 0, 0 num_em_errors, num_ems = 0, 0 num_tree_errors, num_trees = 0, 0 gen_out = self.inference_step(generator, [model], sample, None) for i in range(len(gen_out)): hyp_tokens = gen_out[i][0]["tokens"] # hyp = decode(hyp_tokens) ref_tokens = utils.strip_pad( sample["target"][i], self.target_dictionary.pad() ) # ref = decode(ref_tokens) hyp_list = decode_to_list(hyp_tokens) ref_list = decode_to_list(ref_tokens) hyp_list = post_process(hyp_list) ref_list = post_process(ref_list) hyp = "".join(hyp_list).strip() ref = "".join(ref_list).strip() num_chars += len(ref) num_char_errors += editdistance.eval(hyp, ref) hyp_words = hyp.split() ref_words = ref.split() hyp_tree = [word for word in hyp_list if ("[" in word or "]" in word)] ref_tree = [word for word in ref_list if ("[" in word or "]" in word)] # num_word_errors += editdistance.eval(hyp_words, ref_words) hyp_before = decode(hyp_tokens).split() ref_before = decode(ref_tokens).split() num_word_errors += editdistance.eval(hyp_before, ref_before) num_words += len(ref_before) if hyp != ref: num_em_errors += 1 if hyp_tree != ref_tree: num_tree_errors += 1 num_ems += 1 num_trees += 1 return { "num_char_errors": num_char_errors, "num_chars": num_chars, "num_word_errors": num_word_errors, "num_words": num_words, "num_ems": num_ems, "num_em_errors": num_em_errors, "num_trees": num_trees, "num_tree_errors": num_tree_errors, } def _inference_with_wer(self, generator, sample, model): import editdistance def decode(toks): s = self.target_dictionary.string( toks.int().cpu(), self.cfg.eval_wer_post_process, escape_unk=True, ) if self.tokenizer: s = self.tokenizer.decode(s) return s num_word_errors, num_char_errors = 0, 0 num_chars, num_words = 0, 0 gen_out = self.inference_step(generator, [model], sample, None) for i in range(len(gen_out)): hyp = decode(gen_out[i][0]["tokens"]) ref = decode( utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), ) num_char_errors += editdistance.eval(hyp, ref) num_chars += len(ref) hyp_words = hyp.split() ref_words = ref.split() num_word_errors += editdistance.eval(hyp_words, ref_words) num_words += len(ref_words) return { "num_char_errors": num_char_errors, "num_chars": num_chars, "num_word_errors": num_word_errors, "num_words": num_words, } def _inference_with_bleu(self, generator, sample, model): import sacrebleu def decode(toks, is_ref): s = self.target_dictionary.string( toks.int().cpu(), self.cfg.eval_bleu_remove_bpe, # The default unknown string in fairseq is `<unk>`, but # this is tokenized by sacrebleu as `< unk >`, inflating # BLEU scores. Instead, we use a somewhat more verbose # alternative that is unlikely to appear in the real # reference, but doesn't get split into multiple tokens. unk_string=("UNKNOWNTOKENINREF" if is_ref else "UNKNOWNTOKENINHYP"), ) if self.tokenizer: s = self.tokenizer.decode(s) return s gen_out = self.inference_step(generator, [model], sample) hyps, refs = [], [] for i in range(len(gen_out)): hyps.append(decode(gen_out[i][0]["tokens"], is_ref=False)) refs.append( decode( utils.strip_pad(sample["target"][i], self.target_dictionary.pad()), is_ref=True, # don't count <unk> as matches to the hypo ) ) if self.cfg.eval_bleu_print_samples: logger.info("H-{} {}".format(sample["id"][0], hyps[0])) logger.info("T-{} {}".format(sample["id"][0], refs[0])) eval_tokenization = "none" if self.cfg.eval_tokenized_bleu else "13a" return sacrebleu.corpus_bleu(hyps, [refs], tokenize=eval_tokenization) def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if self.cfg.eval_wer or self.cfg.eval_wer_parse: zero = torch.scalar_tensor(0.0) num_char_errors = sum( log.get("_num_char_errors", zero) for log in logging_outputs ) num_chars = sum(log.get("_num_chars", zero) for log in logging_outputs) num_word_errors = sum( log.get("_num_word_errors", zero) for log in logging_outputs ) num_words = sum(log.get("_num_words", zero) for log in logging_outputs) metrics.log_scalar("_num_char_errors", num_char_errors) metrics.log_scalar("_num_chars", num_chars) metrics.log_scalar("_num_word_errors", num_word_errors) metrics.log_scalar("_num_words", num_words) if num_chars > 0: metrics.log_derived( "uer", lambda meters: meters["_num_char_errors"].sum * 100.0 / meters["_num_chars"].sum if meters["_num_chars"].sum > 0 else float("nan"), ) if num_words > 0: metrics.log_derived( "wer", lambda meters: meters["_num_word_errors"].sum * 100.0 / meters["_num_words"].sum if meters["_num_words"].sum > 0 else float("nan"), ) if self.cfg.eval_wer_parse: num_em_errors = sum( log.get("_num_em_errors", zero) for log in logging_outputs ) num_ems = sum(log.get("_num_ems", zero) for log in logging_outputs) metrics.log_scalar("_num_em_errors", num_em_errors) metrics.log_scalar("_num_ems", num_ems) num_tree_errors = sum( log.get("_num_tree_errors", zero) for log in logging_outputs ) num_trees = sum(log.get("_num_trees", zero) for log in logging_outputs) metrics.log_scalar("_num_tree_errors", num_tree_errors) metrics.log_scalar("_num_trees", num_trees) if num_ems > 0: metrics.log_derived( "em_error", lambda meters: meters["_num_em_errors"].sum * 100.0 / meters["_num_ems"].sum if meters["_num_ems"].sum > 0 else float("nan"), ) if num_trees > 0: metrics.log_derived( "tree_error", lambda meters: meters["_num_tree_errors"].sum * 100.0 / meters["_num_trees"].sum if meters["_num_trees"].sum > 0 else float("nan"), ) if self.cfg.eval_bleu: len_keys = ["_bleu_sys_len", "_bleu_ref_len"] count_keys = [f"_bleu_counts_{i}" for i in range(4)] total_keys = [f"_bleu_totals_{i}" for i in range(4)] for k in len_keys + count_keys + total_keys: metrics.log_scalar(k, sum(log.get(k, 0) for log in logging_outputs)) import sacrebleu metrics.log_derived( "bleu", lambda meters: sacrebleu.compute_bleu( correct=[meters[k].sum for k in count_keys], total=[meters[k].sum for k in total_keys], sys_len=meters["_bleu_sys_len"].sum, ref_len=meters["_bleu_ref_len"].sum, smooth_method="exp", ).score, ) ================================================ FILE: fairseq/tasks/online_backtranslation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import json import logging import math import os from argparse import Namespace from collections import OrderedDict, defaultdict from pathlib import Path from typing import Dict, Sequence, Tuple from argparse import ArgumentError import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import fairseq from fairseq import options, utils from fairseq.logging import metrics from fairseq.data import ( FairseqDataset, LanguagePairDataset, NoisingDataset, PrependTokenDataset, RoundRobinZipDatasets, TransformEosLangPairDataset, data_utils, encoders, ) from fairseq.sequence_generator import SequenceGenerator from fairseq.tasks import register_task from fairseq.tasks.translation import TranslationTask, load_langpair_dataset logger = logging.getLogger(__name__) class PiecewiseLinearFn: """Piecewise linear function. Can be configured with a string.""" def __init__(self, pieces: Sequence[Tuple[int, float]]): assert pieces == sorted( pieces ), f"PiecewiseLinearFn configuration should be sorted, received: {pieces}" self.pieces = pieces def __call__(self, x: int) -> float: for i, (x_a, y_a) in enumerate(self.pieces[:-1]): x_b, y_b = self.pieces[i + 1] if x_a <= x <= x_b: return y_a + (x - x_a) * (y_b - y_a) / (x_b - x_a) return self.pieces[-1][1] @staticmethod def from_string(configuration: str) -> "PiecewiseLinearFn": """ Parse the configuration of lambda coefficient (for scheduling). x = "3" # lambda will be a constant equal to x x = "0:1,1000:0" # lambda will start from 1 and linearly decrease # to 0 during the first 1000 iterations x = "0:0,1000:0,2000:1" # lambda will be equal to 0 for the first 1000 # iterations, then will linearly increase to 1 until iteration 2000 """ if isinstance(configuration, float): return PiecewiseLinearFn([(0, configuration)]) try: parts = configuration.split(",") if len(parts) == 1: v = float(configuration) return PiecewiseLinearFn([(0, v)]) split = [s.split(":") for s in parts] pieces = [(int(t), float(v)) for t, v in split] return PiecewiseLinearFn(pieces) except Exception: raise ValueError( f"Invalid PiecewiseLinearFn configuration: {configuration!r}" ) @staticmethod def one() -> "PiecewiseLinearFn": return PiecewiseLinearFn([(0, 1.0)]) @register_task("online_backtranslation") class OnlineBackTranslationTask(TranslationTask): @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off # Generic translation args parser.add_argument('data', help='colon separated path to data directories list, \ will be iterated upon during epochs in round-robin manner; \ however, valid and test data are always in the first directory to \ avoid the need for repeating them in all directories') parser.add_argument('--mono-langs', metavar='MONO_LANGS', help='monolingual languages for training') parser.add_argument('--valid-lang-pairs', default=None, metavar='VALID_LANG_PAIRS', help='language pairs for validation') parser.add_argument('--load-alignments', action='store_true', help='load the binarized alignments') parser.add_argument('--left-pad-source', default='False', type=str, metavar='BOOL', help='pad the source on the left') parser.add_argument('--left-pad-target', default='False', type=str, metavar='BOOL', help='pad the target on the left') parser.add_argument('--upsample-primary', default=1, type=int, help='amount to upsample primary dataset') try: parser.add_argument('--max-source-positions', default=1024, type=int, metavar='N', help='max number of tokens in the source sequence') parser.add_argument('--max-target-positions', default=1024, type=int, metavar='N', help='max number of tokens in the target sequence') except ArgumentError: # this might have already been defined. Once we transition this to hydra it should be fine to add it here. pass parser.add_argument('--truncate-source', action='store_true', default=False, help='truncate source to max-source-positions') parser.add_argument('--num-batch-buckets', default=0, type=int, metavar='N', help='if >0, then bucket source and target lengths into N ' 'buckets and pad accordingly; this is useful on TPUs ' 'to minimize the number of compilations') # Denoising args parser.add_argument('--max-word-shuffle-distance', default=3.0, type=float, metavar='N', help='maximum word shuffle distance for denoising autoencoding data generation') parser.add_argument('--word-dropout-prob', default=0.1, type=float, metavar='N', help='word dropout probability for denoising autoencoding data generation') parser.add_argument('--word-blanking-prob', default=0.2, type=float, metavar='N', help='word blanking probability for denoising autoencoding data generation') # Backtranslation args parser.add_argument('--lambda-bt', default="1.0", type=str, metavar='N', help='back-translation weight') parser.add_argument('--lambda-dae', default="1.0", type=str, metavar='N', help='denoising auto-encoder weight') # Evaluation args parser.add_argument('--generate-one-by-one', action='store_true', help='generate one sentence at a time for backtranslation') parser.add_argument('--eval-bleu', action='store_true', help='evaluation with BLEU scores') parser.add_argument('--eval-bleu-detok', type=str, default="space", help='detokenize before computing BLEU (e.g., "moses"); ' 'required if using --eval-bleu; use "space" to ' 'disable detokenization; see fairseq.data.encoders ' 'for other options') parser.add_argument('--eval-bleu-detok-args', type=str, metavar='JSON', help='args for building the tokenizer, if needed') parser.add_argument('--eval-tokenized-bleu', action='store_true', default=False, help='compute tokenized BLEU instead of sacrebleu') parser.add_argument('--eval-bleu-remove-bpe', nargs='?', const='@@ ', default=None, help='remove BPE before computing BLEU') parser.add_argument('--eval-bleu-args', type=str, metavar='JSON', help='generation args for BLUE scoring, ' 'e.g., \'{"beam": 4, "lenpen": 0.6}\'') parser.add_argument('--eval-bleu-print-samples', action='store_true', help='print sample generations during validation') # fmt: on def __init__(self, args, common_dict, mono_langs, valid_lang_pairs): super().__init__(args, common_dict, common_dict) self.common_dict = common_dict self.mono_langs = mono_langs self.valid_lang_pairs = valid_lang_pairs self.SHOW_SAMPLES_INTERVAL = 1000 # Start by showing samples self._show_samples_ctr = self.SHOW_SAMPLES_INTERVAL self.SHOW_SAMPLES_NUMBER = 5 self.lambda_bt = PiecewiseLinearFn.from_string(args.lambda_bt) self.lambda_dae = PiecewiseLinearFn.from_string(args.lambda_dae) self.args = args self.data = utils.split_paths(self.args.data) if len(self.data) == 1: shards = list(Path(self.data[0]).glob("shard*")) if len(shards) > 0: # keep this as strings, since it can also be a manifold path old_data = self.data self.data = [str(shard) for shard in shards] logging.warning(f"Expanded data directory {old_data} to {self.data}") @classmethod def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ args.left_pad_source = options.eval_bool(args.left_pad_source) args.left_pad_target = options.eval_bool(args.left_pad_target) paths = utils.split_paths(args.data) assert len(paths) > 0 assert args.mono_langs is not None mono_langs = args.mono_langs.split(",") valid_lang_pairs = args.valid_lang_pairs.split(",") # load dictionary dict_path = os.path.join(paths[0], "dict.txt") common_dict = cls.load_dictionary(dict_path) return cls(args, common_dict, mono_langs, valid_lang_pairs) def load_dataset(self, split, epoch=1, combine=False, **kwargs) -> FairseqDataset: """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if split == "train": data_path = self.data[(epoch - 1) % len(self.data)] dataset = self.load_train_dataset(data_path) else: # valid/test should always be the same. dataset = self.load_translation_dataset(split, self.data[0]) self.datasets[split] = dataset return dataset def load_train_dataset(self, data_path: str) -> FairseqDataset: """The training dataset is made of backtranslation dataset and denoising dataset.""" data = [] for lang in self.mono_langs: train_path = os.path.join(data_path, lang, "train") # TODO: could we do the BT using denoise sample ? # this would half the data loading work data.append((f"{lang}-BT", self.load_bt_dataset(train_path, lang))) data.append( (f"{lang}-DENOISE", self.load_denoise_dataset(train_path, lang)) ) return RoundRobinZipDatasets(OrderedDict(data)) def _langpair_dataset( self, src: FairseqDataset, tgt: FairseqDataset ) -> LanguagePairDataset: return LanguagePairDataset( src, src.sizes, self.dictionary, tgt=tgt, tgt_sizes=tgt.sizes, tgt_dict=self.dictionary, left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, # TODO: should we shuffle ? we are already sorting batch by sizes so ? # shuffle=True, ) def _prepend_lang_bos_to_target( self, dataset: LanguagePairDataset, lang: str ) -> LanguagePairDataset: bos = _lang_token_index(self.dictionary, lang) return TransformEosLangPairDataset( dataset, src_eos=self.dictionary.eos(), new_src_eos=self.dictionary.eos(), tgt_bos=self.dictionary.eos(), new_tgt_bos=bos, ) def load_bt_dataset(self, data_path: str, lang: str) -> FairseqDataset: """The BT dataset is generated with (tgt, tgt) pairs. The actual translation to a (generated_src, tgt) pair is done on the fly during training. """ mono_dataset = data_utils.load_indexed_dataset( data_path, self.common_dict, self.args.dataset_impl ) assert mono_dataset is not None, f"No dataset found for {lang}" mono_dataset_src = PrependTokenDataset( mono_dataset, _lang_token_index(self.dictionary, lang) ) mono_dataset_bt = self._langpair_dataset(mono_dataset_src, mono_dataset) logger.info( f"mono_lang = {lang} " f"lang token index = {_lang_token_index(self.dictionary, lang)} " f"lang token = {_lang_token(lang)}" ) mono_dataset_bt = self._prepend_lang_bos_to_target(mono_dataset_bt, lang) return mono_dataset_bt def load_denoise_dataset(self, data_path: str, lang: str) -> FairseqDataset: """Classic denoising dataset""" dataset = data_utils.load_indexed_dataset( data_path, self.common_dict, self.args.dataset_impl ) noisy_dataset = NoisingDataset( dataset, self.dictionary, seed=1, max_word_shuffle_distance=self.args.max_word_shuffle_distance, word_dropout_prob=self.args.word_dropout_prob, word_blanking_prob=self.args.word_blanking_prob, ) noisy_dataset = PrependTokenDataset( noisy_dataset, _lang_token_index(self.dictionary, lang) ) clean_dataset = data_utils.load_indexed_dataset( data_path, self.common_dict, self.args.dataset_impl ) denoising_dataset = self._langpair_dataset(noisy_dataset, clean_dataset) denoising_dataset = self._prepend_lang_bos_to_target(denoising_dataset, lang) return denoising_dataset def load_translation_dataset( self, split: str, data_path: str, combine: bool = False ): # only judging with one language pair for the moment, # since ConcatDataset doesn't work as expected assert len(self.valid_lang_pairs) == 1, "For now..." valid_lang_pair = self.valid_lang_pairs[0] src, tgt = valid_lang_pair.split("-") # use the same function than TranslationTask src_tgt_dt = load_langpair_dataset( data_path, split, src, self.common_dict, tgt, self.common_dict, combine=combine, dataset_impl=self.args.dataset_impl, upsample_primary=self.args.upsample_primary, left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, max_source_positions=self.args.max_source_positions, max_target_positions=self.args.max_target_positions, load_alignments=self.args.load_alignments, truncate_source=self.args.truncate_source, num_buckets=self.args.num_batch_buckets, shuffle=(split != "test"), prepend_bos_src=_lang_token_index(self.dictionary, src), ) src_tgt_eos_dt = self._prepend_lang_bos_to_target(src_tgt_dt, tgt) src_tgt_eos_dt.args = self.args return src_tgt_eos_dt def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): raise NotImplementedError def build_model(self, args, from_checkpoint=False): # torch.autograd.set_detect_anomaly(True) model = super().build_model(args, from_checkpoint) add_secial_tokens_to_dict_and_model(self.common_dict, model, self.mono_langs) self.sequence_generators = {} for mono_lang in self.mono_langs: self.sequence_generators[mono_lang] = SequenceGenerator( [model], tgt_dict=self.dictionary, beam_size=1, max_len_a=1.3, max_len_b=5, min_len=5, # keep 1 to be able to prepend bos max_len=model.max_decoder_positions() - 1, ) if getattr(args, "eval_bleu", False): assert getattr(args, "eval_bleu_detok", None) is not None, ( "--eval-bleu-detok is required if using --eval-bleu; " "try --eval-bleu-detok=moses (or --eval-bleu-detok=space " "to disable detokenization, e.g., when using sentencepiece)" ) detok_args = json.loads(getattr(args, "eval_bleu_detok_args", "{}") or "{}") self.tokenizer = encoders.build_tokenizer( Namespace( tokenizer=getattr(args, "eval_bleu_detok", None), **detok_args ) ) gen_args = json.loads(getattr(args, "eval_bleu_args", "{}") or "{}") self.bleu_sequence_generator = self.build_generator( [model], Namespace(**gen_args) ) return model def max_positions(self): """Return the max sentence length allowed by the task.""" return (self.args.max_source_positions, self.args.max_target_positions) @property def dictionary(self): """Return the source :class:`~fairseq.data.Dictionary`.""" return self.common_dict def display_samples_once_in_a_while(self, smp, mono_lang, other_lang): self._show_samples_ctr += 1 if self._show_samples_ctr < self.SHOW_SAMPLES_INTERVAL: return self._show_samples_ctr = 0 ln = smp["net_input"]["src_tokens"].shape[0] logger.info( f"(r:{self.args.distributed_rank}) : " f"{other_lang} ---> {mono_lang} " f"({other_lang} was generated by back-translation.) {ln} samples" ) for i in range(min(ln, self.SHOW_SAMPLES_NUMBER)): src_tokens = smp["net_input"]["src_tokens"][i] tgt_tokens = smp["target"][i] src_str = self.dictionary.string(src_tokens, "sentencepiece") tgt_str = self.dictionary.string(tgt_tokens, "sentencepiece") logger.info( f"\n{i}\t\t[{other_lang} generated] {src_str}\n" f"\t\t[{mono_lang} original ] {tgt_str}\n" f"\t\t[ src tokens] {src_tokens}\n" ) def backtranslate_sample(self, smp, orig_lang, other_lang) -> None: """ * WARNING: smp is modified in place. * At the start of this function, `smp` has the same input and target: |--------------------------------------------------------| | smp['net_input']['src_tokens'] | smp['target'] | | (from data) __en__ hello world | __en__ hello world | |--------------------------------------------------------| * We call generator.generate(smp, bos_token = token("ro")), and copy the result as input * At the end, `smp` has the translation to other language. |--------------------------------------------------------| | smp['net_input']['src_tokens'] | smp['target'] | | (generated) __ro__ salut lume | __en__ hello world | |--------------------------------------------------------| """ bos_token = _lang_token_index(self.dictionary, other_lang) generated = self.sequence_generators[orig_lang].generate( models=[], sample=smp, bos_token=bos_token ) max_lngth = max([gn[0]["tokens"].size(0) for gn in generated]) net_input = smp["net_input"] n_src_tokens = torch.empty( size=(len(generated), max_lngth + 1), dtype=net_input["src_tokens"].dtype ) n_src_lengths = torch.empty( len(generated), dtype=net_input["src_lengths"].dtype ) for i, gn in enumerate(generated): tokens = gn[0]["tokens"] tokens_size = tokens.size(0) padding_needed = max_lngth - tokens_size tokens = torch.cat([tokens.new([bos_token]), tokens]) tokens = F.pad(tokens, (0, padding_needed), value=self.dictionary.pad()) n_src_tokens[i] = tokens n_src_lengths[i] = tokens_size + 1 device = net_input["src_tokens"].device # This seems to be important del net_input["src_tokens"] del net_input["src_lengths"] net_input["src_tokens"] = n_src_tokens.to(device) net_input["src_lengths"] = n_src_lengths.to(device) def generate(self, smp, model): model.eval() orig_lang = ( self.dictionary[smp["net_input"]["src_tokens"][0][0]] .replace(" ", "") .replace("_", "") ) bos_token = smp["net_input"]["prev_output_tokens"][0][0] with torch.no_grad(): generated = self.sequence_generators[orig_lang].generate( models=[model], sample=smp, bos_token=bos_token ) return generated def get_other_lang(self, lang): # TODO: allow more complex mapping if lang != self.mono_langs[0]: return self.mono_langs[0] if len(self.mono_langs) == 2: return self.mono_langs[1] return self.mono_langs[np.random.randint(1, len(self.mono_langs))] def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): model.train() model.set_num_updates(update_num) agg_loss, agg_sample_size = 0.0, 0.0 agg_logging_output: Dict[str, float] = defaultdict(float) dataset_keys = self.datasets["train"].datasets.keys() weights = { "BT": self.lambda_bt(update_num), "DENOISE": self.lambda_dae(update_num), } log_keys = {"BT": "bt_", "DENOISE": "dae_"} for dataset_key in dataset_keys: smp = sample[dataset_key] mono_lang, task_subtype = dataset_key.split("-") if weights[task_subtype] == 0: continue if task_subtype == "BT": with torch.autograd.profiler.record_function("backtranslation"): model.eval() # TODO: Could we translate to several language at once ? # this would allow to share encoder_out and maximize GPU usage. other_lang = self.get_other_lang(mono_lang) self.backtranslate_sample(smp, mono_lang, other_lang) self.display_samples_once_in_a_while(smp, mono_lang, other_lang) model.train() # Like in FairseqTask.train_step with torch.autograd.profiler.record_function("forward"): loss, sample_size, logging_output = criterion(model, smp) loss *= weights[task_subtype] if ignore_grad: loss *= 0 with torch.autograd.profiler.record_function("backward"): optimizer.backward(loss) agg_loss += loss.item() agg_sample_size += sample_size for k in logging_output: agg_logging_output[log_keys[task_subtype] + k] += logging_output[k] agg_logging_output[k] += logging_output[k] return agg_loss, agg_sample_size, agg_logging_output def get_bos_token_from_sample(self, sample): net_input = sample["net_input"] source_lang_token_id = torch.unique(net_input["src_tokens"][:, 0]).item() source_lang_token = self.dictionary[source_lang_token_id].replace("_", "") target_lang_token_id = _lang_token_index( self.dictionary, self.get_other_lang(source_lang_token) ) return target_lang_token_id def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) bt_sample_size = sum(x.get("bt_sample_size", 0) for x in logging_outputs) if bt_sample_size: bt_loss_sum = sum(x.get("bt_loss", 0) for x in logging_outputs) bt_loss_sum *= 1 / bt_sample_size / math.log(2) metrics.log_scalar("bt_loss", bt_loss_sum, bt_sample_size, round=3) bt_nll_loss_sum = sum(x.get("bt_nll_loss", 0) for x in logging_outputs) bt_ntokens = sum(x.get("bt_ntokens", 0) for x in logging_outputs) bt_nll_loss_sum *= 1 / bt_ntokens / math.log(2) metrics.log_scalar("bt_nll_loss", bt_nll_loss_sum, bt_ntokens, round=3) metrics.log_derived( "bt_ppl", lambda meters: utils.get_perplexity(meters["bt_nll_loss"].avg) ) dae_sample_size = sum(x.get("dae_sample_size", 0) for x in logging_outputs) if dae_sample_size: dae_loss_sum = sum(x.get("dae_loss", 0) for x in logging_outputs) dae_loss_sum *= 1 / dae_sample_size / math.log(2) metrics.log_scalar("dae_loss", dae_loss_sum, dae_sample_size, round=3) dae_nll_loss_sum = sum(x.get("dae_nll_loss", 0) for x in logging_outputs) dae_ntokens = sum(x.get("dae_ntokens", 0) for x in logging_outputs) dae_nll_loss_sum *= 1 / dae_ntokens / math.log(2) metrics.log_scalar("dae_nll_loss", dae_nll_loss_sum, dae_ntokens, round=3) metrics.log_derived( "dae_ppl", lambda meters: utils.get_perplexity(meters["dae_nll_loss"].avg), ) @torch.no_grad() def extend_embedding( emb: nn.Module, new_vocab_size: int, copy_from_token_id: int ) -> None: old_emb_data = emb.weight.data (old_vocab_size, dim) = old_emb_data.shape assert new_vocab_size >= old_vocab_size if new_vocab_size > old_vocab_size: emb.weight.data = torch.zeros((new_vocab_size, dim)) emb.weight.data[:old_vocab_size, :] = old_emb_data # initialize new embeddings emb.weight.data[old_vocab_size:, :] = old_emb_data[copy_from_token_id] if hasattr(emb, "num_embeddings"): emb.num_embeddings = new_vocab_size if hasattr(emb, "out_features"): emb.out_features = new_vocab_size if getattr(emb, "bias", None) is None: return # Fix the bias. # Bias shape can be different from the previous vocab size # if the weight matrix was shared and alread extended but not the bias. (old_vocab_size,) = emb.bias.shape assert new_vocab_size >= old_vocab_size if new_vocab_size > old_vocab_size: old_bias = emb.bias.data new_bias = torch.zeros( (new_vocab_size,), dtype=old_bias.dtype, device=old_bias.device ) new_bias[:old_vocab_size] = old_bias emb.bias.data = new_bias def add_secial_tokens_to_dict_and_model( dictionary: "fairseq.data.Dictionary", model: nn.Module, mono_langs: Sequence[str], ) -> None: embs = model.encoder.embed_tokens vocab_size, embedding_dim = embs.weight.shape # The model may or may not have a '<mask>' embedding yet assert ( len(dictionary) <= vocab_size <= len(dictionary) + 1 ), f"Dictionary len ({len(dictionary)}) doesn't match embs shape ({embs.weight.shape})" # TODO: we should reuse the pretrained model dict which already has <mask> dictionary.add_symbol("<mask>") for lang in mono_langs: lang_token = _lang_token(lang) dictionary.add_symbol(lang_token) logger.info( f"dictionary: {len(dictionary)} -> {vocab_size} tokens " f"after adding {len(mono_langs)} lang tokens." ) if len(dictionary) <= vocab_size: return extend_embedding(embs, len(dictionary), dictionary.bos()) dec_embs = model.decoder.embed_tokens extend_embedding(dec_embs, len(dictionary), dictionary.bos()) lm_head = model.decoder.output_projection extend_embedding(lm_head, len(dictionary), dictionary.bos()) assert lm_head.weight.shape == (len(dictionary), embedding_dim) def _lang_token(lang: str) -> str: return f"__{lang}__" def _lang_token_index(dictionary, lang: str) -> int: return dictionary.index(_lang_token(lang)) @contextlib.contextmanager def assert_weights_have_changed(model: nn.Module): def checksum(model: nn.Module) -> float: return sum(p.sum().item() for p in model.parameters()) initial_checksum = checksum(model) yield model final_checksum = checksum(model) logger.info( f"initial_checksum={initial_checksum} -> final_checksum={final_checksum}" ) assert initial_checksum != final_checksum, "Model hasn't changed !" ================================================ FILE: fairseq/tasks/semisupervised_translation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from collections import OrderedDict from fairseq import utils from fairseq.data import ( BacktranslationDataset, IndexedCachedDataset, IndexedDataset, IndexedRawTextDataset, LanguagePairDataset, NoisingDataset, RoundRobinZipDatasets, data_utils, indexed_dataset, ) from fairseq.models import FairseqMultiModel from fairseq.sequence_generator import SequenceGenerator from . import register_task from .multilingual_translation import MultilingualTranslationTask logger = logging.getLogger(__name__) def _get_bt_dataset_key(lang_pair): return "bt:" + lang_pair def _get_denoising_dataset_key(lang_pair): return "denoising:" + lang_pair # ported from UnsupervisedMT def parse_lambda_config(x): """ Parse the configuration of lambda coefficient (for scheduling). x = "3" # lambda will be a constant equal to x x = "0:1,1000:0" # lambda will start from 1 and linearly decrease # to 0 during the first 1000 iterations x = "0:0,1000:0,2000:1" # lambda will be equal to 0 for the first 1000 # iterations, then will linearly increase to 1 until iteration 2000 """ split = x.split(",") if len(split) == 1: return float(x), None else: split = [s.split(os.pathsep) for s in split] assert all(len(s) == 2 for s in split) assert all(k.isdigit() for k, _ in split) assert all( int(split[i][0]) < int(split[i + 1][0]) for i in range(len(split) - 1) ) return float(split[0][1]), [(int(k), float(v)) for k, v in split] @register_task("semisupervised_translation") class SemisupervisedTranslationTask(MultilingualTranslationTask): """A task for training multiple translation models simultaneously. We iterate round-robin over batches from multiple language pairs, ordered according to the `--lang-pairs` argument. The training loop is roughly: for i in range(len(epoch)): for lang_pair in args.lang_pairs: batch = next_batch_for_lang_pair(lang_pair) loss = criterion(model_for_lang_pair(lang_pair), batch) loss.backward() optimizer.step() In practice, `next_batch_for_lang_pair` is abstracted in a FairseqDataset (e.g., `RoundRobinZipDatasets`) and `model_for_lang_pair` is a model that implements the `FairseqMultiModel` interface. During inference it is required to specify a single `--source-lang` and `--target-lang`, instead of `--lang-pairs`. """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off MultilingualTranslationTask.add_args(parser) parser.add_argument('--lambda-parallel-config', default="1.0", type=str, metavar='CONFIG', help='cross-entropy reconstruction coefficient (parallel data). ' 'use fixed weight during training if set to floating point number. ' 'use piecewise linear function over number of updates to schedule the ' 'weight with the format: w0:step0,w1:step1,...') parser.add_argument('--lambda-denoising-config', default="0.0", type=str, metavar='CONFIG', help='Cross-entropy reconstruction coefficient (denoising autoencoding)' 'use fixed weight during training if set to floating point number. ' 'use piecewise linear function over number of updates to schedule the ' 'weight with the format: w0:step0,w1:step1,...') parser.add_argument('--lambda-otf-bt-config', default="0.0", type=str, metavar='CONFIG', help='cross-entropy reconstruction coefficient (on-the-fly back-translation parallel data)' 'use fixed weight during training if set to floating point number. ' 'use piecewise linear function over number of updates to schedule the ' 'weight with the format: w0:step0,w1:step1,...') parser.add_argument('--bt-max-len-a', default=1.1, type=float, metavar='N', help='generate back-translated sequences of maximum length ax + b, where x is the ' 'source length') parser.add_argument('--bt-max-len-b', default=10.0, type=float, metavar='N', help='generate back-translated sequences of maximum length ax + b, where x is the ' 'source length') parser.add_argument('--bt-beam-size', default=1, type=int, metavar='N', help='beam size used in beam search of online back-translation') parser.add_argument('--max-word-shuffle-distance', default=3.0, type=float, metavar='N', help='maximum word shuffle distance for denoising autoencoding data generation') parser.add_argument('--word-dropout-prob', default=0.1, type=float, metavar='N', help='word dropout probability for denoising autoencoding data generation') parser.add_argument('--word-blanking-prob', default=0.2, type=float, metavar='N', help='word blanking probability for denoising autoencoding data generation') # fmt: on def __init__(self, args, dicts, training): super().__init__(args, dicts, training) self.lambda_parallel, self.lambda_parallel_steps = parse_lambda_config( args.lambda_parallel_config ) self.lambda_otf_bt, self.lambda_otf_bt_steps = parse_lambda_config( args.lambda_otf_bt_config ) self.lambda_denoising, self.lambda_denoising_steps = parse_lambda_config( args.lambda_denoising_config ) if self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None: denoising_lang_pairs = [ "%s-%s" % (tgt, tgt) for tgt in {lang_pair.split("-")[1] for lang_pair in args.lang_pairs} ] self.model_lang_pairs = self.model_lang_pairs + denoising_lang_pairs self.backtranslate_datasets = {} self.backtranslators = {} @classmethod def setup_task(cls, args, **kwargs): dicts, training = MultilingualTranslationTask.prepare(args, **kwargs) return cls(args, dicts, training) def load_dataset(self, split, epoch=1, **kwargs): """Load a dataset split.""" paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] def split_exists(split, src, tgt, lang): if src is not None: filename = os.path.join( data_path, "{}.{}-{}.{}".format(split, src, tgt, lang) ) else: filename = os.path.join( data_path, "{}.{}-None.{}".format(split, src, tgt) ) return indexed_dataset.dataset_exists(filename, impl=self.args.dataset_impl) def load_indexed_dataset(path, dictionary): return data_utils.load_indexed_dataset( path, dictionary, self.args.dataset_impl ) # load parallel datasets src_datasets, tgt_datasets = {}, {} if ( self.lambda_parallel > 0.0 or self.lambda_parallel_steps is not None or not split.startswith("train") ): for lang_pair in self.lang_pairs: src, tgt = lang_pair.split("-") if split_exists(split, src, tgt, src): prefix = os.path.join( data_path, "{}.{}-{}.".format(split, src, tgt) ) elif split_exists(split, tgt, src, src): prefix = os.path.join( data_path, "{}.{}-{}.".format(split, tgt, src) ) else: continue src_datasets[lang_pair] = load_indexed_dataset( prefix + src, self.dicts[src] ) tgt_datasets[lang_pair] = load_indexed_dataset( prefix + tgt, self.dicts[tgt] ) logger.info( "parallel-{} {} {} examples".format( data_path, split, len(src_datasets[lang_pair]) ) ) if len(src_datasets) == 0: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) # back translation datasets backtranslate_datasets = {} if ( self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None ) and split.startswith("train"): for lang_pair in self.lang_pairs: src, tgt = lang_pair.split("-") if not split_exists(split, tgt, None, tgt): raise FileNotFoundError( "Dataset not found: backtranslation {} ({})".format( split, data_path ) ) filename = os.path.join( data_path, "{}.{}-None.{}".format(split, tgt, tgt) ) dataset = load_indexed_dataset(filename, self.dicts[tgt]) lang_pair_dataset_tgt = LanguagePairDataset( dataset, dataset.sizes, self.dicts[tgt], left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, ) lang_pair_dataset = LanguagePairDataset( dataset, dataset.sizes, src_dict=self.dicts[src], tgt=dataset, tgt_sizes=dataset.sizes, tgt_dict=self.dicts[tgt], left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, ) backtranslate_datasets[lang_pair] = BacktranslationDataset( tgt_dataset=self.alter_dataset_langtok( lang_pair_dataset_tgt, src_eos=self.dicts[tgt].eos(), src_lang=tgt, tgt_lang=src, ), backtranslation_fn=self.backtranslators[lang_pair], src_dict=self.dicts[src], tgt_dict=self.dicts[tgt], output_collater=self.alter_dataset_langtok( lang_pair_dataset=lang_pair_dataset, src_eos=self.dicts[src].eos(), src_lang=src, tgt_eos=self.dicts[tgt].eos(), tgt_lang=tgt, ).collater, ) logger.info( "backtranslate-{}: {} {} {} examples".format( tgt, data_path, split, len(backtranslate_datasets[lang_pair]), ) ) self.backtranslate_datasets[lang_pair] = backtranslate_datasets[ lang_pair ] # denoising autoencoder noising_datasets = {} if ( self.lambda_denoising > 0.0 or self.lambda_denoising_steps is not None ) and split.startswith("train"): for lang_pair in self.lang_pairs: _, tgt = lang_pair.split("-") if not split_exists(split, tgt, None, tgt): continue filename = os.path.join( data_path, "{}.{}-None.{}".format(split, tgt, tgt) ) tgt_dataset1 = load_indexed_dataset(filename, self.dicts[tgt]) tgt_dataset2 = load_indexed_dataset(filename, self.dicts[tgt]) noising_dataset = NoisingDataset( tgt_dataset1, self.dicts[tgt], seed=1, max_word_shuffle_distance=self.args.max_word_shuffle_distance, word_dropout_prob=self.args.word_dropout_prob, word_blanking_prob=self.args.word_blanking_prob, ) noising_datasets[lang_pair] = self.alter_dataset_langtok( LanguagePairDataset( noising_dataset, tgt_dataset1.sizes, self.dicts[tgt], tgt_dataset2, tgt_dataset2.sizes, self.dicts[tgt], left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, ), src_eos=self.dicts[tgt].eos(), src_lang=tgt, tgt_eos=self.dicts[tgt].eos(), tgt_lang=tgt, ) logger.info( "denoising-{}: {} {} {} examples".format( tgt, data_path, split, len(noising_datasets[lang_pair]), ) ) def language_pair_dataset(lang_pair): src, tgt = lang_pair.split("-") src_dataset, tgt_dataset = src_datasets[lang_pair], tgt_datasets[lang_pair] return self.alter_dataset_langtok( LanguagePairDataset( src_dataset, src_dataset.sizes, self.dicts[src], tgt_dataset, tgt_dataset.sizes, self.dicts[tgt], left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, ), self.dicts[src].eos(), src, self.dicts[tgt].eos(), tgt, ) self.datasets[split] = RoundRobinZipDatasets( OrderedDict( [ (lang_pair, language_pair_dataset(lang_pair)) for lang_pair in src_datasets.keys() ] + [ (_get_bt_dataset_key(lang_pair), dataset) for lang_pair, dataset in backtranslate_datasets.items() ] + [ (_get_denoising_dataset_key(lang_pair), dataset) for lang_pair, dataset in noising_datasets.items() ] ), eval_key=None if self.training else "%s-%s" % (self.args.source_lang, self.args.target_lang), ) def build_model(self, args, from_checkpoint=False): from fairseq import models model = models.build_model(args, self, from_checkpoint) if not isinstance(model, FairseqMultiModel): raise ValueError( "SemisupervisedTranslationTask requires a FairseqMultiModel architecture" ) # create SequenceGenerator for each model that has backtranslation dependency on it self.sequence_generators = {} if ( self.lambda_otf_bt > 0.0 or self.lambda_otf_bt_steps is not None ) and self.training: for lang_pair in self.lang_pairs: src, tgt = lang_pair.split("-") key = "{}-{}".format(tgt, src) self.sequence_generators[key] = SequenceGenerator( [model.models[key]], tgt_dict=self.dicts[src], beam_size=args.bt_beam_size, max_len_a=args.bt_max_len_a, max_len_b=args.bt_max_len_b, ) decoder_lang_tok_idx = self.get_decoder_langtok(src) def backtranslate_fn( sample, model=model.models[key], bos_token=decoder_lang_tok_idx, sequence_generator=self.sequence_generators[key], ): return sequence_generator.generate( [model], sample, bos_token=bos_token, ) self.backtranslators[lang_pair] = backtranslate_fn return model def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): model.train() if update_num > 0: self.update_step(update_num) agg_loss, agg_sample_size, agg_logging_output = 0.0, 0.0, {} def forward_backward(model, samples, logging_output_key, weight): nonlocal agg_loss, agg_sample_size, agg_logging_output if samples is None or len(samples) == 0: return loss, sample_size, logging_output = criterion(model, samples) if ignore_grad: loss *= 0 else: loss *= weight optimizer.backward(loss) agg_loss += loss.detach().item() # TODO make summing of the sample sizes configurable agg_sample_size += sample_size for k in logging_output: agg_logging_output[k] += logging_output[k] agg_logging_output[logging_output_key] += logging_output[k] if self.lambda_parallel > 0.0: for lang_pair in self.lang_pairs: forward_backward( model.models[lang_pair], sample[lang_pair], lang_pair, self.lambda_parallel, ) if self.lambda_otf_bt > 0.0: for lang_pair in self.lang_pairs: sample_key = _get_bt_dataset_key(lang_pair) forward_backward( model.models[lang_pair], sample[sample_key], sample_key, self.lambda_otf_bt, ) if self.lambda_denoising > 0.0: for lang_pair in self.lang_pairs: _, tgt = lang_pair.split("-") sample_key = _get_denoising_dataset_key(lang_pair) forward_backward( model.models["{0}-{0}".format(tgt)], sample[sample_key], sample_key, self.lambda_denoising, ) return agg_loss, agg_sample_size, agg_logging_output def update_step(self, num_updates): def lambda_step_func(config, n_iter): """ Update a lambda value according to its schedule configuration. """ ranges = [ i for i in range(len(config) - 1) if config[i][0] <= n_iter < config[i + 1][0] ] if len(ranges) == 0: assert n_iter >= config[-1][0] return config[-1][1] assert len(ranges) == 1 i = ranges[0] x_a, y_a = config[i] x_b, y_b = config[i + 1] return y_a + (n_iter - x_a) * float(y_b - y_a) / float(x_b - x_a) if self.lambda_parallel_steps is not None: self.lambda_parallel = lambda_step_func( self.lambda_parallel_steps, num_updates ) if self.lambda_denoising_steps is not None: self.lambda_denoising = lambda_step_func( self.lambda_denoising_steps, num_updates ) if self.lambda_otf_bt_steps is not None: self.lambda_otf_bt = lambda_step_func(self.lambda_otf_bt_steps, num_updates) ================================================ FILE: fairseq/tasks/sentence_prediction.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import contextlib from dataclasses import dataclass, field from typing import Optional from omegaconf import MISSING, II, open_dict, OmegaConf import numpy as np from fairseq.data import ( ConcatSentencesDataset, Dictionary, IdDataset, NestedDictionaryDataset, NumelDataset, NumSamplesDataset, OffsetTokensDataset, PrependTokenDataset, RawLabelDataset, RightPadDataset, RightPaddingMaskDataset, RollDataset, SortDataset, StripTokenDataset, data_utils, ) from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.tasks import FairseqDataclass, FairseqTask, register_task from fairseq.dataclass import ChoiceEnum logger = logging.getLogger(__name__) SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) @dataclass class SentencePredictionConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "path to data directory"}) num_classes: int = field( default=-1, metadata={"help": "number of classes or regression targets"}, ) init_token: Optional[int] = field( default=None, metadata={"help": "add token at the beginning of each batch item"}, ) separator_token: Optional[int] = field( default=None, metadata={"help": "add separator token between inputs"}, ) no_shuffle: bool = field( default=False, ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed tokens_per_sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) add_prev_output_tokens: bool = field( default=False, metadata={ "help": "add prev_output_tokens to sample, used for encoder-decoder arch" }, ) max_positions: int = field( default=512, metadata={"help": "max tokens per example"}, ) regression_target: bool = II("criterion.regression_target") classification_head_name: str = II("criterion.classification_head_name") seed: int = II("common.seed") d2v2_multi: bool = field( default=False, metadata={"help": "prepare dataset for data2vec_multi"}, ) @register_task("sentence_prediction", dataclass=SentencePredictionConfig) class SentencePredictionTask(FairseqTask): """ Sentence (or sentence pair) prediction (classification or regression) task. Args: dictionary (Dictionary): the dictionary for the input of the task """ def __init__(self, cfg, data_dictionary, label_dictionary): super().__init__(cfg) self.dictionary = data_dictionary self._label_dictionary = label_dictionary @classmethod def load_dictionary(cls, filename): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol("<mask>") return dictionary @classmethod def setup_task(cls, cfg, **kwargs): assert cfg.num_classes > 0, "Must set task.num_classes" # load data dictionary data_dict = cls.load_dictionary( os.path.join(cfg.data, "input0", "dict.txt"), ) logger.info("[input] dictionary: {} types".format(len(data_dict))) # load label dictionary if not cfg.regression_target: label_dict = cls.load_dictionary( os.path.join(cfg.data, "label", "dict.txt"), ) logger.info("[label] dictionary: {} types".format(len(label_dict))) else: label_dict = data_dict return cls(cfg, data_dict, label_dict) def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(key, split): return os.path.join(self.cfg.data, key, split) def make_dataset(key, dictionary): split_path = get_path(key, split) try: dataset = data_utils.load_indexed_dataset( split_path, dictionary, combine=combine, ) except Exception as e: if "StorageException: [404] Path not found" in str(e): logger.warning(f"dataset {e} not found") dataset = None else: raise e return dataset input0 = make_dataset("input0", self.source_dictionary) assert input0 is not None, "could not find dataset: {}".format( get_path("input0", split) ) input1 = make_dataset("input1", self.source_dictionary) if self.cfg.init_token is not None: input0 = PrependTokenDataset(input0, self.cfg.init_token) if input1 is None: src_tokens = input0 else: if self.cfg.separator_token is not None: input1 = PrependTokenDataset(input1, self.cfg.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.cfg.seed): shuffle = np.random.permutation(len(src_tokens)) src_tokens = maybe_shorten_dataset( src_tokens, split, self.cfg.shorten_data_split_list, self.cfg.shorten_method, self.max_positions(), self.cfg.seed, ) if self.cfg.d2v2_multi: net_input = { "source": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "id": IdDataset(), "padding_mask": RightPaddingMaskDataset(src_tokens), } else: net_input = { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset(src_tokens, reduce=False), } if self.cfg.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) net_input.update( prev_output_tokens=prev_tokens_dataset, ) dataset = { "id": IdDataset(), "net_input": net_input, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } if not self.cfg.regression_target: label_dataset = make_dataset("label", self.label_dictionary) if label_dataset is not None: dataset.update( target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, ) ) else: label_path = "{0}.label".format(get_path("label", split)) if os.path.exists(label_path): def parse_regression_target(i, line): values = line.split() assert ( len(values) == self.cfg.num_classes ), f'expected num_classes={self.cfg.num_classes} regression target values on line {i}, found: "{line}"' return [float(x) for x in values] with open(label_path) as h: dataset.update( target=RawLabelDataset( [ parse_regression_target(i, line.strip()) for i, line in enumerate(h.readlines()) ] ) ) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.cfg.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split] def build_model(self, cfg, from_checkpoint=False): from fairseq import models with open_dict(cfg) if OmegaConf.is_config(cfg) else contextlib.ExitStack(): cfg.max_positions = self.cfg.max_positions model = models.build_model(cfg, self, from_checkpoint) model.register_classification_head( self.cfg.classification_head_name, num_classes=self.cfg.num_classes, ) return model def max_positions(self): return self.cfg.max_positions @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary @property def label_dictionary(self): return self._label_dictionary ================================================ FILE: fairseq/tasks/sentence_prediction_adapters.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import contextlib from omegaconf import open_dict, OmegaConf from fairseq.tasks import register_task from fairseq.tasks.sentence_prediction import ( SentencePredictionTask, SentencePredictionConfig, ) logger = logging.getLogger(__name__) @register_task("sentence_prediction_adapters", dataclass=SentencePredictionConfig) class SentencePredictionAdapterTask(SentencePredictionTask): def build_model(self, cfg): from fairseq import models with open_dict(cfg) if OmegaConf.is_config(cfg) else contextlib.ExitStack(): cfg.max_positions = self.cfg.max_positions model = models.build_model(cfg, self) model.register_classification_head( self.cfg.classification_head_name, num_classes=self.cfg.num_classes, ) logger.info("Freezing Embedding Parameters") for parameter in model.encoder.sentence_encoder.embed_positions.parameters(): parameter.requires_grad = False for ( parameter ) in model.encoder.sentence_encoder.layernorm_embedding.parameters(): parameter.requires_grad = False for parameter in model.encoder.sentence_encoder.embed_tokens.parameters(): parameter.requires_grad = False logger.info("Freezing Adapters") for k, v in model.encoder.sentence_encoder.layers._modules.items(): logger.info("Freezing Adapters in Layer " + str(k)) if hasattr(v, "adapter_layer_norm"): logger.info("Freezing Adapter LN") for parameter in v.adapter_layer_norm.parameters(): parameter.requires_grad = False for parameter in v.adapter_modules.parameters(): parameter.requires_grad = False return model ================================================ FILE: fairseq/tasks/sentence_ranking.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import numpy as np from fairseq import utils from fairseq.data import ( ConcatSentencesDataset, Dictionary, IdDataset, NestedDictionaryDataset, NumelDataset, NumSamplesDataset, PrependTokenDataset, RawLabelDataset, RightPadDataset, SortDataset, TruncateDataset, data_utils, ) from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.tasks import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) @register_task("sentence_ranking") class SentenceRankingTask(LegacyFairseqTask): """ Ranking task on multiple sentences. Args: dictionary (Dictionary): the dictionary for the input of the task """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" parser.add_argument("data", metavar="FILE", help="file prefix for data") parser.add_argument( "--num-classes", type=int, help="number of sentences to be ranked" ) parser.add_argument( "--init-token", type=int, help="add token at the beginning of each batch item", ) parser.add_argument( "--separator-token", type=int, help="add separator token between inputs" ) parser.add_argument("--no-shuffle", action="store_true") parser.add_argument( "--shorten-method", default="none", choices=["none", "truncate", "random_crop"], help="if not none, shorten sequences that exceed --tokens-per-sample", ) parser.add_argument( "--shorten-data-split-list", default="", help="comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)', ) parser.add_argument( "--max-option-length", type=int, help="max length for each option" ) def __init__(self, args, dictionary): super().__init__(args) self.dictionary = dictionary @classmethod def load_dictionary(cls, args, filename, source=True): """Load the dictionary from the filename Args: filename (str): the filename """ dictionary = Dictionary.load(filename) dictionary.add_symbol("<mask>") return dictionary @classmethod def setup_task(cls, args, **kwargs): assert ( args.criterion == "sentence_ranking" ), "Must set --criterion=sentence_ranking" # load data dictionary data_dict = cls.load_dictionary( args, os.path.join(args.data, "input0", "dict.txt"), source=True, ) logger.info("[input] dictionary: {} types".format(len(data_dict))) return SentenceRankingTask(args, data_dict) def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset("input0", self.source_dictionary) input_options = [ make_dataset("input{idx}".format(idx=idx + 1), self.source_dictionary) for idx in range(self.args.num_classes) ] if self.args.separator_token is not None: input0 = PrependTokenDataset(input0, self.args.separator_token) src_tokens = [] for input_option in input_options: if self.args.init_token is not None: input_option = PrependTokenDataset(input_option, self.args.init_token) if self.args.max_option_length is not None: input_option = TruncateDataset( input_option, self.args.max_option_length ) src_token = ConcatSentencesDataset(input_option, input0) src_token = maybe_shorten_dataset( src_token, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.max_positions, self.args.seed, ) src_tokens.append(src_token) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens[0])) dataset = { "id": IdDataset(), "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens[0], reduce=True), } for src_token_idx in range(len(src_tokens)): dataset.update( { "net_input{idx}".format(idx=src_token_idx + 1): { "src_tokens": RightPadDataset( src_tokens[src_token_idx], pad_idx=self.source_dictionary.pad(), ), "src_lengths": NumelDataset( src_tokens[src_token_idx], reduce=False ), } } ) label_path = "{}.label".format(get_path("label", split)) if os.path.exists(label_path): with open(label_path) as h: dataset.update( target=RawLabelDataset([int(x.strip()) for x in h.readlines()]) ) nested_dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum.reduce([src_token.sizes for src_token in src_tokens])], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split] def build_model(self, args, from_checkpoint=False): from fairseq import models model = models.build_model(args, self, from_checkpoint) model.register_classification_head( getattr(args, "ranking_head_name", "sentence_classification_head"), num_classes=1, ) return model def max_positions(self): return self.args.max_positions @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary ================================================ FILE: fairseq/tasks/simultaneous_translation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from fairseq.tasks import register_task from fairseq.tasks.speech_to_text import SpeechToTextTask from fairseq.tasks.translation import TranslationTask, TranslationConfig try: import examples.simultaneous_translation # noqa import_successful = True except BaseException: import_successful = False logger = logging.getLogger(__name__) def check_import(flag): if not flag: raise ImportError( "'examples.simultaneous_translation' is not correctly imported. " "Please considering `pip install -e $FAIRSEQ_DIR`." ) @register_task("simul_speech_to_text") class SimulSpeechToTextTask(SpeechToTextTask): def __init__(self, args, tgt_dict): check_import(import_successful) super().__init__(args, tgt_dict) @register_task("simul_text_to_text", dataclass=TranslationConfig) class SimulTextToTextTask(TranslationTask): def __init__(self, cfg, src_dict, tgt_dict): check_import(import_successful) super().__init__(cfg, src_dict, tgt_dict) ================================================ FILE: fairseq/tasks/span_masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field from typing import Optional import numpy as np from omegaconf import II, MISSING from fairseq import utils from fairseq.data import ( AppendTokenDataset, Dictionary, IdDataset, NestedDictionaryDataset, NumelDataset, PadDataset, PrependTokenDataset, StripTokenDataset, TokenBlockDataset, data_utils, ) from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.data.span_mask_tokens_dataset import SpanMaskedTokensDataset from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.tasks import FairseqTask, register_task from ..data.indexed_dataset import get_available_dataset_impl logger = logging.getLogger(__name__) SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) @dataclass class SpanMaskedLMConfig(FairseqDataclass): shuffle: bool = field( default=False, ) noise_density: float = field( default=0.15, metadata={"help": "What fraction of the tokens to select as noise"}, ) mean_noise_span_length: float = field( default=3, metadata={"help": "Mean noise span length, must be >= 1"}, ) data: str = field( default=MISSING, metadata={ "help": "colon separated path to data directories list, " "will be iterated upon during epochs in round-robin manner" }, ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) seed: int = II("common.seed") dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"} ) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"} ) include_target_tokens: bool = field( default=False, metadata={ "help": "include target tokens in model input. this is used for data2vec" }, ) @register_task("span_masked_lm", dataclass=SpanMaskedLMConfig) class SpanMaskedLMTask(FairseqTask): """ Span masked language modeling task. (ie. T5) """ cfg: SpanMaskedLMConfig def __init__(self, cfg, dictionary): super().__init__(cfg) self.dictionary = dictionary @classmethod def setup_task(cls, cfg: SpanMaskedLMConfig, **kwargs): """Setup the task.""" paths = utils.split_paths(cfg.data) assert len(paths) > 0 dictionary = Dictionary.load(os.path.join(paths[0], "dict.txt")) logger.info("dictionary: {} types".format(len(dictionary))) if not hasattr(cfg, "shuffle"): cfg.shuffle = False return cls(cfg, dictionary) def _load_dataset_split(self, split, epoch, combine): paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.cfg.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) dataset = StripTokenDataset(dataset, self.dictionary.eos()) dataset = maybe_shorten_dataset( dataset, split, self.cfg.shorten_data_split_list, self.cfg.shorten_method, self.cfg.tokens_per_sample, self.cfg.seed, ) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.cfg.tokens_per_sample - 2, # one less for <s> and one for </s> pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.cfg.sample_break_mode, document_sep_len=0, ) logger.info("loaded {} blocks from: {}".format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) return dataset def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ dataset = self._load_dataset_split(split, epoch, combine) self.datasets[split] = SpanMaskedTokensDataset( dataset, self.dictionary, noise_density=self.cfg.noise_density, mean_noise_span_length=self.cfg.mean_noise_span_length, shuffle=self.cfg.shuffle, seed=self.cfg.seed, ) logger.info( "Split: {0}, Loaded {1} samples of span_masked_tokens_dataset".format( split, len(self.datasets[split]), ) ) def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): """ Generate batches for inference. We assume that the input begins with a bos symbol (`<s>`) and ends with an eos symbol (`</s>`). """ pad = self.source_dictionary.pad() eos = self.source_dictionary.eos() src_dataset = TokenBlockDataset( src_tokens, src_lengths, block_size=self.cfg.tokens_per_sample - 2, # for <s> and </s> pad=pad, eos=eos, break_mode=self.cfg.sample_break_mode, document_sep_len=0, ) prev_output_tokens = PrependTokenDataset( StripTokenDataset(src_dataset, eos), eos ) src_dataset = PadDataset(src_dataset, pad_idx=pad, left_pad=False) return NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": src_dataset, "src_lengths": NumelDataset(src_dataset, reduce=False), "prev_output_tokens": PadDataset( prev_output_tokens, pad_idx=pad, left_pad=False ), }, "target": src_dataset, }, sizes=[np.array(src_lengths)], ) def max_positions(self): """Return the max sentence length allowed by the task.""" return (self.cfg.max_source_positions, self.cfg.max_target_positions) @property def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary`.""" return self.dictionary @property def target_dictionary(self): """Return the target :class:`~fairseq.data.Dictionary`.""" return self.dictionary ================================================ FILE: fairseq/tasks/speech_dlm_task.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os from dataclasses import dataclass, field from typing import Optional from collections import OrderedDict import numpy as np import torch from fairseq import utils from fairseq.data import ( AppendTokenDataset, Dictionary, IdDataset, LMContextWindowDataset, MonolingualDataset, NestedDictionaryDataset, NumelDataset, PadDataset, PrependTokenDataset, SpeechDLMDataset, StripTokenDataset, TokenBlockDataset, TruncatedDictionary, data_utils, ) from fairseq.data.indexed_dataset import get_available_dataset_impl from fairseq.data.shorten_dataset import maybe_shorten_dataset from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.tasks import LegacyFairseqTask, register_task from omegaconf import II SAMPLE_BREAK_MODE_CHOICES = ChoiceEnum(["none", "complete", "complete_doc", "eos"]) SHORTEN_METHOD_CHOICES = ChoiceEnum(["none", "truncate", "random_crop"]) logger = logging.getLogger(__name__) @dataclass class SpeechDLMConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={"help": "path to data directory"} ) channels: Optional[str] = field( default=None, metadata={ "help": 'comma-separated list of channels to load e.g., "unitA,unitB"' "(default: load all possible channels in the data path)" }, ) channel_weights: Optional[str] = field( default=None, metadata={ "help": "comma-separated list of weights for different losses" "(default: None, which means all losses are treated equally)" }, ) sample_break_mode: SAMPLE_BREAK_MODE_CHOICES = field( default="none", metadata={ "help": 'If omitted or "none", fills each sample with tokens-per-sample ' 'tokens. If set to "complete", splits samples only at the end ' "of sentence, but may include multiple sentences per sample. " '"complete_doc" is similar but respects doc boundaries. ' 'If set to "eos", includes only one sentence per sample.' }, ) tokens_per_sample: int = field( default=1024, metadata={"help": "max number of tokens per sample for LM dataset"}, ) output_dictionary_size: int = field( default=-1, metadata={"help": "limit the size of output dictionary"} ) # str type is a workaround to put **default=True** here next_unit_prediction: str = field( default="False", metadata={ "help": "Perform Next Unit Prediction, expected str input ('True' or 'False')" }, ) edge_unit_prediction: str = field( default="True", metadata={ "help": "Perform Edge Unit Prediction, expected str input ('True' or 'False')" }, ) duration_prediction: str = field( default="True", metadata={ "help": "Perform Duration Prediction, expected str input ('True' or 'False')" }, ) delayed_duration_target: str = field( default="True", metadata={ "help": "Perform Delayed Duration Prediction, expected str input ('True' or 'False')" "(default: 'True')" }, ) max_target_durations: Optional[int] = field( default=256, metadata={"help": "max duration considered (cut off to this value)"}, ) add_bos_token: bool = field( default=False, metadata={"help": "prepend beginning of sentence token (<s>)"} ) max_target_positions: Optional[int] = field( default=None, metadata={"help": "max number of tokens in the target sequence"} ) shorten_method: SHORTEN_METHOD_CHOICES = field( default="none", metadata={ "help": "if not none, shorten sequences that exceed --tokens-per-sample" }, ) shorten_data_split_list: str = field( default="", metadata={ "help": "comma-separated list of dataset splits to apply shortening to, " 'e.g., "train,valid" (default: all dataset splits)' }, ) # TODO common vars below add to parent seed: int = II("common.seed") dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) data_buffer_size: int = II("dataset.data_buffer_size") tpu: bool = II("common.tpu") @register_task("speech_dlm_task", dataclass=SpeechDLMConfig) class SpeechDLMTask(LegacyFairseqTask): """Task for the SpeechDLM model as described in the paper: https://arxiv.org/pdf/2203.16502.pdf It create a multi-channel dataset (SpeechDLMDataset) from multiple dictionaries. Args: dictionaries (Dict[str, ~fairseq.data.Dictionary]): the dictionaries for each input channel of the SpeechDLM model output_dictionaries (Dict[str, ~fairseq.data.Dictionary]): the dictionaries for the output of each channel of the SpeechDLM model. In most cases it will be the same as *dictionaries*. targets (List[str]): list of the target types that the SpeechDLM model should predict. Can be one of "next", "edge", "duration". Defaults to "next". .. note:: The SpeechDLM task is only compatible with :mod:`fairseq-train` and :mod:`fairseq-validate`. To generate new samples, please refer to example codes at examples/textless_nlp/dgslm . """ def __init__(self, args, dicts, output_dicts=None, targets=None): super().__init__(args) self.dicts = dicts self.output_dicts = output_dicts or dicts if targets is None: targets = ["next"] self.targets = targets self.channels = list(dicts.keys()) if args.channel_weights is not None: self.channel_weights = [float(w) for w in args.channel_weights.split(",")] else: self.channel_weights = [1.0 for _ in self.channels] assert len(self.channel_weights) == len( self.channels ), "number of channel_weights must be the same as number of channels" assert str(args.next_unit_prediction).lower() in [ "true", "false", ], f"Expected to be a string of boolean, found {args.next_unit_prediction}" assert str(args.edge_unit_prediction).lower() in [ "true", "false", ], f"Expected to be a string of boolean, found {args.edge_unit_prediction}" assert str(args.duration_prediction).lower() in [ "true", "false", ], f"Expected to be a string of boolean, found {args.duration_prediction}" assert str(args.delayed_duration_target).lower() in [ "true", "false", ], f"Expected to be a string of boolean, found {args.delayed_duration_target}" self.next_unit_prediction = bool( str(args.next_unit_prediction).lower() == "true" ) self.edge_unit_prediction = bool( str(args.edge_unit_prediction).lower() == "true" ) self.duration_prediction = bool(str(args.duration_prediction).lower() == "true") self.delayed_duration_target = bool( str(args.delayed_duration_target).lower() == "true" ) self.max_target_durations = args.max_target_durations @classmethod def setup_dictionary(cls, args, **kwargs): """The dictionaries will be a dict over channel keys and values of type ~fairseq.data.Dictionary. """ paths = utils.split_paths(args.data) assert len(paths) > 0 data_path = paths[0] dicts = None output_dicts = None if args.channels is None: sorted_channels = sorted( name[5:-4] for name in os.listdir(data_path) if name[:5] == "dict." and name[-4:] == ".txt" ) else: sorted_channels = sorted(args.channels.split(",")) logger.info("channels: {}".format(sorted_channels)) # load dictionaries dicts = OrderedDict() output_dicts = OrderedDict() for channel in sorted_channels: dictionary = Dictionary.load( os.path.join(data_path, "dict.{}.txt".format(channel)) ) logger.info("[{}] dictionary: {} types".format(channel, len(dictionary))) output_dictionary = dictionary if args.output_dictionary_size >= 0: output_dictionary = TruncatedDictionary( dictionary, args.output_dictionary_size ) dicts[channel] = dictionary output_dicts[channel] = output_dictionary if len(dicts) > 0: assert dicts[channel].pad() == dicts[sorted_channels[0]].pad() assert dicts[channel].bos() == dicts[sorted_channels[0]].bos() assert dicts[channel].eos() == dicts[sorted_channels[0]].eos() assert dicts[channel].unk() == dicts[sorted_channels[0]].unk() return (dicts, output_dicts) @classmethod def setup_task(cls, args, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ dicts, output_dicts = cls.setup_dictionary(args, **kwargs) targets = [] if str(getattr(args, "next_unit_prediction", "false")).lower() == "true": targets.append("next") if str(getattr(args, "edge_unit_prediction", "false")).lower() == "true": targets.append("edge") if str(getattr(args, "duration_prediction", "false")).lower() == "true": targets.append("duration") if len(targets) == 0: # standard language modeling targets = ["next"] return cls(args, dicts, output_dicts, targets=targets) def build_model(self, args): model = super().build_model(args) for target in self.targets: if target not in model.supported_targets: raise ValueError("Unsupported SpeechDLM target: {}".format(target)) return model def load_dataset( self, split: str, epoch=1, combine=False, **kwargs ) -> SpeechDLMDataset: """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] channel_datasets = {} for channel in self.channels: split_path = os.path.join(data_path, split + "." + channel) dictionary = self.dicts[channel] output_dictionary = self.output_dicts[channel] dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine ) if dataset is None: raise FileNotFoundError( "[{}] Dataset not found: {} ({})".format(channel, split, split_path) ) dataset = maybe_shorten_dataset( dataset, split, self.args.shorten_data_split_list, self.args.shorten_method, self.args.tokens_per_sample, self.args.seed, ) dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=dictionary.pad(), eos=dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, ) add_eos_for_other_targets = ( self.args.sample_break_mode is not None and self.args.sample_break_mode != "none" ) channel_datasets[channel] = MonolingualDataset( dataset=dataset, sizes=dataset.sizes, src_vocab=dictionary, tgt_vocab=output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=["future"], add_bos_token=self.args.add_bos_token, ) self.datasets[split] = SpeechDLMDataset( datasets=channel_datasets, targets=self.targets, max_target_durations=self.max_target_durations, shuffle=True, ) def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): """ Generate batches for inference. We prepend an eos token to src_tokens (or bos if `--add-bos-token` is set) and we append a <pad> to target. This is convenient both for generation with a prefix and LM scoring. """ src_datasets = {} tgt_datasets = {} for channel in src_tokens[0]: dataset = StripTokenDataset( TokenBlockDataset( [src_tokens[i][channel] for i in range(len(src_tokens))], src_lengths, block_size=None, # ignored for "eos" break mode pad=self.source_dictionaries[channel].pad(), eos=self.source_dictionaries[channel].eos(), break_mode="eos", ), # remove eos from (end of) target sequence self.source_dictionaries[channel].eos(), ) src_dataset = PrependTokenDataset( dataset, token=( self.source_dictionaries[channel].bos() if getattr(self.args, "add_bos_token", False) else self.source_dictionaries[channel].eos() ), ) tgt_dataset = AppendTokenDataset( dataset, token=self.source_dictionaries[channel].pad() ) src_datasets[channel] = src_dataset tgt_datasets[channel] = tgt_dataset return NestedDictionaryDataset( { "id": IdDataset(), "net_input": { "src_tokens": OrderedDict( [ ( channel, PadDataset( src_datasets[channel], pad_idx=self.source_dictionaries[channel].pad(), left_pad=False, ), ) for channel in src_datasets ] ), "src_lengths": NumelDataset( next(iter(src_datasets.values())), reduce=False ), }, "target": OrderedDict( [ ( channel, PadDataset( tgt_datasets[channel], pad_idx=self.source_dictionaries[channel].pad(), left_pad=False, ), ) for channel in tgt_datasets ] ), }, sizes=[np.array(src_lengths)], ) def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): # Generation will always be conditioned on bos_token if getattr(self.args, "add_bos_token", False): bos_token = self.source_dictionary.bos() else: bos_token = self.source_dictionary.eos() if constraints is not None: raise NotImplementedError( "Constrained decoding with the SpeechDLM task is not supported" ) # SequenceGenerator doesn't use src_tokens directly, we need to # pass the `prefix_tokens` argument instead if prefix_tokens is None: prefix_tokens = {} for channel in sample["net_input"]["src_tokens"]: if sample["net_input"]["src_tokens"][channel].nelement(): prefix_tokens_channel = sample["net_input"]["src_tokens"][ channel ] if prefix_tokens_channel[:, 0].eq(bos_token).all(): prefix_tokens_channel = prefix_tokens_channel[:, 1:] prefix_tokens[channel] = prefix_tokens_channel else: prefix_tokens = None break return generator.generate( models, sample, prefix_tokens=prefix_tokens, bos_token=bos_token ) def eval_lm_dataloader( self, dataset, max_tokens: Optional[int] = 36000, batch_size: Optional[int] = None, max_positions: Optional[int] = None, num_shards: int = 1, shard_id: int = 0, num_workers: int = 1, data_buffer_size: int = 10, # ensures that every evaluated token has access to a context of at least # this size, if possible context_window: int = 0, ): if context_window > 0: dataset = LMContextWindowDataset( dataset=dataset, tokens_per_sample=self.args.tokens_per_sample, context_window=context_window, pad_idx=self.source_dictionary.pad(), ) return self.get_batch_iterator( dataset=dataset, max_tokens=max_tokens, max_sentences=batch_size, max_positions=max_positions, ignore_invalid_inputs=True, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, data_buffer_size=data_buffer_size, ).next_epoch_itr(shuffle=False) @property def source_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.dicts[self.channels[0]] @property def target_dictionary(self): """Return the :class:`~fairseq.data.Dictionary` for the language model.""" return self.output_dicts[self.channels[0]] @property def source_dictionaries(self): """Return the dict of :class:`~fairseq.data.Dictionary` for the multichannel language model.""" return self.dicts @property def target_dictionaries(self): """Return the dict of :class:`~fairseq.data.Dictionary` for the multichannel language model.""" return self.output_dicts def build_generator(self, models, args, extra_gen_cls_kwargs=None): from fairseq.models.speech_dlm.sequence_generator import ( multichannel_search, MultichannelSequenceGenerator, ) # Choose search strategy. Defaults to Beam Search. sampling = getattr(args, "sampling", False) sampling_topk = getattr(args, "sampling_topk", -1) sampling_topp = getattr(args, "sampling_topp", -1.0) assert ( sampling_topk < 0 or sampling ), "--sampling-topk requires sampling (not beam search)" assert ( sampling_topp < 0 or sampling ), "--sampling-topp requires sampling (not beam search)" if sampling: search_strategy = multichannel_search.ContiguousMultichannelSampling( self.target_dictionaries, sampling_topk, sampling_topp ) else: search_strategy = multichannel_search.ContiguousMultichannelBeamSearch( self.target_dictionaries ) extra_gen_cls_kwargs = extra_gen_cls_kwargs or {} return MultichannelSequenceGenerator( models, self.target_dictionaries, beam_size=getattr(args, "beam", 5), max_len_a=getattr(args, "max_len_a", 0), max_len_b=getattr(args, "max_len_b", 500), min_len=getattr(args, "min_len", 1), normalize_scores=(not getattr(args, "unnormalized", False)), len_penalty=getattr(args, "lenpen", 1), unk_penalty=getattr(args, "unkpen", 0), temperature=getattr(args, "temperature", 1.0), match_source_len=getattr(args, "match_source_len", False), no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), search_strategy=search_strategy, duration_temperature=getattr(args, "duration_temperature", 1.0), **extra_gen_cls_kwargs, ) ================================================ FILE: fairseq/tasks/speech_to_speech.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import logging import math from argparse import Namespace from pathlib import Path from typing import List import torch import torch.nn as nn from fairseq import utils from fairseq.data import Dictionary from fairseq.data.audio.data_cfg import MultitaskConfig, S2SDataConfig from fairseq.data.audio.speech_to_speech_dataset import SpeechToSpeechDatasetCreator from fairseq.data.audio.speech_to_text_dataset import ( SpeechToTextDataset, TextTargetMultitaskData, ) from fairseq.tasks import LegacyFairseqTask, register_task from fairseq.tasks.speech_to_text import DummyMultiTask from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion logger = logging.getLogger(__name__) class StackUnitSequenceGenerator(nn.Module): def __init__(self, tgt_dict, vocab_size): super().__init__() self.pad = tgt_dict.pad() self.eos = tgt_dict.eos() self.unk = tgt_dict.unk() self.offset = len(tgt_dict) - vocab_size self.vocab_size = vocab_size def pack_units(self, input: torch.Tensor, n_frames_per_step) -> torch.Tensor: if n_frames_per_step <= 1: return input bsz, _, n = input.shape assert n == n_frames_per_step scale = [ pow(self.vocab_size, n_frames_per_step - 1 - i) for i in range(n_frames_per_step) ] scale = torch.LongTensor(scale).squeeze(0).to(input.device) mask = input >= self.offset res = ((input - self.offset) * scale * mask).sum(dim=2) + self.offset return res @torch.no_grad() def generate(self, models, sample, **kwargs): # currently only support viterbi search for stacked units model = models[0] model.eval() max_len = model.max_decoder_positions() # TODO: incorporate max_len_a and max_len_b src_tokens = sample["net_input"]["src_tokens"] src_lengths = sample["net_input"]["src_lengths"] bsz, src_len, _ = src_tokens.size() n_frames_per_step = model.decoder.n_frames_per_step # initialize encoder_out = model.forward_encoder( src_tokens, src_lengths, speaker=sample["speaker"] ) incremental_state = {} pred_out, attn, scores = [], [], [] finished = src_tokens.new_zeros((bsz,)).bool() prev_output_tokens = src_lengths.new_zeros((bsz, 1)).long().fill_(self.eos) for _ in range(max_len): cur_out, cur_extra = model.forward_decoder( prev_output_tokens, encoder_out=encoder_out, incremental_state=incremental_state, ) lprobs = model.get_normalized_probs([cur_out], log_probs=True) # never select pad, unk lprobs[:, :, self.pad] = -math.inf lprobs[:, :, self.unk] = -math.inf cur_pred_lprob, cur_pred_out = torch.max(lprobs, dim=2) scores.append(cur_pred_lprob) pred_out.append(cur_pred_out) prev_output_tokens = torch.cat( ( prev_output_tokens, self.pack_units( cur_pred_out.view(bsz, 1, n_frames_per_step), n_frames_per_step ), ), dim=1, ) attn.append(cur_extra["attn"][0]) cur_finished = torch.any(cur_pred_out.squeeze(1) == self.eos, dim=1) finished = finished | cur_finished if finished.sum().item() == bsz: break pred_out = torch.cat(pred_out, dim=1).view(bsz, -1) attn = torch.cat(attn, dim=2) alignment = attn.max(dim=1)[1] attn = attn.repeat_interleave(n_frames_per_step, dim=2) alignment = alignment.repeat_interleave(n_frames_per_step, dim=1) scores = torch.cat(scores, dim=1) eos_idx = (pred_out == self.eos).nonzero(as_tuple=True) out_lens = src_lengths.new_zeros((bsz,)).long().fill_(max_len) for b, l in zip(eos_idx[0], eos_idx[1]): out_lens[b] = min(l, out_lens[b]) hypos = [ [ { "tokens": pred_out[b, :out_len], "attn": attn[b, :, :out_len], "alignment": alignment[b, :out_len], "positional_scores": scores[b, :out_len], "score": utils.item(scores[b, :out_len].sum().data), } ] for b, out_len in zip(range(bsz), out_lens) ] return hypos @register_task("speech_to_speech") class SpeechToSpeechTask(LegacyFairseqTask): @classmethod def add_args(cls, parser): parser.add_argument("data", help="manifest root path") parser.add_argument( "--config-yaml", type=str, default="config.yaml", help="Configuration YAML filename (under manifest root)", ) parser.add_argument( "--multitask-config-yaml", type=str, default=None, help="Configuration YAML filename for the multitasks (under manifest root)", ) parser.add_argument( "--max-source-positions", default=6000, type=int, metavar="N", help="max number of tokens in the source sequence", ) parser.add_argument( "--max-target-positions", default=1024, type=int, metavar="N", help="max number of tokens in the target sequence", ) parser.add_argument( "--target-is-code", action="store_true", help="set if target is discrete unit instead of spectrogram", ) parser.add_argument( "--target-code-size", type=int, default=None, help="# discrete units" ) parser.add_argument( "--n-frames-per-step", type=int, default=1, help="# stacked frames, use 0 for reduced discrete unit sequence", ) parser.add_argument("--eval-inference", action="store_true") parser.add_argument( "--eval-args", type=str, default="{}", help='generation args for speech-to-unit model , e.g., \'{"beam": 5, "max_len_a": 1}\', as JSON string', ) parser.add_argument("--eos-prob-threshold", type=float, default=0.5) parser.add_argument( "--mcd-normalize-type", type=str, default="targ", choices=["targ", "pred", "path"], ) parser.add_argument( "--vocoder", type=str, default="griffin_lim", choices=["griffin_lim", "hifigan", "code_hifigan"], ) parser.add_argument("--spec-bwd-max-iter", type=int, default=8) parser.add_argument( "--infer-target-lang", type=str, default="", help="target language for inference", ) def __init__(self, args, tgt_dict, infer_tgt_lang_id=None): super().__init__(args) self.tgt_dict = tgt_dict self.data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml) self.multitask_tasks = {} self.tgt_dict_mt = None self.eos_token_mt = None if getattr(args, "multitask_config_yaml", None) is not None: multitask_cfg = MultitaskConfig( Path(args.data) / args.multitask_config_yaml ) first_pass_task_idx = multitask_cfg.first_pass_decoder_task_index for i, (task_name, task_config) in enumerate( multitask_cfg.get_all_tasks().items() ): task_obj = DummyMultiTask( task_config, task_config.tgt_dict, first_pass=i == first_pass_task_idx, ) self.multitask_tasks[task_name] = task_obj if task_obj.is_first_pass_decoder: self.tgt_dict_mt = task_obj.target_dictionary if task_config.prepend_bos_and_append_tgt_lang_tag: self.eos_token_mt = task_config.eos_token assert not isinstance(self.eos_token_mt, List) if not self.eos_token_mt: raise Warning( "Please provide eos_token in --multitask-config-yaml to replace eos in sequence generator" ) self._infer_tgt_lang_id = infer_tgt_lang_id @classmethod def setup_task(cls, args, **kwargs): data_cfg = data_cfg = S2SDataConfig(Path(args.data) / args.config_yaml) tgt_dict = None infer_tgt_lang_id = None if args.target_is_code: if data_cfg.prepend_tgt_lang_tag_as_bos: # dictionary with language tags dict_path = Path(args.data) / data_cfg.vocab_filename if not dict_path.is_file(): raise FileNotFoundError( f"Dict has to be provided when setting prepend_tgt_lang_tag_as_bos: true, but dict not found: {dict_path}" ) tgt_dict = Dictionary.load(dict_path.as_posix()) # target langauge for inference if args.infer_target_lang != "": tgt_lang_tag = SpeechToTextDataset.LANG_TAG_TEMPLATE.format( args.infer_target_lang ) infer_tgt_lang_id = tgt_dict.index(tgt_lang_tag) assert infer_tgt_lang_id != tgt_dict.unk() else: assert args.target_code_size is not None tgt_dict = Dictionary() for i in range(args.target_code_size): tgt_dict.add_symbol(str(i)) logger.info(f"dictionary size: " f"{len(tgt_dict):,}") if getattr(args, "train_subset", None) is not None: if not all(s.startswith("train") for s in args.train_subset.split(",")): raise ValueError('Train splits should be named like "train*".') assert args.n_frames_per_step >= 1 assert ( not args.eval_inference or (args.target_is_code and args.vocoder == "code_hifigan") or (not args.target_is_code and args.vocoder != "code_hifigan") ) return cls(args, tgt_dict, infer_tgt_lang_id=infer_tgt_lang_id) def build_criterion(self, args): from fairseq import criterions if len(self.multitask_tasks) > 0: if self.args.target_is_code and not args._name.startswith("speech_to_unit"): raise ValueError( "set --criterion speech_to_unit for speech-to-unit loss with multitask" ) elif not self.args.target_is_code and not args._name.startswith( "speech_to_spectrogram" ): raise ValueError( "set --criterion speech_to_spectrogram for speech-to-spectrogram loss with multitask" ) return criterions.build_criterion(args, self) def load_dataset(self, split, epoch=1, combine=False, **kwargs): self.datasets[split] = SpeechToSpeechDatasetCreator.from_tsv( root=self.args.data, data_cfg=self.data_cfg, splits=split, is_train_split=split.startswith("train"), epoch=epoch, seed=self.args.seed, target_is_code=self.args.target_is_code, tgt_dict=self.target_dictionary, n_frames_per_step=self.args.n_frames_per_step, multitask=self.multitask_tasks, ) @property def target_dictionary(self): return self.tgt_dict @property def target_dictionary_mt(self): return self.tgt_dict_mt @property def source_dictionary(self): return None def max_positions(self): return self.args.max_source_positions, self.args.max_target_positions def build_model(self, args, from_checkpoint=False): args.input_feat_per_channel = self.data_cfg.input_feat_per_channel args.input_channels = self.data_cfg.input_transformed_channels args.target_speaker_embed = self.data_cfg.target_speaker_embed is not None args.n_frames_per_step = self.args.n_frames_per_step model = super().build_model(args, from_checkpoint) if len(self.multitask_tasks) > 0: from fairseq.models.speech_to_speech.s2s_transformer import ( S2STransformerMultitaskModelBase, ) assert isinstance(model, S2STransformerMultitaskModelBase) if self.args.eval_inference: self.eval_gen_args = json.loads(self.args.eval_args) self.generator = self.build_generator( [model], Namespace(**self.eval_gen_args) ) return model def build_generator_dual_decoder( self, models, args, extra_gen_cls_kwargs=None, ): from examples.speech_to_speech.unity.sequence_generator_multi_decoder import ( MultiDecoderSequenceGenerator, ) return MultiDecoderSequenceGenerator( models, self.target_dictionary, self.target_dictionary_mt, beam_size=max(1, getattr(args, "beam", 1)), beam_size_mt=max(1, getattr(args, "beam_mt", 1)), max_len_a=getattr(args, "max_len_a", 0), max_len_b=getattr(args, "max_len_b", 200), max_len_a_mt=getattr(args, "max_len_a_mt", 0), max_len_b_mt=getattr(args, "max_len_b_mt", 200), min_len=getattr(args, "min_len", 1), normalize_scores=(not getattr(args, "unnormalized", False)), len_penalty=getattr(args, "lenpen", 1), unk_penalty=getattr(args, "unkpen", 0), temperature=getattr(args, "temperature", 1.0), match_source_len=getattr(args, "match_source_len", False), no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), **extra_gen_cls_kwargs, ) def build_generator( self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, ): if not self.args.target_is_code or self.args.eval_inference: from fairseq.models.text_to_speech.vocoder import get_vocoder self.vocoder = get_vocoder(self.args, self.data_cfg) self.vocoder = ( self.vocoder.cuda() if torch.cuda.is_available() and not self.args.cpu else self.vocoder.cpu() ) has_dual_decoder = getattr(models[0], "mt_task_name", None) is not None if self.args.target_is_code: if self.args.n_frames_per_step == 1: if has_dual_decoder: seq_generator = self.build_generator_dual_decoder( models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs, ) else: seq_generator = super().build_generator( models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs, ) else: assert ( getattr(args, "beam", 1) == 1 and getattr(args, "nbest", 1) == 1 ), "only support viterbi search for stacked units" seq_generator = StackUnitSequenceGenerator( self.tgt_dict, self.args.target_code_size, ) else: if has_dual_decoder: if getattr(args, "teacher_forcing", False): raise NotImplementedError else: from fairseq.speech_generator import MultiDecoderSpeechGenerator generator = MultiDecoderSpeechGenerator lang_token_ids_aux = { i for s, i in self.tgt_dict_mt.indices.items() if TextTargetMultitaskData.is_lang_tag(s) } if extra_gen_cls_kwargs is None: extra_gen_cls_kwargs = {} extra_gen_cls_kwargs[ "symbols_to_strip_from_output" ] = lang_token_ids_aux eos_id_mt = ( self.tgt_dict_mt.index(self.eos_token_mt) if self.eos_token_mt else None ) assert eos_id_mt != self.tgt_dict_mt.unk() extra_gen_cls_kwargs["eos_mt"] = eos_id_mt seq_generator = generator( models, args, self.vocoder, self.data_cfg, self.target_dictionary_mt, max_iter=self.args.max_target_positions, eos_prob_threshold=self.args.eos_prob_threshold, **extra_gen_cls_kwargs, ) else: if getattr(args, "teacher_forcing", False): from fairseq.speech_generator import ( TeacherForcingAutoRegressiveSpeechGenerator, ) generator = TeacherForcingAutoRegressiveSpeechGenerator logger.info("Teacher forcing mode for generation") else: from fairseq.speech_generator import AutoRegressiveSpeechGenerator generator = AutoRegressiveSpeechGenerator seq_generator = generator( models[0], self.vocoder, self.data_cfg, max_iter=self.args.max_target_positions, eos_prob_threshold=self.args.eos_prob_threshold, ) return seq_generator def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): for task_name, task_obj in self.multitask_tasks.items(): criterion.set_multitask_loss_weight( task_name, task_obj.args.get_loss_weight(update_num) ) if task_name in model.multitask_decoders: model.multitask_decoders[task_name].train() loss, sample_size, logging_output = super().train_step( sample, model, criterion, optimizer, update_num, ignore_grad ) return loss, sample_size, logging_output def valid_step(self, sample, model, criterion): for task_name in self.multitask_tasks.keys(): if task_name in model.multitask_decoders: model.multitask_decoders[task_name].eval() loss, sample_size, logging_output = super().valid_step(sample, model, criterion) if self.args.eval_inference: hypos, inference_losses = self.valid_step_with_inference( sample, model, self.generator ) for k, v in inference_losses.items(): assert k not in logging_output logging_output[k] = v return loss, sample_size, logging_output def valid_step_with_inference(self, sample, model, generator): if self.args.target_is_code: hypos = generator.generate([model], sample) tgt_lens = ( sample["target_lengths"] - 1 ) * self.args.n_frames_per_step # strip <eos> for b, (f, l) in enumerate(zip(sample["target"], tgt_lens)): hypos[b][0]["targ_waveform"] = self.vocoder( {"code": f[:l] - 4}, # remove <bos>, <pad>, <eos>, <unk> dur_prediction=self.eval_gen_args.get("dur_prediction", False), ) if len(hypos[b][0]["tokens"]) > 0: hypos[b][0]["waveform"] = self.vocoder( {"code": hypos[b][0]["tokens"] - 4}, dur_prediction=self.eval_gen_args.get("dur_prediction", False), ) else: hypos[b][0]["waveform"] = torch.flip( hypos[b][0]["targ_waveform"], dims=[0] ) else: hypos = [ [hypo] for hypo in generator.generate(model, sample, has_targ=True) ] losses = { "mcd_loss": 0.0, "targ_frames": 0.0, "pred_frames": 0.0, "path_frames": 0.0, "nins": 0.0, "ndel": 0.0, } rets = batch_mel_cepstral_distortion( [hypo[0]["targ_waveform"] for hypo in hypos], [hypo[0]["waveform"] for hypo in hypos], self.data_cfg.output_sample_rate, normalize_type=None, ) for d, extra in rets: pathmap = extra[-1] losses["mcd_loss"] += d.item() losses["targ_frames"] += pathmap.size(0) losses["pred_frames"] += pathmap.size(1) losses["path_frames"] += pathmap.sum().item() losses["nins"] += (pathmap.sum(dim=1) - 1).sum().item() losses["ndel"] += (pathmap.sum(dim=0) - 1).sum().item() losses["norm_frames"] = losses[ f"{getattr(self.args, 'mcd_normalize_type', 'targ')}_frames" ] return hypos, losses def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): if self._infer_tgt_lang_id is not None: return generator.generate( models, sample, prefix_tokens=prefix_tokens, constraints=constraints, bos_token=self._infer_tgt_lang_id, ) else: return super().inference_step( generator, models, sample, prefix_tokens=prefix_tokens, constraints=constraints, ) ================================================ FILE: fairseq/tasks/speech_to_text.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging from argparse import Namespace from pathlib import Path from typing import List from fairseq.data import Dictionary, encoders from fairseq.data.audio.audio_utils import get_features_or_waveform from fairseq.data.audio.data_cfg import MultitaskConfig from fairseq.data.audio.speech_to_text_dataset import ( S2TDataConfig, SpeechToTextDataset, SpeechToTextDatasetCreator, TextTargetMultitaskData, ) from fairseq.tasks import LegacyFairseqTask, register_task logger = logging.getLogger(__name__) @register_task("speech_to_text") class SpeechToTextTask(LegacyFairseqTask): @classmethod def add_args(cls, parser): parser.add_argument("data", help="manifest root path") parser.add_argument( "--config-yaml", type=str, default="config.yaml", help="Configuration YAML filename (under manifest root)", ) parser.add_argument( "--multitask-config-yaml", type=str, default=None, help="Configuration YAML filename for the multitasks (under manifest root)", ) parser.add_argument( "--max-source-positions", default=6000, type=int, metavar="N", help="max number of tokens in the source sequence", ) parser.add_argument( "--max-target-positions", default=1024, type=int, metavar="N", help="max number of tokens in the target sequence", ) def __init__(self, args, tgt_dict): super().__init__(args) self.tgt_dict = tgt_dict self.data_cfg = S2TDataConfig(Path(args.data) / args.config_yaml) self.speaker_to_id = self._get_speaker_to_id() if ( self.data_cfg.prepend_tgt_lang_tag and self.data_cfg.prepend_bos_and_append_tgt_lang_tag ): raise ValueError( "Please set only one of the two options to avoid adding target token multiple times" ) self.multitask_tasks = {} self.tgt_dict_mt = None self.eos_token_mt = None if getattr(args, "multitask_config_yaml", None) is not None: multitask_cfg = MultitaskConfig( Path(args.data) / args.multitask_config_yaml ) first_pass_task_idx = multitask_cfg.first_pass_decoder_task_index for i, (task_name, task_config) in enumerate( multitask_cfg.get_all_tasks().items() ): task_obj = DummyMultiTask( task_config, task_config.tgt_dict, first_pass=i == first_pass_task_idx, ) self.multitask_tasks[task_name] = task_obj if task_obj.is_first_pass_decoder: self.tgt_dict_mt = task_obj.target_dictionary if task_config.prepend_bos_and_append_tgt_lang_tag: self.eos_token_mt = task_config.eos_token assert not isinstance(self.eos_token_mt, List) if not self.eos_token_mt: raise Warning( "Please provide eos_token in --multitask-config-yaml to replace eos in sequence generator" ) def _get_speaker_to_id(self): speaker_to_id = None speaker_set_filename = self.data_cfg.config.get("speaker_set_filename") if speaker_set_filename is not None: speaker_set_path = Path(self.args.data) / speaker_set_filename with open(speaker_set_path) as f: speaker_to_id = {r.strip(): i for i, r in enumerate(f)} return speaker_to_id @classmethod def setup_task(cls, args, **kwargs): data_cfg = S2TDataConfig(Path(args.data) / args.config_yaml) dict_path = Path(args.data) / data_cfg.vocab_filename if not dict_path.is_file(): raise FileNotFoundError(f"Dict not found: {dict_path.as_posix()}") tgt_dict = Dictionary.load(dict_path.as_posix()) logger.info( f"dictionary size ({data_cfg.vocab_filename}): " f"{len(tgt_dict):,}" ) if getattr(args, "train_subset", None) is not None: if not all(s.startswith("train") for s in args.train_subset.split(",")): raise ValueError('Train splits should be named like "train*".') return cls(args, tgt_dict) def build_criterion(self, args): from fairseq import criterions if self.data_cfg.prepend_tgt_lang_tag and args.ignore_prefix_size != 1: raise ValueError( 'Please set "--ignore-prefix-size 1" since ' "target language ID token is prepended as BOS." ) return criterions.build_criterion(args, self) def load_dataset(self, split, epoch=1, combine=False, **kwargs): is_train_split = split.startswith("train") pre_tokenizer = self.build_tokenizer(self.args) bpe_tokenizer = self.build_bpe(self.args) self.datasets[split] = SpeechToTextDatasetCreator.from_tsv( root=self.args.data, cfg=self.data_cfg, splits=split, tgt_dict=self.tgt_dict, pre_tokenizer=pre_tokenizer, bpe_tokenizer=bpe_tokenizer, is_train_split=is_train_split, epoch=epoch, seed=self.args.seed, speaker_to_id=self.speaker_to_id, multitask=self.multitask_tasks, ) @property def target_dictionary(self): return self.tgt_dict @property def target_dictionary_mt(self): return self.tgt_dict_mt @property def source_dictionary(self): return None def max_positions(self): return self.args.max_source_positions, self.args.max_target_positions def build_model(self, args, from_checkpoint=False): args.input_feat_per_channel = self.data_cfg.input_feat_per_channel args.input_channels = self.data_cfg.input_channels args.speaker_to_id = self.speaker_to_id return super(SpeechToTextTask, self).build_model(args, from_checkpoint) def build_generator_dual_decoder( self, models, args, extra_gen_cls_kwargs, ): from examples.speech_to_speech.unity.sequence_generator_multi_decoder import ( MultiDecoderSequenceGenerator, ) lang_token_ids_aux = { i for s, i in self.tgt_dict_mt.indices.items() if TextTargetMultitaskData.is_lang_tag(s) } extra_gen_cls_kwargs["symbols_to_strip_from_output"].update(lang_token_ids_aux) eos_id_mt = ( self.tgt_dict_mt.index(self.eos_token_mt) if self.eos_token_mt else None ) assert eos_id_mt != self.tgt_dict_mt.unk() extra_gen_cls_kwargs["eos_mt"] = eos_id_mt return MultiDecoderSequenceGenerator( models, self.target_dictionary, self.target_dictionary_mt, beam_size=max(1, getattr(args, "beam", 1)), beam_size_mt=max(1, getattr(args, "beam_mt", 1)), max_len_a=getattr(args, "max_len_a", 0), max_len_b=getattr(args, "max_len_b", 200), max_len_a_mt=getattr(args, "max_len_a_mt", 0), max_len_b_mt=getattr(args, "max_len_b_mt", 0), min_len=getattr(args, "min_len", 1), normalize_scores=(not getattr(args, "unnormalized", False)), len_penalty=getattr(args, "lenpen", 1), len_penalty_mt=getattr(args, "lenpen_mt", 1), unk_penalty=getattr(args, "unkpen", 0), temperature=getattr(args, "temperature", 1.0), match_source_len=getattr(args, "match_source_len", False), no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), **extra_gen_cls_kwargs, ) def build_generator( self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, ): if self.data_cfg.prepend_tgt_lang_tag and args.prefix_size != 1: raise ValueError( 'Please set "--prefix-size 1" since ' "target language ID token is prepended as BOS." ) lang_token_ids = { i for s, i in self.tgt_dict.indices.items() if SpeechToTextDataset.is_lang_tag(s) } if extra_gen_cls_kwargs is None: extra_gen_cls_kwargs = {} extra_gen_cls_kwargs["symbols_to_strip_from_output"] = lang_token_ids eos_token = ( args.eos_token if "eos_token" in args and args.eos_token is not None else self.data_cfg.config.get("eos_token", None) ) if self.data_cfg.prepend_bos_and_append_tgt_lang_tag and not eos_token: raise Warning( "Please provide --eos_token to replace eos in sequence generator" ) eos_id = self.tgt_dict.index(eos_token) if eos_token else None extra_gen_cls_kwargs["eos"] = eos_id has_dual_decoder = getattr(models[0], "mt_task_name", None) is not None if has_dual_decoder: return self.build_generator_dual_decoder( models, args, extra_gen_cls_kwargs=extra_gen_cls_kwargs, ) else: return super().build_generator( models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs, ) def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): for task_name, task_obj in self.multitask_tasks.items(): criterion.set_multitask_loss_weight( task_name, task_obj.args.get_loss_weight(update_num) ) if task_name in model.multitask_decoders: model.multitask_decoders[task_name].train() loss, sample_size, logging_output = super().train_step( sample, model, criterion, optimizer, update_num, ignore_grad ) return loss, sample_size, logging_output def valid_step(self, sample, model, criterion): for task_name, task_obj in self.multitask_tasks.items(): if task_name in model.multitask_decoders: model.multitask_decoders[task_name].eval() loss, sample_size, logging_output = super().valid_step(sample, model, criterion) return loss, sample_size, logging_output def build_tokenizer(self, args): logger.info(f"pre-tokenizer: {self.data_cfg.pre_tokenizer}") return encoders.build_tokenizer(Namespace(**self.data_cfg.pre_tokenizer)) def build_bpe(self, args): logger.info(f"tokenizer: {self.data_cfg.bpe_tokenizer}") return encoders.build_bpe(Namespace(**self.data_cfg.bpe_tokenizer)) def get_interactive_tokens_and_lengths(self, lines, encode_fn): n_frames = [get_features_or_waveform(p).shape[0] for p in lines] return lines, n_frames def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): return SpeechToTextDataset( "interactive", False, self.data_cfg, src_tokens, src_lengths ) class DummyMultiTask(LegacyFairseqTask): def __init__(self, args, tgt_dict, first_pass=False): super().__init__(args) self.tgt_dict = tgt_dict self.first_pass = first_pass @property def target_dictionary(self): return self.tgt_dict @property def is_first_pass_decoder(self): return self.first_pass def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): if self.args.decoder_type == "ctc": model = models[0] # only support single model encoder_out = model(**sample) if hasattr(model, "get_logits"): emissions = model.get_logits( encoder_out ) # no need to normalize emissions else: emissions = model.get_normalized_probs(encoder_out, log_probs=True) return generator.decode( emissions.transpose(0, 1).float().cpu().contiguous() ) else: raise NotImplementedError("only ctc decoder is supported at the moment") def build_generator( self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None ): if self.args.decoder_type == "ctc": from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder return W2lViterbiDecoder(args, self.tgt_dict) else: raise NotImplementedError("only ctc decoder is supported at the moment") ================================================ FILE: fairseq/tasks/speech_ulm_task.py ================================================ # Copyright (c) 2017-present, Facebook, Inc. # All rights reserved. # # This source code is licensed under the license found in the LICENSE file in # the root directory of this source tree. An additional grant of patent rights # can be found in the PATENTS file in the same directory. import logging import sys import torch from dataclasses import dataclass, field from typing import List, Optional, Tuple from fairseq.data import Dictionary from fairseq.data.codedataset import ExpressiveCodeDataConfig, CodeDataset from fairseq.dataclass.configs import FairseqDataclass from fairseq.tasks import register_task from fairseq.tasks.fairseq_task import FairseqTask from omegaconf import MISSING, DictConfig logger = logging.getLogger(__name__) class UnitDictionary(Dictionary): """ A fixed-sized Dictionary that operates on integer-valued tokens wth a trivial (identity) token <-> id mapping. Special symbols (bos, eos, ...) have ids above n_units. """ def __init__( self, *, # begin keyword-only arguments n_units, bos="<s>", pad="<pad>", eos="</s>", unk="<unk>", extra_special_symbols=None, clip=False, ): self.n_units = n_units self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos self.clip = clip self.symbols = [] self.count = [] self.indices = {} for i in range(n_units): self.add_symbol(str(i)) self.bos_index = self.add_symbol(bos) self.pad_index = self.add_symbol(pad) self.eos_index = self.add_symbol(eos) self.unk_index = self.add_symbol(unk) if extra_special_symbols: for s in extra_special_symbols: self.add_symbol(s) self.nspecial = len(self.symbols) def encode_line(self, line, append_eos=True, prepend_bos=False) -> torch.IntTensor: words = [int(x) for x in line.split()] if self.clip: words = [min(self.n_units - 1, word) for word in words] if prepend_bos: words = [self.bos_index] + words if append_eos: words.append(self.eos_index) ids = torch.IntTensor(words) return ids @dataclass class SpeechUnitModelingConfig(FairseqDataclass): data: str = field(default=MISSING, metadata={"help": "Path to data config.json"}) max_token_duration: int = field( default=20, metadata={"help": "all token durations are capped to this value"} ) tokens_per_sample: int = field( default=1024, metadata={"help": "tokens in a sample"} ) max_target_positions: int = field( default=1024, metadata={"help": "max target positions"} ) # duration modeling ignore_duration_input: bool = field( default=False, metadata={"help": "whether token durations should be zeroed out"} ) discrete_duration: bool = field( default=False, metadata={"help": "treat duration as discrete variable"} ) # F0 modeling ignore_f0_input: bool = field( default=False, metadata={"help": "whether F0 should be zeroed out"} ) discrete_f0: bool = field( default=False, metadata={"help": "load quantized f0. get bin from config"} ) log_f0: bool = field( default=False, metadata={"help": "whether f0 should be modeled in log space"} ) normalize_f0_mean: bool = field( default=False, metadata={"help": "whether normalize f0 by speaker mean"} ) normalize_f0_std: bool = field( default=False, metadata={"help": "whether normalize f0 by speaker stddev"} ) interpolate_f0: bool = field( default=False, metadata={"help": "whether interpolate f0 for non-voiced segments"}, ) # input/output streams stream_shifts: str = field( default="0,0", metadata={ "help": ( "comma-separated integer list denoting right-shift for " "duration and pitch streams" ) }, ) @register_task("speech_unit_modeling", dataclass=SpeechUnitModelingConfig) class SpeechUnitLanguageModelingTask(FairseqTask): def __init__(self, cfg: SpeechUnitModelingConfig) -> None: super().__init__(cfg) assert not self.cfg.normalize_f0_std or self.cfg.normalize_f0_mean self.data_config = ExpressiveCodeDataConfig(cfg.data) self._source_dictionary = self._target_dictionary = UnitDictionary( n_units=self.data_config.n_units ) self._source_duration_dictionary = self._target_duration_dictionary = ( UnitDictionary(n_units=self.cfg.max_token_duration + 1, clip=True) if self.cfg.discrete_duration else None ) self._source_f0_dictionary = self._target_f0_dictionary = ( UnitDictionary(n_units=self.data_config.f0_vq_n_units) if self.cfg.discrete_f0 else None ) self._channel_names = ["token", "duration", "f0"] self._channel_sizes = [ len(self.target_dictionary), len(self.target_duration_dictionary) if self.cfg.discrete_duration else 1, len(self.target_f0_dictionary) if self.cfg.discrete_f0 else 1, ] @property def source_dictionary(self) -> Optional[Dictionary]: return self._source_dictionary @property def source_duration_dictionary(self) -> Optional[Dictionary]: return self._source_duration_dictionary @property def source_f0_dictionary(self) -> Optional[Dictionary]: return self._source_f0_dictionary @property def channel_names(self) -> List[str]: return self._channel_names @property def channel_sizes(self) -> List[int]: return self._channel_sizes @property def dictionary(self) -> Optional[Dictionary]: return self._source_dictionary @property def target_dictionary(self) -> Optional[Dictionary]: return self._target_dictionary @property def target_duration_dictionary(self) -> Optional[Dictionary]: return self._target_duration_dictionary @property def target_f0_dictionary(self) -> Optional[Dictionary]: return self._target_f0_dictionary @property def dictionaries(self) -> List[Dictionary]: return [self._dictionaries[l] for l in self.cfg.labels] @classmethod def setup_task( cls, cfg: SpeechUnitModelingConfig, **kwargs ) -> "SpeechUnitLanguageModelingTask": return cls(cfg) def load_dataset(self, split: str, **kwargs) -> None: self.datasets[split] = CodeDataset( manifest=self.data_config.manifests[split], dictionary=self.source_dictionary, dur_dictionary=self.source_duration_dictionary, f0_dictionary=self.source_f0_dictionary, config=self.data_config, discrete_dur=self.cfg.discrete_duration, discrete_f0=self.cfg.discrete_f0, log_f0=self.cfg.log_f0, normalize_f0_mean=self.cfg.normalize_f0_mean, normalize_f0_std=self.cfg.normalize_f0_std, interpolate_f0=self.cfg.interpolate_f0, shifts=self.cfg.stream_shifts, ) def max_positions(self) -> Tuple[int, int]: return (sys.maxsize, sys.maxsize) def build_criterion(self, cfg: DictConfig): import fairseq.criterions return fairseq.criterions.build_criterion(cfg, self) ================================================ FILE: fairseq/tasks/text_to_speech.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import os.path as op import torch import torch.nn.functional as F import numpy as np from fairseq.data.audio.text_to_speech_dataset import TextToSpeechDatasetCreator from fairseq.tasks import register_task from fairseq.tasks.speech_to_text import SpeechToTextTask from fairseq.speech_generator import ( AutoRegressiveSpeechGenerator, NonAutoregressiveSpeechGenerator, TeacherForcingAutoRegressiveSpeechGenerator, ) logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO, ) logger = logging.getLogger(__name__) try: from tensorboardX import SummaryWriter except ImportError: logger.info("Please install tensorboardX: pip install tensorboardX") SummaryWriter = None @register_task("text_to_speech") class TextToSpeechTask(SpeechToTextTask): @staticmethod def add_args(parser): parser.add_argument("data", help="manifest root path") parser.add_argument( "--config-yaml", type=str, default="config.yaml", help="Configuration YAML filename (under manifest root)", ) parser.add_argument( "--max-source-positions", default=1024, type=int, metavar="N", help="max number of tokens in the source sequence", ) parser.add_argument( "--max-target-positions", default=1200, type=int, metavar="N", help="max number of tokens in the target sequence", ) parser.add_argument("--n-frames-per-step", type=int, default=1) parser.add_argument("--eos-prob-threshold", type=float, default=0.5) parser.add_argument("--eval-inference", action="store_true") parser.add_argument("--eval-tb-nsample", type=int, default=8) parser.add_argument("--vocoder", type=str, default="griffin_lim") parser.add_argument("--spec-bwd-max-iter", type=int, default=8) def __init__(self, args, src_dict): super().__init__(args, src_dict) self.src_dict = src_dict self.sr = self.data_cfg.config.get("features").get("sample_rate") self.tensorboard_writer = None self.tensorboard_dir = "" if args.tensorboard_logdir and SummaryWriter is not None: self.tensorboard_dir = os.path.join(args.tensorboard_logdir, "valid_extra") def load_dataset(self, split, epoch=1, combine=False, **kwargs): is_train_split = split.startswith("train") pre_tokenizer = self.build_tokenizer(self.args) bpe_tokenizer = self.build_bpe(self.args) self.datasets[split] = TextToSpeechDatasetCreator.from_tsv( self.args.data, self.data_cfg, split, self.src_dict, pre_tokenizer, bpe_tokenizer, is_train_split=is_train_split, epoch=epoch, seed=self.args.seed, n_frames_per_step=self.args.n_frames_per_step, speaker_to_id=self.speaker_to_id, ) @property def target_dictionary(self): return None @property def source_dictionary(self): return self.src_dict def get_speaker_embeddings_path(self): speaker_emb_path = None if self.data_cfg.config.get("speaker_emb_filename") is not None: speaker_emb_path = op.join( self.args.data, self.data_cfg.config.get("speaker_emb_filename") ) return speaker_emb_path @classmethod def get_speaker_embeddings(cls, args): embed_speaker = None if args.speaker_to_id is not None: if args.speaker_emb_path is None: embed_speaker = torch.nn.Embedding( len(args.speaker_to_id), args.speaker_embed_dim ) else: speaker_emb_mat = np.load(args.speaker_emb_path) assert speaker_emb_mat.shape[1] == args.speaker_embed_dim embed_speaker = torch.nn.Embedding.from_pretrained( torch.from_numpy(speaker_emb_mat), freeze=True, ) logger.info( f"load speaker embeddings from {args.speaker_emb_path}. " f"train embedding? {embed_speaker.weight.requires_grad}\n" f"embeddings:\n{speaker_emb_mat}" ) return embed_speaker def build_model(self, cfg, from_checkpoint=False): cfg.pitch_min = self.data_cfg.config["features"].get("pitch_min", None) cfg.pitch_max = self.data_cfg.config["features"].get("pitch_max", None) cfg.energy_min = self.data_cfg.config["features"].get("energy_min", None) cfg.energy_max = self.data_cfg.config["features"].get("energy_max", None) cfg.speaker_emb_path = self.get_speaker_embeddings_path() model = super().build_model(cfg, from_checkpoint) self.generator = None if getattr(cfg, "eval_inference", False): self.generator = self.build_generator([model], cfg) return model def build_generator(self, models, cfg, vocoder=None, **unused): if vocoder is None: vocoder = self.build_default_vocoder() model = models[0] if getattr(model, "NON_AUTOREGRESSIVE", False): return NonAutoregressiveSpeechGenerator(model, vocoder, self.data_cfg) else: generator = AutoRegressiveSpeechGenerator if getattr(cfg, "teacher_forcing", False): generator = TeacherForcingAutoRegressiveSpeechGenerator logger.info("Teacher forcing mode for generation") return generator( model, vocoder, self.data_cfg, max_iter=self.args.max_target_positions, eos_prob_threshold=self.args.eos_prob_threshold, ) def build_default_vocoder(self): from fairseq.models.text_to_speech.vocoder import get_vocoder vocoder = get_vocoder(self.args, self.data_cfg) if torch.cuda.is_available() and not self.args.cpu: vocoder = vocoder.cuda() else: vocoder = vocoder.cpu() return vocoder def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) if getattr(self.args, "eval_inference", False): hypos, inference_losses = self.valid_step_with_inference( sample, model, self.generator ) for k, v in inference_losses.items(): assert k not in logging_output logging_output[k] = v picked_id = 0 if self.tensorboard_dir and (sample["id"] == picked_id).any(): self.log_tensorboard( sample, hypos[: self.args.eval_tb_nsample], model._num_updates, is_na_model=getattr(model, "NON_AUTOREGRESSIVE", False), ) return loss, sample_size, logging_output def valid_step_with_inference(self, sample, model, generator): hypos = generator.generate(model, sample, has_targ=True) losses = { "mcd_loss": 0.0, "targ_frames": 0.0, "pred_frames": 0.0, "nins": 0.0, "ndel": 0.0, } rets = batch_mel_cepstral_distortion( [hypo["targ_waveform"] for hypo in hypos], [hypo["waveform"] for hypo in hypos], self.sr, normalize_type=None, ) for d, extra in rets: pathmap = extra[-1] losses["mcd_loss"] += d.item() losses["targ_frames"] += pathmap.size(0) losses["pred_frames"] += pathmap.size(1) losses["nins"] += (pathmap.sum(dim=1) - 1).sum().item() losses["ndel"] += (pathmap.sum(dim=0) - 1).sum().item() return hypos, losses def log_tensorboard(self, sample, hypos, num_updates, is_na_model=False): if self.tensorboard_writer is None: self.tensorboard_writer = SummaryWriter(self.tensorboard_dir) tb_writer = self.tensorboard_writer for b in range(len(hypos)): idx = sample["id"][b] text = sample["src_texts"][b] targ = hypos[b]["targ_feature"] pred = hypos[b]["feature"] attn = hypos[b]["attn"] if is_na_model: data = plot_tts_output( [targ.transpose(0, 1), pred.transpose(0, 1)], [f"target (idx={idx})", "output"], attn, "alignment", ret_np=True, suptitle=text, ) else: eos_prob = hypos[b]["eos_prob"] data = plot_tts_output( [targ.transpose(0, 1), pred.transpose(0, 1), attn], [f"target (idx={idx})", "output", "alignment"], eos_prob, "eos prob", ret_np=True, suptitle=text, ) tb_writer.add_image( f"inference_sample_{b}", data, num_updates, dataformats="HWC" ) if hypos[b]["waveform"] is not None: targ_wave = hypos[b]["targ_waveform"].detach().cpu().float() pred_wave = hypos[b]["waveform"].detach().cpu().float() tb_writer.add_audio( f"inference_targ_{b}", targ_wave, num_updates, sample_rate=self.sr ) tb_writer.add_audio( f"inference_pred_{b}", pred_wave, num_updates, sample_rate=self.sr ) def save_figure_to_numpy(fig): data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) return data DEFAULT_V_MIN = np.log(1e-5) def plot_tts_output( data_2d, title_2d, data_1d, title_1d, figsize=(24, 4), v_min=DEFAULT_V_MIN, v_max=3, ret_np=False, suptitle="", ): try: import matplotlib.pyplot as plt from mpl_toolkits.axes_grid1 import make_axes_locatable except ImportError: raise ImportError("Please install Matplotlib: pip install matplotlib") data_2d = [ x.detach().cpu().float().numpy() if isinstance(x, torch.Tensor) else x for x in data_2d ] fig, axes = plt.subplots(1, len(data_2d) + 1, figsize=figsize) if suptitle: fig.suptitle(suptitle[:400]) # capped at 400 chars axes = [axes] if len(data_2d) == 0 else axes for ax, x, name in zip(axes, data_2d, title_2d): ax.set_title(name) divider = make_axes_locatable(ax) cax = divider.append_axes("right", size="5%", pad=0.05) im = ax.imshow( x, origin="lower", aspect="auto", vmin=max(x.min(), v_min), vmax=min(x.max(), v_max), ) fig.colorbar(im, cax=cax, orientation="vertical") if isinstance(data_1d, torch.Tensor): data_1d = data_1d.detach().cpu().numpy() axes[-1].plot(data_1d) axes[-1].set_title(title_1d) plt.tight_layout() if ret_np: fig.canvas.draw() data = save_figure_to_numpy(fig) plt.close(fig) return data def antidiag_indices(offset, min_i=0, max_i=None, min_j=0, max_j=None): """ for a (3, 4) matrix with min_i=1, max_i=3, min_j=1, max_j=4, outputs offset=2 (1, 1), offset=3 (2, 1), (1, 2) offset=4 (2, 2), (1, 3) offset=5 (2, 3) constraints: i + j = offset min_j <= j < max_j min_i <= offset - j < max_i """ if max_i is None: max_i = offset + 1 if max_j is None: max_j = offset + 1 min_j = max(min_j, offset - max_i + 1, 0) max_j = min(max_j, offset - min_i + 1, offset + 1) j = torch.arange(min_j, max_j) i = offset - j return torch.stack([i, j]) def batch_dynamic_time_warping(distance, shapes=None): """full batched DTW without any constraints distance: (batchsize, max_M, max_N) matrix shapes: (batchsize,) vector specifying (M, N) for each entry """ # ptr: 0=left, 1=up-left, 2=up ptr2dij = {0: (0, -1), 1: (-1, -1), 2: (-1, 0)} bsz, m, n = distance.size() cumdist = torch.zeros_like(distance) backptr = torch.zeros_like(distance).type(torch.int32) - 1 # initialize cumdist[:, 0, :] = distance[:, 0, :].cumsum(dim=-1) cumdist[:, :, 0] = distance[:, :, 0].cumsum(dim=-1) backptr[:, 0, :] = 0 backptr[:, :, 0] = 2 # DP with optimized anti-diagonal parallelization, O(M+N) steps for offset in range(2, m + n - 1): ind = antidiag_indices(offset, 1, m, 1, n) c = torch.stack( [ cumdist[:, ind[0], ind[1] - 1], cumdist[:, ind[0] - 1, ind[1] - 1], cumdist[:, ind[0] - 1, ind[1]], ], dim=2, ) v, b = c.min(axis=-1) backptr[:, ind[0], ind[1]] = b.int() cumdist[:, ind[0], ind[1]] = v + distance[:, ind[0], ind[1]] # backtrace pathmap = torch.zeros_like(backptr) for b in range(bsz): i = m - 1 if shapes is None else (shapes[b][0] - 1).item() j = n - 1 if shapes is None else (shapes[b][1] - 1).item() dtwpath = [(i, j)] while (i != 0 or j != 0) and len(dtwpath) < 10000: assert i >= 0 and j >= 0 di, dj = ptr2dij[backptr[b, i, j].item()] i, j = i + di, j + dj dtwpath.append((i, j)) dtwpath = dtwpath[::-1] indices = torch.from_numpy(np.array(dtwpath)) pathmap[b, indices[:, 0], indices[:, 1]] = 1 return cumdist, backptr, pathmap def compute_l2_dist(x1, x2): """compute an (m, n) L2 distance matrix from (m, d) and (n, d) matrices""" return torch.cdist(x1.unsqueeze(0), x2.unsqueeze(0), p=2).squeeze(0).pow(2) def compute_rms_dist(x1, x2): l2_dist = compute_l2_dist(x1, x2) return (l2_dist / x1.size(1)).pow(0.5) def get_divisor(pathmap, normalize_type): if normalize_type is None: return 1 elif normalize_type == "len1": return pathmap.size(0) elif normalize_type == "len2": return pathmap.size(1) elif normalize_type == "path": return pathmap.sum().item() else: raise ValueError(f"normalize_type {normalize_type} not supported") def batch_compute_distortion(y1, y2, sr, feat_fn, dist_fn, normalize_type): d, s, x1, x2 = [], [], [], [] for cur_y1, cur_y2 in zip(y1, y2): assert cur_y1.ndim == 1 and cur_y2.ndim == 1 cur_x1 = feat_fn(cur_y1) cur_x2 = feat_fn(cur_y2) x1.append(cur_x1) x2.append(cur_x2) cur_d = dist_fn(cur_x1, cur_x2) d.append(cur_d) s.append(d[-1].size()) max_m = max(ss[0] for ss in s) max_n = max(ss[1] for ss in s) d = torch.stack( [F.pad(dd, (0, max_n - dd.size(1), 0, max_m - dd.size(0))) for dd in d] ) s = torch.LongTensor(s).to(d.device) cumdists, backptrs, pathmaps = batch_dynamic_time_warping(d, s) rets = [] itr = zip(s, x1, x2, d, cumdists, backptrs, pathmaps) for (m, n), cur_x1, cur_x2, dist, cumdist, backptr, pathmap in itr: cumdist = cumdist[:m, :n] backptr = backptr[:m, :n] pathmap = pathmap[:m, :n] divisor = get_divisor(pathmap, normalize_type) distortion = cumdist[-1, -1] / divisor ret = distortion, (cur_x1, cur_x2, dist, cumdist, backptr, pathmap) rets.append(ret) return rets def batch_mel_cepstral_distortion(y1, y2, sr, normalize_type="path", mfcc_fn=None): """ https://arxiv.org/pdf/2011.03568.pdf The root mean squared error computed on 13-dimensional MFCC using DTW for alignment. MFCC features are computed from an 80-channel log-mel spectrogram using a 50ms Hann window and hop of 12.5ms. y1: list of waveforms y2: list of waveforms sr: sampling rate """ try: import torchaudio except ImportError: raise ImportError("Please install torchaudio: pip install torchaudio") if mfcc_fn is None or mfcc_fn.sample_rate != sr: melkwargs = { "n_fft": int(0.05 * sr), "win_length": int(0.05 * sr), "hop_length": int(0.0125 * sr), "f_min": 20, "n_mels": 80, "window_fn": torch.hann_window, } mfcc_fn = torchaudio.transforms.MFCC( sr, n_mfcc=13, log_mels=True, melkwargs=melkwargs ).to(y1[0].device) return batch_compute_distortion( y1, y2, sr, lambda y: mfcc_fn(y).transpose(-1, -2), compute_rms_dist, normalize_type, ) ================================================ FILE: fairseq/tasks/translation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import itertools import json import logging import os from typing import Optional from argparse import Namespace from omegaconf import II import numpy as np from fairseq import utils from fairseq.logging import metrics from fairseq.data import ( AppendTokenDataset, ConcatDataset, LanguagePairDataset, PrependTokenDataset, StripTokenDataset, TruncateDataset, data_utils, encoders, indexed_dataset, ) from fairseq.data.indexed_dataset import get_available_dataset_impl from fairseq.dataclass import ChoiceEnum, FairseqDataclass from fairseq.tasks import FairseqTask, register_task EVAL_BLEU_ORDER = 4 logger = logging.getLogger(__name__) def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, pad_to_multiple=1, prepend_bos_src=None, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl ) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset( prefix + tgt, tgt_dict, dataset_impl ) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info( "{} {} {}-{} {} examples".format( data_path, split_k, src, tgt, len(src_datasets[-1]) ) ) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) elif prepend_bos_src is not None: logger.info(f"prepending src bos: {prepend_bos_src}") src_dataset = PrependTokenDataset(src_dataset, prepend_bos_src) eos = None if append_source_id: src_dataset = AppendTokenDataset( src_dataset, src_dict.index("[{}]".format(src)) ) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt)) ) eos = tgt_dict.index("[{}]".format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, "{}.align.{}-{}".format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl ) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, pad_to_multiple=pad_to_multiple, ) @dataclass class TranslationConfig(FairseqDataclass): data: Optional[str] = field( default=None, metadata={ "help": "colon separated path to data directories list, will be iterated upon during epochs " "in round-robin manner; however, valid and test data are always in the first directory " "to avoid the need for repeating them in all directories" }, ) source_lang: Optional[str] = field( default=None, metadata={ "help": "source language", "argparse_alias": "-s", }, ) target_lang: Optional[str] = field( default=None, metadata={ "help": "target language", "argparse_alias": "-t", }, ) load_alignments: bool = field( default=False, metadata={"help": "load the binarized alignments"} ) left_pad_source: bool = field( default=True, metadata={"help": "pad the source on the left"} ) left_pad_target: bool = field( default=False, metadata={"help": "pad the target on the left"} ) max_source_positions: int = field( default=1024, metadata={"help": "max number of tokens in the source sequence"} ) max_target_positions: int = field( default=1024, metadata={"help": "max number of tokens in the target sequence"} ) upsample_primary: int = field( default=-1, metadata={"help": "the amount of upsample primary dataset"} ) truncate_source: bool = field( default=False, metadata={"help": "truncate source to max-source-positions"} ) num_batch_buckets: int = field( default=0, metadata={ "help": "if >0, then bucket source and target lengths into " "N buckets and pad accordingly; this is useful on TPUs to minimize the number of compilations" }, ) train_subset: str = II("dataset.train_subset") dataset_impl: Optional[ChoiceEnum(get_available_dataset_impl())] = II( "dataset.dataset_impl" ) required_seq_len_multiple: int = II("dataset.required_seq_len_multiple") # options for reporting BLEU during validation eval_bleu: bool = field( default=False, metadata={"help": "evaluation with BLEU scores"} ) eval_bleu_args: Optional[str] = field( default="{}", metadata={ "help": 'generation args for BLUE scoring, e.g., \'{"beam": 4, "lenpen": 0.6}\', as JSON string' }, ) eval_bleu_detok: str = field( default="space", metadata={ "help": "detokenize before computing BLEU (e.g., 'moses'); required if using --eval-bleu; " "use 'space' to disable detokenization; see fairseq.data.encoders for other options" }, ) eval_bleu_detok_args: Optional[str] = field( default="{}", metadata={"help": "args for building the tokenizer, if needed, as JSON string"}, ) eval_tokenized_bleu: bool = field( default=False, metadata={"help": "compute tokenized BLEU instead of sacrebleu"} ) eval_bleu_remove_bpe: Optional[str] = field( default=None, metadata={ "help": "remove BPE before computing BLEU", "argparse_const": "@@ ", }, ) eval_bleu_print_samples: bool = field( default=False, metadata={"help": "print sample generations during validation"} ) @register_task("translation", dataclass=TranslationConfig) class TranslationTask(FairseqTask): """ Translate from one (source) language to another (target) language. Args: src_dict (~fairseq.data.Dictionary): dictionary for the source language tgt_dict (~fairseq.data.Dictionary): dictionary for the target language .. note:: The translation task is compatible with :mod:`fairseq-train`, :mod:`fairseq-generate` and :mod:`fairseq-interactive`. """ cfg: TranslationConfig def __init__(self, cfg: TranslationConfig, src_dict, tgt_dict): super().__init__(cfg) self.src_dict = src_dict self.tgt_dict = tgt_dict @classmethod def setup_task(cls, cfg: TranslationConfig, **kwargs): """Setup the task (e.g., load dictionaries). Args: args (argparse.Namespace): parsed command-line arguments """ paths = utils.split_paths(cfg.data) assert len(paths) > 0 # find language pair automatically if cfg.source_lang is None or cfg.target_lang is None: cfg.source_lang, cfg.target_lang = data_utils.infer_language_pair(paths[0]) if cfg.source_lang is None or cfg.target_lang is None: raise Exception( "Could not infer language pair, please provide it explicitly" ) # load dictionaries src_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(cfg.source_lang)) ) tgt_dict = cls.load_dictionary( os.path.join(paths[0], "dict.{}.txt".format(cfg.target_lang)) ) assert src_dict.pad() == tgt_dict.pad() assert src_dict.eos() == tgt_dict.eos() assert src_dict.unk() == tgt_dict.unk() logger.info("[{}] dictionary: {} types".format(cfg.source_lang, len(src_dict))) logger.info("[{}] dictionary: {} types".format(cfg.target_lang, len(tgt_dict))) return cls(cfg, src_dict, tgt_dict) def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 if split != self.cfg.train_subset: # if not training data set, use the first shard for valid and test paths = paths[:1] data_path = paths[(epoch - 1) % len(paths)] # infer langcode src, tgt = self.cfg.source_lang, self.cfg.target_lang self.datasets[split] = load_langpair_dataset( data_path, split, src, self.src_dict, tgt, self.tgt_dict, combine=combine, dataset_impl=self.cfg.dataset_impl, upsample_primary=self.cfg.upsample_primary, left_pad_source=self.cfg.left_pad_source, left_pad_target=self.cfg.left_pad_target, max_source_positions=self.cfg.max_source_positions, max_target_positions=self.cfg.max_target_positions, load_alignments=self.cfg.load_alignments, truncate_source=self.cfg.truncate_source, num_buckets=self.cfg.num_batch_buckets, shuffle=(split != "test"), pad_to_multiple=self.cfg.required_seq_len_multiple, ) def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): return LanguagePairDataset( src_tokens, src_lengths, self.source_dictionary, tgt_dict=self.target_dictionary, constraints=constraints, ) def build_model(self, cfg, from_checkpoint=False): model = super().build_model(cfg, from_checkpoint) if self.cfg.eval_bleu: detok_args = json.loads(self.cfg.eval_bleu_detok_args) self.tokenizer = encoders.build_tokenizer( Namespace(tokenizer=self.cfg.eval_bleu_detok, **detok_args) ) gen_args = json.loads(self.cfg.eval_bleu_args) self.sequence_generator = self.build_generator( [model], Namespace(**gen_args) ) return model def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) if self.cfg.eval_bleu: bleu = self._inference_with_bleu(self.sequence_generator, sample, model) logging_output["_bleu_sys_len"] = bleu.sys_len logging_output["_bleu_ref_len"] = bleu.ref_len # we split counts into separate entries so that they can be # summed efficiently across workers using fast-stat-sync assert len(bleu.counts) == EVAL_BLEU_ORDER for i in range(EVAL_BLEU_ORDER): logging_output["_bleu_counts_" + str(i)] = bleu.counts[i] logging_output["_bleu_totals_" + str(i)] = bleu.totals[i] return loss, sample_size, logging_output def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) if self.cfg.eval_bleu: def sum_logs(key): import torch result = sum(log.get(key, 0) for log in logging_outputs) if torch.is_tensor(result): result = result.cpu() return result counts, totals = [], [] for i in range(EVAL_BLEU_ORDER): counts.append(sum_logs("_bleu_counts_" + str(i))) totals.append(sum_logs("_bleu_totals_" + str(i))) if max(totals) > 0: # log counts as numpy arrays -- log_scalar will sum them correctly metrics.log_scalar("_bleu_counts", np.array(counts)) metrics.log_scalar("_bleu_totals", np.array(totals)) metrics.log_scalar("_bleu_sys_len", sum_logs("_bleu_sys_len")) metrics.log_scalar("_bleu_ref_len", sum_logs("_bleu_ref_len")) def compute_bleu(meters): import inspect try: from sacrebleu.metrics import BLEU comp_bleu = BLEU.compute_bleu except ImportError: # compatibility API for sacrebleu 1.x import sacrebleu comp_bleu = sacrebleu.compute_bleu fn_sig = inspect.getfullargspec(comp_bleu)[0] if "smooth_method" in fn_sig: smooth = {"smooth_method": "exp"} else: smooth = {"smooth": "exp"} bleu = comp_bleu( correct=meters["_bleu_counts"].sum, total=meters["_bleu_totals"].sum, sys_len=int(meters["_bleu_sys_len"].sum), ref_len=int(meters["_bleu_ref_len"].sum), **smooth, ) return round(bleu.score, 2) metrics.log_derived("bleu", compute_bleu) def max_positions(self): """Return the max sentence length allowed by the task.""" return (self.cfg.max_source_positions, self.cfg.max_target_positions) @property def source_dictionary(self): """Return the source :class:`~fairseq.data.Dictionary`.""" return self.src_dict @property def target_dictionary(self): """Return the target :class:`~fairseq.data.Dictionary`.""" return self.tgt_dict def _inference_with_bleu(self, generator, sample, model): import sacrebleu def decode(toks, escape_unk=False): s = self.tgt_dict.string( toks.int().cpu(), self.cfg.eval_bleu_remove_bpe, # The default unknown string in fairseq is `<unk>`, but # this is tokenized by sacrebleu as `< unk >`, inflating # BLEU scores. Instead, we use a somewhat more verbose # alternative that is unlikely to appear in the real # reference, but doesn't get split into multiple tokens. unk_string=("UNKNOWNTOKENINREF" if escape_unk else "UNKNOWNTOKENINHYP"), ) if self.tokenizer: s = self.tokenizer.decode(s) return s gen_out = self.inference_step(generator, [model], sample, prefix_tokens=None) hyps, refs = [], [] for i in range(len(gen_out)): hyps.append(decode(gen_out[i][0]["tokens"])) refs.append( decode( utils.strip_pad(sample["target"][i], self.tgt_dict.pad()), escape_unk=True, # don't count <unk> as matches to the hypo ) ) if self.cfg.eval_bleu_print_samples: logger.info("example hypothesis: " + hyps[0]) logger.info("example reference: " + refs[0]) if self.cfg.eval_tokenized_bleu: return sacrebleu.corpus_bleu(hyps, [refs], tokenize="none") else: return sacrebleu.corpus_bleu(hyps, [refs]) ================================================ FILE: fairseq/tasks/translation_from_pretrained_bart.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import torch from fairseq import utils from fairseq.data import LanguagePairDataset from . import register_task from .translation import TranslationTask, load_langpair_dataset @register_task("translation_from_pretrained_bart") class TranslationFromPretrainedBARTTask(TranslationTask): """ Translate from source language to target language with a model initialized with a multilingual pretrain. Args: src_dict (~fairseq.data.Dictionary): dictionary for the source language tgt_dict (~fairseq.data.Dictionary): dictionary for the target language .. note:: The translation task is compatible with :mod:`fairseq-train`, :mod:`fairseq-generate` and :mod:`fairseq-interactive`. The translation task provides the following additional command-line arguments: .. argparse:: :ref: fairseq.tasks.translation_parser :prog: """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off TranslationTask.add_args(parser) parser.add_argument('--langs', type=str, metavar='LANG', help='comma-separated list of monolingual language, ' 'for example, "en,de,fr". These should match the ' 'langs from pretraining (and be in the same order). ' 'You should always add all pretraining language idx ' 'during finetuning.') parser.add_argument('--prepend-bos', action='store_true', help='prepend bos token to each sentence, which matches ' 'mBART pretraining') # fmt: on def __init__(self, args, src_dict, tgt_dict): super().__init__(args, src_dict, tgt_dict) self.langs = args.langs.split(",") for d in [src_dict, tgt_dict]: for l in self.langs: d.add_symbol("[{}]".format(l)) d.add_symbol("<mask>") def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] # infer langcode src, tgt = self.args.source_lang, self.args.target_lang self.datasets[split] = load_langpair_dataset( data_path, split, src, self.src_dict, tgt, self.tgt_dict, combine=combine, dataset_impl=self.args.dataset_impl, upsample_primary=self.args.upsample_primary, left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, max_source_positions=getattr(self.args, "max_source_positions", 1024), max_target_positions=getattr(self.args, "max_target_positions", 1024), load_alignments=self.args.load_alignments, prepend_bos=getattr(self.args, "prepend_bos", False), append_source_id=True, ) def build_generator(self, models, args, **unused): if getattr(args, "score_reference", False): from fairseq.sequence_scorer import SequenceScorer return SequenceScorer( self.target_dictionary, eos=self.tgt_dict.index("[{}]".format(self.args.target_lang)), ) else: from fairseq.sequence_generator import SequenceGenerator return SequenceGenerator( models, self.target_dictionary, beam_size=getattr(args, "beam", 5), max_len_a=getattr(args, "max_len_a", 0), max_len_b=getattr(args, "max_len_b", 200), min_len=getattr(args, "min_len", 1), normalize_scores=(not getattr(args, "unnormalized", False)), len_penalty=getattr(args, "lenpen", 1), unk_penalty=getattr(args, "unkpen", 0), temperature=getattr(args, "temperature", 1.0), match_source_len=getattr(args, "match_source_len", False), no_repeat_ngram_size=getattr(args, "no_repeat_ngram_size", 0), eos=self.tgt_dict.index("[{}]".format(self.args.target_lang)), ) def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): src_lang_id = self.source_dictionary.index("[{}]".format(self.args.source_lang)) source_tokens = [] for s_t in src_tokens: s_t = torch.cat([s_t, s_t.new(1).fill_(src_lang_id)]) source_tokens.append(s_t) dataset = LanguagePairDataset( source_tokens, src_lengths, self.source_dictionary, tgt_dict=self.target_dictionary, constraints=constraints, ) return dataset ================================================ FILE: fairseq/tasks/translation_from_pretrained_xlm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass from fairseq.data.legacy.masked_lm_dictionary import MaskedLMDictionary from fairseq.tasks.translation import TranslationConfig, TranslationTask from . import register_task @dataclass class TranslationFromPretrainedXLMConfig(TranslationConfig): pass @register_task( "translation_from_pretrained_xlm", dataclass=TranslationFromPretrainedXLMConfig ) class TranslationFromPretrainedXLMTask(TranslationTask): """ Same as TranslationTask except use the MaskedLMDictionary class so that we can load data that was binarized with the MaskedLMDictionary class. This task should be used for the entire training pipeline when we want to train an NMT model from a pretrained XLM checkpoint: binarizing NMT data, training NMT with the pretrained XLM checkpoint, and subsequent evaluation of that trained model. """ @classmethod def load_dictionary(cls, filename): """Load the masked LM dictionary from the filename Args: filename (str): the filename """ return MaskedLMDictionary.load(filename) ================================================ FILE: fairseq/tasks/translation_lev.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from dataclasses import dataclass, field import torch from fairseq import utils from fairseq.data import LanguagePairDataset from fairseq.dataclass import ChoiceEnum from fairseq.tasks import register_task from fairseq.tasks.translation import ( TranslationConfig, TranslationTask, load_langpair_dataset, ) from fairseq.utils import new_arange NOISE_CHOICES = ChoiceEnum(["random_delete", "random_mask", "no_noise", "full_mask"]) @dataclass class TranslationLevenshteinConfig(TranslationConfig): noise: NOISE_CHOICES = field( default="random_delete", metadata={"help": "type of noise"}, ) @register_task("translation_lev", dataclass=TranslationLevenshteinConfig) class TranslationLevenshteinTask(TranslationTask): """ Translation (Sequence Generation) task for Levenshtein Transformer See `"Levenshtein Transformer" <https://arxiv.org/abs/1905.11006>`_. """ cfg: TranslationLevenshteinConfig def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[(epoch - 1) % len(paths)] # infer langcode src, tgt = self.cfg.source_lang, self.cfg.target_lang self.datasets[split] = load_langpair_dataset( data_path, split, src, self.src_dict, tgt, self.tgt_dict, combine=combine, dataset_impl=self.cfg.dataset_impl, upsample_primary=self.cfg.upsample_primary, left_pad_source=self.cfg.left_pad_source, left_pad_target=self.cfg.left_pad_target, max_source_positions=self.cfg.max_source_positions, max_target_positions=self.cfg.max_target_positions, prepend_bos=True, ) def inject_noise(self, target_tokens): def _random_delete(target_tokens): pad = self.tgt_dict.pad() bos = self.tgt_dict.bos() eos = self.tgt_dict.eos() max_len = target_tokens.size(1) target_mask = target_tokens.eq(pad) target_score = target_tokens.clone().float().uniform_() target_score.masked_fill_( target_tokens.eq(bos) | target_tokens.eq(eos), 0.0 ) target_score.masked_fill_(target_mask, 1) target_score, target_rank = target_score.sort(1) target_length = target_mask.size(1) - target_mask.float().sum( 1, keepdim=True ) # do not delete <bos> and <eos> (we assign 0 score for them) target_cutoff = ( 2 + ( (target_length - 2) * target_score.new_zeros(target_score.size(0), 1).uniform_() ).long() ) target_cutoff = target_score.sort(1)[1] >= target_cutoff prev_target_tokens = ( target_tokens.gather(1, target_rank) .masked_fill_(target_cutoff, pad) .gather(1, target_rank.masked_fill_(target_cutoff, max_len).sort(1)[1]) ) prev_target_tokens = prev_target_tokens[ :, : prev_target_tokens.ne(pad).sum(1).max() ] return prev_target_tokens def _random_mask(target_tokens): pad = self.tgt_dict.pad() bos = self.tgt_dict.bos() eos = self.tgt_dict.eos() unk = self.tgt_dict.unk() target_masks = ( target_tokens.ne(pad) & target_tokens.ne(bos) & target_tokens.ne(eos) ) target_score = target_tokens.clone().float().uniform_() target_score.masked_fill_(~target_masks, 2.0) target_length = target_masks.sum(1).float() target_length = target_length * target_length.clone().uniform_() target_length = target_length + 1 # make sure to mask at least one token. _, target_rank = target_score.sort(1) target_cutoff = new_arange(target_rank) < target_length[:, None].long() prev_target_tokens = target_tokens.masked_fill( target_cutoff.scatter(1, target_rank, target_cutoff), unk ) return prev_target_tokens def _full_mask(target_tokens): pad = self.tgt_dict.pad() bos = self.tgt_dict.bos() eos = self.tgt_dict.eos() unk = self.tgt_dict.unk() target_mask = ( target_tokens.eq(bos) | target_tokens.eq(eos) | target_tokens.eq(pad) ) return target_tokens.masked_fill(~target_mask, unk) if self.cfg.noise == "random_delete": return _random_delete(target_tokens) elif self.cfg.noise == "random_mask": return _random_mask(target_tokens) elif self.cfg.noise == "full_mask": return _full_mask(target_tokens) elif self.cfg.noise == "no_noise": return target_tokens else: raise NotImplementedError def build_generator(self, models, args, **unused): # add models input to match the API for SequenceGenerator from fairseq.iterative_refinement_generator import IterativeRefinementGenerator return IterativeRefinementGenerator( self.target_dictionary, eos_penalty=getattr(args, "iter_decode_eos_penalty", 0.0), max_iter=getattr(args, "iter_decode_max_iter", 10), beam_size=getattr(args, "iter_decode_with_beam", 1), reranking=getattr(args, "iter_decode_with_external_reranker", False), decoding_format=getattr(args, "decoding_format", None), adaptive=not getattr(args, "iter_decode_force_max_iter", False), retain_history=getattr(args, "retain_iter_history", False), ) def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): if constraints is not None: # Though see Susanto et al. (ACL 2020): https://www.aclweb.org/anthology/2020.acl-main.325/ raise NotImplementedError( "Constrained decoding with the translation_lev task is not supported" ) return LanguagePairDataset( src_tokens, src_lengths, self.source_dictionary, append_bos=True ) def train_step( self, sample, model, criterion, optimizer, update_num, ignore_grad=False ): model.train() sample["prev_target"] = self.inject_noise(sample["target"]) loss, sample_size, logging_output = criterion(model, sample) if ignore_grad: loss *= 0 optimizer.backward(loss) return loss, sample_size, logging_output def valid_step(self, sample, model, criterion): model.eval() with torch.no_grad(): sample["prev_target"] = self.inject_noise(sample["target"]) loss, sample_size, logging_output = criterion(model, sample) return loss, sample_size, logging_output ================================================ FILE: fairseq/tasks/translation_multi_simple_epoch.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import datetime import logging import time import torch from fairseq.data import ( FairseqDataset, LanguagePairDataset, ListDataset, data_utils, iterators, ) from fairseq.data.multilingual.multilingual_data_manager import ( MultilingualDatasetManager, ) from fairseq.data.multilingual.sampling_method import SamplingMethod from fairseq.tasks import LegacyFairseqTask, register_task from fairseq.utils import FileContentsAction ### def get_time_gap(s, e): return ( datetime.datetime.fromtimestamp(e) - datetime.datetime.fromtimestamp(s) ).__str__() ### logger = logging.getLogger(__name__) @register_task("translation_multi_simple_epoch") class TranslationMultiSimpleEpochTask(LegacyFairseqTask): """ Translate from one (source) language to another (target) language. Args: langs (List[str]): a list of languages that are being supported dicts (Dict[str, fairseq.data.Dictionary]): mapping from supported languages to their dictionaries training (bool): whether the task should be configured for training or not .. note:: The translation task is compatible with :mod:`fairseq-train`, :mod:`fairseq-generate` and :mod:`fairseq-interactive`. The translation task provides the following additional command-line arguments: .. argparse:: :ref: fairseq.tasks.translation_parser :prog: """ @staticmethod def add_args(parser): """Add task-specific arguments to the parser.""" # fmt: off parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='inference source language') parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='inference target language') parser.add_argument('--lang-pairs', default=None, metavar='PAIRS', help='comma-separated list of language pairs (in training order): en-de,en-fr,de-fr', action=FileContentsAction) parser.add_argument('--keep-inference-langtok', action='store_true', help='keep language tokens in inference output (e.g. for analysis or debugging)') SamplingMethod.add_arguments(parser) MultilingualDatasetManager.add_args(parser) # fmt: on def __init__(self, args, langs, dicts, training): super().__init__(args) self.langs = langs self.dicts = dicts self.training = training if training: self.lang_pairs = args.lang_pairs else: self.lang_pairs = ["{}-{}".format(args.source_lang, args.target_lang)] # eval_lang_pairs for multilingual translation is usually all of the # lang_pairs. However for other multitask settings or when we want to # optimize for certain languages we want to use a different subset. Thus # the eval_lang_pairs class variable is provided for classes that extend # this class. self.eval_lang_pairs = self.lang_pairs # model_lang_pairs will be used to build encoder-decoder model pairs in # models.build_model(). This allows multitask type of sub-class can # build models other than the input lang_pairs self.model_lang_pairs = self.lang_pairs self.source_langs = [d.split("-")[0] for d in self.lang_pairs] self.target_langs = [d.split("-")[1] for d in self.lang_pairs] self.check_dicts(self.dicts, self.source_langs, self.target_langs) self.sampling_method = SamplingMethod.build_sampler(args, self) self.data_manager = MultilingualDatasetManager.setup_data_manager( args, self.lang_pairs, langs, dicts, self.sampling_method ) def check_dicts(self, dicts, source_langs, target_langs): if self.args.source_dict is not None or self.args.target_dict is not None: # no need to check whether the source side and target side are sharing dictionaries return src_dict = dicts[source_langs[0]] tgt_dict = dicts[target_langs[0]] for src_lang in source_langs: assert ( src_dict == dicts[src_lang] ), "Diffrent dictionary are specified for different source languages; " "TranslationMultiSimpleEpochTask only supports one shared dictionary across all source languages" for tgt_lang in target_langs: assert ( tgt_dict == dicts[tgt_lang] ), "Diffrent dictionary are specified for different target languages; " "TranslationMultiSimpleEpochTask only supports one shared dictionary across all target languages" @classmethod def setup_task(cls, args, **kwargs): langs, dicts, training = MultilingualDatasetManager.prepare( cls.load_dictionary, args, **kwargs ) return cls(args, langs, dicts, training) def has_sharded_data(self, split): return self.data_manager.has_sharded_data(split) def load_dataset(self, split, epoch=1, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ if split in self.datasets: dataset = self.datasets[split] if self.has_sharded_data(split): if self.args.virtual_epoch_size is not None: if dataset.load_next_shard: shard_epoch = dataset.shard_epoch else: # no need to load next shard so skip loading # also this avoid always loading from beginning of the data return else: shard_epoch = epoch else: # estimate the shard epoch from virtual data size and virtual epoch size shard_epoch = self.data_manager.estimate_global_pass_epoch(epoch) logger.info(f"loading data for {split} epoch={epoch}/{shard_epoch}") logger.info(f"mem usage: {data_utils.get_mem_usage()}") if split in self.datasets: del self.datasets[split] logger.info("old dataset deleted manually") logger.info(f"mem usage: {data_utils.get_mem_usage()}") self.datasets[split] = self.data_manager.load_dataset( split, self.training, epoch=epoch, combine=combine, shard_epoch=shard_epoch, **kwargs, ) def build_dataset_for_inference(self, src_tokens, src_lengths, constraints=None): if constraints is not None: raise NotImplementedError( "Constrained decoding with the multilingual_translation task is not supported" ) src_data = ListDataset(src_tokens, src_lengths) dataset = LanguagePairDataset(src_data, src_lengths, self.source_dictionary) src_langtok_spec, tgt_langtok_spec = self.args.langtoks["main"] if self.args.lang_tok_replacing_bos_eos: dataset = self.data_manager.alter_dataset_langtok( dataset, src_eos=self.source_dictionary.eos(), src_lang=self.args.source_lang, tgt_eos=self.target_dictionary.eos(), tgt_lang=self.args.target_lang, src_langtok_spec=src_langtok_spec, tgt_langtok_spec=tgt_langtok_spec, ) else: dataset.src = self.data_manager.src_dataset_tranform_func( self.args.source_lang, self.args.target_lang, dataset=dataset.src, spec=src_langtok_spec, ) return dataset def build_generator( self, models, args, seq_gen_cls=None, extra_gen_cls_kwargs=None, ): if not getattr(args, "keep_inference_langtok", False): _, tgt_langtok_spec = self.args.langtoks["main"] if tgt_langtok_spec: tgt_lang_tok = self.data_manager.get_decoder_langtok( self.args.target_lang, tgt_langtok_spec ) extra_gen_cls_kwargs = extra_gen_cls_kwargs or {} extra_gen_cls_kwargs["symbols_to_strip_from_output"] = {tgt_lang_tok} return super().build_generator( models, args, seq_gen_cls=None, extra_gen_cls_kwargs=extra_gen_cls_kwargs ) def build_model(self, args, from_checkpoint=False): return super().build_model(args, from_checkpoint) def valid_step(self, sample, model, criterion): loss, sample_size, logging_output = super().valid_step(sample, model, criterion) return loss, sample_size, logging_output def inference_step( self, generator, models, sample, prefix_tokens=None, constraints=None ): with torch.no_grad(): _, tgt_langtok_spec = self.args.langtoks["main"] if not self.args.lang_tok_replacing_bos_eos: if prefix_tokens is None and tgt_langtok_spec: tgt_lang_tok = self.data_manager.get_decoder_langtok( self.args.target_lang, tgt_langtok_spec ) src_tokens = sample["net_input"]["src_tokens"] bsz = src_tokens.size(0) prefix_tokens = ( torch.LongTensor([[tgt_lang_tok]]).expand(bsz, 1).to(src_tokens) ) return generator.generate( models, sample, prefix_tokens=prefix_tokens, constraints=constraints, ) else: return generator.generate( models, sample, prefix_tokens=prefix_tokens, bos_token=self.data_manager.get_decoder_langtok( self.args.target_lang, tgt_langtok_spec ) if tgt_langtok_spec else self.target_dictionary.eos(), ) def reduce_metrics(self, logging_outputs, criterion): super().reduce_metrics(logging_outputs, criterion) def max_positions(self): """Return the max sentence length allowed by the task.""" return (self.args.max_source_positions, self.args.max_target_positions) @property def source_dictionary(self): return self.data_manager.get_source_dictionary(self.source_langs[0]) @property def target_dictionary(self): return self.data_manager.get_target_dictionary(self.target_langs[0]) def create_batch_sampler_func( self, max_positions, ignore_invalid_inputs, max_tokens, max_sentences, required_batch_size_multiple=1, seed=1, ): def construct_batch_sampler(dataset, epoch): splits = [ s for s, _ in self.datasets.items() if self.datasets[s] == dataset ] split = splits[0] if len(splits) > 0 else None # NEW implementation if epoch is not None: # initialize the dataset with the correct starting epoch dataset.set_epoch(epoch) # get indices ordered by example size start_time = time.time() logger.info(f"start batch sampler: mem usage: {data_utils.get_mem_usage()}") with data_utils.numpy_seed(seed): indices = dataset.ordered_indices() logger.info( f"[{split}] @batch_sampler order indices time: {get_time_gap(start_time, time.time())}" ) logger.info(f"mem usage: {data_utils.get_mem_usage()}") # filter examples that are too large if max_positions is not None: my_time = time.time() indices = self.filter_indices_by_size( indices, dataset, max_positions, ignore_invalid_inputs ) logger.info( f"[{split}] @batch_sampler filter_by_size time: {get_time_gap(my_time, time.time())}" ) logger.info(f"mem usage: {data_utils.get_mem_usage()}") # create mini-batches with given size constraints my_time = time.time() batch_sampler = dataset.batch_by_size( indices, max_tokens=max_tokens, max_sentences=max_sentences, required_batch_size_multiple=required_batch_size_multiple, ) logger.info( f"[{split}] @batch_sampler batch_by_size time: {get_time_gap(my_time, time.time())}" ) logger.info( f"[{split}] per epoch batch_sampler set-up time: {get_time_gap(start_time, time.time())}" ) logger.info(f"mem usage: {data_utils.get_mem_usage()}") return batch_sampler return construct_batch_sampler # we need to override get_batch_iterator because we want to reset the epoch iterator each time def get_batch_iterator( self, dataset, max_tokens=None, max_sentences=None, max_positions=None, ignore_invalid_inputs=False, required_batch_size_multiple=1, seed=1, num_shards=1, shard_id=0, num_workers=0, epoch=1, data_buffer_size=0, disable_iterator_cache=False, skip_remainder_batch=False, grouped_shuffling=False, update_epoch_batch_itr=False, ): """ Get an iterator that yields batches of data from the given dataset. Args: dataset (~fairseq.data.FairseqDataset): dataset to batch max_tokens (int, optional): max number of tokens in each batch (default: None). max_sentences (int, optional): max number of sentences in each batch (default: None). max_positions (optional): max sentence length supported by the model (default: None). ignore_invalid_inputs (bool, optional): don't raise Exception for sentences that are too long (default: False). required_batch_size_multiple (int, optional): require batch size to be a multiple of N (default: 1). seed (int, optional): seed for random number generator for reproducibility (default: 1). num_shards (int, optional): shard the data iterator into N shards (default: 1). shard_id (int, optional): which shard of the data iterator to return (default: 0). num_workers (int, optional): how many subprocesses to use for data loading. 0 means the data will be loaded in the main process (default: 0). epoch (int, optional): the epoch to start the iterator from (default: 0). data_buffer_size (int, optional): number of batches to preload (default: 0). disable_iterator_cache (bool, optional): don't cache the EpochBatchIterator (ignores `FairseqTask::can_reuse_epoch_itr`) (default: False). grouped_shuffling (bool, optional): group batches with each groups containing num_shards batches and shuffle groups. Reduces difference between sequence lengths among workers for batches sorted by length. update_epoch_batch_itr (bool optional): if true then donot use the cached batch iterator for the epoch Returns: ~fairseq.iterators.EpochBatchIterator: a batched iterator over the given dataset split """ # initialize the dataset with the correct starting epoch assert isinstance(dataset, FairseqDataset) if dataset in self.dataset_to_epoch_iter: return self.dataset_to_epoch_iter[dataset] if self.args.sampling_method == "RoundRobin": batch_iter = super().get_batch_iterator( dataset, max_tokens=max_tokens, max_sentences=max_sentences, max_positions=max_positions, ignore_invalid_inputs=ignore_invalid_inputs, required_batch_size_multiple=required_batch_size_multiple, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, data_buffer_size=data_buffer_size, disable_iterator_cache=disable_iterator_cache, skip_remainder_batch=skip_remainder_batch, update_epoch_batch_itr=update_epoch_batch_itr, ) self.dataset_to_epoch_iter[dataset] = batch_iter return batch_iter construct_batch_sampler = self.create_batch_sampler_func( max_positions, ignore_invalid_inputs, max_tokens, max_sentences, required_batch_size_multiple=required_batch_size_multiple, seed=seed, ) epoch_iter = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=construct_batch_sampler, seed=seed, num_shards=num_shards, shard_id=shard_id, num_workers=num_workers, epoch=epoch, ) return epoch_iter ================================================ FILE: fairseq/token_generation_constraints.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """Implements tracking of constraints for a beam item. A list of constraints is given as a list of one or more token sequences, each of length at least one token. For example, for an input sentence > Die maschinelle Übersetzung ist schwer zu kontrollieren. We could have the constraints: * to influence * hard There are two implementations: * OrderedConstraintState: Tracks progress through an ordered list of multitoken constraints. * UnorderedConstraintState: Tracks progress through an unordered list of multitoken constraints. The difference is that in the first, the constraints are assumed to be in order; the algorithm will permit zero or more tokens between them. In the second, the constraints are not ordered, so many orderings will be explored. The same sequence can be present any number of times, and will appear that many times in the output. """ from collections import Counter from typing import List, Optional, Set, Tuple import torch class ConstraintState: def __init__(self): pass def pack_constraints(batch_constraints: List[List[torch.Tensor]]) -> torch.Tensor: """Takes a list of list of constraints in tensor form (a list of tensor constraints for each sentence) and transforms it into a packed Tensor. For example, here is a batch of size 3 with 3, 0, and 1 constraints: [ [ [3 1 2], [3], [4 5 6 7], ] [], [ [1 8 9 10 1 4 11 12], ] ] Its corresponding packed structure is: [ [ 3 3 1 2 0 3 0 4 5 6 7 0], [ 0 0 0 0 0 0 0 0 0 0 0 0], [ 1 1 8 9 10 1 4 11 12 0 0 0] ] The packed tensor has shape (batch size, maxlen), where maxlen is defined below. Each row contains concatenated constraint tokens for that sentence, with 0 appended after each constraint. The first item in each row is the number of constraints for that sentence. So maxlen is the maximum of (number of constraints) + (sum length of constraints) + 1. across all sentences in the batch. """ # The maximum word length of concatenated constraints for any sentence max_constraints_len = 1 for sentence_constraints in batch_constraints: if len(sentence_constraints): # number of constraints, plus sum of constrain lens, plus a zero after each constraints_len = ( 1 + sum([c.size(0) for c in sentence_constraints]) + len(sentence_constraints) ) max_constraints_len = max(max_constraints_len, constraints_len) batch_size = len(batch_constraints) constraints_tensor = torch.zeros((batch_size, max_constraints_len)).long() for i, sentence_constraints in enumerate(batch_constraints): constraints_tensor[i, 0] = len(sentence_constraints) offset = 1 for j, constraint in enumerate(sentence_constraints): this_len = constraint.size(0) constraints_tensor[i, offset : offset + this_len] = constraint offset += this_len + 1 return constraints_tensor.long() def unpack_constraints(constraint_tensor: torch.Tensor) -> List[torch.Tensor]: """ Transforms *one row* of a packed constraint tensor (e.g., for one sentence in the batch) into a list of constraint tensors. """ constraint_list = [] num_constraints = constraint_tensor[0] constraints = constraint_tensor.tolist() offset = 1 for i in range(num_constraints): where = constraints.index(0, offset) constraint_list.append(constraint_tensor[offset:where]) offset = where + 1 return constraint_list class ConstraintNode: """ Represents a node in a trie managing unordered constraints. """ def __init__(self, token: int = None, parent=None): # The token associate with this node (None for the root) self.token = int(token) if token is not None else None # The parent (None at the root) self.parent = parent # Whether this node is a completed constraint self.terminal = 0 # List of child nodes self.children = {} # The cumulative number of constraints from this point in the # trie forward self.num_constraints = 0 @property def id(self): return self.token def __str__(self): term = self.terminal != 0 return f"[{self.token}].{term}#{self.num_constraints}" def __getitem__(self, key: int): return self.children.get(key, None) def next_tokens(self) -> Set[int]: """The set of child labels.""" return set(self.children.keys()) @staticmethod def create(constraints: List[List[int]]): root = ConstraintNode() for sequence in constraints: root.add_sequence(sequence) return root @staticmethod def print_graph(node: "ConstraintNode"): if len(node.children) == 0: return str(node) else: s = f"({node}" for child in node.children.values(): s += " " + ConstraintNode.print_graph(child) s += ")" return s def token_counts(self) -> Counter: """Returns a counter of the number of times each token is used in a constraint. """ token_counts = Counter() kids = list(self.children.values()) while len(kids) > 0: kid = kids.pop() token_counts[kid.id] += kid.num_constraints kids += list(kid.children.values()) return token_counts def tokens(self) -> Set[int]: """Returns the set of tokens in constraints.""" return set(self.token_counts().keys()) def add_sequence(self, sequence: List[int]): """Adds a constraint, represented as a list of integers, to the trie.""" assert len(sequence) > 0 token = int(sequence[0]) if token not in self.children: self.children[token] = ConstraintNode(token, parent=self) node = self.children[token] if len(sequence) == 1: node.terminal += 1 node.num_constraints += 1 parent = node.parent while parent is not None: parent.num_constraints += 1 parent = parent.parent else: node.add_sequence(sequence[1:]) class UnorderedConstraintState(ConstraintState): """ Records progress through the set of constraints for each item in the beam using a trie. """ def __init__(self, node: ConstraintNode, copy_from: "ConstraintState" = None): self.node = node if copy_from is None: # The root node self.root = node # The set of states in the graph that have been completed self.completed = Counter() # The... self.generated = Counter() # The list of tokens we need to generate self.needed_tokens = self.root.tokens() else: self.completed = Counter(copy_from.completed) self.generated = Counter(copy_from.generated) self.root = copy_from.root # Mark the node as generated if self.node != self.root: self.generated[node] += 1 @staticmethod def create(constraint_tensor: torch.Tensor): constraint_list = unpack_constraints(constraint_tensor) constraint_trie_root = ConstraintNode.create(constraint_list) return UnorderedConstraintState(constraint_trie_root) def __str__(self): gen_str = ",".join([str(node) for node in self.generated]) return f"{self.name}/{self.bank}({gen_str})x{self.num_completed}" def __copy__(self): copied_state = UnorderedConstraintState(self.node, copy_from=self) return copied_state def copy(self): return self.__copy__() @property def name(self): if self.node.id is None: return "ROOT" else: return str(self.node.id) @property def is_root(self): return self.node == self.root @property def bank(self): return sum(self.generated.values()) @property def num_completed(self): """The number of constraints (not constraint tokens) that are completed. In addition to the already-completed states, we need to account for the current state, which might get marked as completed when another token is generated. """ in_final = self.node.terminal and self.completed[self.node] < self.node.terminal return sum(self.completed.values()) + in_final @property def finished(self): return self.root.num_constraints - self.num_completed == 0 @property def token_counts(self): return self.root.token_counts() @property def tokens(self): return self.root.tokens() @property def num_constraint_tokens(self): return sum(self.token_counts.values()) def next_tokens(self) -> Set[int]: """Returns the list of tokens that could come next. These are (a) all tokens extending the root state and, for non-root states, additionally all tokens extending the current state.""" if self.node != self.root: return self.root.next_tokens().union(self.node.next_tokens()) else: return self.root.next_tokens() def advance(self, token: int): """Reads in a token and advances the state. Here's how it works. We can advance to the next state if: - there is a matching child - its path isn't blocked A path is blocked when all constraints that are descendants of that node have already been generated, in the current state. If we are not able to advance from the current state, we "fall off the graph" and return to the root state. There, we again try to advance, checking the same criteria. In any case, when falling off the graph, we need to do some bookkeeping. We: - check whether any constraints were met (all prefixes of current state) - if one is found, mark it as completed - adjust visited nodes accordingly """ token = int(token) next_state = None child = self.node[token] if child is not None and self.generated[child] < child.num_constraints: next_state = UnorderedConstraintState(child, copy_from=self) def rewind(): """If we're mid-trie and an "illegal" token is chosen next, we need to reset our state to the root state. However, along the way, we need to check whether a prefix of the current trie state represents a state we could mark as completed. """ node = self.node while node != self.root: if node.terminal and self.completed[node] < node.terminal: next_state.completed[node] += 1 return next_state.generated[node] -= 1 node = node.parent # Fall off the graph, check the root if next_state is None and token in self.root.next_tokens(): child = self.root[token] # We can only traverse this edge if it's not saturated if self.generated[child] < child.num_constraints: next_state = UnorderedConstraintState(child, copy_from=self) else: next_state = UnorderedConstraintState(self.root, copy_from=self) # Rewind rewind() elif next_state is None: next_state = UnorderedConstraintState(self.root, copy_from=self) # Rewind rewind() return next_state class ConstraintSequence: def __init__(self, sequences: List[List[int]]): """Represents a set of possibly multitoken constraints by concatenating them and internally recording the end points. """ self.sequences = [] self.endpoints = [] self.num_tokens = 0 self.tokens = set() for sequence in sequences: for token in sequence: self.tokens.add(token) self.num_tokens += len(sequence) self.endpoints += [False for x in range(len(sequence) - 1)] + [True] self.sequences += sequence def __getitem__(self, key: int): return self.sequences[key] def __len__(self): return len(self.sequences) def __str__(self): return str(self.sequences) class OrderedConstraintState(ConstraintState): """ Records progress through the set of linear nonbranching constraints with gaps. """ def __init__(self, sequence: ConstraintSequence, state: int = -1): self.sequence = sequence self.state = state @staticmethod def create(constraint_tensor: torch.Tensor): constraint_list = unpack_constraints(constraint_tensor) return OrderedConstraintState(ConstraintSequence(constraint_list), -1) def __str__(self): return f"{self.state}/{self.bank}x{self.num_completed}" def __copy__(self): return OrderedConstraintState(self.sequence, self.state) def copy(self): return self.__copy__() @property def num_completed(self): if self.state == -1: return 0 count = len( list(filter(lambda x: x, self.sequence.endpoints[0 : self.state + 1])) ) return count @property def is_root(self): return self.state == -1 @property def name(self): if self.state == -1: return "ROOT" else: return str(self.sequence[self.state]) @property def bank(self) -> int: return self.state + 1 @property def finished(self): return self.state + 1 == len(self.sequence) @property def token_counts(self): return self.sequence.token_counts() @property def tokens(self): return self.sequence.tokens @property def num_constraint_tokens(self): return sum(self.token_counts.values()) def next_tokens(self) -> Set[int]: """Returns the list of tokens that could come next. These are (a) all tokens extending the root state and, for non-root states, additionally all tokens extending the current state.""" tokens = set() if self.state > 0: tokens.add(self.sequence[0]) if not self.finished: tokens.add(self.sequence[self.state + 1]) return tokens def advance(self, token: int): """Reads in a token and advances the state. Here's how it works. We can advance to the next state if: - there is a matching child - its path isn't blocked A path is blocked when all constraints that are descendants of that node have already been generated, in the current state. If we are not able to advance from the current state, we "fall off the graph" and return to the root state. There, we again try to advance, checking the same criteria. In any case, when falling off the graph, we need to do some bookkeeping. We: - check whether any constraints were met (all prefixes of current state) - if one is found, mark it as completed - adjust visited nodes accordingly """ token = int(token) # print(f"{self} ADVANCE({token}) {self.sequence} -> ", end="") if self.finished: # Accept anything next_state = self.copy() elif self.sequence[self.state + 1] == token: # Advance to the next token next_state = OrderedConstraintState(self.sequence, self.state + 1) elif self.sequence.endpoints[self.state]: # Accept anything between constraints (*) next_state = self.copy() elif token == self.sequence[0]: # Start over having generated the first token next_state = OrderedConstraintState(self.sequence, 0) else: # Start over from the root next_state = OrderedConstraintState(self.sequence, -1) return next_state ================================================ FILE: fairseq/tokenizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import re SPACE_NORMALIZER = re.compile(r"\s+") def tokenize_line(line): line = SPACE_NORMALIZER.sub(" ", line) line = line.strip() return line.split() ================================================ FILE: fairseq/trainer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Train a network across multiple GPUs. """ import contextlib import logging import os import sys import time from argparse import Namespace from itertools import chain from typing import Any, Dict, List import torch from omegaconf import OmegaConf from fairseq import checkpoint_utils, models, optim, utils from fairseq.dataclass.configs import FairseqConfig from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.distributed import utils as distributed_utils from fairseq.file_io import PathManager from fairseq.logging import meters, metrics from fairseq.models.ema import build_ema from fairseq.nan_detector import NanDetector from fairseq.optim import lr_scheduler from fairseq.utils import safe_hasattr logger = logging.getLogger(__name__) class Trainer(object): """Main class for data parallel training. This class supports synchronous distributed data parallel training, where multiple workers each have a full model replica and gradients are accumulated across workers before each update. We use :class:`~torch.nn.parallel.DistributedDataParallel` to handle communication of the gradients across workers. """ def __init__(self, cfg: FairseqConfig, task, model, criterion, quantizer=None): if isinstance(cfg, Namespace): logger.warning( "argparse.Namespace configuration is deprecated! Automatically converting to OmegaConf" ) cfg = convert_namespace_to_omegaconf(cfg) self.cfg = cfg self.task = task # catalog shared parameters shared_params = _catalog_shared_params(model) self.tpu = cfg.common.tpu self.cuda = torch.cuda.is_available() and not cfg.common.cpu and not self.tpu if self.cuda: self.device = torch.device("cuda") elif self.tpu: self.device = utils.get_tpu_device() else: self.device = torch.device("cpu") if self.is_fsdp: import fairscale if self.cfg.common.bf16: raise ValueError( "FullyShardedDataParallel is not compatible with --bf16 or " "--memory-efficient-bf16" ) if self.cfg.distributed_training.zero_sharding != "none": raise ValueError( "FullyShardedDataParallel is not compatible with --zero-sharding " "option (it's already built in)" ) if ( max(self.cfg.optimization.update_freq) > 1 and fairscale.__version__ < "0.4.0" ): raise RuntimeError( "Please update to fairscale 0.4.0 or newer when combining " "--update-freq with FullyShardedDataParallel" ) else: if ( hasattr(self.cfg.distributed_training, "cpu_offload") and self.cfg.distributed_training.cpu_offload ): raise ValueError("--cpu-offload requires --ddp-backend=fully_sharded") # copy model and criterion to current device/dtype self._criterion = criterion self._model = model if not self.is_fsdp: if cfg.common.fp16: assert not cfg.common.amp, "Cannot use fp16 and AMP together" self._criterion = self._criterion.half() self._model = self._model.half() elif cfg.common.bf16: self._criterion = self._criterion.to(dtype=torch.bfloat16) self._model = self._model.to(dtype=torch.bfloat16) elif cfg.common.amp: self._amp_retries = 0 if ( not cfg.distributed_training.pipeline_model_parallel # the DistributedFairseqModel wrapper will handle moving to device, # so only handle cases which don't use the wrapper and not self.use_distributed_wrapper ): self._criterion = self._criterion.to(device=self.device) self._model = self._model.to(device=self.device) self.pipeline_model_parallel = cfg.distributed_training.pipeline_model_parallel self.last_device = None if self.cuda and self.pipeline_model_parallel: self.last_device = torch.device( cfg.distributed_training.pipeline_devices[-1] ) # check that shared parameters are preserved after device transfer for shared_param in shared_params: ref = _get_module_by_path(self._model, shared_param[0]) for path in shared_param[1:]: logger.info( "detected shared parameter: {} <- {}".format(shared_param[0], path) ) _set_module_by_path(self._model, path, ref) self._dummy_batch = None # indicates we don't have a dummy batch at first self._lr_scheduler = None self._num_updates = 0 self._num_xla_compiles = 0 # for TPUs self._optim_history = None self._optimizer = None self._warn_once = set() self._wrapped_criterion = None self._wrapped_model = None self._ema = None # TODO(myleott): support tpu if self.cuda and self.data_parallel_world_size > 1: self._grad_norm_buf = torch.cuda.DoubleTensor(self.data_parallel_world_size) else: self._grad_norm_buf = None self.quantizer = quantizer if self.quantizer is not None: self.quantizer.set_trainer(self) # get detailed cuda environment if self.cuda: self.cuda_env = utils.CudaEnvironment() if self.data_parallel_world_size > 1: self.cuda_env_arr = distributed_utils.all_gather_list( self.cuda_env, group=distributed_utils.get_global_group() ) else: self.cuda_env_arr = [self.cuda_env] if self.data_parallel_rank == 0: utils.CudaEnvironment.pretty_print_cuda_env_list(self.cuda_env_arr) else: self.cuda_env = None self.cuda_env_arr = None metrics.log_start_time("wall", priority=790, round=0) self._start_time = time.time() self._previous_training_time = 0 self._cumulative_training_time = None def reinitialize(self): """Reinitialize the Trainer, typically after model params change.""" self._lr_scheduler = None self._optimizer = None self._wrapped_criterion = None self._wrapped_model = None @property def data_parallel_world_size(self): if self.cfg.distributed_training.distributed_world_size == 1: return 1 return distributed_utils.get_data_parallel_world_size() @property def data_parallel_process_group(self): return distributed_utils.get_data_parallel_group() @property def data_parallel_rank(self): if self.cfg.distributed_training.distributed_world_size == 1: return 0 return distributed_utils.get_data_parallel_rank() @property def is_data_parallel_master(self): # NOTE: this returns true for all model parallel replicas with data # parallel rank 0 return self.data_parallel_rank == 0 @property def use_distributed_wrapper(self) -> bool: return ( self.data_parallel_world_size > 1 and not self.cfg.optimization.use_bmuf ) or (self.is_fsdp and self.cfg.distributed_training.cpu_offload) @property def should_save_checkpoint_on_current_rank(self) -> bool: """Indicates whether to save checkpoints on the current DDP rank.""" if ( self.is_fsdp and self.cfg.distributed_training.use_sharded_state ) or getattr(self.cfg.model, "base_layers", 0) > 0: return True else: return self.is_data_parallel_master @property def always_call_state_dict_during_save_checkpoint(self) -> bool: if self.is_fsdp and not self.cfg.distributed_training.use_sharded_state: # FSDP calls communication collective when consolidating checkpoints return True else: return False @property def checkpoint_suffix(self) -> str: """Suffix to add to the checkpoint file name.""" if self.is_fsdp and self.cfg.distributed_training.use_sharded_state: return self.cfg.checkpoint.checkpoint_suffix + "-shard{0}".format( self.data_parallel_rank ) else: return self.cfg.checkpoint.checkpoint_suffix or "" @property def criterion(self): if self._wrapped_criterion is None: if utils.has_parameters(self._criterion) and self.use_distributed_wrapper: self._wrapped_criterion = models.DistributedFairseqModel( self.cfg.distributed_training, self._criterion, process_group=self.data_parallel_process_group, device=self.device, ) else: self._wrapped_criterion = self._criterion return self._wrapped_criterion @property def model(self): if self._wrapped_model is None: if self.use_distributed_wrapper: self._wrapped_model = models.DistributedFairseqModel( self.cfg.distributed_training, self._model, process_group=self.data_parallel_process_group, device=self.device, ) else: self._wrapped_model = self._model return self._wrapped_model @property def ema(self): if self._ema is None: self._build_ema() return self._ema def _build_ema(self): if self.cfg.ema.store_ema: self._ema = build_ema(self._model, self.cfg.ema, self.device) logger.info("Exponential Moving Average Shadow Model is initialized.") @property def optimizer(self): if self._optimizer is None: self._build_optimizer() return self._optimizer @property def lr_scheduler(self): if self._lr_scheduler is None: self._build_optimizer() # this will initialize self._lr_scheduler return self._lr_scheduler def _build_optimizer(self): if ( self.cfg.optimization.debug_param_names and self.cfg.common.fp16_no_flatten_grads ): params = [] self.param_names = [] for n, p in chain( self.model.named_parameters(), self.criterion.named_parameters() ): if p.requires_grad: params.append(p) self.param_names.append(n) else: params = list( filter( lambda p: p.requires_grad, chain(self.model.parameters(), self.criterion.parameters()), ) ) if self.is_fsdp and self.cfg.common.fp16: # FullyShardedDataParallel always uses MemoryEfficientFP16 wrapper, # mostly for the grad scaling. But if we don't have the # --memory-efficient-fp16 flag set, then we're effectively doing # regular --fp16 and can allow the use of optimizers that would # otherwise be unsupported by MemoryEfficientFP16Optimizer. allow_unsupported = not self.cfg.common.memory_efficient_fp16 self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.cfg, params, allow_unsupported=allow_unsupported ) elif self.cfg.common.fp16 or self.cfg.common.bf16 or self.cfg.common.amp: if self.cuda and torch.cuda.get_device_capability(0)[0] < 7: logger.info( "NOTE: your device does NOT support faster training with --fp16 or --amp, " "please switch to FP32 which is likely to be faster" ) if ( self.cfg.common.memory_efficient_fp16 or self.cfg.common.memory_efficient_bf16 ): self._optimizer = optim.MemoryEfficientFP16Optimizer.build_optimizer( self.cfg, params ) elif self.cfg.common.amp: self._optimizer = optim.AMPOptimizer.build_optimizer(self.cfg, params) else: self._optimizer = optim.FP16Optimizer.build_optimizer(self.cfg, params) else: if self.cuda and torch.cuda.get_device_capability(0)[0] >= 7: logger.info( "NOTE: your device may support faster training with --fp16 or --amp" ) self._optimizer = optim.build_optimizer(self.cfg.optimizer, params) if self.is_fsdp: assert ( not self.cfg.optimization.use_bmuf ), "--ddp-backend=fully_sharded is not compatible with BMUF" assert self._optimizer.supports_flat_params, ( "--ddp-backend=fully_sharded is only compatible with pointwise " "optimizers (e.g., Adam, AdamW, Adadelta, Adamax, SGD, etc.). " "However, the sharding will result in slightly different results when " "using non-pointwise optimizers (e.g., Adagrad, Adafactor, LAMB)" ) if self.cfg.optimization.use_bmuf: self._optimizer = optim.FairseqBMUF( self.cfg.bmuf, self._optimizer, ) if self.cfg.distributed_training.zero_sharding == "os": if ( self.cfg.common.fp16 and not self.cfg.common.memory_efficient_fp16 and not self.cfg.common.memory_efficient_bf16 ) and not self.cfg.common.fp16_no_flatten_grads: raise ValueError( "ZeRO is incomptabile with fp16 and flattened grads. " "Please use --fp16-no-flatten-grads" ) else: optim.shard_(self._optimizer, self.data_parallel_process_group) # We should initialize the learning rate scheduler immediately after # building the optimizer, so that the initial learning rate is set. self._lr_scheduler = lr_scheduler.build_lr_scheduler( self.cfg.lr_scheduler, self.optimizer, ) self._lr_scheduler.step_update(0) @property def is_fsdp(self): return self.cfg.distributed_training.ddp_backend == "fully_sharded" def consolidate_optimizer(self): """For OSS, we need to consolidate the state dict.""" if self.cfg.checkpoint.no_save_optimizer_state: return self._gathered_optim_state = None if hasattr(self.optimizer.optimizer, "consolidate_state_dict"): self.optimizer.optimizer.consolidate_state_dict() elif self.is_fsdp and not self.model.use_sharded_state: st = self.model.gather_full_optim_state_dict( self.optimizer ) # only returns on rank 0 self._gathered_optim_state = st def state_dict(self): state_dict = { "args": None, # legacy "cfg": ( OmegaConf.to_container(self.cfg, resolve=True, enum_to_str=True) if OmegaConf.is_config(self.cfg) else self.cfg ), "model": self.model.state_dict(), "criterion": ( self.criterion.state_dict() if utils.has_parameters(self.criterion) else None ), "optimizer_history": (self._optim_history or []) + [ { "criterion_name": self.get_criterion().__class__.__name__, "optimizer_name": self.optimizer.__class__.__name__, "lr_scheduler_state": self.lr_scheduler.state_dict(), "num_updates": self.get_num_updates(), } ], "task_state": self.task.state_dict() if self.task is not None else {}, "extra_state": { "metrics": metrics.state_dict(), "previous_training_time": self.cumulative_training_time(), }, } if self.cfg.ema.store_ema: # Save EMA model state as extra state state_dict["extra_state"]["ema"] = self.ema.get_model().state_dict() if self.cfg.ema.ema_fp32: # Save EMA params in fp32 state_dict["extra_state"]["ema_fp32_params"] = self.ema.fp32_params if not self.cfg.checkpoint.no_save_optimizer_state: if self._gathered_optim_state is not None: state_dict["last_optimizer_state"] = self._gathered_optim_state self._gathered_optim_state = None else: state_dict["last_optimizer_state"] = self.optimizer.state_dict() if self.is_fsdp: # save meta data for recombining checkpoint upon loading state_dict["fsdp_metadata"] = self.model.local_metadata_dict() return state_dict def save_checkpoint(self, filename, extra_state): """Save all training state in a checkpoint file.""" if self.should_save_checkpoint_on_current_rank: logger.info(f"Saving checkpoint to {os.path.abspath(filename)}") # call state_dict on all ranks in case it needs internal communication state_dict = utils.move_to_cpu(self.state_dict()) state_dict["extra_state"].update(extra_state) checkpoint_utils.torch_persistent_save( state_dict, filename, async_write=self.cfg.checkpoint.write_checkpoints_asynchronously, ) logger.info(f"Finished saving checkpoint to {os.path.abspath(filename)}") return os.path.abspath(filename) return None def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """ Load all training state from a checkpoint file. rank = 0 will load the checkpoint, and then broadcast it to all other ranks. """ extra_state, self._optim_history, last_optim_state = None, [], None logger.info(f"Preparing to load checkpoint {filename}") is_distributed = self.data_parallel_world_size > 1 bexists = PathManager.isfile(filename) if bexists: load_on_all_ranks = ( self.cfg.checkpoint.load_checkpoint_on_all_dp_ranks # TPUs don't support broadcast yet, so load checkpoints # on every worker for now or self.tpu # FSDP requires loading checkpoint shards on all ranks or (self.is_fsdp and self.cfg.distributed_training.use_sharded_state) or getattr(self.cfg.model, "base_layers", 0) > 0 ) if load_on_all_ranks or self.data_parallel_rank == 0: state = checkpoint_utils.load_checkpoint_to_cpu( filename, load_on_all_ranks=load_on_all_ranks ) last_optim_state = state.get("last_optimizer_state", None) # If doing zero_sharding, do not broadcast global optimizer # state. Later we will broadcast sharded states to each rank # to avoid memory from exploding. if ( not load_on_all_ranks and self.cfg.distributed_training.zero_sharding == "os" and "last_optimizer_state" in state and is_distributed ): state["last_optimizer_state"] = "SHARDED" else: last_optim_state = None state = None if is_distributed and not load_on_all_ranks: state = distributed_utils.broadcast_object( state, src_rank=0, group=self.data_parallel_process_group, dist_device=self.device, ) if self.data_parallel_rank > 0: last_optim_state = state.get("last_optimizer_state", None) # load model parameters try: if ( "optimizer_history" in state and len(state["optimizer_history"]) > 0 and "num_updates" in state["optimizer_history"][-1] ): self.model.set_num_updates( state["optimizer_history"][-1]["num_updates"] ) # this is the code related to AdaPrune # In short, it removes redundant heads in multi-head attention module based on heads importance provided # For more info, please refer to the paper: https://openreview.net/forum?id=_CMSV7FTzGI # The idea of prune in mha can be summarized as # Fine tune model (e.g. roberta encoder) on a certain datasets with regularization # After the model is trained. User could use get_reserve_head_index and _adaptive_prune_heads functions to get the top X heads with most importance. # Then user uses the rank to prune a new roberta encoder and save the pruned ckpt manually. # User will fine tune the the new roberta encoder via the ckpt saved above # To get rid of registering different pruned version of Roberta, I use the argument --mha-heads-to-keep to prune the Roberta model into a pruned version which matches the pruned ckpt. if ( safe_hasattr(self.model, "args") and safe_hasattr(self.model.args, "mha_heads_to_keep") and self.model.args.mha_heads_to_keep != -1 ): logger.info( f"Prune model: keep {self.model.args.mha_heads_to_keep} heads for each multihead attention module" ) for layer in self.model.encoder.sentence_encoder.layers: reserve_head_index = layer.self_attn._get_reserve_head_index( num_heads_to_keep=self.model.args.mha_heads_to_keep ) layer.self_attn._adaptive_prune_heads( reserve_head_index=reserve_head_index ) layer.self_attn._set_skip_embed_dim_check() logger.info(self.model) # this is the code related to AdaPrune # In short, it removes redundant units in feedforward layer in each transformer layer based on importance # For more info, please refer to the paper: https://openreview.net/forum?id=_CMSV7FTzGI # The idea of prune in ffn can be summarized as # Fine tune model (e.g. roberta encoder) on a certain datasets with regularization # After the model is trained. User could use _get_fc_rank and _prune_fc_layer functions to get the top X units with most importance. # Then user uses the rank to prune a new roberta encoder and save the pruned ckpt manually. # User will fine tune the the new roberta encoder via the ckpt saved above # To get rid of registering different pruned version of Roberta, I use the argument --ffn-blocks-to-remove to prune the Roberta model into a pruned version which matches the pruned ckpt. if ( safe_hasattr(self.model, "args") and safe_hasattr(self.model.args, "ffn_blocks_to_remove") and self.model.args.ffn_blocks_to_remove != -1 ): logger.info( f"Prune model: remove {self.model.args.ffn_blocks_to_remove} ffn blocks for each transformer layer" ) for layer in self.model.encoder.sentence_encoder.layers: remove_index = layer._get_fc_rank( remove_num=self.model.args.ffn_blocks_to_remove ) layer._prune_fc_layer(remove_index=remove_index) logger.info(self.model) self.model.load_state_dict( state["model"], strict=True, model_cfg=self.cfg.model ) # save memory for later steps del state["model"] if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict( state["criterion"], strict=True ) del state["criterion"] except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format(filename) ) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), f"Criterion does not match; please reset the optimizer (--reset-optimizer). {last_optim['criterion_name']} vs {self.get_criterion().__class__.__name__}" assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), f"Optimizer does not match; please reset the optimizer (--reset-optimizer). {last_optim['optimizer_name']} vs {self.optimizer.__class__.__name__}" if not reset_lr_scheduler: self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"]) if self.is_fsdp and not self.model.use_sharded_state: # if use_sharded_state, the last_optim_state is already sharded, skip this last_optim_state = self.model.get_shard_from_optim_state_dict( last_optim_state ) elif not load_on_all_ranks and is_distributed: last_optim_state = self.optimizer.broadcast_global_state_dict( last_optim_state ) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: itr_state = extra_state["train_iterator"] epoch = itr_state["epoch"] if "previous_training_time" in extra_state: self._previous_training_time = extra_state["previous_training_time"] self._start_time = time.time() self.lr_step(epoch) if ( itr_state.get("version", 1) >= 2 and itr_state["iterations_in_epoch"] == 0 ): # reset meters at start of epoch reset_meters = True if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() if self.cfg.ema.store_ema: if "ema" not in extra_state: logger.warn( "EMA not found in checkpoint. But store_ema is True. " "EMA is re-initialized from checkpoint." ) self.ema.restore( state["model"], build_fp32_params=self.cfg.ema.ema_fp32 ) else: logger.info("Loading EMA from checkpoint") self.ema.restore(extra_state["ema"], build_fp32_params=False) if self.cfg.ema.ema_fp32: if "ema_fp32_params" in extra_state: logger.info("Loading EMA fp32 params from checkpoint") self.ema.build_fp32_params(extra_state["ema_fp32_params"]) else: logger.info( "Building EMA fp32 params from EMA model in checkpoint" ) self.ema.build_fp32_params() logger.info( "Loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates() ) ) else: logger.info("No existing checkpoint found {}".format(filename)) return extra_state def get_train_iterator( self, epoch, combine=True, load_dataset=True, data_selector=None, shard_batch_itr=True, disable_iterator_cache=False, ): """Return an EpochBatchIterator over the training set for a given epoch.""" if load_dataset: logger.info("loading train data for epoch {}".format(epoch)) self.task.load_dataset( self.cfg.dataset.train_subset, epoch=epoch, combine=combine, data_selector=data_selector, tpu=self.tpu, ) batch_iterator = self.task.get_batch_iterator( dataset=self.task.dataset(self.cfg.dataset.train_subset), max_tokens=self.cfg.dataset.max_tokens, max_sentences=self.cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( self.task.max_positions(), self.model.max_positions(), self.cfg.dataset.max_tokens, ), ignore_invalid_inputs=True, required_batch_size_multiple=self.cfg.dataset.required_batch_size_multiple, seed=(self.cfg.common.seed + epoch) if self.cfg.dataset.update_ordered_indices_seed else self.cfg.common.seed, num_shards=self.data_parallel_world_size if shard_batch_itr else 1, shard_id=self.data_parallel_rank if shard_batch_itr else 0, num_workers=self.cfg.dataset.num_workers, epoch=epoch, data_buffer_size=self.cfg.dataset.data_buffer_size, disable_iterator_cache=disable_iterator_cache, skip_remainder_batch=self.cfg.optimization.skip_remainder_batch, grouped_shuffling=self.cfg.dataset.grouped_shuffling, update_epoch_batch_itr=self.cfg.dataset.update_epoch_batch_itr, ) self.reset_dummy_batch(batch_iterator.first_batch) return batch_iterator def get_valid_iterator( self, subset, disable_iterator_cache=False, ): """Return an EpochBatchIterator over given validation subset for a given epoch.""" batch_iterator = self.task.get_batch_iterator( dataset=self.task.dataset(subset), max_tokens=self.cfg.dataset.max_tokens_valid, max_sentences=self.cfg.dataset.batch_size_valid, max_positions=utils.resolve_max_positions( self.task.max_positions(), self.model.max_positions(), ), ignore_invalid_inputs=self.cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=self.cfg.dataset.required_batch_size_multiple, seed=self.cfg.common.seed, num_shards=self.data_parallel_world_size, shard_id=self.data_parallel_rank, num_workers=self.cfg.dataset.num_workers, # always pass a fixed "epoch" to keep validation data consistent # across training epochs epoch=1, data_buffer_size=self.cfg.dataset.data_buffer_size, disable_iterator_cache=disable_iterator_cache, skip_remainder_batch=False, ) self.reset_dummy_batch(batch_iterator.first_batch) return batch_iterator def begin_epoch(self, epoch): """Called at the beginning of each epoch.""" logger.info("begin training epoch {}".format(epoch)) self.lr_step_begin_epoch(epoch) if self.quantizer is not None: self.quantizer.begin_epoch(epoch) # task specific setup per epoch self.task.begin_epoch(epoch, self.get_model()) if self.tpu: import torch_xla.core.xla_model as xm xm.rendezvous("begin_epoch") # wait for all workers xm.mark_step() def begin_valid_epoch(self, epoch): """Called at the beginning of each validation epoch.""" # task specific setup per validation epoch self.task.begin_valid_epoch(epoch, self.get_model()) def reset_dummy_batch(self, batch): self._dummy_batch = batch @metrics.aggregate("train") def train_step(self, samples, raise_oom=False): """Do forward, backward and parameter update.""" self._set_seed() self.model.train() self.criterion.train() self.zero_grad() metrics.log_start_time("train_wall", priority=800, round=0) # If EMA is enabled through store_ema=True # and task.uses_ema is True, pass the EMA model as a keyword # argument to the task. extra_kwargs = {} if self.cfg.ema.store_ema and getattr(self.task, "uses_ema", False): extra_kwargs["ema_model"] = self.ema.get_model() has_oom = False # forward and backward pass logging_outputs, sample_size, ooms = [], 0, 0 for i, sample in enumerate(samples): # delayed update loop sample, is_dummy_batch = self._prepare_sample(sample) def maybe_no_sync(): """ Whenever *samples* contains more than one mini-batch, we want to accumulate gradients locally and only call all-reduce in the last backwards pass. """ if ( self.data_parallel_world_size > 1 and hasattr(self.model, "no_sync") and i < len(samples) - 1 # The no_sync context manager results in increased memory # usage with FSDP, since full-size gradients will be # accumulated on each GPU. It's typically a better tradeoff # to do the extra communication with FSDP. and not self.is_fsdp ): return self.model.no_sync() else: return contextlib.ExitStack() # dummy contextmanager try: with maybe_no_sync(): # forward and backward loss, sample_size_i, logging_output = self.task.train_step( sample=sample, model=self.model, criterion=self.criterion, optimizer=self.optimizer, update_num=self.get_num_updates(), ignore_grad=is_dummy_batch, **extra_kwargs, ) del loss logging_outputs.append(logging_output) sample_size += sample_size_i # emptying the CUDA cache after the first step can # reduce the chance of OOM if self.cuda and self.get_num_updates() == 0: torch.cuda.empty_cache() except RuntimeError as e: if "out of memory" in str(e): self._log_oom(e) has_oom = True if raise_oom: raise e else: raise e except Exception: self.consolidate_optimizer() self.save_checkpoint( os.path.join(self.cfg.checkpoint.save_dir, "crash.pt"), {} ) raise if has_oom: logger.warning( "attempting to recover from OOM in forward/backward pass" ) ooms += 1 self.zero_grad() if self.cuda: torch.cuda.empty_cache() if self.cfg.distributed_training.distributed_world_size == 1: return None if self.tpu and i < len(samples) - 1: # tpu-comment: every XLA operation before marking step is # appended to the IR graph, and processing too many batches # before marking step can lead to OOM errors. # To handle gradient accumulation use case, we explicitly # mark step here for every forward pass without a backward pass self._xla_markstep_and_send_to_cpu() if is_dummy_batch: if torch.is_tensor(sample_size): sample_size.zero_() else: sample_size *= 0.0 if torch.is_tensor(sample_size): sample_size = sample_size.float() else: sample_size = float(sample_size) # gather logging outputs from all replicas if self._sync_stats(): train_time = self._local_cumulative_training_time() ( logging_outputs, ( sample_size, ooms, total_train_time, ), ) = self._aggregate_logging_outputs( logging_outputs, sample_size, ooms, train_time, ignore=is_dummy_batch ) self._cumulative_training_time = ( total_train_time / self.data_parallel_world_size ) overflow = False try: with torch.autograd.profiler.record_function("reduce-grads"): # reduce gradients across workers self.optimizer.all_reduce_grads(self.model) if utils.has_parameters(self.criterion): self.optimizer.all_reduce_grads(self.criterion) with torch.autograd.profiler.record_function("multiply-grads"): # multiply gradients by (data_parallel_size / sample_size) since # DDP normalizes by the number of data parallel workers for # improved fp16 precision. # Thus we get (sum_of_gradients / sample_size) at the end. # In case of fp16, this step also undoes loss scaling. # (Debugging note: Some optimizers perform this scaling on the # fly, so inspecting model.parameters() or optimizer.params may # still show the original, unscaled gradients.) numer = ( self.data_parallel_world_size if not self.cfg.optimization.use_bmuf or self._sync_stats() else 1 ) self.optimizer.multiply_grads(numer / (sample_size or 1.0)) # Note: (sample_size or 1.0) handles the case of a zero gradient, in a # way that avoids CPU/device transfers in case sample_size is a GPU or # TPU object. The assumption is that the gradient itself is also 0. with torch.autograd.profiler.record_function("clip-grads"): # clip grads grad_norm = self.clip_grad_norm(self.cfg.optimization.clip_norm) # check that grad norms are consistent across workers # on tpu check tensor is slow if not self.tpu: if ( not self.cfg.optimization.use_bmuf and self.cfg.distributed_training.ddp_backend != "slowmo" ): self._check_grad_norms(grad_norm) if not torch.isfinite(grad_norm).all(): # in case of AMP, if gradients are Nan/Inf then # optimizer step is still required if self.cfg.common.amp: overflow = True else: # check local gradnorm single GPU case, trigger NanDetector raise FloatingPointError("gradients are Nan/Inf") with torch.autograd.profiler.record_function("optimizer"): # take an optimization step self.task.optimizer_step( self.optimizer, model=self.model, update_num=self.get_num_updates() ) if self.cfg.common.amp and overflow: if self._amp_retries == self.cfg.common.amp_batch_retries: logger.info("AMP: skipping this batch.") self._amp_retries = 0 else: self._amp_retries += 1 return self.train_step( samples, raise_oom ) # recursion to feed in same batch except FloatingPointError: self.consolidate_optimizer() self.save_checkpoint( os.path.join(self.cfg.checkpoint.save_dir, "crash.pt"), {} ) # re-run the forward and backward pass with hooks attached to print # out where it fails self.zero_grad() with NanDetector(self.get_model()): for _, sample in enumerate(samples): sample, _ = self._prepare_sample(sample) self.task.train_step( sample, self.model, self.criterion, self.optimizer, self.get_num_updates(), ignore_grad=False, **extra_kwargs, ) raise except OverflowError as e: overflow = True logger.info( f"NOTE: gradient overflow detected, ignoring gradient, {str(e)}" ) if hasattr(self, "param_names") and hasattr( self.optimizer, "fp32_optimizer" ): for p, n in zip(self.optimizer.fp32_optimizer.params, self.param_names): if torch.isinf(p.grad).any() or torch.isnan(p.grad).any(): logger.info(f"overflow in param {n}") grad_norm = torch.tensor(0.0).cuda() self.zero_grad() except RuntimeError as e: if "out of memory" in str(e): self._log_oom(e) logger.error("OOM during optimization, irrecoverable") raise e # Some distributed wrappers (e.g., SlowMo) need access to the optimizer # after the step if hasattr(self.model, "perform_slowmo"): self.model.perform_slowmo( self.optimizer.optimizer, getattr(self.optimizer, "fp32_params", None) ) logging_output = None if not overflow or self.cfg.distributed_training.ddp_backend == "slowmo": self.set_num_updates(self.get_num_updates() + 1) if self.cfg.ema.store_ema: # Step EMA forward with new model. self.ema.step( self.get_model(), self.get_num_updates(), ) metrics.log_scalar( "ema_decay", self.ema.get_decay(), priority=10000, round=5, weight=0, ) if self.tpu: import torch_xla.core.xla_model as xm # mark step on TPUs self._xla_markstep_and_send_to_cpu() # only log stats every log_interval steps # this causes wps to be misreported when log_interval > 1 logging_output = {} if self.get_num_updates() % self.cfg.common.log_interval == 0: # log memory usage mem_info = xm.get_memory_info(self.device) gb_free = mem_info["kb_free"] / 1024 / 1024 gb_total = mem_info["kb_total"] / 1024 / 1024 metrics.log_scalar( "gb_free", gb_free, priority=1500, round=1, weight=0 ) metrics.log_scalar( "gb_total", gb_total, priority=1600, round=1, weight=0 ) logging_outputs = self._xla_markstep_and_send_to_cpu( logging_outputs ) logging_output = self._reduce_and_log_stats( logging_outputs, sample_size, grad_norm ) # log whenever there's an XLA compilation, since these # slow down training and may indicate opportunities for # optimization self._check_xla_compilation() else: if self.cuda and self.cuda_env is not None: # log minimum free memory over the iteration gb_used = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024 torch.cuda.reset_peak_memory_stats() gb_free = self.cuda_env.total_memory_in_GB - gb_used metrics.log_scalar( "gb_free", gb_free, priority=1500, round=1, weight=0 ) # log stats logging_output = self._reduce_and_log_stats( logging_outputs, sample_size, grad_norm ) # clear CUDA cache to reduce memory fragmentation if ( self.cuda and self.cfg.common.empty_cache_freq > 0 and ( (self.get_num_updates() + self.cfg.common.empty_cache_freq - 1) % self.cfg.common.empty_cache_freq ) == 0 ): torch.cuda.empty_cache() if self.cfg.common.fp16 or self.cfg.common.amp: metrics.log_scalar( "loss_scale", ( self.optimizer.scaler.loss_scale if self.cfg.common.fp16 else self.optimizer.scaler.get_scale() ), priority=700, round=4, weight=0, ) metrics.log_stop_time("train_wall") return logging_output @metrics.aggregate("valid") def valid_step(self, sample, raise_oom=False): """Do forward pass in evaluation mode.""" if self.tpu: import torch_xla.core.xla_model as xm xm.rendezvous("valid_step") # wait for all workers # If EMA is enabled through store_ema=True # and task.uses_ema is True, pass the EMA model as a keyword # argument to the task. extra_kwargs = {} if self.cfg.ema.store_ema and getattr(self.task, "uses_ema", False): extra_kwargs["ema_model"] = self.ema.get_model() with torch.no_grad(): self.model.eval() self.criterion.eval() sample, is_dummy_batch = self._prepare_sample(sample) try: _loss, sample_size, logging_output = self.task.valid_step( sample, self.model, self.criterion, **extra_kwargs ) except RuntimeError as e: if "out of memory" in str(e): self._log_oom(e) if not raise_oom: logger.warning( "ran out of memory in validation step, retrying batch" ) for p in self.model.parameters(): if p.grad is not None: p.grad = None # free some memory if self.cuda: torch.cuda.empty_cache() return self.valid_step(sample, raise_oom=True) raise e logging_outputs = [logging_output] if is_dummy_batch: if torch.is_tensor(sample_size): sample_size.zero_() else: sample_size *= 0.0 # gather logging outputs from all replicas if self.data_parallel_world_size > 1: logging_outputs, (sample_size,) = self._aggregate_logging_outputs( logging_outputs, sample_size, ignore=is_dummy_batch, ) # log validation stats if self.tpu: logging_outputs = self._xla_markstep_and_send_to_cpu(logging_outputs) logging_output = self._reduce_and_log_stats(logging_outputs, sample_size) return logging_output def zero_grad(self): self.optimizer.zero_grad() def lr_step_begin_epoch(self, epoch): """Adjust the learning rate at the beginning of the epoch.""" self.lr_scheduler.step_begin_epoch(epoch) # prefer updating the LR based on the number of steps return self.lr_step_update() def lr_step(self, epoch, val_loss=None): """Adjust the learning rate at the end of the epoch.""" self.lr_scheduler.step(epoch, val_loss) # prefer updating the LR based on the number of steps return self.lr_step_update() def lr_step_update(self): """Update the learning rate after each update.""" new_lr = self.lr_scheduler.step_update(self.get_num_updates()) if isinstance(new_lr, dict): for k, v in new_lr.items(): metrics.log_scalar(f"lr_{k}", v, weight=0, priority=300) new_lr = new_lr.get("default", next(iter(new_lr.values()))) else: metrics.log_scalar("lr", new_lr, weight=0, priority=300) return new_lr def get_lr(self): """Get the current learning rate.""" return self.optimizer.get_lr() def get_model(self): """Get the (non-wrapped) model instance.""" return self._model def get_criterion(self): """Get the (non-wrapped) criterion instance.""" return self._criterion def get_meter(self, name): """[deprecated] Get a specific meter by name.""" from fairseq import meters if "get_meter" not in self._warn_once: self._warn_once.add("get_meter") utils.deprecation_warning( "Trainer.get_meter is deprecated. Please use fairseq.metrics instead." ) train_meters = metrics.get_meters("train") if train_meters is None: train_meters = {} if name == "train_loss" and "loss" in train_meters: return train_meters["loss"] elif name == "train_nll_loss": # support for legacy train.py, which assumed this meter is # always initialized m = train_meters.get("nll_loss", None) return m or meters.AverageMeter() elif name == "wall": # support for legacy train.py, which assumed this meter is # always initialized m = metrics.get_meter("default", "wall") return m or meters.TimeMeter() elif name == "wps": m = metrics.get_meter("train", "wps") return m or meters.TimeMeter() elif name in {"valid_loss", "valid_nll_loss"}: # support for legacy train.py, which assumed these meters # are always initialized k = name[len("valid_") :] m = metrics.get_meter("valid", k) return m or meters.AverageMeter() elif name == "oom": return meters.AverageMeter() elif name in train_meters: return train_meters[name] return None def get_num_updates(self): """Get the number of parameters updates.""" return self._num_updates def set_num_updates(self, num_updates): """Set the number of parameters updates.""" self._num_updates = num_updates self.lr_step_update() if self.quantizer: self.quantizer.step_update(self._num_updates) metrics.log_scalar("num_updates", self._num_updates, weight=0, priority=200) def clip_grad_norm(self, clip_norm): def agg_norm_fn(total_norm): total_norm = total_norm.cuda().float() ** 2 total_norm = distributed_utils.all_reduce( total_norm, group=self.data_parallel_process_group ) return total_norm**0.5 should_agg_norm = self.is_fsdp and ( self.data_parallel_process_group is not None or torch.distributed.is_initialized() ) return self.optimizer.clip_grad_norm( clip_norm, aggregate_norm_fn=agg_norm_fn if should_agg_norm else None ) def cumulative_training_time(self): if self._cumulative_training_time is None: # single GPU return self._local_cumulative_training_time() else: return self._cumulative_training_time def _local_cumulative_training_time(self): """Aggregate training time in seconds.""" return time.time() - self._start_time + self._previous_training_time def _fp_convert_sample(self, sample): def apply_half(t): if t.dtype is torch.float32: return t.to(dtype=torch.half) return t def apply_bfloat16(t): if t.dtype is torch.float32: return t.to(dtype=torch.bfloat16) return t if self.cfg.common.fp16: sample = utils.apply_to_sample(apply_half, sample) if self.cfg.common.bf16: sample = utils.apply_to_sample(apply_bfloat16, sample) return sample def _prepare_sample(self, sample, is_dummy=False): if sample == "DUMMY": raise Exception( "Trying to use an uninitialized 'dummy' batch. This usually indicates " "that the total number of batches is smaller than the number of " "participating GPUs. Try reducing the batch size or using fewer GPUs." ) if sample is None or len(sample) == 0: assert ( self._dummy_batch is not None and len(self._dummy_batch) > 0 ), "Invalid dummy batch: {}".format(self._dummy_batch) sample, _ = self._prepare_sample(self._dummy_batch, is_dummy=True) return sample, True # Given that PCIe/NVLink bandwidth is significantly smaller than DRAM bandwidth # it makes sense to do the format conversion on the CPU and then transfer # a smaller buffer to the device. This also saves GPU memory capacity. if self.cfg.common.on_cpu_convert_precision: sample = self._fp_convert_sample(sample) if self.cuda: if self.pipeline_model_parallel: if "target" in sample: sample["target"] = utils.move_to_cuda( sample["target"], device=self.last_device ) else: sample = utils.move_to_cuda(sample) elif self.tpu and is_dummy: # the dummy batch may not be on the appropriate device sample = utils.move_to_cuda(sample, device=self.device) if not self.cfg.common.on_cpu_convert_precision: sample = self._fp_convert_sample(sample) if self._dummy_batch == "DUMMY": self._dummy_batch = sample return sample, False def _set_seed(self): # Set seed based on args.seed and the update number so that we get # reproducible results when resuming from checkpoints seed = self.cfg.common.seed + self.get_num_updates() utils.set_torch_seed(seed) def _sync_stats(self): # Return True if it's using multiple GPUs and DDP or multiple GPUs with # BMUF and it's a bmuf sync with warmup iterations completed before. if self.data_parallel_world_size == 1: return False elif self.cfg.optimization.use_bmuf: return ( self.get_num_updates() + 1 ) % self.cfg.bmuf.global_sync_iter == 0 and ( self.get_num_updates() + 1 ) > self.cfg.bmuf.warmup_iterations else: return True def _log_oom(self, exc): msg = "OOM: Ran out of memory with exception: {}".format(exc) logger.warning(msg) if torch.cuda.is_available() and hasattr(torch.cuda, "memory_summary"): for device_idx in range(torch.cuda.device_count()): logger.warning(torch.cuda.memory_summary(device=device_idx)) sys.stderr.flush() def _aggregate_logging_outputs( self, logging_outputs: List[Dict[str, Any]], *extra_stats_to_sum, ignore=False, ): if self.task.__class__.logging_outputs_can_be_summed(self.get_criterion()): return self._fast_stat_sync_sum( logging_outputs, *extra_stats_to_sum, ignore=ignore ) else: return self._all_gather_list_sync( logging_outputs, *extra_stats_to_sum, ignore=ignore ) def _all_gather_list_sync( self, logging_outputs: List[Dict[str, Any]], *extra_stats_to_sum, ignore=False, ): """ Sync logging outputs across workers. all_gather_list_sync is suitable when logging outputs are complex types. """ if self.tpu: raise NotImplementedError if ignore: logging_outputs = [] results = list( zip( *distributed_utils.all_gather_list( [logging_outputs] + list(extra_stats_to_sum), max_size=getattr(self.cfg.common, "all_gather_list_size", 16384), group=self.data_parallel_process_group, ) ) ) logging_outputs, extra_stats_to_sum = results[0], results[1:] logging_outputs = list(chain.from_iterable(logging_outputs)) extra_stats_to_sum = [sum(s) for s in extra_stats_to_sum] return logging_outputs, extra_stats_to_sum def _fast_stat_sync_sum( self, logging_outputs: List[Dict[str, Any]], *extra_stats_to_sum, ignore=False, ): """ Sync logging outputs across workers. fast_stat_sync_sum is faster than all_gather_list_sync, but is only suitable when logging outputs are scalars and can be summed. Note that *logging_outputs* cannot contain any nested dicts/lists. """ data = {} for i, stat in enumerate(extra_stats_to_sum): data["extra_stats_" + str(i)] = stat if len(logging_outputs) > 0: log_keys = list(logging_outputs[0].keys()) for k in log_keys: if not ignore: v = sum(log[k] for log in logging_outputs if k in log) else: v = logging_outputs[0][k] v = torch.zeros_like(v) if torch.is_tensor(v) else 0 data["logging_outputs_" + k] = v else: log_keys = None data = distributed_utils.all_reduce_dict( data, device=self.device, group=self.data_parallel_process_group ) extra_stats_to_sum = [ data["extra_stats_" + str(i)] for i in range(len(extra_stats_to_sum)) ] if log_keys is not None: logging_outputs = [{k: data["logging_outputs_" + k] for k in log_keys}] else: logging_outputs = [] return logging_outputs, extra_stats_to_sum def _check_grad_norms(self, grad_norm): """Check that grad norms are consistent across workers.""" if self._grad_norm_buf is not None: self._grad_norm_buf.zero_() self._grad_norm_buf[self.data_parallel_rank] = grad_norm distributed_utils.all_reduce( self._grad_norm_buf, group=self.data_parallel_process_group ) def is_consistent(tensor): max_abs_diff = torch.max(torch.abs(tensor - tensor[0])) return ( ( torch.isfinite(tensor).all() and (max_abs_diff / (tensor[0] + 1e-6) < 1e-6).all() ) or (self.cfg.common.amp and not torch.isfinite(tensor).all()) # in case of amp non-finite grads are fine ) if not is_consistent(self._grad_norm_buf): pretty_detail = "\n".join( "rank {:3d} = {:.8f}".format(r, n) for r, n in enumerate(self._grad_norm_buf.tolist()) ) error_detail = "grad_norm across the workers:\n{}\n".format( pretty_detail ) # use FloatingPointError to trigger NanDetector raise FloatingPointError( "Fatal error: gradients are inconsistent between workers. " "Try --ddp-backend=legacy_ddp. " "Or are you mixing up different generation of GPUs in training?" + "\n" + "-" * 80 + "\n{}\n".format(error_detail) + "-" * 80 ) def _reduce_and_log_stats(self, logging_outputs, sample_size, grad_norm=None): if grad_norm is not None and ( not torch.is_tensor(grad_norm) or torch.isfinite(grad_norm) ): metrics.log_speed("ups", 1.0, priority=100, round=2) metrics.log_scalar("gnorm", grad_norm, priority=400, round=3) if self.cfg.optimization.clip_norm > 0: metrics.log_scalar( "clip", torch.where( grad_norm > self.cfg.optimization.clip_norm, grad_norm.new_tensor(100), grad_norm.new_tensor(0), ), priority=500, round=1, ) with metrics.aggregate() as agg: if logging_outputs is not None: self.task.reduce_metrics(logging_outputs, self.get_criterion()) del logging_outputs # extra warning for criterions that don't properly log a loss value if "loss" not in agg: if "loss" not in self._warn_once: self._warn_once.add("loss") logger.warning( "Criterion.reduce_metrics did not log a 'loss' value, " "which may break some functionality" ) metrics.log_scalar("loss", -1) # support legacy interface if self.tpu: logging_output = {} else: logging_output = agg.get_smoothed_values() logging_output["sample_size"] = sample_size for key_to_delete in ["ppl", "wps", "wpb", "bsz"]: if key_to_delete in logging_output: del logging_output[key_to_delete] return logging_output def _check_xla_compilation(self): import torch_xla.debug.metrics as met compile_stats = met.metric_data("CompileTime") if compile_stats is None: return num_xla_compiles = compile_stats[0] if num_xla_compiles > self._num_xla_compiles: logger.warning( "XLA compilation detected on device #{}; too many of these can lead " "to slow training, but we expect a few in the beginning".format( self.cfg.distributed_training.distributed_rank ) ) self._num_xla_compiles = num_xla_compiles def _xla_markstep_and_send_to_cpu(self, data=None): import torch_xla.core.xla_model as xm xm.mark_step() if data is not None: from fairseq.utils import xla_device_to_cpu return xla_device_to_cpu(data) def _catalog_shared_params(module, memo=None, prefix=""): if memo is None: first_call = True memo = {} else: first_call = False for name, param in module._parameters.items(): param_prefix = prefix + ("." if prefix else "") + name if param not in memo: memo[param] = [] memo[param].append(param_prefix) for name, m in module._modules.items(): if m is None: continue submodule_prefix = prefix + ("." if prefix else "") + name _catalog_shared_params(m, memo, submodule_prefix) if first_call: return [x for x in memo.values() if len(x) > 1] def _get_module_by_path(module, path): path = path.split(".") for name in path: module = getattr(module, name) return module def _set_module_by_path(module, path, value): path = path.split(".") for name in path[:-1]: module = getattr(module, name) setattr(module, path[-1], value) ================================================ FILE: fairseq/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import collections import contextlib import copy import importlib import logging import os import sys import warnings from itertools import accumulate from typing import TYPE_CHECKING, Callable, Dict, List, Optional import torch import torch.nn.functional as F from torch import Tensor if TYPE_CHECKING: from fairseq.modules.multihead_attention import MultiheadAttention try: from amp_C import multi_tensor_l2norm multi_tensor_l2norm_available = True except ImportError: multi_tensor_l2norm_available = False try: import torch_xla.core.xla_model as xm except ImportError: xm = None logger = logging.getLogger(__name__) MANIFOLD_PATH_SEP = "|" class FileContentsAction(argparse.Action): def __init__(self, option_strings, dest, nargs=None, **kwargs): if nargs is not None: raise ValueError("nargs not allowed") super(FileContentsAction, self).__init__(option_strings, dest, **kwargs) def __call__(self, parser, namespace, values, option_string=None): from fairseq.file_io import PathManager if PathManager.isfile(values): with PathManager.open(values) as f: argument = f.read().strip() else: argument = values setattr(namespace, self.dest, argument) def split_paths(paths: str, separator=os.pathsep) -> List[str]: return ( paths.split(separator) if "://" not in paths else paths.split(MANIFOLD_PATH_SEP) ) def load_ensemble_for_inference(filenames, task, model_arg_overrides=None): from fairseq import checkpoint_utils deprecation_warning( "utils.load_ensemble_for_inference is deprecated. " "Please use checkpoint_utils.load_model_ensemble instead." ) return checkpoint_utils.load_model_ensemble( filenames, arg_overrides=model_arg_overrides, task=task ) def apply_to_sample(f, sample): if hasattr(sample, "__len__") and len(sample) == 0: return {} def _apply(x): if torch.is_tensor(x): return f(x) elif isinstance(x, collections.OrderedDict): # OrderedDict has attributes that needs to be preserved od = collections.OrderedDict( (key, _apply(value)) for key, value in x.items() ) od.__dict__ = x.__dict__ return od elif isinstance(x, dict): return {key: _apply(value) for key, value in x.items()} elif isinstance(x, list): return [_apply(x) for x in x] elif isinstance(x, tuple): return tuple(_apply(x) for x in x) elif isinstance(x, set): return {_apply(x) for x in x} else: return x return _apply(sample) def move_to_cuda(sample, device=None): device = device or torch.cuda.current_device() def _move_to_cuda(tensor): # non_blocking is ignored if tensor is not pinned, so we can always set # to True (see github.com/PyTorchLightning/pytorch-lightning/issues/620) return tensor.to(device=device, non_blocking=True) return apply_to_sample(_move_to_cuda, sample) def move_to_cpu(sample): def _move_to_cpu(tensor): # PyTorch has poor support for half tensors (float16) on CPU. # Move any such tensors to float32. if tensor.dtype in {torch.bfloat16, torch.float16}: tensor = tensor.to(dtype=torch.float32) return tensor.cpu() return apply_to_sample(_move_to_cpu, sample) def move_to_tpu(sample): import torch_xla.core.xla_model as xm device = xm.xla_device() def _move_to_tpu(tensor): return tensor.to(device) return apply_to_sample(_move_to_tpu, sample) def get_incremental_state( module: "MultiheadAttention", incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], key: str, ) -> Optional[Dict[str, Optional[Tensor]]]: """Helper for getting incremental state for an nn.Module.""" return module.get_incremental_state(incremental_state, key) def set_incremental_state( module: "MultiheadAttention", incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]], key: str, value: Dict[str, Optional[Tensor]], ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]: """Helper for setting incremental state for an nn.Module.""" if incremental_state is not None: result = module.set_incremental_state(incremental_state, key, value) if result is not None: incremental_state = result return incremental_state def load_align_dict(replace_unk): if replace_unk is None: align_dict = None elif isinstance(replace_unk, str) and len(replace_unk) > 0: # Load alignment dictionary for unknown word replacement if it was passed as an argument. align_dict = {} with open(replace_unk, "r") as f: for line in f: cols = line.split() align_dict[cols[0]] = cols[1] else: # No alignment dictionary provided but we still want to perform unknown word replacement by copying the # original source word. align_dict = {} return align_dict def print_embed_overlap(embed_dict, vocab_dict): embed_keys = set(embed_dict.keys()) vocab_keys = set(vocab_dict.symbols) overlap = len(embed_keys & vocab_keys) logger.info("found {}/{} types in embedding file".format(overlap, len(vocab_dict))) def parse_embedding(embed_path): """Parse embedding text file into a dictionary of word and embedding tensors. The first line can have vocabulary size and dimension. The following lines should contain word and embedding separated by spaces. Example: 2 5 the -0.0230 -0.0264 0.0287 0.0171 0.1403 at -0.0395 -0.1286 0.0275 0.0254 -0.0932 """ embed_dict = {} with open(embed_path) as f_embed: next(f_embed) # skip header for line in f_embed: pieces = line.rstrip().split(" ") embed_dict[pieces[0]] = torch.Tensor( [float(weight) for weight in pieces[1:]] ) return embed_dict def load_embedding(embed_dict, vocab, embedding): for idx in range(len(vocab)): token = vocab[idx] if token in embed_dict: embedding.weight.data[idx] = embed_dict[token] return embedding def replace_unk(hypo_str, src_str, alignment, align_dict, unk): from fairseq import tokenizer # Tokens are strings here hypo_tokens = tokenizer.tokenize_line(hypo_str) # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully src_tokens = tokenizer.tokenize_line(src_str) + ["<eos>"] for i, ht in enumerate(hypo_tokens): if ht == unk: src_token = src_tokens[alignment[i]] # Either take the corresponding value in the aligned dictionary or just copy the original value. hypo_tokens[i] = align_dict.get(src_token, src_token) return " ".join(hypo_tokens) def post_process_prediction( hypo_tokens, src_str, alignment, align_dict, tgt_dict, remove_bpe=None, extra_symbols_to_ignore=None, ): hypo_str = tgt_dict.string( hypo_tokens, remove_bpe, extra_symbols_to_ignore=extra_symbols_to_ignore ) if align_dict is not None: hypo_str = replace_unk( hypo_str, src_str, alignment, align_dict, tgt_dict.unk_string() ) if align_dict is not None or remove_bpe is not None: # Convert back to tokens for evaluating with unk replacement or without BPE # Note that the dictionary can be modified inside the method. hypo_tokens = tgt_dict.encode_line(hypo_str, add_if_not_exist=True) return hypo_tokens, hypo_str, alignment def make_positions(tensor, padding_idx: int, onnx_trace: bool = False): """Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols are ignored. """ # The series of casts and type-conversions here are carefully # balanced to both work with ONNX export and XLA. In particular XLA # prefers ints, cumsum defaults to output longs, and ONNX doesn't know # how to handle the dtype kwarg in cumsum. mask = tensor.ne(padding_idx).int() return (torch.cumsum(mask, dim=1).type_as(mask) * mask).long() + padding_idx def strip_pad(tensor, pad): return tensor[tensor.ne(pad)] def buffered_arange(max, device="cpu"): if not hasattr(buffered_arange, "buf"): buffered_arange.buf = torch.LongTensor().to(device) if max > buffered_arange.buf.numel(): buffered_arange.buf.resize_(max) torch.arange(max, out=buffered_arange.buf) return buffered_arange.buf[:max] def convert_padding_direction( src_tokens, padding_idx, right_to_left: bool = False, left_to_right: bool = False ): assert right_to_left ^ left_to_right pad_mask = src_tokens.eq(padding_idx) if not pad_mask.any(): # no padding, return early return src_tokens if left_to_right and not pad_mask[:, 0].any(): # already right padded return src_tokens if right_to_left and not pad_mask[:, -1].any(): # already left padded return src_tokens max_len = src_tokens.size(1) buffered = torch.empty(0).long() if max_len > 0: torch.arange(max_len, out=buffered) range = buffered.type_as(src_tokens).expand_as(src_tokens) num_pads = pad_mask.long().sum(dim=1, keepdim=True) if right_to_left: index = torch.remainder(range - num_pads, max_len) else: index = torch.remainder(range + num_pads, max_len) return src_tokens.gather(1, index) def item(tensor): # tpu-comment: making this a no-op for xla devices. if torch.is_tensor(tensor) and tensor.device.type == "xla": return tensor.detach() if hasattr(tensor, "item"): return tensor.item() if hasattr(tensor, "__getitem__"): return tensor[0] return tensor def multi_tensor_total_norm(grads, chunk_size=2048 * 32) -> torch.Tensor: per_device_grads = {} norms = [] for grad in grads: device = grad.device cur_device_grads = per_device_grads.get(device) if cur_device_grads is None: cur_device_grads = [] per_device_grads[device] = cur_device_grads cur_device_grads.append(grad) for device in per_device_grads.keys(): cur_device_grads = per_device_grads[device] if device.type == "cuda": # TODO(msb) return has_inf has_inf = torch.zeros((1, 1), dtype=torch.int, device=device) with torch.cuda.device(device): norm = multi_tensor_l2norm( chunk_size, has_inf, [cur_device_grads], False ) norms.append(norm[0].to(torch.cuda.current_device())) else: norms += [torch.norm(g, p=2, dtype=torch.float32) for g in cur_device_grads] total_norm = torch.norm(torch.stack(norms)) return total_norm @torch.no_grad() def clip_grad_norm_(params, max_norm, aggregate_norm_fn=None) -> torch.Tensor: def grad_exists(p): return p is not None and getattr(p, "grad", None) is not None if isinstance(params, torch.Tensor): params = [params] params = list(params) grads = [ p.grad.detach() for p in params if grad_exists(p) and not hasattr(p, "expert") ] expert_grads = [ p.grad.detach() for p in params if grad_exists(p) and hasattr(p, "expert") ] if len(grads) == 0: if len(params) > 0: return params[0].new_tensor(0.0) else: return torch.tensor(0.0) if len(grads) == 1: total_norm = torch.norm(grads[0], p=2, dtype=torch.float32) else: if multi_tensor_l2norm_available: total_norm = multi_tensor_total_norm(grads) else: if torch.cuda.is_available(): warnings.warn( "amp_C fused kernels unavailable, disabling multi_tensor_l2norm; " "you may get better performance by installing NVIDIA's apex library" ) device = torch.cuda.current_device() elif grads[0].device.type == "xla": device = grads[0].device else: device = torch.device("cpu") total_norm = torch.norm( torch.stack( [torch.norm(g, p=2, dtype=torch.float32).to(device) for g in grads] ) ) if aggregate_norm_fn is not None: total_norm = aggregate_norm_fn(total_norm) if max_norm > 0: max_norm = float(max_norm) clip_coef = (max_norm / (total_norm + 1e-6)).clamp_(max=1) torch._foreach_mul_(grads + expert_grads, clip_coef) return total_norm def fill_with_neg_inf(t): """FP16-compatible function that fills a tensor with -inf.""" return t.float().fill_(float("-inf")).type_as(t) def _match_types(arg1, arg2): """Convert the numerical argument to the same type as the other argument""" def upgrade(arg_number, arg_structure): if isinstance(arg_structure, tuple): return tuple([arg_number] * len(arg_structure)) elif isinstance(arg_structure, dict): arg = copy.deepcopy(arg_structure) for k in arg: arg[k] = upgrade(arg_number, arg_structure[k]) return arg else: return arg_number if isinstance(arg1, float) or isinstance(arg1, int): return upgrade(arg1, arg2), arg2 elif isinstance(arg2, float) or isinstance(arg2, int): return arg1, upgrade(arg2, arg1) return arg1, arg2 def resolve_max_positions(*args): """Resolve max position constraints from multiple sources.""" def map_value_update(d1, d2): updated_value = copy.deepcopy(d1) for key in d2: if key not in updated_value: updated_value[key] = d2[key] else: updated_value[key] = min(d1[key], d2[key]) return updated_value def nullsafe_min(l): minim = None for item in l: if minim is None: minim = item elif item is not None and item < minim: minim = item return minim max_positions = None for arg in args: if max_positions is None: max_positions = arg elif arg is not None: max_positions, arg = _match_types(max_positions, arg) if isinstance(arg, float) or isinstance(arg, int): max_positions = min(max_positions, arg) elif isinstance(arg, dict): max_positions = map_value_update(max_positions, arg) else: max_positions = tuple(map(nullsafe_min, zip(max_positions, arg))) return max_positions def import_user_module(args): module_path = getattr(args, "user_dir", None) if module_path is not None: module_path = os.path.abspath(args.user_dir) if not os.path.exists(module_path) and not os.path.isfile( os.path.dirname(module_path) ): fairseq_rel_path = os.path.join(os.path.dirname(__file__), args.user_dir) if os.path.exists(fairseq_rel_path): module_path = fairseq_rel_path else: fairseq_rel_path = os.path.join( os.path.dirname(__file__), "..", args.user_dir ) if os.path.exists(fairseq_rel_path): module_path = fairseq_rel_path else: raise FileNotFoundError(module_path) # ensure that user modules are only imported once import_user_module.memo = getattr(import_user_module, "memo", set()) if module_path not in import_user_module.memo: import_user_module.memo.add(module_path) module_parent, module_name = os.path.split(module_path) if module_name not in sys.modules: sys.path.insert(0, module_parent) importlib.import_module(module_name) tasks_path = os.path.join(module_path, "tasks") if os.path.exists(tasks_path): from fairseq.tasks import import_tasks import_tasks(tasks_path, f"{module_name}.tasks") models_path = os.path.join(module_path, "models") if os.path.exists(models_path): from fairseq.models import import_models import_models(models_path, f"{module_name}.models") elif module_path in sys.modules[module_name].__path__: logger.info(f"--user-dir={module_path} has already been imported.") else: raise ImportError( "Failed to import --user-dir={} because the corresponding module name " "({}) is not globally unique. Please rename the directory to " "something unique and try again.".format(module_path, module_name) ) def softmax(x, dim: int, onnx_trace: bool = False): if onnx_trace: return F.softmax(x.float(), dim=dim) else: return F.softmax(x, dim=dim, dtype=torch.float32) def log_softmax(x, dim: int, onnx_trace: bool = False): if onnx_trace: return F.log_softmax(x.float(), dim=dim) else: return F.log_softmax(x, dim=dim, dtype=torch.float32) def get_perplexity(loss, round=2, base=2): from fairseq.logging.meters import safe_round if loss is None: return 0.0 try: return safe_round(base**loss, round) except OverflowError: return float("inf") def deprecation_warning(message, stacklevel=3): # don't use DeprecationWarning, since it's ignored by default warnings.warn(message, stacklevel=stacklevel) def relu_squared(x: torch.Tensor): return F.relu(x).pow(2) def get_activation_fn(activation: str) -> Callable: """Returns the activation function corresponding to `activation`""" from fairseq.modules import gelu, gelu_accurate if activation == "relu": return F.relu elif activation == "relu_squared": return relu_squared elif activation == "gelu": return gelu elif activation == "gelu_fast": deprecation_warning( "--activation-fn=gelu_fast has been renamed to gelu_accurate" ) return gelu_accurate elif activation == "gelu_accurate": return gelu_accurate elif activation == "tanh": return torch.tanh elif activation == "linear": return lambda x: x elif activation == "swish": return torch.nn.SiLU else: raise RuntimeError("--activation-fn {} not supported".format(activation)) def get_available_activation_fns() -> List: return [ "relu", "gelu", "gelu_fast", # deprecated "gelu_accurate", "tanh", "linear", ] @contextlib.contextmanager def model_eval(model): is_training = model.training model.eval() yield model.train(is_training) def has_parameters(module): try: next(module.parameters()) return True except StopIteration: return False def get_rng_state(): state = {"torch_rng_state": torch.get_rng_state()} if xm is not None: state["xla_rng_state"] = xm.get_rng_state() if torch.cuda.is_available(): state["cuda_rng_state"] = torch.cuda.get_rng_state() return state def set_rng_state(state): torch.set_rng_state(state["torch_rng_state"]) if xm is not None: xm.set_rng_state(state["xla_rng_state"]) if torch.cuda.is_available(): torch.cuda.set_rng_state(state["cuda_rng_state"]) class set_torch_seed(object): def __init__(self, seed): assert isinstance(seed, int) self.rng_state = get_rng_state() torch.manual_seed(seed) if xm is not None: xm.set_rng_state(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) def __enter__(self): return self def __exit__(self, *exc): set_rng_state(self.rng_state) def parse_alignment(line): """ Parses a single line from the alingment file. Args: line (str): String containing the alignment of the format: <src_idx_1>-<tgt_idx_1> <src_idx_2>-<tgt_idx_2> .. <src_idx_m>-<tgt_idx_m>. All indices are 0 indexed. Returns: torch.IntTensor: packed alignments of shape (2 * m). """ alignments = line.strip().split() parsed_alignment = torch.IntTensor(2 * len(alignments)) for idx, alignment in enumerate(alignments): src_idx, tgt_idx = alignment.split("-") parsed_alignment[2 * idx] = int(src_idx) parsed_alignment[2 * idx + 1] = int(tgt_idx) return parsed_alignment def get_token_to_word_mapping(tokens, exclude_list): n = len(tokens) word_start = [int(token not in exclude_list) for token in tokens] word_idx = list(accumulate(word_start)) token_to_word = {i: word_idx[i] for i in range(n)} return token_to_word def extract_hard_alignment(attn, src_sent, tgt_sent, pad, eos): tgt_valid = ( ((tgt_sent != pad) & (tgt_sent != eos)).nonzero(as_tuple=False).squeeze(dim=-1) ) src_invalid = ( ((src_sent == pad) | (src_sent == eos)).nonzero(as_tuple=False).squeeze(dim=-1) ) src_token_to_word = get_token_to_word_mapping(src_sent, [eos, pad]) tgt_token_to_word = get_token_to_word_mapping(tgt_sent, [eos, pad]) alignment = [] if len(tgt_valid) != 0 and len(src_invalid) < len(src_sent): attn_valid = attn[tgt_valid] attn_valid[:, src_invalid] = float("-inf") _, src_indices = attn_valid.max(dim=1) for tgt_idx, src_idx in zip(tgt_valid, src_indices): alignment.append( ( src_token_to_word[src_idx.item()] - 1, tgt_token_to_word[tgt_idx.item()] - 1, ) ) return alignment def extract_soft_alignment(attn, src_sent, tgt_sent, pad, eos): tgt_valid = ((tgt_sent != pad)).nonzero(as_tuple=False) src_valid = ((src_sent != pad)).nonzero(as_tuple=False).squeeze(dim=-1) alignment = [] if len(tgt_valid) != 0 and len(src_valid) != 0: attn_valid = attn[tgt_valid, src_valid] alignment = [ ["{:.6f}".format(p) for p in src_probs.tolist()] for src_probs in attn_valid ] return alignment def new_arange(x, *size): """ Return a Tensor of `size` filled with a range function on the device of x. If size is empty, using the size of the variable x. """ if len(size) == 0: size = x.size() return torch.arange(size[-1], device=x.device).expand(*size).contiguous() def get_tpu_device(): return xm.xla_device() def tpu_data_loader(itr): import torch_xla.core.xla_model as xm import torch_xla.distributed.parallel_loader as pl from fairseq.data import iterators xm.rendezvous("tpu_data_loader") # wait for all workers xm.mark_step() device = xm.xla_device() return iterators.CountingIterator( pl.ParallelLoader(itr, [device]).per_device_loader(device), start=getattr(itr, "n", 0), total=len(itr), ) def is_xla_tensor(tensor): return torch.is_tensor(tensor) and tensor.device.type == "xla" def index_put(tensor, indices, value): if is_xla_tensor(tensor): for _ in range(indices.dim(), tensor.dim()): indices = indices.unsqueeze(-1) if indices.size(-1) < tensor.size(-1): indices = indices.expand_as(tensor) tensor = torch.mul(tensor, ~indices) + torch.mul(value, indices) else: tensor[indices] = value return tensor def xla_device_to_cpu(dat): import torch_xla.core.xla_model as xm return xm._maybe_convert_to_cpu(dat) class CudaEnvironment(object): def __init__(self): cur_device = torch.cuda.current_device() prop = torch.cuda.get_device_properties("cuda:{}".format(cur_device)) self.name = prop.name self.major = prop.major self.minor = prop.minor self.total_memory_in_GB = prop.total_memory / 1024 / 1024 / 1024 @staticmethod def pretty_print_cuda_env_list(cuda_env_list): """ Given a list of CudaEnviorments, pretty print them """ num_workers = len(cuda_env_list) center = "CUDA enviroments for all {} workers".format(num_workers) banner_len = 40 - len(center) // 2 first_line = "*" * banner_len + center + "*" * banner_len logger.info(first_line) for r, env in enumerate(cuda_env_list): logger.info( "rank {:3d}: ".format(r) + "capabilities = {:2d}.{:<2d} ; ".format(env.major, env.minor) + "total memory = {:.3f} GB ; ".format(env.total_memory_in_GB) + "name = {:40s}".format(env.name) ) logger.info(first_line) def csv_str_list(x): return x.split(",") def eval_str_list(x, type=float): if x is None: return None if isinstance(x, str): x = eval(x) try: return list(map(type, x)) except TypeError: return [type(x)] def eval_str_dict(x, type=dict): if x is None: return None if isinstance(x, str): x = eval(x) return x def eval_bool(x, default=False): if x is None: return default try: return bool(eval(x)) except TypeError: return default def reset_logging(): root = logging.getLogger() for handler in root.handlers: root.removeHandler(handler) root.setLevel(os.environ.get("LOGLEVEL", "INFO").upper()) handler = logging.StreamHandler(sys.stdout) handler.setFormatter( logging.Formatter( fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) ) root.addHandler(handler) def safe_getattr(obj, k, default=None): """Returns obj[k] if it exists and is not None, otherwise returns default.""" from omegaconf import OmegaConf if OmegaConf.is_config(obj): return obj[k] if k in obj and obj[k] is not None else default return getattr(obj, k, default) def safe_hasattr(obj, k): """Returns True if the given key exists and is not None.""" return getattr(obj, k, None) is not None def hotreload_function(name=None): """ Decorator to function to enable hot-reload for debugging. It allows you to debug a function without having reloading all heavy models, dataset loading and preprocessing, allow faster debugging. If you want to change model or dataset loading, consider relaunching your code ----------------------------------- This will run the decorated function func: if func run successful: It will pause, allow user to edit code, and prompt user to: Press enter to re-run the function with updated code Type "done" to finish the function, return output Type "disable" to stop pausing this function and let code continue without pause Ctril + C to terminal if func raise error: it will prompt user to 1. Edit code, and press enter to retry 2. Ctrl + C to terminate 3. Type "raise" to raise that exception * Requirements: 0. Fairseq was installed with `pip install --editable .` 1. pip install jurigged[develoop] 2. set environment HOTRELOAD_PAUSE=1 CUDA_LAUNCH_BLOCKING=1 3. Run on only 1 GPU (no distributed) * How to use: 1. in python, import and decorate the top-level function to be re-run after code edits: ```python from fairseq.utils import hotreload_function .... @hotreload_function("train_step") def train_step(self, sample ....): .... .... ``` 2. in bash run scripts: ```bash watch_dir=<home>/fairseq-py/fairseq/tasks # directory to watch for file changes export CUDA_VISIBLE_DEVICES=0 # single-gpu HOTRELOAD_PAUSE=1 CUDA_LAUNCH_BLOCKING=1 python -m jurigged -w ${watch_dir} --poll 2 -v train.py ...... ``` * NOTE: 1. -w ${watch_dir} specify all the files to be watched for changes once functions, class, ... code are changed, all instances in the process will get updated (hot-reload) * Limitation: * Currently distributed debugging not working * Need to launch train.py locally (cannot submit jobs) """ try: import jurigged except ImportError as e: logger.warning("Please install jurigged: pip install jurigged[develoop]") raise e from fairseq.distributed import utils as distributed_utils import traceback def hotreload_decorator(func): assert callable(func), f"not callable: {func}" jname = name or func.__name__ logger.info(f"jurigged-hotreload:Apply jurigged on {jname}:{func.__name__}") HOTRELOAD_PAUSE = bool(os.environ.get("HOTRELOAD_PAUSE", 0)) cublk = bool(os.environ.get("CUDA_LAUNCH_BLOCKING", 0)) prefix = f"HOTRELOAD:{jname}:[cublk={cublk}]" hot_reload_state = {"disable": False} def func_wrapper(*args, **kwargs): if not HOTRELOAD_PAUSE or hot_reload_state["disable"]: return func(*args, **kwargs) world_size = distributed_utils.get_global_world_size() assert ( world_size <= 1 ), f"HOTRELOAD_PAUSE:{jname} currently cannot do distributed training" success = False while not success: try: output = func(*args, **kwargs) # success = True end_action = input( f"{prefix}: PAUSE, you may edit code now. Enter to re-run, ctrl+C to terminate, " f'type "done" to continue (function still being watched), or type "disable" to stop pausing this function :' ) if end_action.strip().lower() in ["disable", "done"]: success = True else: logger.warning( f"{prefix}: action={end_action} function will re-run now." ) except Exception as e: action = input( f"{prefix}:ERROR: \n{traceback.format_exc()}\n" f'Edit code to try again: enter to continue, ctrl+C to terminate, or type "raise" to raise the exception: ' ) if action.strip().lower() == "raise": raise e if end_action.strip().lower() == "disable": logger.warning( f"{prefix}: Stop pausing {jname}. The function is still being watched and newly editted code will take effect " f"if the {jname} is called again later." f' "unset HOTRELOAD_PAUSE" before relaunch to disable hotreload and' f" remove @hotreload_function decorator in the code." ) hot_reload_state["disable"] = True return output return func_wrapper return hotreload_decorator ================================================ FILE: fairseq/version.txt ================================================ 0.12.2 ================================================ FILE: fairseq_cli/__init__.py ================================================ ================================================ FILE: fairseq_cli/eval_lm.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Evaluate the perplexity of a trained language model. """ import logging import math import os import sys from argparse import Namespace from typing import Iterable, List, Optional import torch from omegaconf import DictConfig import fairseq from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging import progress_bar from fairseq.logging.meters import StopwatchMeter from fairseq.sequence_scorer import SequenceScorer logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("fairseq_cli.eval_lm") def eval_lm( models: List[fairseq.models.FairseqModel], source_dictionary: fairseq.data.Dictionary, batch_iterator: Iterable, post_process: Optional[str] = None, output_word_probs: bool = False, output_word_stats: bool = False, target_dictionary: Optional[fairseq.data.Dictionary] = None, softmax_batch: int = 0, remove_bos_token: bool = False, device: Optional[torch.device] = None, ): """ Args: models (List[~fairseq.models.FairseqModel]): list of models to evaluate. Models are essentially `nn.Module` instances, but must be compatible with fairseq's `SequenceScorer`. source_dictionary (~fairseq.data.Dictionary): dictionary for applying any relevant post processing or outputing word probs/stats. batch_iterator (Iterable): yield batches of data post_process (Optional[str]): post-process text by removing BPE, letter segmentation, etc. Valid options can be found in fairseq.data.utils.post_process, although not all options are implemented here. output_word_probs (Optional[bool]): output words and their predicted log probabilities output_word_stats (Optional[bool]): output word statistics such as word count and average probability target_dictionary (Optional[~fairseq.data.Dictionary]): output dictionary (defaults to *source_dictionary*) softmax_batch (Optional[bool]): if BxT is more than this, will batch the softmax over vocab to this amount of tokens, in order to fit into GPU memory remove_bos_token (Optional[bool]): if True, confirm that the first token is the beginning-of-sentence symbol (according to the relevant dictionary) and remove it from the output device (Optional[torch.device]): device to use for evaluation (defaults to device of first model parameter) """ if target_dictionary is None: target_dictionary = source_dictionary if device is None: device = next(models[0].parameters()).device gen_timer = StopwatchMeter() scorer = SequenceScorer(target_dictionary, softmax_batch) score_sum = 0.0 count = 0 if post_process is not None: if post_process in {"subword_nmt", "@@ "}: bpe_cont = post_process.rstrip() bpe_toks = { i for i in range(len(source_dictionary)) if source_dictionary[i].endswith(bpe_cont) } else: raise NotImplementedError( f"--post-process={post_process} is not implemented" ) bpe_len = len(bpe_cont) else: bpe_toks = None bpe_len = 0 word_stats = dict() for sample in batch_iterator: if "net_input" not in sample: continue sample = utils.move_to_cuda(sample, device=device) gen_timer.start() hypos = scorer.generate(models, sample) gen_timer.stop(sample["ntokens"]) for i, hypos_i in enumerate(hypos): hypo = hypos_i[0] sample_id = sample["id"][i] tokens = hypo["tokens"] tgt_len = tokens.numel() pos_scores = hypo["positional_scores"].float() if remove_bos_token: assert hypo["tokens"][0].item() == target_dictionary.bos() tokens = tokens[1:] pos_scores = pos_scores[1:] skipped_toks = 0 if bpe_toks is not None: for i in range(tgt_len - 1): if tokens[i].item() in bpe_toks: skipped_toks += 1 pos_scores[i + 1] += pos_scores[i] pos_scores[i] = 0 inf_scores = pos_scores.eq(float("inf")) | pos_scores.eq(float("-inf")) if inf_scores.any(): logger.info( "skipping tokens with inf scores:", target_dictionary.string(tokens[inf_scores.nonzero()]), ) pos_scores = pos_scores[(~inf_scores).nonzero()] score_sum += pos_scores.sum().cpu() count += pos_scores.numel() - skipped_toks if output_word_probs or output_word_stats: w = "" word_prob = [] is_bpe = False for i in range(len(tokens)): w_ind = tokens[i].item() w += source_dictionary[w_ind] if bpe_toks is not None and w_ind in bpe_toks: w = w[:-bpe_len] is_bpe = True else: word_prob.append((w, pos_scores[i].item())) next_prob = None ind = i + 1 while ind < len(tokens): if pos_scores[ind].item() != 0: next_prob = pos_scores[ind] break ind += 1 word_stats.setdefault(w, WordStat(w, is_bpe)).add( pos_scores[i].item(), next_prob ) is_bpe = False w = "" if output_word_probs: logger.info( str(int(sample_id)) + " " + ( "\t".join( "{} [{:2f}]".format(x[0], x[1]) for x in word_prob ) ) ) avg_nll_loss = ( -score_sum / count / math.log(2) if count > 0 else 0 ) # convert to base 2 logger.info( "Evaluated {:,} tokens in {:.1f}s ({:.2f} tokens/s)".format( gen_timer.n, gen_timer.sum, 1.0 / gen_timer.avg if gen_timer.avg > 0 else 0 ) ) if output_word_stats: for ws in sorted(word_stats.values(), key=lambda x: x.count, reverse=True): logger.info(ws) return { "loss": avg_nll_loss, "perplexity": 2**avg_nll_loss, } class WordStat(object): def __init__(self, word, is_bpe): self.word = word self.is_bpe = is_bpe self.log_prob = 0 self.next_word_prob = 0 self.count = 0 self.missing_next_words = 0 def add(self, log_prob, next_word_prob): """increments counters for the sum of log probs of current word and next word (given context ending at current word). Since the next word might be at the end of the example, or it might be not counted because it is not an ending subword unit, also keeps track of how many of those we have seen""" if next_word_prob is not None: self.next_word_prob += next_word_prob else: self.missing_next_words += 1 self.log_prob += log_prob self.count += 1 def __str__(self): return "{}\t{}\t{}\t{}\t{}\t{}".format( self.word, self.count, self.log_prob, self.is_bpe, self.next_word_prob, self.count - self.missing_next_words, ) def main(cfg: DictConfig, **unused_kwargs): if isinstance(cfg, Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) logger.info(cfg) if cfg.eval_lm.context_window > 0: # reduce tokens per sample by the required context window size cfg.task.tokens_per_sample -= cfg.eval_lm.context_window # Initialize the task using the current *cfg* task = tasks.setup_task(cfg.task) # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, model_args, task = checkpoint_utils.load_model_ensemble_and_task( [cfg.common_eval.path], arg_overrides=eval(cfg.common_eval.model_overrides), suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, task=task, ) use_fp16 = cfg.common.fp16 use_cuda = torch.cuda.is_available() and not cfg.common.cpu if use_cuda: torch.cuda.set_device(cfg.distributed_training.device_id) # Optimize ensemble for generation and set the source and dest dicts on the model # (required by scorer) for model in models: if use_fp16: model.half() if use_cuda and not cfg.distributed_training.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(cfg) assert len(models) > 0 logger.info( "num. model params: {:,}".format(sum(p.numel() for p in models[0].parameters())) ) # Load dataset splits task.load_dataset(cfg.dataset.gen_subset) dataset = task.dataset(cfg.dataset.gen_subset) logger.info( "{} {} {:,} examples".format( cfg.task.data, cfg.dataset.gen_subset, len(dataset) ) ) itr = task.eval_lm_dataloader( dataset=dataset, max_tokens=cfg.dataset.max_tokens or 36000, batch_size=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( *[model.max_positions() for model in models] ), num_shards=max( cfg.dataset.num_shards, cfg.distributed_training.distributed_world_size, ), shard_id=max( cfg.dataset.shard_id, cfg.distributed_training.distributed_rank, ), num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, context_window=cfg.eval_lm.context_window, ) itr = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) results = eval_lm( models=models, source_dictionary=task.source_dictionary, batch_iterator=itr, post_process=cfg.common_eval.post_process, output_word_probs=cfg.eval_lm.output_word_probs, output_word_stats=cfg.eval_lm.output_word_stats, target_dictionary=task.target_dictionary, softmax_batch=cfg.eval_lm.softmax_batch, remove_bos_token=getattr(cfg.task, "add_bos_token", False), ) logger.info( "Loss (base 2): {:.4f}, Perplexity: {:.2f}".format( results["loss"], results["perplexity"] ) ) return results def cli_main(): parser = options.get_eval_lm_parser() args = options.parse_args_and_arch(parser) distributed_utils.call_main(convert_namespace_to_omegaconf(args), main) if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/generate.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Translate pre-processed data with a trained model. """ import ast import logging import math import os import sys from argparse import Namespace from itertools import chain import numpy as np import torch from omegaconf import DictConfig from fairseq import checkpoint_utils, options, scoring, tasks, utils from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging import progress_bar from fairseq.logging.meters import StopwatchMeter, TimeMeter def main(cfg: DictConfig): if isinstance(cfg, Namespace): cfg = convert_namespace_to_omegaconf(cfg) assert cfg.common_eval.path is not None, "--path required for generation!" assert ( not cfg.generation.sampling or cfg.generation.nbest == cfg.generation.beam ), "--sampling requires --nbest to be equal to --beam" assert ( cfg.generation.replace_unk is None or cfg.dataset.dataset_impl == "raw" ), "--replace-unk requires a raw text dataset (--dataset-impl=raw)" if cfg.common_eval.results_path is not None: os.makedirs(cfg.common_eval.results_path, exist_ok=True) output_path = os.path.join( cfg.common_eval.results_path, "generate-{}.txt".format(cfg.dataset.gen_subset), ) with open(output_path, "w", buffering=1, encoding="utf-8") as h: return _main(cfg, h) else: return _main(cfg, sys.stdout) def get_symbols_to_strip_from_output(generator): if hasattr(generator, "symbols_to_strip_from_output"): return generator.symbols_to_strip_from_output else: return {generator.eos} def _main(cfg: DictConfig, output_file): logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=output_file, ) logger = logging.getLogger("fairseq_cli.generate") utils.import_user_module(cfg.common) if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: cfg.dataset.max_tokens = 12000 logger.info(cfg) # Fix seed for stochastic decoding if cfg.common.seed is not None and not cfg.generation.no_seed_provided: np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) use_cuda = torch.cuda.is_available() and not cfg.common.cpu # Load dataset splits task = tasks.setup_task(cfg.task) # Set dictionaries try: src_dict = getattr(task, "source_dictionary", None) except NotImplementedError: src_dict = None tgt_dict = task.target_dictionary overrides = ast.literal_eval(cfg.common_eval.model_overrides) # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, saved_cfg = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides=overrides, task=task, suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, ) # loading the dataset should happen after the checkpoint has been loaded so we can give it the saved task config task.load_dataset(cfg.dataset.gen_subset, task_cfg=saved_cfg.task) if cfg.generation.lm_path is not None: overrides["data"] = cfg.task.data try: lms, _ = checkpoint_utils.load_model_ensemble( [cfg.generation.lm_path], arg_overrides=overrides, task=None ) except: logger.warning( f"Failed to load language model! Please make sure that the language model dict is the same " f"as target dict and is located in the data dir ({cfg.task.data})" ) raise assert len(lms) == 1 else: lms = [None] # Optimize ensemble for generation for model in chain(models, lms): if model is None: continue if cfg.common.fp16: model.half() if use_cuda and not cfg.distributed_training.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(cfg) # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(cfg.generation.replace_unk) # Load dataset (possibly sharded) itr = task.get_batch_iterator( dataset=task.dataset(cfg.dataset.gen_subset), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models] ), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=cfg.distributed_training.distributed_world_size, shard_id=cfg.distributed_training.distributed_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) # Initialize generator gen_timer = StopwatchMeter() extra_gen_cls_kwargs = {"lm_model": lms[0], "lm_weight": cfg.generation.lm_weight} generator = task.build_generator( models, cfg.generation, extra_gen_cls_kwargs=extra_gen_cls_kwargs ) # Handle tokenization and BPE tokenizer = task.build_tokenizer(cfg.tokenizer) bpe = task.build_bpe(cfg.bpe) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x scorer = scoring.build_scorer(cfg.scoring, tgt_dict) num_sentences = 0 has_target = True wps_meter = TimeMeter() for sample in progress: sample = utils.move_to_cuda(sample) if use_cuda else sample if "net_input" not in sample: continue prefix_tokens = None if cfg.generation.prefix_size > 0: prefix_tokens = sample["target"][:, : cfg.generation.prefix_size] constraints = None if "constraints" in sample: constraints = sample["constraints"] gen_timer.start() hypos = task.inference_step( generator, models, sample, prefix_tokens=prefix_tokens, constraints=constraints, ) num_generated_tokens = sum(len(h[0]["tokens"]) for h in hypos) gen_timer.stop(num_generated_tokens) for i, sample_id in enumerate(sample["id"].tolist()): has_target = sample["target"] is not None # Remove padding if "src_tokens" in sample["net_input"]: src_tokens = utils.strip_pad( sample["net_input"]["src_tokens"][i, :], tgt_dict.pad() ) else: src_tokens = None target_tokens = None if has_target: target_tokens = ( utils.strip_pad(sample["target"][i, :], tgt_dict.pad()).int().cpu() ) # Either retrieve the original sentences or regenerate them from tokens. if align_dict is not None: src_str = task.dataset(cfg.dataset.gen_subset).src.get_original_text( sample_id ) target_str = task.dataset(cfg.dataset.gen_subset).tgt.get_original_text( sample_id ) else: if src_dict is not None: src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) else: src_str = "" if has_target: target_str = tgt_dict.string( target_tokens, cfg.common_eval.post_process, escape_unk=True, extra_symbols_to_ignore=get_symbols_to_strip_from_output( generator ), ) src_str = decode_fn(src_str) if has_target: target_str = decode_fn(target_str) if not cfg.common_eval.quiet: if src_dict is not None: print("S-{}\t{}".format(sample_id, src_str), file=output_file) if has_target: print("T-{}\t{}".format(sample_id, target_str), file=output_file) # Process top predictions for j, hypo in enumerate(hypos[i][: cfg.generation.nbest]): hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=cfg.common_eval.post_process, extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator), ) detok_hypo_str = decode_fn(hypo_str) if not cfg.common_eval.quiet: score = hypo["score"] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print( "H-{}\t{}\t{}".format(sample_id, score, hypo_str), file=output_file, ) # detokenized hypothesis print( "D-{}\t{}\t{}".format(sample_id, score, detok_hypo_str), file=output_file, ) print( "P-{}\t{}".format( sample_id, " ".join( map( lambda x: "{:.4f}".format(x), # convert from base e to base 2 hypo["positional_scores"] .div_(math.log(2)) .tolist(), ) ), ), file=output_file, ) if cfg.generation.print_alignment == "hard": print( "A-{}\t{}".format( sample_id, " ".join( [ "{}-{}".format(src_idx, tgt_idx) for src_idx, tgt_idx in alignment ] ), ), file=output_file, ) if cfg.generation.print_alignment == "soft": print( "A-{}\t{}".format( sample_id, " ".join( [",".join(src_probs) for src_probs in alignment] ), ), file=output_file, ) if cfg.generation.print_step: print( "I-{}\t{}".format(sample_id, hypo["steps"]), file=output_file, ) if cfg.generation.retain_iter_history: for step, h in enumerate(hypo["history"]): _, h_str, _ = utils.post_process_prediction( hypo_tokens=h["tokens"].int().cpu(), src_str=src_str, alignment=None, align_dict=None, tgt_dict=tgt_dict, remove_bpe=None, ) print( "E-{}_{}\t{}".format(sample_id, step, h_str), file=output_file, ) # Score only the top hypothesis if has_target and j == 0: if ( align_dict is not None or cfg.common_eval.post_process is not None ): # Convert back to tokens for evaluation with unk replacement and/or without BPE target_tokens = tgt_dict.encode_line( target_str, add_if_not_exist=True ) hypo_tokens = tgt_dict.encode_line( detok_hypo_str, add_if_not_exist=True ) if hasattr(scorer, "add_string"): scorer.add_string(target_str, detok_hypo_str) else: scorer.add(target_tokens, hypo_tokens) wps_meter.update(num_generated_tokens) progress.log({"wps": round(wps_meter.avg)}) num_sentences += ( sample["nsentences"] if "nsentences" in sample else sample["id"].numel() ) logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info( "Translated {:,} sentences ({:,} tokens) in {:.1f}s ({:.2f} sentences/s, {:.2f} tokens/s)".format( num_sentences, gen_timer.n, gen_timer.sum, num_sentences / gen_timer.sum, 1.0 / gen_timer.avg, ) ) if has_target: if cfg.bpe and not cfg.generation.sacrebleu: if cfg.common_eval.post_process: logger.warning( "BLEU score is being computed by splitting detokenized string on spaces, this is probably not what you want. Use --sacrebleu for standard 13a BLEU tokenization" ) else: logger.warning( "If you are using BPE on the target side, the BLEU score is computed on BPE tokens, not on proper words. Use --sacrebleu for standard 13a BLEU tokenization" ) # use print to be consistent with other main outputs: S-, H-, T-, D- and so on print( "Generate {} with beam={}: {}".format( cfg.dataset.gen_subset, cfg.generation.beam, scorer.result_string() ), file=output_file, ) return scorer def cli_main(): parser = options.get_generation_parser() # TODO: replace this workaround with refactoring of `AudioPretraining` parser.add_argument( "--arch", "-a", metavar="ARCH", default="wav2vec2", help="Model architecture. For constructing tasks that rely on " "model args (e.g. `AudioPretraining`)", ) args = options.parse_args_and_arch(parser) main(args) if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/hydra_train.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import hydra import torch from hydra.core.hydra_config import HydraConfig from omegaconf import OmegaConf, open_dict from fairseq import distributed_utils, metrics from fairseq.dataclass.configs import FairseqConfig from fairseq.dataclass.initialize import add_defaults, hydra_init from fairseq.dataclass.utils import omegaconf_no_object_check from fairseq.utils import reset_logging from fairseq_cli.train import main as pre_main logger = logging.getLogger("fairseq_cli.hydra_train") @hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config") def hydra_main(cfg: FairseqConfig) -> float: _hydra_main(cfg) def _hydra_main(cfg: FairseqConfig, **kwargs) -> float: add_defaults(cfg) if cfg.common.reset_logging: reset_logging() # Hydra hijacks logging, fix that else: # check if directly called or called through hydra_main if HydraConfig.initialized(): with open_dict(cfg): # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) cfg.job_logging_cfg = OmegaConf.to_container( HydraConfig.get().job_logging, resolve=True ) with omegaconf_no_object_check(): cfg = OmegaConf.create( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) ) OmegaConf.set_struct(cfg, True) try: if cfg.common.profile: with torch.cuda.profiler.profile(): with torch.autograd.profiler.emit_nvtx(): distributed_utils.call_main(cfg, pre_main, **kwargs) else: distributed_utils.call_main(cfg, pre_main, **kwargs) except BaseException as e: if not cfg.common.suppress_crashes: raise else: logger.error("Crashed! " + str(e)) # get best val and return - useful for sweepers try: best_val = metrics.get_smoothed_value( "valid", cfg.checkpoint.best_checkpoint_metric ) except: best_val = None if best_val is None: best_val = float("inf") return best_val def cli_main(): try: from hydra._internal.utils import get_args cfg_name = get_args().config_name or "config" except: logger.warning("Failed to get config name from hydra args") cfg_name = "config" hydra_init(cfg_name) hydra_main() if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/hydra_validate.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys from itertools import chain import torch from hydra.core.hydra_config import HydraConfig from omegaconf import OmegaConf, open_dict import hydra from fairseq import checkpoint_utils, distributed_utils, utils from fairseq.dataclass.configs import FairseqConfig from fairseq.dataclass.initialize import add_defaults, hydra_init from fairseq.dataclass.utils import omegaconf_no_object_check from fairseq.distributed import utils as distributed_utils from fairseq.logging import metrics, progress_bar from fairseq.utils import reset_logging logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("fairseq_cli.validate") @hydra.main(config_path=os.path.join("..", "fairseq", "config"), config_name="config") def hydra_main(cfg: FairseqConfig) -> float: return _hydra_main(cfg) def _hydra_main(cfg: FairseqConfig, **kwargs) -> float: add_defaults(cfg) if cfg.common.reset_logging: reset_logging() # Hydra hijacks logging, fix that else: # check if directly called or called through hydra_main if HydraConfig.initialized(): with open_dict(cfg): # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) cfg.job_logging_cfg = OmegaConf.to_container( HydraConfig.get().job_logging, resolve=True ) with omegaconf_no_object_check(): cfg = OmegaConf.create( OmegaConf.to_container(cfg, resolve=True, enum_to_str=True) ) OmegaConf.set_struct(cfg, True) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" distributed_utils.call_main(cfg, validate, **kwargs) def validate(cfg): utils.import_user_module(cfg.common) use_fp16 = cfg.common.fp16 use_cuda = torch.cuda.is_available() and not cfg.common.cpu if use_cuda: torch.cuda.set_device(cfg.distributed_training.device_id) if cfg.distributed_training.distributed_world_size > 1: data_parallel_world_size = distributed_utils.get_data_parallel_world_size() data_parallel_rank = distributed_utils.get_data_parallel_rank() else: data_parallel_world_size = 1 data_parallel_rank = 0 overrides = {"task": {"data": cfg.task.data}} # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [cfg.common_eval.path], arg_overrides=overrides, suffix=cfg.checkpoint.checkpoint_suffix, ) model = models[0] # Move models to GPU for model in models: model.eval() if use_fp16: model.half() if use_cuda: model.cuda() # Print args logger.info(saved_cfg) # Build criterion criterion = task.build_criterion(saved_cfg.criterion, from_checkpoint=True) criterion.eval() for subset in cfg.dataset.valid_subset.split(","): try: task.load_dataset(subset, combine=False, epoch=1, task_cfg=saved_cfg.task) dataset = task.dataset(subset) except KeyError: raise Exception("Cannot find dataset: " + subset) # Initialize data iterator itr = task.get_batch_iterator( dataset=dataset, max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models], ), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=data_parallel_world_size, shard_id=data_parallel_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, prefix=f"valid on '{subset}' subset", default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) def apply_half(t): if t.dtype is torch.float32: return t.to(dtype=torch.half) return t log_outputs = [] for i, sample in enumerate(progress): sample = utils.move_to_cuda(sample) if use_cuda else sample if use_fp16: sample = utils.apply_to_sample(apply_half, sample) _loss, _sample_size, log_output = task.valid_step(sample, model, criterion) with metrics.aggregate() as agg: task.reduce_metrics([log_output], criterion) progress.log(agg.get_smoothed_values(), step=i) # progress.log(log_output, step=i) from vision log_outputs.append(log_output) if data_parallel_world_size > 1: log_outputs = distributed_utils.all_gather_list( log_outputs, max_size=cfg.common.all_gather_list_size, group=distributed_utils.get_data_parallel_group(), ) log_outputs = list(chain.from_iterable(log_outputs)) with metrics.aggregate() as agg: task.reduce_metrics(log_outputs, criterion) log_output = agg.get_smoothed_values() progress.print(log_output, tag=subset, step=i) def cli_main(): try: from hydra._internal.utils import get_args cfg_name = get_args().config_name or "config" except: logger.warning("Failed to get config name from hydra args") cfg_name = "config" hydra_init(cfg_name) hydra_main() if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/interactive.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Translate raw text with a trained model. Batches data on-the-fly. """ import ast import fileinput import logging import math import os import sys import time from argparse import Namespace from collections import namedtuple import numpy as np import torch from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils from fairseq.dataclass.configs import FairseqConfig from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.token_generation_constraints import pack_constraints, unpack_constraints from fairseq_cli.generate import get_symbols_to_strip_from_output logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("fairseq_cli.interactive") Batch = namedtuple("Batch", "ids src_tokens src_lengths constraints") Translation = namedtuple("Translation", "src_str hypos pos_scores alignments") def buffered_read(input, buffer_size): buffer = [] with fileinput.input(files=[input], openhook=fileinput.hook_encoded("utf-8")) as h: for src_str in h: buffer.append(src_str.strip()) if len(buffer) >= buffer_size: yield buffer buffer = [] if len(buffer) > 0: yield buffer def make_batches(lines, cfg, task, max_positions, encode_fn): def encode_fn_target(x): return encode_fn(x) if cfg.generation.constraints: # Strip (tab-delimited) contraints, if present, from input lines, # store them in batch_constraints batch_constraints = [list() for _ in lines] for i, line in enumerate(lines): if "\t" in line: lines[i], *batch_constraints[i] = line.split("\t") # Convert each List[str] to List[Tensor] for i, constraint_list in enumerate(batch_constraints): batch_constraints[i] = [ task.target_dictionary.encode_line( encode_fn_target(constraint), append_eos=False, add_if_not_exist=False, ) for constraint in constraint_list ] if cfg.generation.constraints: constraints_tensor = pack_constraints(batch_constraints) else: constraints_tensor = None tokens, lengths = task.get_interactive_tokens_and_lengths(lines, encode_fn) itr = task.get_batch_iterator( dataset=task.build_dataset_for_inference( tokens, lengths, constraints=constraints_tensor ), max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=max_positions, ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, ).next_epoch_itr(shuffle=False) for batch in itr: ids = batch["id"] src_tokens = batch["net_input"]["src_tokens"] src_lengths = batch["net_input"]["src_lengths"] constraints = batch.get("constraints", None) yield Batch( ids=ids, src_tokens=src_tokens, src_lengths=src_lengths, constraints=constraints, ) def main(cfg: FairseqConfig): if isinstance(cfg, Namespace): cfg = convert_namespace_to_omegaconf(cfg) start_time = time.time() total_translate_time = 0 utils.import_user_module(cfg.common) if cfg.interactive.buffer_size < 1: cfg.interactive.buffer_size = 1 if cfg.dataset.max_tokens is None and cfg.dataset.batch_size is None: cfg.dataset.batch_size = 1 assert ( not cfg.generation.sampling or cfg.generation.nbest == cfg.generation.beam ), "--sampling requires --nbest to be equal to --beam" assert ( not cfg.dataset.batch_size or cfg.dataset.batch_size <= cfg.interactive.buffer_size ), "--batch-size cannot be larger than --buffer-size" logger.info(cfg) # Fix seed for stochastic decoding if cfg.common.seed is not None and not cfg.generation.no_seed_provided: np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) use_cuda = torch.cuda.is_available() and not cfg.common.cpu # Setup task, e.g., translation task = tasks.setup_task(cfg.task) # Load ensemble overrides = ast.literal_eval(cfg.common_eval.model_overrides) logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, _model_args = checkpoint_utils.load_model_ensemble( utils.split_paths(cfg.common_eval.path), arg_overrides=overrides, task=task, suffix=cfg.checkpoint.checkpoint_suffix, strict=(cfg.checkpoint.checkpoint_shard_count == 1), num_shards=cfg.checkpoint.checkpoint_shard_count, ) # Set dictionaries src_dict = task.source_dictionary tgt_dict = task.target_dictionary # Optimize ensemble for generation for model in models: if model is None: continue if cfg.common.fp16: model.half() if use_cuda and not cfg.distributed_training.pipeline_model_parallel: model.cuda() model.prepare_for_inference_(cfg) # Initialize generator generator = task.build_generator(models, cfg.generation) # Handle tokenization and BPE tokenizer = task.build_tokenizer(cfg.tokenizer) bpe = task.build_bpe(cfg.bpe) def encode_fn(x): if tokenizer is not None: x = tokenizer.encode(x) if bpe is not None: x = bpe.encode(x) return x def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x # Load alignment dictionary for unknown word replacement # (None if no unknown word replacement, empty if no path to align dictionary) align_dict = utils.load_align_dict(cfg.generation.replace_unk) max_positions = utils.resolve_max_positions( task.max_positions(), *[model.max_positions() for model in models] ) if cfg.generation.constraints: logger.warning( "NOTE: Constrained decoding currently assumes a shared subword vocabulary." ) if cfg.interactive.buffer_size > 1: logger.info("Sentence buffer size: %s", cfg.interactive.buffer_size) logger.info("NOTE: hypothesis and token scores are output in base 2") logger.info("Type the input sentence and press return:") start_id = 0 for inputs in buffered_read(cfg.interactive.input, cfg.interactive.buffer_size): results = [] for batch in make_batches(inputs, cfg, task, max_positions, encode_fn): bsz = batch.src_tokens.size(0) src_tokens = batch.src_tokens src_lengths = batch.src_lengths constraints = batch.constraints if use_cuda: src_tokens = src_tokens.cuda() src_lengths = src_lengths.cuda() if constraints is not None: constraints = constraints.cuda() sample = { "net_input": { "src_tokens": src_tokens, "src_lengths": src_lengths, }, } translate_start_time = time.time() translations = task.inference_step( generator, models, sample, constraints=constraints ) translate_time = time.time() - translate_start_time total_translate_time += translate_time list_constraints = [[] for _ in range(bsz)] if cfg.generation.constraints: list_constraints = [unpack_constraints(c) for c in constraints] for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)): src_tokens_i = utils.strip_pad(src_tokens[i], tgt_dict.pad()) constraints = list_constraints[i] results.append( ( start_id + id, src_tokens_i, hypos, { "constraints": constraints, "time": translate_time / len(translations), }, ) ) # sort output to match input order for id_, src_tokens, hypos, info in sorted(results, key=lambda x: x[0]): src_str = "" if src_dict is not None: src_str = src_dict.string(src_tokens, cfg.common_eval.post_process) print("S-{}\t{}".format(id_, src_str)) print("W-{}\t{:.3f}\tseconds".format(id_, info["time"])) for constraint in info["constraints"]: print( "C-{}\t{}".format( id_, tgt_dict.string(constraint, cfg.common_eval.post_process), ) ) # Process top predictions for hypo in hypos[: min(len(hypos), cfg.generation.nbest)]: hypo_tokens, hypo_str, alignment = utils.post_process_prediction( hypo_tokens=hypo["tokens"].int().cpu(), src_str=src_str, alignment=hypo["alignment"], align_dict=align_dict, tgt_dict=tgt_dict, remove_bpe=cfg.common_eval.post_process, extra_symbols_to_ignore=get_symbols_to_strip_from_output(generator), ) detok_hypo_str = decode_fn(hypo_str) score = hypo["score"] / math.log(2) # convert to base 2 # original hypothesis (after tokenization and BPE) print("H-{}\t{}\t{}".format(id_, score, hypo_str)) # detokenized hypothesis print("D-{}\t{}\t{}".format(id_, score, detok_hypo_str)) print( "P-{}\t{}".format( id_, " ".join( map( lambda x: "{:.4f}".format(x), # convert from base e to base 2 hypo["positional_scores"].div_(math.log(2)).tolist(), ) ), ) ) if cfg.generation.print_alignment: alignment_str = " ".join( ["{}-{}".format(src, tgt) for src, tgt in alignment] ) print("A-{}\t{}".format(id_, alignment_str)) # update running id_ counter start_id += len(inputs) logger.info( "Total time: {:.3f} seconds; translation time: {:.3f}".format( time.time() - start_time, total_translate_time ) ) def cli_main(): parser = options.get_interactive_generation_parser() args = options.parse_args_and_arch(parser) distributed_utils.call_main(convert_namespace_to_omegaconf(args), main) if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/preprocess.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Data pre-processing: build vocabularies and binarize training data. """ import logging import os import shutil import sys import typing as tp from argparse import Namespace from itertools import zip_longest from fairseq import options, tasks, utils from fairseq.binarizer import ( AlignmentDatasetBinarizer, FileBinarizer, VocabularyDatasetBinarizer, ) from fairseq.data import Dictionary logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("fairseq_cli.preprocess") ##################################################################### # file name tools ##################################################################### def _train_path(lang, trainpref): return "{}{}".format(trainpref, ("." + lang) if lang else "") def _file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def _dest_path(prefix, lang, destdir): return os.path.join(destdir, _file_name(prefix, lang)) def _dict_path(lang, destdir): return _dest_path("dict", lang, destdir) + ".txt" def dataset_dest_prefix(args, output_prefix, lang): base = os.path.join(args.destdir, output_prefix) if lang is not None: lang_part = f".{args.source_lang}-{args.target_lang}.{lang}" elif args.only_source: lang_part = "" else: lang_part = f".{args.source_lang}-{args.target_lang}" return "{}{}".format(base, lang_part) def dataset_dest_file(args, output_prefix, lang, extension): return "{}.{}".format(dataset_dest_prefix(args, output_prefix, lang), extension) ##################################################################### # dictionary tools ##################################################################### def _build_dictionary( filenames, task, args, src=False, tgt=False, ): assert src ^ tgt return task.build_dictionary( filenames, workers=args.workers, threshold=args.thresholdsrc if src else args.thresholdtgt, nwords=args.nwordssrc if src else args.nwordstgt, padding_factor=args.padding_factor, ) ##################################################################### # bin file creation logic ##################################################################### def _make_binary_dataset( vocab: Dictionary, input_prefix: str, output_prefix: str, lang: tp.Optional[str], num_workers: int, args: Namespace, ): logger.info("[{}] Dictionary: {} types".format(lang, len(vocab))) binarizer = VocabularyDatasetBinarizer( vocab, append_eos=True, ) input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") full_output_prefix = dataset_dest_prefix(args, output_prefix, lang) final_summary = FileBinarizer.multiprocess_dataset( input_file, args.dataset_impl, binarizer, full_output_prefix, vocab_size=len(vocab), num_workers=num_workers, ) logger.info(f"[{lang}] {input_file}: {final_summary} (by {vocab.unk_word})") def _make_binary_alignment_dataset( input_prefix: str, output_prefix: str, num_workers: int, args: Namespace ): binarizer = AlignmentDatasetBinarizer(utils.parse_alignment) input_file = input_prefix full_output_prefix = dataset_dest_prefix(args, output_prefix, lang=None) final_summary = FileBinarizer.multiprocess_dataset( input_file, args.dataset_impl, binarizer, full_output_prefix, vocab_size=None, num_workers=num_workers, ) logger.info( "[alignments] {}: parsed {} alignments".format( input_file, final_summary.num_seq ) ) ##################################################################### # routing logic ##################################################################### def _make_dataset( vocab: Dictionary, input_prefix: str, output_prefix: str, lang: tp.Optional[str], args: Namespace, num_workers: int, ): if args.dataset_impl == "raw": # Copy original text file to destination folder output_text_file = _dest_path( output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), lang, args.destdir, ) shutil.copyfile(_file_name(input_prefix, lang), output_text_file) else: _make_binary_dataset( vocab, input_prefix, output_prefix, lang, num_workers, args ) def _make_all(lang, vocab, args): if args.trainpref: _make_dataset( vocab, args.trainpref, "train", lang, args=args, num_workers=args.workers ) if args.validpref: for k, validpref in enumerate(args.validpref.split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" _make_dataset( vocab, validpref, outprefix, lang, args=args, num_workers=args.workers ) if args.testpref: for k, testpref in enumerate(args.testpref.split(",")): outprefix = "test{}".format(k) if k > 0 else "test" _make_dataset( vocab, testpref, outprefix, lang, args=args, num_workers=args.workers ) def _make_all_alignments(args): if args.trainpref and os.path.exists(args.trainpref + "." + args.align_suffix): _make_binary_alignment_dataset( args.trainpref + "." + args.align_suffix, "train.align", num_workers=args.workers, args=args, ) if args.validpref and os.path.exists(args.validpref + "." + args.align_suffix): _make_binary_alignment_dataset( args.validpref + "." + args.align_suffix, "valid.align", num_workers=args.workers, args=args, ) if args.testpref and os.path.exists(args.testpref + "." + args.align_suffix): _make_binary_alignment_dataset( args.testpref + "." + args.align_suffix, "test.align", num_workers=args.workers, args=args, ) ##################################################################### # align ##################################################################### def _align_files(args, src_dict, tgt_dict): assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = _train_path(args.source_lang, args.trainpref) tgt_file_name = _train_path(args.target_lang, args.trainpref) freq_map = {} with open(args.alignfile, "r", encoding="utf-8") as align_file: with open(src_file_name, "r", encoding="utf-8") as src_file: with open(tgt_file_name, "r", encoding="utf-8") as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = src_dict.encode_line(s, add_if_not_exist=False) ti = tgt_dict.encode_line(t, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split("-")), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), ), "w", encoding="utf-8", ) as f: for k, v in align_dict.items(): print("{} {}".format(src_dict[k], tgt_dict[v]), file=f) ##################################################################### # MAIN ##################################################################### def main(args): # setup some basic things utils.import_user_module(args) os.makedirs(args.destdir, exist_ok=True) logger.addHandler( logging.FileHandler( filename=os.path.join(args.destdir, "preprocess.log"), ) ) logger.info(args) assert ( args.dataset_impl != "huffman" ), "preprocessing.py doesn't support Huffman yet, use HuffmanCodeBuilder directly." # build dictionaries target = not args.only_source if not args.srcdict and os.path.exists(_dict_path(args.source_lang, args.destdir)): raise FileExistsError(_dict_path(args.source_lang, args.destdir)) if ( target and not args.tgtdict and os.path.exists(_dict_path(args.target_lang, args.destdir)) ): raise FileExistsError(_dict_path(args.target_lang, args.destdir)) task = tasks.get_task(args.task) if args.joined_dictionary: assert ( not args.srcdict or not args.tgtdict ), "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args.srcdict: src_dict = task.load_dictionary(args.srcdict) elif args.tgtdict: src_dict = task.load_dictionary(args.tgtdict) else: assert ( args.trainpref ), "--trainpref must be set if --srcdict is not specified" src_dict = _build_dictionary( { _train_path(lang, args.trainpref) for lang in [args.source_lang, args.target_lang] }, task=task, args=args, src=True, ) tgt_dict = src_dict else: if args.srcdict: src_dict = task.load_dictionary(args.srcdict) else: assert ( args.trainpref ), "--trainpref must be set if --srcdict is not specified" src_dict = _build_dictionary( [_train_path(args.source_lang, args.trainpref)], task=task, args=args, src=True, ) if target: if args.tgtdict: tgt_dict = task.load_dictionary(args.tgtdict) else: assert ( args.trainpref ), "--trainpref must be set if --tgtdict is not specified" tgt_dict = _build_dictionary( [_train_path(args.target_lang, args.trainpref)], task=task, args=args, tgt=True, ) else: tgt_dict = None # save dictionaries src_dict.save(_dict_path(args.source_lang, args.destdir)) if target and tgt_dict is not None: tgt_dict.save(_dict_path(args.target_lang, args.destdir)) if args.dict_only: return _make_all(args.source_lang, src_dict, args) if target: _make_all(args.target_lang, tgt_dict, args) # align the datasets if needed if args.align_suffix: _make_all_alignments(args) logger.info("Wrote preprocessed data to {}".format(args.destdir)) if args.alignfile: _align_files(args, src_dict=src_dict, tgt_dict=tgt_dict) def cli_main(): parser = options.get_preprocessing_parser() args = parser.parse_args() main(args) if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/score.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ BLEU scoring of generated translations against reference translations. """ import argparse import os import sys from fairseq.data import dictionary from fairseq.scoring import bleu def get_parser(): parser = argparse.ArgumentParser( description="Command-line script for BLEU scoring." ) # fmt: off parser.add_argument('-s', '--sys', default='-', help='system output') parser.add_argument('-r', '--ref', required=True, help='references') parser.add_argument('-o', '--order', default=4, metavar='N', type=int, help='consider ngrams up to this order') parser.add_argument('--ignore-case', action='store_true', help='case-insensitive scoring') parser.add_argument('--sacrebleu', action='store_true', help='score with sacrebleu') parser.add_argument('--sentence-bleu', action='store_true', help='report sentence-level BLEUs (i.e., with +1 smoothing)') # fmt: on return parser def cli_main(): parser = get_parser() args = parser.parse_args() print(args) assert args.sys == "-" or os.path.exists( args.sys ), "System output file {} does not exist".format(args.sys) assert os.path.exists(args.ref), "Reference file {} does not exist".format(args.ref) dict = dictionary.Dictionary() def readlines(fd): for line in fd.readlines(): if args.ignore_case: yield line.lower() else: yield line if args.sacrebleu: import sacrebleu def score(fdsys): with open(args.ref) as fdref: print(sacrebleu.corpus_bleu(fdsys, [fdref]).format()) elif args.sentence_bleu: def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer(dict.pad(), dict.eos(), dict.unk()) for i, (sys_tok, ref_tok) in enumerate( zip(readlines(fdsys), readlines(fdref)) ): scorer.reset(one_init=True) sys_tok = dict.encode_line(sys_tok) ref_tok = dict.encode_line(ref_tok) scorer.add(ref_tok, sys_tok) print(i, scorer.result_string(args.order)) else: def score(fdsys): with open(args.ref) as fdref: scorer = bleu.Scorer( bleu.BleuConfig( pad=dict.pad(), eos=dict.eos(), unk=dict.unk(), ) ) for sys_tok, ref_tok in zip(readlines(fdsys), readlines(fdref)): sys_tok = dict.encode_line(sys_tok) ref_tok = dict.encode_line(ref_tok) scorer.add(ref_tok, sys_tok) print(scorer.result_string(args.order)) if args.sys == "-": score(sys.stdin) else: with open(args.sys, "r") as f: score(f) if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/train.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Train a new model on one or across multiple GPUs. """ import argparse import logging import math import os import sys from typing import Any, Callable, Dict, List, Optional, Tuple # We need to setup root logger before importing any fairseq libraries. logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("fairseq_cli.train") import numpy as np import torch from omegaconf import DictConfig, OmegaConf from fairseq import checkpoint_utils, options, quantization_utils, tasks, utils from fairseq.data import data_utils, iterators from fairseq.data.plasma_utils import PlasmaStore from fairseq.dataclass.configs import FairseqConfig from fairseq.dataclass.initialize import add_defaults from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.distributed import fsdp_enable_wrap, fsdp_wrap from fairseq.distributed import utils as distributed_utils from fairseq.file_io import PathManager from fairseq.logging import meters, metrics, progress_bar from fairseq.model_parallel.megatron_trainer import MegatronTrainer from fairseq.trainer import Trainer def main(cfg: FairseqConfig) -> None: if isinstance(cfg, argparse.Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) add_defaults(cfg) if ( distributed_utils.is_master(cfg.distributed_training) and "job_logging_cfg" in cfg ): # make hydra logging work with ddp (see # see https://github.com/facebookresearch/hydra/issues/1126) logging.config.dictConfig(OmegaConf.to_container(cfg.job_logging_cfg)) assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" metrics.reset() if cfg.common.log_file is not None: handler = logging.FileHandler(filename=cfg.common.log_file) logger.addHandler(handler) np.random.seed(cfg.common.seed) utils.set_torch_seed(cfg.common.seed) if distributed_utils.is_master(cfg.distributed_training): checkpoint_utils.verify_checkpoint_directory(cfg.checkpoint.save_dir) # Print args logger.info(cfg) if cfg.checkpoint.write_checkpoints_asynchronously: try: import iopath # noqa: F401 except ImportError: logging.exception( "Asynchronous checkpoint writing is specified but iopath is " "not installed: `pip install iopath`" ) return # Setup task, e.g., translation, language modeling, etc. task = tasks.setup_task(cfg.task) assert cfg.criterion, "Please specify criterion to train a model" # Build model and criterion if cfg.distributed_training.ddp_backend == "fully_sharded": with fsdp_enable_wrap(cfg.distributed_training): model = fsdp_wrap(task.build_model(cfg.model)) else: model = task.build_model(cfg.model) criterion = task.build_criterion(cfg.criterion) logger.info(model) logger.info("task: {}".format(task.__class__.__name__)) logger.info("model: {}".format(model.__class__.__name__)) logger.info("criterion: {}".format(criterion.__class__.__name__)) logger.info( "num. shared model params: {:,} (num. trained: {:,})".format( sum( p.numel() for p in model.parameters() if not getattr(p, "expert", False) ), sum( p.numel() for p in model.parameters() if not getattr(p, "expert", False) and p.requires_grad ), ) ) logger.info( "num. expert model params: {} (num. trained: {})".format( sum(p.numel() for p in model.parameters() if getattr(p, "expert", False)), sum( p.numel() for p in model.parameters() if getattr(p, "expert", False) and p.requires_grad ), ) ) # Load valid dataset (we load training data below, based on the latest checkpoint) # We load the valid dataset AFTER building the model if not cfg.dataset.disable_validation: data_utils.raise_if_valid_subsets_unintentionally_ignored(cfg) if cfg.dataset.combine_valid_subsets: task.load_dataset("valid", combine=True, epoch=1) else: for valid_sub_split in cfg.dataset.valid_subset.split(","): task.load_dataset(valid_sub_split, combine=False, epoch=1) # (optionally) Configure quantization if cfg.common.quantization_config_path is not None: quantizer = quantization_utils.Quantizer( config_path=cfg.common.quantization_config_path, max_epoch=cfg.optimization.max_epoch, max_update=cfg.optimization.max_update, ) else: quantizer = None # Build trainer if cfg.common.model_parallel_size == 1: trainer = Trainer(cfg, task, model, criterion, quantizer) else: trainer = MegatronTrainer(cfg, task, model, criterion) logger.info( "training on {} devices (GPUs/TPUs)".format( cfg.distributed_training.distributed_world_size ) ) logger.info( "max tokens per device = {} and max sentences per device = {}".format( cfg.dataset.max_tokens, cfg.dataset.batch_size, ) ) # Load the latest checkpoint if one is available and restore the # corresponding train iterator extra_state, epoch_itr = checkpoint_utils.load_checkpoint( cfg.checkpoint, trainer, # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) if cfg.common.tpu: import torch_xla.core.xla_model as xm xm.rendezvous("load_checkpoint") # wait for all workers max_epoch = cfg.optimization.max_epoch or math.inf lr = trainer.get_lr() # TODO: a dry run on validation set to pin the memory valid_subsets = cfg.dataset.valid_subset.split(",") if not cfg.dataset.disable_validation: for subset in valid_subsets: logger.info('begin dry-run validation on "{}" subset'.format(subset)) itr = trainer.get_valid_iterator(subset).next_epoch_itr( shuffle=False, set_dataset_epoch=False # use a fixed valid set ) if cfg.common.tpu: itr = utils.tpu_data_loader(itr) for _ in itr: pass # TODO: end of dry run section train_meter = meters.StopwatchMeter() train_meter.start() while epoch_itr.next_epoch_idx <= max_epoch: if lr <= cfg.optimization.stop_min_lr: logger.info( f"stopping training because current learning rate ({lr}) is smaller " "than or equal to minimum learning rate " f"(--stop-min-lr={cfg.optimization.stop_min_lr})" ) break # train for one epoch valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) if should_stop: break # only use first validation loss to update the learning rate lr = trainer.lr_step(epoch_itr.epoch, valid_losses[0]) epoch_itr = trainer.get_train_iterator( epoch_itr.next_epoch_idx, # sharded data: get train iterator for next epoch load_dataset=task.has_sharded_data("train"), # don't cache epoch iterators for sharded datasets disable_iterator_cache=task.has_sharded_data("train"), ) train_meter.stop() logger.info("done training in {:.1f} seconds".format(train_meter.sum)) # ioPath implementation to wait for all asynchronous file writes to complete. if cfg.checkpoint.write_checkpoints_asynchronously: logger.info( "ioPath PathManager waiting for all asynchronous checkpoint " "writes to finish." ) PathManager.async_close() logger.info("ioPath PathManager finished waiting.") def should_stop_early(cfg: DictConfig, valid_loss: float) -> bool: # skip check if no validation was done in the current epoch if valid_loss is None: return False if cfg.checkpoint.patience <= 0: return False def is_better(a, b): return a > b if cfg.checkpoint.maximize_best_checkpoint_metric else a < b prev_best = getattr(should_stop_early, "best", None) if prev_best is None or is_better(valid_loss, prev_best): should_stop_early.best = valid_loss should_stop_early.num_runs = 0 return False else: should_stop_early.num_runs += 1 if should_stop_early.num_runs >= cfg.checkpoint.patience: logger.info( "early stop since valid performance hasn't improved for last {} runs".format( cfg.checkpoint.patience ) ) return True else: return False @metrics.aggregate("train") def train( cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr ) -> Tuple[List[Optional[float]], bool]: """Train the model for one epoch and return validation losses.""" # Initialize data iterator itr = epoch_itr.next_epoch_itr( fix_batches_to_gpus=cfg.distributed_training.fix_batches_to_gpus, shuffle=(epoch_itr.next_epoch_idx > cfg.dataset.curriculum), ) update_freq = ( cfg.optimization.update_freq[epoch_itr.epoch - 1] if epoch_itr.epoch <= len(cfg.optimization.update_freq) else cfg.optimization.update_freq[-1] ) itr = iterators.GroupedIterator( itr, update_freq, skip_remainder_batch=cfg.optimization.skip_remainder_batch, ) if cfg.common.tpu: itr = utils.tpu_data_loader(itr) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_file=cfg.common.log_file, log_interval=cfg.common.log_interval, epoch=epoch_itr.epoch, aim_repo=( cfg.common.aim_repo if distributed_utils.is_master(cfg.distributed_training) else None ), aim_run_hash=( cfg.common.aim_run_hash if distributed_utils.is_master(cfg.distributed_training) else None ), aim_param_checkpoint_dir=cfg.checkpoint.save_dir, tensorboard_logdir=( cfg.common.tensorboard_logdir if distributed_utils.is_master(cfg.distributed_training) else None ), default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), wandb_project=( cfg.common.wandb_project if distributed_utils.is_master(cfg.distributed_training) else None ), wandb_run_name=os.environ.get( "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir) ), azureml_logging=( cfg.common.azureml_logging if distributed_utils.is_master(cfg.distributed_training) else False ), ) progress.update_config(_flatten_config(cfg)) trainer.begin_epoch(epoch_itr.epoch) valid_subsets = cfg.dataset.valid_subset.split(",") should_stop = False num_updates = trainer.get_num_updates() logger.info("Start iterating over samples") for i, samples in enumerate(progress): with metrics.aggregate("train_inner"), torch.autograd.profiler.record_function( "train_step-%d" % i ): log_output = trainer.train_step(samples) if log_output is not None: # not OOM, overflow, ... # log mid-epoch stats num_updates = trainer.get_num_updates() if num_updates % cfg.common.log_interval == 0: stats = get_training_stats(metrics.get_smoothed_values("train_inner")) progress.log(stats, tag="train_inner", step=num_updates) # reset mid-epoch stats after each log interval # the end-of-epoch stats will still be preserved metrics.reset_meters("train_inner") end_of_epoch = not itr.has_next() valid_losses, should_stop = validate_and_save( cfg, trainer, task, epoch_itr, valid_subsets, end_of_epoch ) if should_stop: break # log end-of-epoch stats logger.info("end of epoch {} (average epoch stats below)".format(epoch_itr.epoch)) stats = get_training_stats(metrics.get_smoothed_values("train")) progress.print(stats, tag="train", step=num_updates) # reset epoch-level meters metrics.reset_meters("train") return valid_losses, should_stop def _flatten_config(cfg: DictConfig): config = OmegaConf.to_container(cfg) # remove any legacy Namespaces and replace with a single "args" namespace = None for k, v in list(config.items()): if isinstance(v, argparse.Namespace): namespace = v del config[k] if namespace is not None: config["args"] = vars(namespace) return config def validate_and_save( cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr, valid_subsets: List[str], end_of_epoch: bool, ) -> Tuple[List[Optional[float]], bool]: num_updates = trainer.get_num_updates() max_update = cfg.optimization.max_update or math.inf # Stopping conditions (and an additional one based on validation loss later # on) should_stop = False if num_updates >= max_update: should_stop = True logger.info( f"Stopping training due to " f"num_updates: {num_updates} >= max_update: {max_update}" ) training_time_hours = trainer.cumulative_training_time() / (60 * 60) if ( cfg.optimization.stop_time_hours > 0 and training_time_hours > cfg.optimization.stop_time_hours ): should_stop = True logger.info( f"Stopping training due to " f"cumulative_training_time: {training_time_hours} > " f"stop_time_hours: {cfg.optimization.stop_time_hours} hour(s)" ) do_save = ( (end_of_epoch and epoch_itr.epoch % cfg.checkpoint.save_interval == 0) or should_stop or ( cfg.checkpoint.save_interval_updates > 0 and num_updates > 0 and num_updates % cfg.checkpoint.save_interval_updates == 0 and num_updates >= cfg.dataset.validate_after_updates ) ) do_validate = ( ( (not end_of_epoch and do_save) # validate during mid-epoch saves or (end_of_epoch and epoch_itr.epoch % cfg.dataset.validate_interval == 0) or should_stop or ( cfg.dataset.validate_interval_updates > 0 and num_updates > 0 and num_updates % cfg.dataset.validate_interval_updates == 0 ) ) and not cfg.dataset.disable_validation and num_updates >= cfg.dataset.validate_after_updates ) # Validate valid_losses = [None] if do_validate: valid_losses = validate(cfg, trainer, task, epoch_itr, valid_subsets) should_stop |= should_stop_early(cfg, valid_losses[0]) # Save checkpoint if do_save or should_stop: cp_path = checkpoint_utils.save_checkpoint( cfg.checkpoint, trainer, epoch_itr, valid_losses[0] ) if cp_path is not None and hasattr(task, "post_save"): task.post_save(cp_path, num_updates) return valid_losses, should_stop def get_training_stats(stats: Dict[str, Any]) -> Dict[str, Any]: stats["wall"] = round(metrics.get_meter("default", "wall").elapsed_time, 0) return stats def validate( cfg: DictConfig, trainer: Trainer, task: tasks.FairseqTask, epoch_itr, subsets: List[str], ) -> List[Optional[float]]: """Evaluate the model on the validation set(s) and return the losses.""" if cfg.dataset.fixed_validation_seed is not None: # set fixed seed for every validation utils.set_torch_seed(cfg.dataset.fixed_validation_seed) trainer.begin_valid_epoch(epoch_itr.epoch) valid_losses = [] for subset_idx, subset in enumerate(subsets): logger.info('begin validation on "{}" subset'.format(subset)) # Initialize data iterator itr = trainer.get_valid_iterator(subset).next_epoch_itr( shuffle=False, set_dataset_epoch=False # use a fixed valid set ) if cfg.common.tpu: itr = utils.tpu_data_loader(itr) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, epoch=epoch_itr.epoch, prefix=f"valid on '{subset}' subset", aim_repo=( cfg.common.aim_repo if distributed_utils.is_master(cfg.distributed_training) else None ), aim_run_hash=( cfg.common.aim_run_hash if distributed_utils.is_master(cfg.distributed_training) else None ), aim_param_checkpoint_dir=cfg.checkpoint.save_dir, tensorboard_logdir=( cfg.common.tensorboard_logdir if distributed_utils.is_master(cfg.distributed_training) else None ), default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), wandb_project=( cfg.common.wandb_project if distributed_utils.is_master(cfg.distributed_training) else None ), wandb_run_name=os.environ.get( "WANDB_NAME", os.path.basename(cfg.checkpoint.save_dir) ), ) # create a new root metrics aggregator so validation metrics # don't pollute other aggregators (e.g., train meters) with metrics.aggregate(new_root=True) as agg: for i, sample in enumerate(progress): if ( cfg.dataset.max_valid_steps is not None and i > cfg.dataset.max_valid_steps ): break trainer.valid_step(sample) # log validation stats # only tracking the best metric on the 1st validation subset tracking_best = subset_idx == 0 stats = get_valid_stats(cfg, trainer, agg.get_smoothed_values(), tracking_best) if hasattr(task, "post_validate"): task.post_validate(trainer.get_model(), stats, agg) progress.print(stats, tag=subset, step=trainer.get_num_updates()) valid_losses.append(stats[cfg.checkpoint.best_checkpoint_metric]) return valid_losses def get_valid_stats( cfg: DictConfig, trainer: Trainer, stats: Dict[str, Any], tracking_best: bool, ) -> Dict[str, Any]: stats["num_updates"] = trainer.get_num_updates() if tracking_best and hasattr(checkpoint_utils.save_checkpoint, "best"): key = "best_{0}".format(cfg.checkpoint.best_checkpoint_metric) best_function = max if cfg.checkpoint.maximize_best_checkpoint_metric else min stats[key] = best_function( checkpoint_utils.save_checkpoint.best, stats[cfg.checkpoint.best_checkpoint_metric], ) return stats def cli_main( modify_parser: Optional[Callable[[argparse.ArgumentParser], None]] = None ) -> None: parser = options.get_training_parser() args = options.parse_args_and_arch(parser, modify_parser=modify_parser) cfg = convert_namespace_to_omegaconf(args) if cfg.common.use_plasma_view: server = PlasmaStore(path=cfg.common.plasma_path) logger.info( f"Started plasma server pid {server.server.pid} {cfg.common.plasma_path}" ) if args.profile: with torch.cuda.profiler.profile(): with torch.autograd.profiler.emit_nvtx(): distributed_utils.call_main(cfg, main) else: distributed_utils.call_main(cfg, main) # if cfg.common.use_plasma_view: # server.server.kill() if __name__ == "__main__": cli_main() ================================================ FILE: fairseq_cli/validate.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import os import sys from argparse import Namespace from itertools import chain import torch from omegaconf import DictConfig from fairseq import checkpoint_utils, distributed_utils, options, utils from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.logging import metrics, progress_bar from fairseq.utils import reset_logging logging.basicConfig( format="%(asctime)s | %(levelname)s | %(name)s | %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=os.environ.get("LOGLEVEL", "INFO").upper(), stream=sys.stdout, ) logger = logging.getLogger("fairseq_cli.validate") def main(cfg: DictConfig, override_args=None): if isinstance(cfg, Namespace): cfg = convert_namespace_to_omegaconf(cfg) utils.import_user_module(cfg.common) reset_logging() assert ( cfg.dataset.max_tokens is not None or cfg.dataset.batch_size is not None ), "Must specify batch size either with --max-tokens or --batch-size" use_fp16 = cfg.common.fp16 use_cuda = torch.cuda.is_available() and not cfg.common.cpu if use_cuda: torch.cuda.set_device(cfg.distributed_training.device_id) if cfg.distributed_training.distributed_world_size > 1: data_parallel_world_size = distributed_utils.get_data_parallel_world_size() data_parallel_rank = distributed_utils.get_data_parallel_rank() else: data_parallel_world_size = 1 data_parallel_rank = 0 if override_args is not None: overrides = vars(override_args) overrides.update(eval(getattr(override_args, "model_overrides", "{}"))) else: overrides = None # Load ensemble logger.info("loading model(s) from {}".format(cfg.common_eval.path)) models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( [cfg.common_eval.path], arg_overrides=overrides, suffix=cfg.checkpoint.checkpoint_suffix, ) model = models[0] # Move models to GPU for model in models: model.eval() if use_fp16: model.half() if use_cuda: model.cuda() # Print args logger.info(saved_cfg) # Build criterion criterion = task.build_criterion(saved_cfg.criterion) criterion.eval() for subset in cfg.dataset.valid_subset.split(","): try: task.load_dataset(subset, combine=False, epoch=1, task_cfg=saved_cfg.task) dataset = task.dataset(subset) except KeyError: raise Exception("Cannot find dataset: " + subset) # Initialize data iterator itr = task.get_batch_iterator( dataset=dataset, max_tokens=cfg.dataset.max_tokens, max_sentences=cfg.dataset.batch_size, max_positions=utils.resolve_max_positions( task.max_positions(), *[m.max_positions() for m in models], ), ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test, required_batch_size_multiple=cfg.dataset.required_batch_size_multiple, seed=cfg.common.seed, num_shards=data_parallel_world_size, shard_id=data_parallel_rank, num_workers=cfg.dataset.num_workers, data_buffer_size=cfg.dataset.data_buffer_size, ).next_epoch_itr(shuffle=False) progress = progress_bar.progress_bar( itr, log_format=cfg.common.log_format, log_interval=cfg.common.log_interval, prefix=f"valid on '{subset}' subset", default_log_format=("tqdm" if not cfg.common.no_progress_bar else "simple"), ) log_outputs = [] for i, sample in enumerate(progress): sample = utils.move_to_cuda(sample) if use_cuda else sample _loss, _sample_size, log_output = task.valid_step(sample, model, criterion) progress.log(log_output, step=i) log_outputs.append(log_output) if data_parallel_world_size > 1: log_outputs = distributed_utils.all_gather_list( log_outputs, max_size=cfg.common.all_gather_list_size, group=distributed_utils.get_data_parallel_group(), ) log_outputs = list(chain.from_iterable(log_outputs)) with metrics.aggregate() as agg: task.reduce_metrics(log_outputs, criterion) log_output = agg.get_smoothed_values() progress.print(log_output, tag=subset, step=i) def cli_main(): parser = options.get_validation_parser() args = options.parse_args_and_arch(parser) # only override args that are explicitly given on the command line override_parser = options.get_validation_parser() override_args = options.parse_args_and_arch(override_parser, suppress_defaults=True) distributed_utils.call_main( convert_namespace_to_omegaconf(args), main, override_args=override_args ) if __name__ == "__main__": cli_main() ================================================ FILE: hubconf.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """isort:skip_file""" import functools import importlib dependencies = [ "dataclasses", "hydra", "numpy", "omegaconf", "regex", "requests", "torch", ] # Check for required dependencies and raise a RuntimeError if any are missing. missing_deps = [] for dep in dependencies: try: importlib.import_module(dep) except ImportError: # Hack: the hydra package is provided under the "hydra-core" name in # pypi. We don't want the user mistakenly calling `pip install hydra` # since that will install an unrelated package. if dep == "hydra": dep = "hydra-core" missing_deps.append(dep) if len(missing_deps) > 0: raise RuntimeError("Missing dependencies: {}".format(", ".join(missing_deps))) # only do fairseq imports after checking for dependencies from fairseq.hub_utils import ( # noqa; noqa BPEHubInterface as bpe, TokenizerHubInterface as tokenizer, ) from fairseq.models import MODEL_REGISTRY # noqa # torch.hub doesn't build Cython components, so if they are not found then try # to build them here try: import fairseq.data.token_block_utils_fast # noqa except ImportError: try: import cython # noqa import os from setuptools import sandbox sandbox.run_setup( os.path.join(os.path.dirname(__file__), "setup.py"), ["build_ext", "--inplace"], ) except ImportError: print( "Unable to build Cython components. Please make sure Cython is " "installed if the torch.hub model you are loading depends on it." ) # automatically expose models defined in FairseqModel::hub_models for _model_type, _cls in MODEL_REGISTRY.items(): for model_name in _cls.hub_models().keys(): globals()[model_name] = functools.partial( _cls.from_pretrained, model_name, ) ================================================ FILE: hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved __version__ = "0.1" ================================================ FILE: hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/config.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved from dataclasses import dataclass, field from hydra.core.config_store import ConfigStore from hydra_plugins.hydra_submitit_launcher.config import SlurmQueueConf @dataclass class DependencySubmititConf(SlurmQueueConf): """Slurm configuration overrides and specific parameters""" _target_: str = ( "hydra_plugins.dependency_submitit_launcher.launcher.DependencySubmititLauncher" ) ConfigStore.instance().store( group="hydra/launcher", name="dependency_submitit_slurm", node=DependencySubmititConf(), provider="dependency_submitit_slurm", ) ================================================ FILE: hydra_plugins/dependency_submitit_launcher/hydra_plugins/dependency_submitit_launcher/launcher.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved import logging import os import subprocess from pathlib import Path from typing import Any, List, Sequence from hydra.core.singleton import Singleton from hydra.core.utils import JobReturn, filter_overrides from omegaconf import OmegaConf log = logging.getLogger(__name__) from .config import DependencySubmititConf from hydra_plugins.hydra_submitit_launcher.submitit_launcher import BaseSubmititLauncher class DependencySubmititLauncher(BaseSubmititLauncher): _EXECUTOR = "slurm" def launch( self, job_overrides: Sequence[Sequence[str]], initial_job_idx: int ) -> Sequence[JobReturn]: # lazy import to ensure plugin discovery remains fast import submitit assert self.config is not None num_jobs = len(job_overrides) assert num_jobs > 0 next_script = None for jo in job_overrides: if next_script is None: for item in jo: if "next_script=" in item: next_script = item break assert ( next_script is not None ), "job overrides must contain +next_script=path/to/next/script" jo.remove(next_script) idx = next_script.find("=") next_script = next_script[idx + 1 :] params = self.params # build executor init_params = {"folder": self.params["submitit_folder"]} specific_init_keys = {"max_num_timeout"} init_params.update( **{ f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x in specific_init_keys } ) init_keys = specific_init_keys | {"submitit_folder"} executor = submitit.AutoExecutor(cluster=self._EXECUTOR, **init_params) # specify resources/parameters baseparams = set(OmegaConf.structured(DependencySubmititConf).keys()) params = { x if x in baseparams else f"{self._EXECUTOR}_{x}": y for x, y in params.items() if x not in init_keys } executor.update_parameters(**params) log.info( f"Submitit '{self._EXECUTOR}' sweep output dir : " f"{self.config.hydra.sweep.dir}" ) sweep_dir = Path(str(self.config.hydra.sweep.dir)) sweep_dir.mkdir(parents=True, exist_ok=True) if "mode" in self.config.hydra.sweep: mode = int(str(self.config.hydra.sweep.mode), 8) os.chmod(sweep_dir, mode=mode) job_params: List[Any] = [] for idx, overrides in enumerate(job_overrides): idx = initial_job_idx + idx lst = " ".join(filter_overrides(overrides)) log.info(f"\t#{idx} : {lst}") job_params.append( ( list(overrides), "hydra.sweep.dir", idx, f"job_id_for_{idx}", Singleton.get_state(), ) ) jobs = executor.map_array(self, *zip(*job_params)) for j, jp in zip(jobs, job_params): job_id = str(j.job_id) task_id = "0" if "_" not in job_id else job_id.split("_")[1] sweep_config = self.config_loader.load_sweep_config(self.config, jp[0]) dir = sweep_config.hydra.sweep.dir dir = ( dir.replace("[", "") .replace("]", "") .replace("{", "") .replace("}", "") .replace(",", "_") .replace("'", "") .replace('"', "") ) subprocess.call( [next_script, job_id, task_id, dir], shell=False, ) return [j.results()[0] for j in jobs] ================================================ FILE: hydra_plugins/dependency_submitit_launcher/setup.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # type: ignore from pathlib import Path from read_version import read_version from setuptools import find_namespace_packages, setup setup( name="dependency-submitit-launcher", version=read_version("hydra_plugins/dependency_submitit_launcher", "__init__.py"), author="Alexei Baevski", author_email="abaevski@fb.com", description="Dependency-supporting Submitit Launcher for Hydra apps", packages=find_namespace_packages(include=["hydra_plugins.*"]), classifiers=[ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Operating System :: MacOS", "Operating System :: POSIX :: Linux", "Development Status :: 4 - Beta", ], install_requires=[ "hydra-core>=1.0.4", "submitit>=1.0.0", ], include_package_data=True, ) ================================================ FILE: pyproject.toml ================================================ [build-system] requires = [ "setuptools>=18.0", "wheel", "cython", "numpy>=1.21.3", "torch>=1.10", ] build-backend = "setuptools.build_meta" [tool.black] extend-exclude = ''' ( ^/examples/| ^/fairseq/model_parallel/megatron| ^/build/ ) ''' [tool.isort] profile = "black" known_third_party = "_cffi_backend,agg_results,aml,bitarray,boto3,botocore,dump_hubert_feature,dynamicconv_cuda,editdistance,faiss,fasttext,feature_utils,ffmpeg,g2p_en,h5py,hydra,hypothesis,indicnlp,inflect,iopath,joblib,kaldi_io,kenlm,libfb,librosa,lightconv_cuda,matplotlib,misc,mmpt,mmpt_cli,model,nltk,npy_append_array,numpy,omegaconf,pandas,pathbuilder,preprocessing,progressbar,pythainlp,random_sequence_shuffler,regex,sacrebleu,sacremoses,scipy,sentencepiece,setuptools,six,sklearn,soundfile,sweep,sweep_wmt_en2de_transformer_big_common,tabulate,torch,torchaudio,tqdm,unidecode,utils,videoreader,wav2vec_cluster_faiss,wget,yaml" skip_gitignore = true ================================================ FILE: release_utils.py ================================================ import argparse from typing import Tuple def get_next_version(release_type) -> Tuple[Tuple[int, int, int], str, str]: current_ver = find_version("fairseq/version.txt") version_list = [int(x) for x in current_ver.strip("'").split(".")] major, minor, patch = version_list[0], version_list[1], version_list[2] if release_type == "patch": patch += 1 elif release_type == "minor": minor += 1 patch = 0 elif release_type == "major": major += 1 minor = patch = 0 else: raise ValueError( "Incorrect release type specified. Acceptable types are major, minor and patch." ) new_version_tuple = (major, minor, patch) new_version_str = ".".join([str(x) for x in new_version_tuple]) new_tag_str = "v" + new_version_str return new_version_tuple, new_version_str, new_tag_str def find_version(version_file_path) -> str: with open(version_file_path) as f: version = f.read().strip() return version def update_version(new_version_str) -> None: """ given the current version, update the version to the next version depending on the type of release. """ with open("fairseq/version.txt", "w") as writer: writer.write(new_version_str) def main(args): if args.release_type in ["major", "minor", "patch"]: new_version_tuple, new_version, new_tag = get_next_version(args.release_type) else: raise ValueError("Incorrect release type specified") if args.update_version: update_version(new_version) print(new_version, new_tag) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Versioning utils") parser.add_argument( "--release-type", type=str, required=True, help="type of release = major/minor/patch", ) parser.add_argument( "--update-version", action="store_true", required=False, help="updates the version in fairseq/version.txt", ) args = parser.parse_args() main(args) ================================================ FILE: scripts/__init__.py ================================================ ================================================ FILE: scripts/average_checkpoints.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import collections import os import re import torch from fairseq.file_io import PathManager def average_checkpoints(inputs): """Loads checkpoints from inputs and returns a model with averaged weights. Args: inputs: An iterable of string paths of checkpoints to load from. Returns: A dict of string keys mapping to various values. The 'model' key from the returned dict should correspond to an OrderedDict mapping string parameter names to torch Tensors. """ params_dict = collections.OrderedDict() params_keys = None new_state = None num_models = len(inputs) for fpath in inputs: with PathManager.open(fpath, "rb") as f: state = torch.load( f, map_location=( lambda s, _: torch.serialization.default_restore_location(s, "cpu") ), ) # Copies over the settings from the first checkpoint if new_state is None: new_state = state model_params = state["model"] model_params_keys = list(model_params.keys()) if params_keys is None: params_keys = model_params_keys elif params_keys != model_params_keys: raise KeyError( "For checkpoint {}, expected list of params: {}, " "but found: {}".format(f, params_keys, model_params_keys) ) for k in params_keys: p = model_params[k] if isinstance(p, torch.HalfTensor): p = p.float() if k not in params_dict: params_dict[k] = p.clone() # NOTE: clone() is needed in case of p is a shared parameter else: params_dict[k] += p averaged_params = collections.OrderedDict() for k, v in params_dict.items(): averaged_params[k] = v if averaged_params[k].is_floating_point(): averaged_params[k].div_(num_models) else: averaged_params[k] //= num_models new_state["model"] = averaged_params return new_state def last_n_checkpoints(paths, n, update_based, upper_bound=None): assert len(paths) == 1 path = paths[0] if update_based: pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt") else: pt_regexp = re.compile(r"checkpoint(\d+)\.pt") files = PathManager.ls(path) entries = [] for f in files: m = pt_regexp.fullmatch(f) if m is not None: sort_key = int(m.group(1)) if upper_bound is None or sort_key <= upper_bound: entries.append((sort_key, m.group(0))) if len(entries) < n: raise Exception( "Found {} checkpoint files but need at least {}", len(entries), n ) return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]] def main(): parser = argparse.ArgumentParser( description="Tool to average the params of input checkpoints to " "produce a new checkpoint", ) # fmt: off parser.add_argument('--inputs', required=True, nargs='+', help='Input checkpoint file paths.') parser.add_argument('--output', required=True, metavar='FILE', help='Write the new checkpoint containing the averaged weights to this path.') num_group = parser.add_mutually_exclusive_group() num_group.add_argument('--num-epoch-checkpoints', type=int, help='if set, will try to find checkpoints with names checkpoint_xx.pt in the ' 'path specified by input, and average last this many of them.') num_group.add_argument('--num-update-checkpoints', type=int, help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by' ' input, and average last this many of them.') num_group.add_argument('--num-best-checkpoints', type=int, default=0, help='if set, will try to find checkpoints with names checkpoint_best_ee_xx.pt in the path specified by' ' input, and average last this many of them.') parser.add_argument('--checkpoint-upper-bound', type=int, help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, ' 'when using --num-update-checkpoints, this will set an upper bound on which update to use' 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be' ' averaged.' 'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would' ' be averaged assuming --save-interval-updates 500' ) # fmt: on args = parser.parse_args() print(args) num = None is_update_based = False if args.num_update_checkpoints is not None: num = args.num_update_checkpoints is_update_based = True elif args.num_epoch_checkpoints is not None: num = args.num_epoch_checkpoints assert args.checkpoint_upper_bound is None or ( args.num_epoch_checkpoints is not None or args.num_update_checkpoints is not None ), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints" assert ( args.num_epoch_checkpoints is None or args.num_update_checkpoints is None ), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints" if num is not None: args.inputs = last_n_checkpoints( args.inputs, num, is_update_based, upper_bound=args.checkpoint_upper_bound, ) print("averaging checkpoints: ", args.inputs) if args.num_best_checkpoints > 0: args.inputs = list( sorted( args.inputs, key=lambda x: float( os.path.basename(x).split("_")[-1].replace(".pt", "") ), ) ) args.inputs = args.inputs[: args.num_best_checkpoints] for path in args.inputs: print(os.path.basename(path)) new_state = average_checkpoints(args.inputs) with PathManager.open(args.output, "wb") as f: torch.save(new_state, f) print("Finished writing averaged checkpoint to {}".format(args.output)) if __name__ == "__main__": main() ================================================ FILE: scripts/build_sym_alignment.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Use this script in order to build symmetric alignments for your translation dataset. This script depends on fast_align and mosesdecoder tools. You will need to build those before running the script. fast_align: github: http://github.com/clab/fast_align instructions: follow the instructions in README.md mosesdecoder: github: http://github.com/moses-smt/mosesdecoder instructions: http://www.statmt.org/moses/?n=Development.GetStarted The script produces the following files under --output_dir: text.joined - concatenation of lines from the source_file and the target_file. align.forward - forward pass of fast_align. align.backward - backward pass of fast_align. aligned.sym_heuristic - symmetrized alignment. """ import argparse import os from itertools import zip_longest def main(): parser = argparse.ArgumentParser(description="symmetric alignment builer") # fmt: off parser.add_argument('--fast_align_dir', help='path to fast_align build directory') parser.add_argument('--mosesdecoder_dir', help='path to mosesdecoder root directory') parser.add_argument('--sym_heuristic', help='heuristic to use for symmetrization', default='grow-diag-final-and') parser.add_argument('--source_file', help='path to a file with sentences ' 'in the source language') parser.add_argument('--target_file', help='path to a file with sentences ' 'in the target language') parser.add_argument('--output_dir', help='output directory') # fmt: on args = parser.parse_args() fast_align_bin = os.path.join(args.fast_align_dir, "fast_align") symal_bin = os.path.join(args.mosesdecoder_dir, "bin", "symal") sym_fast_align_bin = os.path.join( args.mosesdecoder_dir, "scripts", "ems", "support", "symmetrize-fast-align.perl" ) # create joined file joined_file = os.path.join(args.output_dir, "text.joined") with open(args.source_file, "r", encoding="utf-8") as src, open( args.target_file, "r", encoding="utf-8" ) as tgt: with open(joined_file, "w", encoding="utf-8") as joined: for s, t in zip_longest(src, tgt): print("{} ||| {}".format(s.strip(), t.strip()), file=joined) bwd_align_file = os.path.join(args.output_dir, "align.backward") # run forward alignment fwd_align_file = os.path.join(args.output_dir, "align.forward") fwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v > {FWD}".format( FASTALIGN=fast_align_bin, JOINED=joined_file, FWD=fwd_align_file ) assert os.system(fwd_fast_align_cmd) == 0 # run backward alignment bwd_align_file = os.path.join(args.output_dir, "align.backward") bwd_fast_align_cmd = "{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}".format( FASTALIGN=fast_align_bin, JOINED=joined_file, BWD=bwd_align_file ) assert os.system(bwd_fast_align_cmd) == 0 # run symmetrization sym_out_file = os.path.join(args.output_dir, "aligned") sym_cmd = "{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}".format( SYMFASTALIGN=sym_fast_align_bin, FWD=fwd_align_file, BWD=bwd_align_file, SRC=args.source_file, TGT=args.target_file, OUT=sym_out_file, HEURISTIC=args.sym_heuristic, SYMAL=symal_bin, ) assert os.system(sym_cmd) == 0 if __name__ == "__main__": main() ================================================ FILE: scripts/check_installation.py ================================================ from pathlib import Path import os cwd = Path(".").resolve() print("running 'check_installation.py' from:", cwd) # Old versions of numpy/torch can prevent loading the .so files import torch print("torch:", torch.__version__) import numpy print("numpy:", numpy.__version__) import fairseq print("Fairseq installed at:", fairseq.__file__) import fairseq.criterions import fairseq.dataclass.configs import _imp print("Should load following .so suffixes:", _imp.extension_suffixes()) so_files = list(Path(fairseq.__file__).parent.glob("*.so")) so_files.extend(Path(fairseq.__file__).parent.glob("data/*.so")) print("Found following .so files:") for so_file in so_files: print(f"- {so_file}") from fairseq import libbleu print("Found libbleu at", libbleu.__file__) from fairseq.data import data_utils_fast print("Found data_utils_fast at", data_utils_fast.__file__) ================================================ FILE: scripts/compare_namespaces.py ================================================ #!/usr/bin/env python """Helper script to compare two argparse.Namespace objects.""" from argparse import Namespace # noqa def main(): ns1 = eval(input("Namespace 1: ")) ns2 = eval(input("Namespace 2: ")) def keys(ns): ks = set() for k in dir(ns): if not k.startswith("_"): ks.add(k) return ks k1 = keys(ns1) k2 = keys(ns2) def print_keys(ks, ns1, ns2=None): for k in ks: if ns2 is None: print("{}\t{}".format(k, getattr(ns1, k, None))) else: print( "{}\t{}\t{}".format(k, getattr(ns1, k, None), getattr(ns2, k, None)) ) print("Keys unique to namespace 1:") print_keys(k1 - k2, ns1) print() print("Keys unique to namespace 2:") print_keys(k2 - k1, ns2) print() print("Overlapping keys with different values:") ks = [k for k in k1 & k2 if getattr(ns1, k, "None") != getattr(ns2, k, "None")] print_keys(ks, ns1, ns2) print() if __name__ == "__main__": main() ================================================ FILE: scripts/compound_split_bleu.sh ================================================ #!/bin/bash if [ $# -ne 1 ]; then echo "usage: $0 GENERATE_PY_OUTPUT" exit 1 fi GEN=$1 SYS=$GEN.sys REF=$GEN.ref if [ $(tail -n 1 $GEN | grep BLEU | wc -l) -ne 1 ]; then echo "not done generating" exit fi grep ^H $GEN | awk -F '\t' '{print $NF}' | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $SYS grep ^T $GEN | cut -f2- | perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' > $REF fairseq-score --sys $SYS --ref $REF ================================================ FILE: scripts/constraints/extract.py ================================================ #!/usr/bin/env python3 # # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """Extracts random constraints from reference files.""" import argparse import random import sys def get_phrase(words, index, length): assert index < len(words) - length + 1 phr = " ".join(words[index : index + length]) for i in range(index, index + length): words.pop(index) return phr def main(args): if args.seed: random.seed(args.seed) for line in sys.stdin: constraints = [] def add_constraint(constraint): constraints.append(constraint) source = line.rstrip() if "\t" in line: source, target = line.split("\t") if args.add_sos: target = f"<s> {target}" if args.add_eos: target = f"{target} </s>" if len(target.split()) >= args.len: words = [target] num = args.number choices = {} for i in range(num): if len(words) == 0: break segmentno = random.choice(range(len(words))) segment = words.pop(segmentno) tokens = segment.split() phrase_index = random.choice(range(len(tokens))) choice = " ".join( tokens[phrase_index : min(len(tokens), phrase_index + args.len)] ) for j in range( phrase_index, min(len(tokens), phrase_index + args.len) ): tokens.pop(phrase_index) if phrase_index > 0: words.append(" ".join(tokens[0:phrase_index])) if phrase_index + 1 < len(tokens): words.append(" ".join(tokens[phrase_index:])) choices[target.find(choice)] = choice # mask out with spaces target = target.replace(choice, " " * len(choice), 1) for key in sorted(choices.keys()): add_constraint(choices[key]) print(source, *constraints, sep="\t") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--number", "-n", type=int, default=1, help="number of phrases") parser.add_argument("--len", "-l", type=int, default=1, help="phrase length") parser.add_argument( "--add-sos", default=False, action="store_true", help="add <s> token" ) parser.add_argument( "--add-eos", default=False, action="store_true", help="add </s> token" ) parser.add_argument("--seed", "-s", default=0, type=int) args = parser.parse_args() main(args) ================================================ FILE: scripts/constraints/validate.py ================================================ #!/usr/bin/env python3 # # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import sys """Reads in a fairseq output file, and verifies that the constraints (C- lines) are present in the output (the first H- line). Assumes that constraints are listed prior to the first hypothesis. """ constraints = [] found = 0 total = 0 for line in sys.stdin: if line.startswith("C-"): constraints.append(line.rstrip().split("\t")[1]) elif line.startswith("H-"): text = line.split("\t")[2] for constraint in constraints: total += 1 if constraint in text: found += 1 else: print(f"No {constraint} in {text}", file=sys.stderr) constraints = [] print(f"Found {found} / {total} = {100 * found / total:.1f}%") ================================================ FILE: scripts/convert_dictionary.lua ================================================ -- Copyright (c) Facebook, Inc. and its affiliates. -- -- This source code is licensed under the MIT license found in the -- LICENSE file in the root directory of this source tree. -- -- Usage: convert_dictionary.lua <dict.th7> require 'fairseq' require 'torch' require 'paths' if #arg < 1 then print('usage: convert_dictionary.lua <dict.th7>') os.exit(1) end if not paths.filep(arg[1]) then print('error: file does not exit: ' .. arg[1]) os.exit(1) end dict = torch.load(arg[1]) dst = paths.basename(arg[1]):gsub('.th7', '.txt') assert(dst:match('.txt$')) f = io.open(dst, 'w') for idx, symbol in ipairs(dict.index_to_symbol) do if idx > dict.cutoff then break end f:write(symbol) f:write(' ') f:write(dict.index_to_freq[idx]) f:write('\n') end f:close() ================================================ FILE: scripts/convert_model.lua ================================================ -- Copyright (c) Facebook, Inc. and its affiliates. -- -- This source code is licensed under the MIT license found in the -- LICENSE file in the root directory of this source tree. -- -- Usage: convert_model.lua <model_epoch1.th7> require 'torch' local fairseq = require 'fairseq' model = torch.load(arg[1]) function find_weight_norm(container, module) for _, wn in ipairs(container:listModules()) do if torch.type(wn) == 'nn.WeightNorm' and wn.modules[1] == module then return wn end end end function push_state(dict, key, module) if torch.type(module) == 'nn.Linear' then local wn = find_weight_norm(model.module, module) assert(wn) dict[key .. '.weight_v'] = wn.v:float() dict[key .. '.weight_g'] = wn.g:float() elseif torch.type(module) == 'nn.TemporalConvolutionTBC' then local wn = find_weight_norm(model.module, module) assert(wn) local v = wn.v:float():view(wn.viewOut):transpose(2, 3) dict[key .. '.weight_v'] = v dict[key .. '.weight_g'] = wn.g:float():view(module.weight:size(3), 1, 1) else dict[key .. '.weight'] = module.weight:float() end if module.bias then dict[key .. '.bias'] = module.bias:float() end end encoder_dict = {} decoder_dict = {} combined_dict = {} function encoder_state(encoder) luts = encoder:findModules('nn.LookupTable') push_state(encoder_dict, 'embed_tokens', luts[1]) push_state(encoder_dict, 'embed_positions', luts[2]) fcs = encoder:findModules('nn.Linear') assert(#fcs >= 2) local nInputPlane = fcs[1].weight:size(1) push_state(encoder_dict, 'fc1', table.remove(fcs, 1)) push_state(encoder_dict, 'fc2', table.remove(fcs, #fcs)) for i, module in ipairs(encoder:findModules('nn.TemporalConvolutionTBC')) do push_state(encoder_dict, 'convolutions.' .. tostring(i - 1), module) if nInputPlane ~= module.weight:size(3) / 2 then push_state(encoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1)) end nInputPlane = module.weight:size(3) / 2 end assert(#fcs == 0) end function decoder_state(decoder) luts = decoder:findModules('nn.LookupTable') push_state(decoder_dict, 'embed_tokens', luts[1]) push_state(decoder_dict, 'embed_positions', luts[2]) fcs = decoder:findModules('nn.Linear') local nInputPlane = fcs[1].weight:size(1) push_state(decoder_dict, 'fc1', table.remove(fcs, 1)) push_state(decoder_dict, 'fc2', fcs[#fcs - 1]) push_state(decoder_dict, 'fc3', fcs[#fcs]) table.remove(fcs, #fcs) table.remove(fcs, #fcs) for i, module in ipairs(decoder:findModules('nn.TemporalConvolutionTBC')) do if nInputPlane ~= module.weight:size(3) / 2 then push_state(decoder_dict, 'projections.' .. tostring(i - 1), table.remove(fcs, 1)) end nInputPlane = module.weight:size(3) / 2 local prefix = 'attention.' .. tostring(i - 1) push_state(decoder_dict, prefix .. '.in_projection', table.remove(fcs, 1)) push_state(decoder_dict, prefix .. '.out_projection', table.remove(fcs, 1)) push_state(decoder_dict, 'convolutions.' .. tostring(i - 1), module) end assert(#fcs == 0) end _encoder = model.module.modules[2] _decoder = model.module.modules[3] encoder_state(_encoder) decoder_state(_decoder) for k, v in pairs(encoder_dict) do combined_dict['encoder.' .. k] = v end for k, v in pairs(decoder_dict) do combined_dict['decoder.' .. k] = v end torch.save('state_dict.t7', combined_dict) ================================================ FILE: scripts/count_docs.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Count the number of documents and average number of lines and tokens per document in a large file. Documents should be separated by a single empty line. """ import argparse import gzip import sys import numpy as np def main(): parser = argparse.ArgumentParser() parser.add_argument("input") parser.add_argument("--gzip", action="store_true") args = parser.parse_args() def gopen(): if args.gzip: return gzip.open(args.input, "r") else: return open(args.input, "r", encoding="utf-8") num_lines = [] num_toks = [] with gopen() as h: num_docs = 1 num_lines_in_doc = 0 num_toks_in_doc = 0 for i, line in enumerate(h): if len(line.strip()) == 0: # empty line indicates new document num_docs += 1 num_lines.append(num_lines_in_doc) num_toks.append(num_toks_in_doc) num_lines_in_doc = 0 num_toks_in_doc = 0 else: num_lines_in_doc += 1 num_toks_in_doc += len(line.rstrip().split()) if i % 1000000 == 0: print(i, file=sys.stderr, end="", flush=True) elif i % 100000 == 0: print(".", file=sys.stderr, end="", flush=True) print(file=sys.stderr, flush=True) print("found {} docs".format(num_docs)) print("average num lines per doc: {}".format(np.mean(num_lines))) print("average num toks per doc: {}".format(np.mean(num_toks))) if __name__ == "__main__": main() ================================================ FILE: scripts/read_binarized.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse from fairseq.data import Dictionary, data_utils, indexed_dataset def get_parser(): parser = argparse.ArgumentParser( description="writes text from binarized file to stdout" ) # fmt: off parser.add_argument('--dataset-impl', help='dataset implementation', choices=indexed_dataset.get_available_dataset_impl()) parser.add_argument('--dict', metavar='FP', help='dictionary containing known words', default=None) parser.add_argument('--input', metavar='FP', required=True, help='binarized file to read') # fmt: on return parser def main(): parser = get_parser() args = parser.parse_args() dictionary = Dictionary.load(args.dict) if args.dict is not None else None dataset = data_utils.load_indexed_dataset( args.input, dictionary, dataset_impl=args.dataset_impl, default="lazy", ) for tensor_line in dataset: if dictionary is None: line = " ".join([str(int(x)) for x in tensor_line]) else: line = dictionary.string(tensor_line) print(line) if __name__ == "__main__": main() ================================================ FILE: scripts/rm_pt.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import os import re import shutil import sys pt_regexp = re.compile(r"checkpoint(\d+|_\d+_\d+|_[a-z]+)\.pt") pt_regexp_epoch_based = re.compile(r"checkpoint(\d+)\.pt") pt_regexp_update_based = re.compile(r"checkpoint_\d+_(\d+)\.pt") def parse_checkpoints(files): entries = [] for f in files: m = pt_regexp_epoch_based.fullmatch(f) if m is not None: entries.append((int(m.group(1)), m.group(0))) else: m = pt_regexp_update_based.fullmatch(f) if m is not None: entries.append((int(m.group(1)), m.group(0))) return entries def last_n_checkpoints(files, n): entries = parse_checkpoints(files) return [x[1] for x in sorted(entries, reverse=True)[:n]] def every_n_checkpoints(files, n): entries = parse_checkpoints(files) return [x[1] for x in sorted(sorted(entries)[::-n])] def main(): parser = argparse.ArgumentParser( description=( "Recursively delete checkpoint files from `root_dir`, " "but preserve checkpoint_best.pt and checkpoint_last.pt" ) ) parser.add_argument("root_dirs", nargs="*") parser.add_argument( "--save-last", type=int, default=0, help="number of last checkpoints to save" ) parser.add_argument( "--save-every", type=int, default=0, help="interval of checkpoints to save" ) parser.add_argument( "--preserve-test", action="store_true", help="preserve checkpoints in dirs that start with test_ prefix (default: delete them)", ) parser.add_argument( "--delete-best", action="store_true", help="delete checkpoint_best.pt" ) parser.add_argument( "--delete-last", action="store_true", help="delete checkpoint_last.pt" ) parser.add_argument( "--no-dereference", action="store_true", help="don't dereference symlinks" ) args = parser.parse_args() files_to_desymlink = [] files_to_preserve = [] files_to_delete = [] for root_dir in args.root_dirs: for root, _subdirs, files in os.walk(root_dir): if args.save_last > 0: to_save = last_n_checkpoints(files, args.save_last) else: to_save = [] if args.save_every > 0: to_save += every_n_checkpoints(files, args.save_every) for file in files: if not pt_regexp.fullmatch(file): continue full_path = os.path.join(root, file) if ( not os.path.basename(root).startswith("test_") or args.preserve_test ) and ( (file == "checkpoint_last.pt" and not args.delete_last) or (file == "checkpoint_best.pt" and not args.delete_best) or file in to_save ): if os.path.islink(full_path) and not args.no_dereference: files_to_desymlink.append(full_path) else: files_to_preserve.append(full_path) else: files_to_delete.append(full_path) if len(files_to_desymlink) == 0 and len(files_to_delete) == 0: print("Nothing to do.") sys.exit(0) files_to_desymlink = sorted(files_to_desymlink) files_to_preserve = sorted(files_to_preserve) files_to_delete = sorted(files_to_delete) print("Operations to perform (in order):") if len(files_to_desymlink) > 0: for file in files_to_desymlink: print(" - preserve (and dereference symlink): " + file) if len(files_to_preserve) > 0: for file in files_to_preserve: print(" - preserve: " + file) if len(files_to_delete) > 0: for file in files_to_delete: print(" - delete: " + file) while True: resp = input("Continue? (Y/N): ") if resp.strip().lower() == "y": break elif resp.strip().lower() == "n": sys.exit(0) print("Executing...") if len(files_to_desymlink) > 0: for file in files_to_desymlink: realpath = os.path.realpath(file) print("rm " + file) os.remove(file) print("cp {} {}".format(realpath, file)) shutil.copyfile(realpath, file) if len(files_to_delete) > 0: for file in files_to_delete: print("rm " + file) os.remove(file) if __name__ == "__main__": main() ================================================ FILE: scripts/sacrebleu.sh ================================================ #!/bin/bash if [ $# -ne 4 ]; then echo "usage: $0 TESTSET SRCLANG TGTLANG GEN" exit 1 fi TESTSET=$1 SRCLANG=$2 TGTLANG=$3 GEN=$4 if ! command -v sacremoses &> /dev/null then echo "sacremoses could not be found, please install with: pip install sacremoses" exit fi grep ^H $GEN \ | sed 's/^H\-//' \ | sort -n -k 1 \ | cut -f 3 \ | sacremoses detokenize \ > $GEN.sorted.detok sacrebleu --test-set $TESTSET --language-pair "${SRCLANG}-${TGTLANG}" < $GEN.sorted.detok ================================================ FILE: scripts/shard_docs.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Split a large file into shards while respecting document boundaries. Documents should be separated by a single empty line. """ import argparse import contextlib def main(): parser = argparse.ArgumentParser() parser.add_argument("input") parser.add_argument("--num-shards", type=int) args = parser.parse_args() assert args.num_shards is not None and args.num_shards > 1 with open(args.input, "r", encoding="utf-8") as h: with contextlib.ExitStack() as stack: outputs = [ stack.enter_context( open(args.input + ".shard" + str(i), "w", encoding="utf-8") ) for i in range(args.num_shards) ] doc = [] first_doc = [True] * args.num_shards def output_doc(i): if not first_doc[i]: outputs[i].write("\n") first_doc[i] = False for line in doc: outputs[i].write(line) doc.clear() num_docs = 0 for line in h: if line.strip() == "": # empty line indicates new document output_doc(num_docs % args.num_shards) num_docs += 1 else: doc.append(line) output_doc(num_docs % args.num_shards) if __name__ == "__main__": main() ================================================ FILE: scripts/split_train_valid_docs.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Split a large file into a train and valid set while respecting document boundaries. Documents should be separated by a single empty line. """ import argparse import random import sys def main(): parser = argparse.ArgumentParser() parser.add_argument("input") parser.add_argument("sample_output", help="train output file") parser.add_argument("remainder_output", help="valid output file") parser.add_argument("-k", type=int, help="remainder size") parser.add_argument( "--lines", action="store_true", help="split lines instead of docs" ) args = parser.parse_args() assert args.k is not None sample = [] remainder = [] num_docs = [0] def update_sample(doc): if len(sample) < args.k: sample.append(doc.copy()) else: i = num_docs[0] j = random.randrange(i + 1) if j < args.k: remainder.append(sample[j]) sample[j] = doc.copy() else: remainder.append(doc.copy()) num_docs[0] += 1 doc.clear() with open(args.input, "r", encoding="utf-8") as h: doc = [] for i, line in enumerate(h): if line.strip() == "": # empty line indicates new document update_sample(doc) else: doc.append(line) if args.lines: update_sample(doc) if i % 1000000 == 0: print(i, file=sys.stderr, end="", flush=True) elif i % 100000 == 0: print(".", file=sys.stderr, end="", flush=True) if len(doc) > 0: update_sample(doc) print(file=sys.stderr, flush=True) assert len(sample) == args.k with open(args.sample_output, "w", encoding="utf-8") as out: first = True for doc in sample: if not first and not args.lines: out.write("\n") first = False for line in doc: out.write(line) with open(args.remainder_output, "w", encoding="utf-8") as out: first = True for doc in remainder: if not first and not args.lines: out.write("\n") first = False for line in doc: out.write(line) if __name__ == "__main__": main() ================================================ FILE: scripts/spm_decode.py ================================================ #!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. from __future__ import absolute_import, division, print_function, unicode_literals import argparse import sentencepiece as spm def main(): parser = argparse.ArgumentParser() parser.add_argument( "--model", required=True, help="sentencepiece model to use for decoding" ) parser.add_argument("--input", required=True, help="input file to decode") parser.add_argument("--input_format", choices=["piece", "id"], default="piece") args = parser.parse_args() sp = spm.SentencePieceProcessor() sp.Load(args.model) if args.input_format == "piece": def decode(input): return "".join(sp.DecodePieces(input)) elif args.input_format == "id": def decode(input): return "".join(sp.DecodeIds(input)) else: raise NotImplementedError def tok2int(tok): # remap reference-side <unk> (represented as <<unk>>) to 0 return int(tok) if tok != "<<unk>>" else 0 with open(args.input, "r", encoding="utf-8") as h: for line in h: if args.input_format == "id": print(decode(list(map(tok2int, line.rstrip().split())))) elif args.input_format == "piece": print(decode(line.rstrip().split())) if __name__ == "__main__": main() ================================================ FILE: scripts/spm_encode.py ================================================ #!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. from __future__ import absolute_import, division, print_function, unicode_literals import argparse import contextlib import sys import sentencepiece as spm def main(): parser = argparse.ArgumentParser() parser.add_argument( "--model", required=True, help="sentencepiece model to use for encoding" ) parser.add_argument( "--inputs", nargs="+", default=["-"], help="input files to filter/encode" ) parser.add_argument( "--outputs", nargs="+", default=["-"], help="path to save encoded outputs" ) parser.add_argument("--output_format", choices=["piece", "id"], default="piece") parser.add_argument( "--min-len", type=int, metavar="N", help="filter sentence pairs with fewer than N tokens", ) parser.add_argument( "--max-len", type=int, metavar="N", help="filter sentence pairs with more than N tokens", ) args = parser.parse_args() assert len(args.inputs) == len( args.outputs ), "number of input and output paths should match" sp = spm.SentencePieceProcessor() sp.Load(args.model) if args.output_format == "piece": def encode(input): return sp.EncodeAsPieces(input) elif args.output_format == "id": def encode(input): return list(map(str, sp.EncodeAsIds(input))) else: raise NotImplementedError if args.min_len is not None or args.max_len is not None: def valid(line): return (args.min_len is None or len(line) >= args.min_len) and ( args.max_len is None or len(line) <= args.max_len ) else: def valid(lines): return True with contextlib.ExitStack() as stack: inputs = [ stack.enter_context(open(input, "r", encoding="utf-8")) if input != "-" else sys.stdin for input in args.inputs ] outputs = [ stack.enter_context(open(output, "w", encoding="utf-8")) if output != "-" else sys.stdout for output in args.outputs ] stats = { "num_empty": 0, "num_filtered": 0, } def encode_line(line): line = line.strip() if len(line) > 0: line = encode(line) if valid(line): return line else: stats["num_filtered"] += 1 else: stats["num_empty"] += 1 return None for i, lines in enumerate(zip(*inputs), start=1): enc_lines = list(map(encode_line, lines)) if not any(enc_line is None for enc_line in enc_lines): for enc_line, output_h in zip(enc_lines, outputs): print(" ".join(enc_line), file=output_h) if i % 10000 == 0: print("processed {} lines".format(i), file=sys.stderr) print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr) print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr) if __name__ == "__main__": main() ================================================ FILE: scripts/spm_train.py ================================================ #!/usr/bin/env python # Copyright (c) Facebook, Inc. and its affiliates. # All rights reserved. # # This source code is licensed under the license found in the # LICENSE file in the root directory of this source tree. from __future__ import absolute_import, division, print_function, unicode_literals import sys import sentencepiece as spm if __name__ == "__main__": spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:])) ================================================ FILE: scripts/test_fsdp.sh ================================================ #!/usr/bin/env bash rm -rf fsdp_dummy mkdir -p fsdp_dummy CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \ --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ --cpu-offload --checkpoint-activations \ --task language_modeling --tokens-per-sample 256 --batch-size 8 \ --arch transformer_lm_gpt2_tiny \ --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ --max-update 5 --log-format json --log-interval 1 \ --save-interval-updates 5 --save-dir fsdp_dummy --disable-validation \ --restore-file x.pt "$@" # Now we try to load the checkpoint CUDA_VISIBLE_DEVICES=0,1 fairseq-train /private/home/sshleifer/data-bin/stories_mmap \ --ddp-backend fully_sharded --fp16 --fp16-init-scale 4 \ --cpu-offload --checkpoint-activations \ --task language_modeling --tokens-per-sample 256 --batch-size 8 \ --arch transformer_lm_gpt2_tiny \ --optimizer cpu_adam --adam-betas "(0.9,0.98)" \ --lr 0.0001 --lr-scheduler polynomial_decay --warmup-updates 5 --total-num-update 10 \ --max-update 2 --log-format json --log-interval 1 \ --save-interval-updates 2 --save-dir fsdp_dummy ================================================ FILE: setup.cfg ================================================ [flake8] max-line-length = 127 extend-ignore = E203, W503 extend-exclude = fairseq/model_parallel/megatron ================================================ FILE: setup.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import subprocess import sys from setuptools import Extension, find_packages, setup from torch.utils import cpp_extension if sys.version_info < (3, 6): sys.exit("Sorry, Python >= 3.6 is required for fairseq.") def write_version_py(): with open(os.path.join("fairseq", "version.txt")) as f: version = f.read().strip() # write version info to fairseq/version.py with open(os.path.join("fairseq", "version.py"), "w") as f: f.write('__version__ = "{}"\n'.format(version)) return version version = write_version_py() with open("README.md") as f: readme = f.read() if sys.platform == "darwin": extra_compile_args = ["-stdlib=libc++", "-O3"] else: extra_compile_args = ["-std=c++11", "-O3"] class NumpyExtension(Extension): """Source: https://stackoverflow.com/a/54128391""" def __init__(self, *args, **kwargs): self.__include_dirs = [] super().__init__(*args, **kwargs) @property def include_dirs(self): import numpy return self.__include_dirs + [numpy.get_include()] @include_dirs.setter def include_dirs(self, dirs): self.__include_dirs = dirs extensions = [ Extension( "fairseq.libbleu", sources=[ "fairseq/clib/libbleu/libbleu.cpp", "fairseq/clib/libbleu/module.cpp", ], extra_compile_args=extra_compile_args, ), NumpyExtension( "fairseq.data.data_utils_fast", sources=["fairseq/data/data_utils_fast.pyx"], language="c++", extra_compile_args=extra_compile_args, ), NumpyExtension( "fairseq.data.token_block_utils_fast", sources=["fairseq/data/token_block_utils_fast.pyx"], language="c++", extra_compile_args=extra_compile_args, ), ] extensions.extend( [ cpp_extension.CppExtension( "fairseq.libbase", sources=[ "fairseq/clib/libbase/balanced_assignment.cpp", ], ), cpp_extension.CppExtension( "fairseq.libnat", sources=[ "fairseq/clib/libnat/edit_dist.cpp", ], ), cpp_extension.CppExtension( "alignment_train_cpu_binding", sources=[ "examples/operators/alignment_train_cpu.cpp", ], ), ] ) if "CUDA_HOME" in os.environ: extensions.extend( [ cpp_extension.CppExtension( "fairseq.libnat_cuda", sources=[ "fairseq/clib/libnat_cuda/edit_dist.cu", "fairseq/clib/libnat_cuda/binding.cpp", ], ), cpp_extension.CppExtension( "fairseq.ngram_repeat_block_cuda", sources=[ "fairseq/clib/cuda/ngram_repeat_block_cuda.cpp", "fairseq/clib/cuda/ngram_repeat_block_cuda_kernel.cu", ], ), cpp_extension.CppExtension( "alignment_train_cuda_binding", sources=[ "examples/operators/alignment_train_kernel.cu", "examples/operators/alignment_train_cuda.cpp", ], ), ] ) cmdclass = {"build_ext": cpp_extension.BuildExtension} if "READTHEDOCS" in os.environ: # don't build extensions when generating docs extensions = [] if "build_ext" in cmdclass: del cmdclass["build_ext"] # use CPU build of PyTorch dependency_links = [ "https://download.pytorch.org/whl/cpu/torch-1.7.0%2Bcpu-cp36-cp36m-linux_x86_64.whl" ] else: dependency_links = [] if "clean" in sys.argv[1:]: # Source: https://bit.ly/2NLVsgE print("deleting Cython files...") subprocess.run( ["rm -f fairseq/*.so fairseq/**/*.so fairseq/*.pyd fairseq/**/*.pyd"], shell=True, ) extra_packages = [] if os.path.exists(os.path.join("fairseq", "model_parallel", "megatron", "mpu")): extra_packages.append("fairseq.model_parallel.megatron.mpu") def do_setup(package_data): setup( name="fairseq", version=version, description="Facebook AI Research Sequence-to-Sequence Toolkit", url="https://github.com/pytorch/fairseq", classifiers=[ "Intended Audience :: Science/Research", "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], long_description=readme, long_description_content_type="text/markdown", install_requires=[ "cffi", "cython", "hydra-core>=1.0.7,<1.1", "omegaconf<2.1", "numpy>=1.21.3", "regex", "sacrebleu>=1.4.12", "torch>=1.13", "tqdm", "bitarray", "torchaudio>=0.8.0", "scikit-learn", "packaging", ], extras_require={ "dev": ["flake8", "pytest", "black==22.3.0"], "docs": ["sphinx", "sphinx-argparse"], }, dependency_links=dependency_links, packages=find_packages( exclude=[ "examples", "examples.*", "scripts", "scripts.*", "tests", "tests.*", ] ) + extra_packages, package_data=package_data, ext_modules=extensions, test_suite="tests", entry_points={ "console_scripts": [ "fairseq-eval-lm = fairseq_cli.eval_lm:cli_main", "fairseq-generate = fairseq_cli.generate:cli_main", "fairseq-hydra-train = fairseq_cli.hydra_train:cli_main", "fairseq-interactive = fairseq_cli.interactive:cli_main", "fairseq-preprocess = fairseq_cli.preprocess:cli_main", "fairseq-score = fairseq_cli.score:cli_main", "fairseq-train = fairseq_cli.train:cli_main", "fairseq-validate = fairseq_cli.validate:cli_main", ], }, cmdclass=cmdclass, zip_safe=False, ) def get_files(path, relative_to="fairseq"): all_files = [] for root, _dirs, files in os.walk(path, followlinks=True): root = os.path.relpath(root, relative_to) for file in files: if file.endswith(".pyc"): continue all_files.append(os.path.join(root, file)) return all_files if __name__ == "__main__": try: # symlink examples into fairseq package so package_data accepts them fairseq_examples = os.path.join("fairseq", "examples") if "build_ext" not in sys.argv[1:] and not os.path.exists(fairseq_examples): os.symlink(os.path.join("..", "examples"), fairseq_examples) package_data = { "fairseq": ( get_files(fairseq_examples) + get_files(os.path.join("fairseq", "config")) ) } do_setup(package_data) finally: if "build_ext" not in sys.argv[1:] and os.path.islink(fairseq_examples): os.unlink(fairseq_examples) ================================================ FILE: tests/__init__.py ================================================ ================================================ FILE: tests/distributed/__init__.py ================================================ ================================================ FILE: tests/distributed/test_bmuf.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import functools import random import unittest from multiprocessing import Manager import torch import torch.nn as nn from omegaconf import OmegaConf from fairseq import optim from fairseq.distributed import utils as distributed_utils class Model(nn.Module): def __init__(self, input_size, output_size): super(Model, self).__init__() self.fc = nn.Linear(input_size, output_size) def forward(self, input): output = self.fc(input) return output def setup_model_loss_criterion(cfg, args, rank, is_cuda): """ setup model, criterion and optimizer based on input args """ args.distributed_rank = rank cfg.distributed_training.distributed_rank = args.distributed_rank if cfg.distributed_training.distributed_world_size > 1: distributed_utils.distributed_init(cfg) torch.manual_seed(1) model = Model(args.input_size, args.nb_classes) loss_fn = nn.CrossEntropyLoss() if is_cuda: model = model.cuda() loss_fn = loss_fn.cuda() optimizer = optim.sgd.SGD(args, model.parameters()) optimizer = optim.FairseqBMUF(cfg=cfg.bmuf, optimizer=optimizer) return model, loss_fn, optimizer def train_step(input, target, model, loss_fn, optimizer, **unused): """Do forward, backward and parameter update.""" model.train() output = model(input) loss = loss_fn(output, target) optimizer.backward(loss) optimizer.step() def single_gpu_training(cfg, args, rank, iterations, shared_results): is_cuda = torch.cuda.is_available() if is_cuda: torch.cuda.set_device(rank) model, loss_fn, optimizer = setup_model_loss_criterion(cfg, args, rank, is_cuda) for _ in range(iterations): input = torch.randn(1, args.input_size) target = torch.empty(args.batch_size, dtype=torch.long).random_(args.nb_classes) if is_cuda: input = input.cuda() target = target.cuda() train_step(input, target, model, loss_fn, optimizer) results = [] for param in model.parameters(): if len(results) == 0: results = param.flatten().cpu().data else: results = torch.cat((results, param.flatten().cpu().data), 0) shared_results[rank] = results def setup_args(): args = argparse.Namespace() args.global_sync_iter = 20 args.block_momentum = 0.875 args.block_lr = 0.5 args.input_size = 5 args.nb_classes = 2 args.batch_size = 1 args.lr = [1e-3] args.momentum = 0 args.weight_decay = 0 args.warmup_iterations = 0 args.use_nbm = True args.average_sync = True args.global_sync_iter = 1 args.model_parallel_size = 1 args.distributed_backend = "gloo" args.distributed_world_size = 2 port = random.randint(10000, 20000) args.distributed_init_method = "tcp://localhost:{port}".format(port=port) args.distributed_init_host = "localhost" args.distributed_port = port + 1 args.local_world_size = args.distributed_world_size cfg = OmegaConf.create() cfg.optimization = OmegaConf.create() cfg.common = OmegaConf.create() cfg.distributed_training = OmegaConf.create() cfg.dataset = OmegaConf.create() cfg.bmuf = OmegaConf.create() cfg.optimizer = OmegaConf.create() cfg.bmuf.global_sync_iter = args.global_sync_iter cfg.bmuf.block_momentum = args.block_momentum cfg.bmuf.block_lr = args.block_lr cfg.dataset.batch_size = args.batch_size cfg.optimization.lr = args.lr cfg.optimizer.momentum = args.momentum cfg.optimizer.weight_decay = args.weight_decay cfg.bmuf.warmup_iterations = args.warmup_iterations cfg.bmuf.use_nbm = args.use_nbm cfg.bmuf.average_sync = args.average_sync cfg.common.model_parallel_size = args.model_parallel_size cfg.distributed_training.distributed_backend = args.distributed_backend cfg.distributed_training.distributed_world_size = args.distributed_world_size cfg.bmuf.distributed_world_size = args.distributed_world_size cfg.distributed_training.distributed_init_method = args.distributed_init_method cfg.distributed_training.distributed_port = args.distributed_port return cfg, args @unittest.skipIf(torch.cuda.device_count() < 2, "test requires 2 GPUs") class TestBMUF(unittest.TestCase): def bmuf_process(self, cfg, args, iterations): results = Manager().dict() torch.multiprocessing.spawn( fn=functools.partial(single_gpu_training, cfg, args), args=(iterations, results), nprocs=args.distributed_world_size, join=True, ) return results def test_bmuf_sync(self): # Train model for 1 iteration and do bmuf sync without doing warmup cfg, args = setup_args() iterations = 1 results = self.bmuf_process(cfg, args, iterations) # Make sure params in both machines are same assert len(results) == 2 self.assertAlmostEqual(results[0], results[1]) def test_warmup_sync(self): # Train model for 20 iteration and do warmup sync without doing bmuf sync cfg, args = setup_args() args.warmup_iterations = 20 cfg.bmuf.warmup_iterations = args.warmup_iterations iterations = 20 results = self.bmuf_process(cfg, args, iterations) # Make sure params in both machines are same assert len(results) == 2 self.assertAlmostEqual(results[0], results[1]) def test_warmup_sync_bmuf_sync(self): # Train model for 25 iteration and do warmup sync after 20 iteration # and bmuf sync after 25 iteration cfg, args = setup_args() args.warmup_iterations = 20 args.global_sync_iter = 5 cfg.bmuf.warmup_iterations = args.warmup_iterations cfg.bmuf.global_sync_iter = args.global_sync_iter iterations = 25 results = self.bmuf_process(cfg, args, iterations) # Make sure params in both machines are same assert len(results) == 2 self.assertAlmostEqual(results[0], results[1]) def test_single_gpu_bmuf(self): # Train model for 5 iterations and use GPU 1 cfg, args = setup_args() args.distributed_world_size = 1 args.warmup_iterations = 5 cfg.distributed_training.distributed_world_size = args.distributed_world_size cfg.bmuf.distributed_world_size = args.distributed_world_size cfg.bmuf.warmup_iterations = args.warmup_iterations iterations = 20 results = self.bmuf_process(cfg, args, iterations) assert len(results) == 1 def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess((t1 - t2).abs().max(), 1e-4) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/distributed/test_distributed_timeout_wrapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import signal import time import unittest import torch from torch import nn from fairseq.distributed import DistributedTimeoutWrapper class ModuleWithDelay(nn.Module): def __init__(self, delay): super().__init__() self.delay = delay def forward(self, x): time.sleep(self.delay) return x class TestDistributedTimeoutWrapper(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_no_timeout(self): module = DistributedTimeoutWrapper(ModuleWithDelay(1), 0, signal.SIGINT) module(torch.rand(5)) module.stop_timeout() def test_timeout_safe(self): module = DistributedTimeoutWrapper(ModuleWithDelay(1), 10, signal.SIGINT) module(torch.rand(5)) module.stop_timeout() def test_timeout_killed(self): with self.assertRaises(KeyboardInterrupt): module = DistributedTimeoutWrapper(ModuleWithDelay(5), 1, signal.SIGINT) module(torch.rand(5)) module.stop_timeout() if __name__ == "__main__": unittest.main() ================================================ FILE: tests/distributed/test_module_proxy_wrapper.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from torch import nn from fairseq.distributed import ModuleProxyWrapper from .utils import objects_are_equal class MockDDPWrapper(nn.Module): """A simple wrapper with an interface similar to DistributedDataParallel.""" def __init__(self, module): super().__init__() self.module = module def forward(self, x): return self.module(x) class Model(nn.Module): def __init__(self): super().__init__() self.linear = nn.Linear(5, 10) self.xyz = "hello" def forward(self, x): return self.linear(x) def get_xyz(self): return self.xyz class TestModuleProxyWrapper(unittest.TestCase): def _get_module(self): module = Model() wrapped_module = MockDDPWrapper(module) wrapped_module = ModuleProxyWrapper(wrapped_module) return wrapped_module, module def test_getattr_forwarding(self): wrapped_module, module = self._get_module() assert module.xyz == "hello" assert module.get_xyz() == "hello" assert wrapped_module.xyz == "hello" wrapped_module.xyz = "world" assert wrapped_module.xyz == "world" assert module.get_xyz() == "hello" def test_state_dict(self): wrapped_module, module = self._get_module() assert objects_are_equal(wrapped_module.state_dict(), module.state_dict()) def test_load_state_dict(self): wrapped_module, module = self._get_module() wrapped_module.load_state_dict(module.state_dict()) input = torch.rand(4, 5) torch.testing.assert_allclose(wrapped_module(input), module(input)) def test_forward(self): wrapped_module, module = self._get_module() input = torch.rand(4, 5) torch.testing.assert_allclose(wrapped_module(input), module(input)) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/distributed/test_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import functools import sys import unittest import torch from fairseq.distributed import utils as dist_utils from .utils import objects_are_equal, spawn_and_init class DistributedTest(unittest.TestCase): def setUp(self): if not torch.cuda.is_available(): raise unittest.SkipTest("CUDA not available, skipping test") if sys.platform == "win32": raise unittest.SkipTest("NCCL doesn't support Windows, skipping test") if torch.cuda.device_count() < 2: raise unittest.SkipTest("distributed tests require 2+ GPUs, skipping") class TestBroadcastObject(DistributedTest): def test_str(self): spawn_and_init( functools.partial( TestBroadcastObject._test_broadcast_object, "hello world" ), world_size=2, ) def test_tensor(self): spawn_and_init( functools.partial( TestBroadcastObject._test_broadcast_object, torch.rand(5), ), world_size=2, ) def test_complex(self): spawn_and_init( functools.partial( TestBroadcastObject._test_broadcast_object, { "a": "1", "b": [2, torch.rand(2, 3), 3], "c": (torch.rand(2, 3), 4), "d": {5, torch.rand(5)}, "e": torch.rand(5), "f": torch.rand(5).int().cuda(), }, ), world_size=2, ) @staticmethod def _test_broadcast_object(ref_obj, rank, group): obj = dist_utils.broadcast_object( ref_obj if rank == 0 else None, src_rank=0, group=group ) assert objects_are_equal(ref_obj, obj) class TestAllGatherList(DistributedTest): def test_str_equality(self): spawn_and_init( functools.partial( TestAllGatherList._test_all_gather_list_equality, "hello world", ), world_size=2, ) def test_tensor_equality(self): spawn_and_init( functools.partial( TestAllGatherList._test_all_gather_list_equality, torch.rand(5), ), world_size=2, ) def test_complex_equality(self): spawn_and_init( functools.partial( TestAllGatherList._test_all_gather_list_equality, { "a": "1", "b": [2, torch.rand(2, 3), 3], "c": (torch.rand(2, 3), 4), "d": {5, torch.rand(5)}, "e": torch.rand(5), "f": torch.rand(5).int(), }, ), world_size=2, ) @staticmethod def _test_all_gather_list_equality(ref_obj, rank, group): objs = dist_utils.all_gather_list(ref_obj, group) for obj in objs: assert objects_are_equal(ref_obj, obj) def test_rank_tensor(self): spawn_and_init( TestAllGatherList._test_all_gather_list_rank_tensor, world_size=2 ) @staticmethod def _test_all_gather_list_rank_tensor(rank, group): obj = torch.tensor([rank]) objs = dist_utils.all_gather_list(obj, group) for i, obj in enumerate(objs): assert obj.item() == i if __name__ == "__main__": unittest.main() ================================================ FILE: tests/distributed/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import functools import tempfile import torch def spawn_and_init(fn, world_size, args=None): if args is None: args = () with tempfile.NamedTemporaryFile(delete=False) as tmp_file: torch.multiprocessing.spawn( fn=functools.partial(init_and_run, fn, args), args=( world_size, tmp_file.name, ), nprocs=world_size, join=True, ) def distributed_init(rank, world_size, tmp_file): torch.distributed.init_process_group( backend="nccl", init_method="file://{}".format(tmp_file), world_size=world_size, rank=rank, ) torch.cuda.set_device(rank) def init_and_run(fn, args, rank, world_size, tmp_file): distributed_init(rank, world_size, tmp_file) group = torch.distributed.new_group() fn(rank, group, *args) def objects_are_equal(a, b) -> bool: if type(a) is not type(b): return False if isinstance(a, dict): if set(a.keys()) != set(b.keys()): return False for k in a.keys(): if not objects_are_equal(a[k], b[k]): return False return True elif isinstance(a, (list, tuple, set)): if len(a) != len(b): return False return all(objects_are_equal(x, y) for x, y in zip(a, b)) elif torch.is_tensor(a): return ( a.size() == b.size() and a.dtype == b.dtype and a.device == b.device and torch.all(a == b) ) else: return a == b ================================================ FILE: tests/gpu/__init__.py ================================================ ================================================ FILE: tests/gpu/test_binaries_gpu.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import json import logging import os import tempfile import unittest from io import StringIO import torch from fairseq import options from fairseq_cli import train from tests.utils import ( create_dummy_data, generate_main, preprocess_lm_data, preprocess_translation_data, train_language_model, train_translation_model, ) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestMultiGPU(unittest.TestCase): @staticmethod def parse_logs(logfile): logs = [] for ln in open(logfile, "r").readlines(): try: logs.append(json.loads(ln)) except json.JSONDecodeError: continue return logs @property def world_size(self): return torch.cuda.device_count() def train_flags(self, mu): return [ "--memory-efficient-fp16", "--update-freq", "1", "--seed", "1", "--log-format", "json", "--max-update", str(mu), "--tokens-per-sample", "20", "--batch-size", "2", "--share-decoder-input-output-embed", "--optimizer", "adam", "--max-valid-steps", "1", "--pad-to-fixed-length", "--sample-break-mode", "none", ] def _test_resume_multilingual_training( self, extra_clargs, arch="transformer_lm_gpt2_tiny" ): languages = ["en_XX", "fr_XX", "zh_CN"] save_interval = 5 mu = 10 flags = ( self.train_flags(mu) + ["--save-interval-updates", str(save_interval), "--log-interval", "1"] + extra_clargs ) with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fp16") as data_dir: log = os.path.join(data_dir, "train.log") create_dummy_data( data_dir, num_examples=int( mu * 20 * self.world_size * 1.5 ), # make sure enough data for max updates languages=languages, ) preprocess_lm_data(data_dir, languages) train_language_model( data_dir, arch, flags + ["--log-file", log], task="multilingual_language_modeling", world_size=self.world_size, ) log2 = os.path.join(data_dir, "resume.log") ckpt_name = f"checkpoint_1_{save_interval}.pt" restore_file = os.path.join(data_dir, ckpt_name) train_language_model( data_dir, arch, flags + ["--log-file", log2, "--restore-file", restore_file, "--no-save"], task="multilingual_language_modeling", world_size=self.world_size, ) l1 = self.parse_logs(log) assert ( int(l1[-1]["train_num_updates"]) == mu ), f"The first run did not complete {mu} updates. Add more data" l2 = self.parse_logs(log2) if int(l2[0]["num_updates"]) != save_interval + 1: all_ckpt_files = [ x for x in os.listdir(data_dir) if x.endswith(".pt") ] import shutil shutil.move(data_dir, "last_failed_resume") raise AssertionError( f"Likely failed to load {ckpt_name}. {all_ckpt_files} \n LOGS: {l1} \n\n {l2}. " ) for k in [ "train_loss", "train_num_updates", "train_ppl", "train_gnorm", ]: from_scratch, resumed = float(l1[-1][k]), float(l2[-1][k]) # This fails without rounding! assert ( from_scratch == resumed ), f"difference at {k} {from_scratch} != {resumed}" @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestTranslationGPU(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_fp16_multigpu(self): self._test_multigpu("test_fp16", ["--fp16"]) def test_slowmo_multigpu(self): self._test_multigpu( "test_slowmo", ["--ddp-backend", "slowmo", "--nprocs-per-node", "1"] ) def test_slowmo_single_node_multigpu(self): self._test_multigpu( "test_slowmo_single_node", ["--ddp-backend", "slowmo", "--nprocs-per-node", "2"], ) def _test_multigpu(self, test_name, test_args): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory(test_name) as data_dir: log = os.path.join(data_dir, "train.log") create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "fconv_iwslt_de_en", test_args + ["--log-file", log], world_size=min(torch.cuda.device_count(), 2), ) generate_main(data_dir) assert os.path.exists(log) @staticmethod def parse_logs(logfile): logs = [] for ln in open(logfile, "r").readlines(): try: logs.append(json.loads(ln)) except json.JSONDecodeError: continue return logs def test_resume_training_fsdp(self): self._test_resume_training(["--ddp-backend", "fully_sharded"]) def test_resume_training_fsdp_sharded_state(self): self._test_resume_training( ["--ddp-backend", "fully_sharded", "--use-sharded-state"] ) def test_resume_training_noc10d(self): self._test_resume_training([]) def _test_resume_training(self, extra_clargs, arch="fconv_iwslt_de_en"): flags = [ "--fp16", "--log-format", "json", "--max-update", "10", "--save-interval-updates", "2", "--log-interval", "1", ] + extra_clargs world_size = min(torch.cuda.device_count(), 2) with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fp16") as data_dir: log = os.path.join(data_dir, "train.log") create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, arch, flags + ["--log-file", log], world_size=world_size, ) log2 = os.path.join(data_dir, "resume.log") restore_file = os.path.join(data_dir, "checkpoint_1_2.pt") train_translation_model( data_dir, arch, flags + ["--log-file", log2, "--restore-file", restore_file], world_size=world_size, ) l1 = self.parse_logs(log) l2 = self.parse_logs(log2) assert int(l2[0]["num_updates"]) == 3, f"{l1}\n\n {l2}" for k in [ "train_loss", "train_num_updates", "train_ppl", "train_gnorm", ]: from_scratch, resumed = l1[-1][k], l2[-1][k] assert ( from_scratch == resumed ), f"difference at {k} {from_scratch} != {resumed}" def test_memory_efficient_fp16(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_memory_efficient_fp16") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "fconv_iwslt_de_en", ["--memory-efficient-fp16"] ) generate_main(data_dir) def test_transformer_fp16(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "transformer_iwslt_de_en", [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "64", "--decoder-embed-dim", "64", "--fp16", ], run_validation=True, ) generate_main(data_dir) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_amp(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_amp") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model(data_dir, "fconv_iwslt_de_en", ["--amp"]) generate_main(data_dir) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_transformer_amp(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "transformer_iwslt_de_en", [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "64", "--decoder-embed-dim", "64", "--amp", ], run_validation=True, ) generate_main(data_dir) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_levenshtein_transformer(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_levenshtein_transformer" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, ["--joined-dictionary"]) train_translation_model( data_dir, "levenshtein_transformer", [ "--apply-bert-init", "--early-exit", "6,6,6", "--criterion", "nat_loss", ], task="translation_lev", ) gen_config = [ "--task", "translation_lev", "--iter-decode-max-iter", "9", "--iter-decode-eos-penalty", "0", "--print-step", ] # non-ensemble generation generate_main(data_dir, gen_config) # ensemble generation generate_main( data_dir, gen_config, path=os.pathsep.join( [ os.path.join(data_dir, "checkpoint_last.pt"), os.path.join(data_dir, "checkpoint_last.pt"), ] ), ) def test_fsdp_checkpoint_generate(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir: log = os.path.join(data_dir, "train.log") create_dummy_data(data_dir) preprocess_translation_data(data_dir) world_size = min(torch.cuda.device_count(), 2) train_translation_model( data_dir, "fconv_iwslt_de_en", ["--log-file", log, "--ddp-backend", "fully_sharded"], world_size=world_size, ) generate_main(data_dir) assert os.path.exists(log) def test_fsdp_sharded_checkpoint_generate(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fsdp_sharded") as data_dir: log = os.path.join(data_dir, "train.log") create_dummy_data(data_dir) preprocess_translation_data(data_dir) world_size = min(torch.cuda.device_count(), 2) train_translation_model( data_dir, "fconv_iwslt_de_en", [ "--log-file", log, "--ddp-backend", "fully_sharded", "--use-sharded-state", ], world_size=world_size, ) generate_main(data_dir, ["--checkpoint-shard-count", str(world_size)]) assert os.path.exists(log) def _quantize_language_model(data_dir, arch, extra_flags=None, run_validation=False): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", "0", ] + (extra_flags or []), ) train.main(train_args) # try scalar quantization scalar_quant_train_parser = options.get_training_parser() scalar_quant_train_args = options.parse_args_and_arch( scalar_quant_train_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-update", "3", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", "0", "--quant-noise-scalar", "0.5", ] + (extra_flags or []), ) train.main(scalar_quant_train_args) # try iterative PQ quantization quantize_parser = options.get_training_parser() quantize_args = options.parse_args_and_arch( quantize_parser, [ "--task", "language_modeling", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", "--max-tokens", "50", "--tokens-per-sample", "50", "--max-update", "6", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", "0", "--restore-file", os.path.join(data_dir, "checkpoint_last.pt"), "--reset-optimizer", "--quantization-config-path", os.path.join( os.path.dirname(__file__), "transformer_quantization_config.yaml" ), ] + (extra_flags or []), ) train.main(quantize_args) @unittest.skipIf( int(torch.__version__[2]) < 10, reason="quantized kernels are only supported on CPU" ) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestQuantization(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_quantization(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_quantization") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) # tests both scalar and iterative PQ quantization _quantize_language_model(data_dir, "transformer_lm") @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestOptimizersGPU(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_flat_grads(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_flat_grads") as data_dir: # Use just a bit of data and tiny model to keep this test runtime reasonable create_dummy_data(data_dir, num_examples=10, maxlen=5) preprocess_translation_data(data_dir) with self.assertRaises(RuntimeError): # adafactor isn't compatible with flat grads, which # are used by default with --fp16 train_translation_model( data_dir, "lstm", [ "--required-batch-size-multiple", "1", "--encoder-layers", "1", "--encoder-hidden-size", "32", "--decoder-layers", "1", "--optimizer", "adafactor", "--fp16", ], ) # but it should pass once we set --fp16-no-flatten-grads train_translation_model( data_dir, "lstm", [ "--required-batch-size-multiple", "1", "--encoder-layers", "1", "--encoder-hidden-size", "32", "--decoder-layers", "1", "--optimizer", "adafactor", "--fp16", "--fp16-no-flatten-grads", ], ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/gpu/test_ema_gpu.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from copy import deepcopy from dataclasses import dataclass from typing import Optional import torch from fairseq.models.ema import EMA class DummyModule(torch.nn.Module): def __init__(self) -> None: """LightningModule for testing purposes Args: epoch_min_loss_override (int, optional): Pass in an epoch that will be set to the minimum validation loss for testing purposes (zero based). If None this is ignored. Defaults to None. """ super().__init__() self.layer = torch.nn.Linear(in_features=32, out_features=2) self.another_layer = torch.nn.Linear(in_features=2, out_features=2) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.layer(x) return self.another_layer(x) @dataclass class EMAConfig(object): ema_decay: float = 0.99 ema_start_update: int = 0 ema_fp32: bool = False ema_seed_model: Optional[str] = None ema_update_freq: int = 1 @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestEMAGPU(unittest.TestCase): def assertTorchAllClose(self, x, y, atol=1e-8, rtol=1e-5, msg=None): diff = x.float() - y.float() diff_norm = torch.norm(diff) other_norm = torch.norm(y.float()) if msg is None: msg = "|input - other| > {} + {} * |other|".format(atol, rtol) self.assertLessEqual( diff_norm, atol + rtol * other_norm, msg=msg, ) def test_ema(self): model = DummyModule().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) state = deepcopy(model.state_dict()) config = EMAConfig() ema = EMA(model, config) # set decay ema._set_decay(config.ema_decay) self.assertEqual(ema.get_decay(), config.ema_decay) # get model self.assertEqual(ema.get_model(), ema.model) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) # EMA step x = torch.randn(32).cuda() y = model(x) loss = y.sum() loss.backward() optimizer.step() ema.step(model) ema_state_dict = ema.get_model().state_dict() for key, param in model.state_dict().items(): prev_param = state[key] ema_param = ema_state_dict[key] if "version" in key: # Do not decay a model.version pytorch param continue self.assertTorchAllClose( ema_param, config.ema_decay * prev_param + (1 - config.ema_decay) * param, ) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) # Load EMA into model model2 = DummyModule().cuda() ema.reverse(model2) for key, param in model2.state_dict().items(): ema_param = ema_state_dict[key] self.assertTrue(torch.allclose(ema_param, param)) def test_ema_fp32(self): model = DummyModule().cuda().half() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) state = deepcopy(model.state_dict()) config = EMAConfig(ema_fp32=True) ema = EMA(model, config) x = torch.randn(32).cuda() y = model(x.half()) loss = y.sum() loss.backward() optimizer.step() ema.step(model) for key, param in model.state_dict().items(): prev_param = state[key] ema_param = ema.get_model().state_dict()[key] if "version" in key: # Do not decay a model.version pytorch param continue self.assertIn(key, ema.fp32_params) # EMA update is done in fp32, and hence the EMA param must be # closer to the EMA update done in fp32 than in fp16. self.assertLessEqual( torch.norm( ema_param.float() - ( config.ema_decay * prev_param.float() + (1 - config.ema_decay) * param.float() ) .half() .float() ), torch.norm( ema_param.float() - ( config.ema_decay * prev_param + (1 - config.ema_decay) * param ).float() ), ) self.assertTorchAllClose( ema_param, ( config.ema_decay * prev_param.float() + (1 - config.ema_decay) * param.float() ).half(), ) def test_ema_fp16(self): model = DummyModule().cuda().half() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) state = deepcopy(model.state_dict()) config = EMAConfig(ema_fp32=False) ema = EMA(model, config) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) x = torch.randn(32).cuda() y = model(x.half()) loss = y.sum() loss.backward() optimizer.step() ema.step(model) for key, param in model.state_dict().items(): prev_param = state[key] ema_param = ema.get_model().state_dict()[key] if "version" in key: # Do not decay a model.version pytorch param continue # EMA update is done in fp16, and hence the EMA param must be # closer to the EMA update done in fp16 than in fp32. self.assertLessEqual( torch.norm( ema_param.float() - ( config.ema_decay * prev_param + (1 - config.ema_decay) * param ).float() ), torch.norm( ema_param.float() - ( config.ema_decay * prev_param.float() + (1 - config.ema_decay) * param.float() ) .half() .float() ), ) self.assertTorchAllClose( ema_param, config.ema_decay * prev_param + (1 - config.ema_decay) * param, ) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/gpu/transformer_quantization_config.yaml ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # This file defines example configuration arguments for quantizing # a transformer model with product quantization n_centroids: Linear: key: in_features value: {"*": 8} Embedding: key: embedding_dim value: {"*": 8} block_sizes: Linear: key: fuzzy_name value: {fc: 8, attn: 4, emb: 4} Embedding: key: fuzzy_name value: {emb: 8} layers_to_quantize: - decoder\\.layers\\.\d+\\.fc[12] - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01] - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj) ================================================ FILE: tests/speech/__init__.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from argparse import Namespace import os import re import unittest from pathlib import Path from tqdm import tqdm from typing import List, Dict, Optional import torch from fairseq.checkpoint_utils import load_model_ensemble_and_task from fairseq.scoring.wer import WerScorer from fairseq.scoring.bleu import SacrebleuScorer from fairseq import utils import zipfile S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq" class TestFairseqSpeech(unittest.TestCase): @classmethod def download(cls, base_url: str, out_root: Path, filename: str): url = f"{base_url}/{filename}" path = out_root / filename if not path.exists(): torch.hub.download_url_to_file(url, path.as_posix(), progress=True) return path def _set_up(self, dataset_id: str, s3_dir: str, data_filenames: List[str]): self.use_cuda = torch.cuda.is_available() self.root = Path.home() / ".cache" / "fairseq" / dataset_id self.root.mkdir(exist_ok=True, parents=True) os.chdir(self.root) self.base_url = ( s3_dir if re.search("^https:", s3_dir) else f"{S3_BASE_URL}/{s3_dir}" ) for filename in data_filenames: self.download(self.base_url, self.root, filename) def set_up_librispeech(self): self._set_up( "librispeech", "s2t/librispeech", [ "cfg_librispeech.yaml", "spm_librispeech_unigram10000.model", "spm_librispeech_unigram10000.txt", "librispeech_test-other.tsv", "librispeech_test-other.zip", ], ) def set_up_ljspeech(self): self._set_up( "ljspeech", "s2/ljspeech", [ "cfg_ljspeech_g2p.yaml", "ljspeech_g2p_gcmvn_stats.npz", "ljspeech_g2p.txt", "ljspeech_test.tsv", "ljspeech_test.zip", ], ) def set_up_sotasty_es_en(self): self._set_up( "sotasty_es_en", "s2t/big/es-en", [ "cfg_es_en.yaml", "spm_bpe32768_es_en.model", "spm_bpe32768_es_en.txt", "sotasty_es_en_test_ted.tsv", "sotasty_es_en_test_ted.zip", ], ) def set_up_mustc_de_fbank(self): self._set_up( "mustc_de_fbank", "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/must_c/en_de", [ "config.yaml", "spm.model", "dict.txt", "src_dict.txt", "tgt_dict.txt", "tst-COMMON.tsv", "tst-COMMON.zip", ], ) def download_and_load_checkpoint( self, checkpoint_filename: str, arg_overrides: Optional[Dict[str, str]] = None, strict: bool = True, ): path = self.download(self.base_url, self.root, checkpoint_filename) _arg_overrides = arg_overrides or {} _arg_overrides["data"] = self.root.as_posix() models, cfg, task = load_model_ensemble_and_task( [path.as_posix()], arg_overrides=_arg_overrides, strict=strict ) if self.use_cuda: for model in models: model.cuda() return models, cfg, task, self.build_generator(task, models, cfg) def build_generator( self, task, models, cfg, ): return task.build_generator(models, cfg) @classmethod def get_batch_iterator(cls, task, test_split, max_tokens, max_positions): task.load_dataset(test_split) return task.get_batch_iterator( dataset=task.dataset(test_split), max_tokens=max_tokens, max_positions=max_positions, num_workers=1, ).next_epoch_itr(shuffle=False) @classmethod def get_wer_scorer( cls, tokenizer="none", lowercase=False, remove_punct=False, char_level=False ): scorer_args = { "wer_tokenizer": tokenizer, "wer_lowercase": lowercase, "wer_remove_punct": remove_punct, "wer_char_level": char_level, } return WerScorer(Namespace(**scorer_args)) @classmethod def get_bleu_scorer(cls, tokenizer="13a", lowercase=False, char_level=False): scorer_args = { "sacrebleu_tokenizer": tokenizer, "sacrebleu_lowercase": lowercase, "sacrebleu_char_level": char_level, } return SacrebleuScorer(Namespace(**scorer_args)) @torch.no_grad() def base_test( self, ckpt_name, reference_score, score_delta=0.3, dataset="librispeech_test-other", max_tokens=65_536, max_positions=(4_096, 1_024), arg_overrides=None, strict=True, score_type="wer", ): models, _, task, generator = self.download_and_load_checkpoint( ckpt_name, arg_overrides=arg_overrides, strict=strict ) if not self.use_cuda: return batch_iterator = self.get_batch_iterator( task, dataset, max_tokens, max_positions ) if score_type == "bleu": scorer = self.get_bleu_scorer() elif score_type == "wer": scorer = self.get_wer_scorer() else: raise Exception(f"Unsupported score type {score_type}") progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator)) for batch_idx, sample in progress: sample = utils.move_to_cuda(sample) if self.use_cuda else sample hypo = task.inference_step(generator, models, sample) for i, sample_id in enumerate(sample["id"].tolist()): tgt_str, hypo_str = self.postprocess_tokens( task, sample["target"][i, :], hypo[i][0]["tokens"].int().cpu(), ) if batch_idx == 0 and i < 3: print(f"T-{sample_id} {tgt_str}") print(f"H-{sample_id} {hypo_str}") scorer.add_string(tgt_str, hypo_str) print(scorer.result_string() + f" (reference: {reference_score})") self.assertAlmostEqual(scorer.score(), reference_score, delta=score_delta) def postprocess_tokens(self, task, target, hypo_tokens): tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu() tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece") hypo_str = task.tgt_dict.string(hypo_tokens, "sentencepiece") return tgt_str, hypo_str def unzip_files(self, zip_file_name): zip_file_path = self.root / zip_file_name with zipfile.ZipFile(zip_file_path, "r") as zip_ref: zip_ref.extractall(self.root / zip_file_name.strip(".zip")) ================================================ FILE: tests/speech/test_convtransformer_simul_trans.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from tests.speech import TestFairseqSpeech S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/" class TestConvtransformerSimulTrans(TestFairseqSpeech): def setUp(self): self._set_up( "simul", "speech_tests/simul", ["config_gcmvn_specaug.yaml", "dict.txt", "dev.tsv"], ) def test_waitk_checkpoint(self): """Only test model loading since fairseq currently doesn't support inference of simultaneous models""" _, _, _, _ = self.download_and_load_checkpoint( "checkpoint_best.pt", arg_overrides={ "config_yaml": "config_gcmvn_specaug.yaml", "load_pretrained_encoder_from": None, }, ) return if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_dual_input_wav_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from collections import namedtuple from pathlib import Path import torch from tqdm import tqdm import fairseq from fairseq import utils from fairseq.checkpoint_utils import load_model_ensemble_and_task from fairseq.scoring.bleu import SacrebleuScorer from fairseq.tasks import import_tasks from tests.speech import S3_BASE_URL, TestFairseqSpeech @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestLibrispeechDualInputWavTransformer(TestFairseqSpeech): def setUp(self): dataset_id = "librispeech_wvtrasnformer" base_url = "https://dl.fbaipublicfiles.com/joint_speech_text_4_s2t/acl2022/librispeech/finetuned" data_filenames = [ "checkpoint_ave_10.pt", "spm.model", "src_dict.txt", "tgt_dict.txt", "config.yaml", ] self._set_up( dataset_id, "s2t", [ "librispeech_flac_test-other.tsv", "librispeech_flac_test-other.zip", ], ) for filename in data_filenames: self.download(base_url, self.root, filename) def import_user_module(self): user_dir = ( Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text" ) Arg = namedtuple("Arg", ["user_dir"]) arg = Arg(user_dir.__str__()) utils.import_user_module(arg) @torch.no_grad() def test_librispeech_dualinput_wav_transformer_checkpoint(self): self.import_user_module() checkpoint_filename = "checkpoint_ave_10.pt" arg_overrides = { "config_yaml": "config.yaml", "load_pretrained_speech_text_encoder": "", "load_pretrained_speech_text_decoder": "", "beam": 10, "nbest": 1, "lenpen": 1.0, "load_speech_only": True, } self.base_test( checkpoint_filename, 4.6, dataset="librispeech_flac_test-other", max_tokens=800000, max_positions=(800000, 1024), arg_overrides=arg_overrides, ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_dualinput_s2t_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from argparse import Namespace from collections import namedtuple from pathlib import Path import torch from tqdm import tqdm import fairseq from fairseq import utils from fairseq.checkpoint_utils import load_model_ensemble_and_task from fairseq.scoring.bleu import SacrebleuScorer from fairseq.tasks import import_tasks from tests.speech import TestFairseqSpeech @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestDualInputS2TTransformer(TestFairseqSpeech): def setUp(self): self.set_up_mustc_de_fbank() def import_user_module(self): user_dir = ( Path(fairseq.__file__).parent.parent / "examples/speech_text_joint_to_text" ) Arg = namedtuple("Arg", ["user_dir"]) arg = Arg(user_dir.__str__()) utils.import_user_module(arg) @torch.no_grad() def test_mustc_de_fbank_dualinput_s2t_transformer_checkpoint(self): self.import_user_module() checkpoint_filename = "checkpoint_ave_10.pt" path = self.download(self.base_url, self.root, checkpoint_filename) models, cfg, task = load_model_ensemble_and_task( [path.as_posix()], arg_overrides={ "data": self.root.as_posix(), "config_yaml": "config.yaml", "load_pretrain_speech_encoder": "", "load_pretrain_text_encoder_last": "", "load_pretrain_decoder": "", "beam": 10, "nbest": 1, "lenpen": 1.0, "load_speech_only": True, }, ) if self.use_cuda: for model in models: model.cuda() generator = task.build_generator(models, cfg) test_split = "tst-COMMON" task.load_dataset(test_split) batch_iterator = task.get_batch_iterator( dataset=task.dataset(test_split), max_tokens=250_000, max_positions=(10_000, 1_024), num_workers=1, ).next_epoch_itr(shuffle=False) tokenizer = task.build_tokenizer(cfg.tokenizer) bpe = task.build_bpe(cfg.bpe) def decode_fn(x): if bpe is not None: x = bpe.decode(x) if tokenizer is not None: x = tokenizer.decode(x) return x scorer_args = { "sacrebleu_tokenizer": "13a", "sacrebleu_lowercase": False, "sacrebleu_char_level": False, } scorer = SacrebleuScorer(Namespace(**scorer_args)) progress = tqdm(enumerate(batch_iterator), total=len(batch_iterator)) for batch_idx, sample in progress: sample = utils.move_to_cuda(sample) if self.use_cuda else sample hypo = task.inference_step(generator, models, sample) for i, sample_id in enumerate(sample["id"].tolist()): tgt_tokens = ( utils.strip_pad(sample["target"][i, :], task.tgt_dict.pad()) .int() .cpu() ) tgt_str = task.tgt_dict.string(tgt_tokens, "sentencepiece") hypo_str = task.tgt_dict.string( hypo[i][0]["tokens"].int().cpu(), "sentencepiece" ) if batch_idx == 0 and i < 3: print(f"T-{sample_id} {tgt_str}") print(f"D-{sample_id} {hypo_str}") scorer.add_string(tgt_str, hypo_str) reference_bleu = 27.3 result = scorer.result_string() print(result + f" (reference: {reference_bleu})") res_bleu = float(result.split()[2]) self.assertAlmostEqual(res_bleu, reference_bleu, delta=0.3) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_fastspeech2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from tqdm import tqdm from fairseq import utils from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion from tests.speech import TestFairseqSpeech @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestFastSpeech2(TestFairseqSpeech): def setUp(self): self.set_up_ljspeech() @torch.no_grad() def test_ljspeech_fastspeech2_checkpoint(self): models, cfg, task, generator = self.download_and_load_checkpoint( "ljspeech_fastspeech2_g2p.pt", arg_overrides={ "config_yaml": "cfg_ljspeech_g2p.yaml", "vocoder": "griffin_lim", "fp16": False, }, ) batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 4_096) progress = tqdm(batch_iterator, total=len(batch_iterator)) mcd, n_samples = 0.0, 0 for sample in progress: sample = utils.move_to_cuda(sample) if self.use_cuda else sample hypos = generator.generate(models[0], sample, has_targ=True) rets = batch_mel_cepstral_distortion( [hypo["targ_waveform"] for hypo in hypos], [hypo["waveform"] for hypo in hypos], sr=task.sr, ) mcd += sum(d.item() for d, _ in rets) n_samples += len(sample["id"].tolist()) mcd = round(mcd / n_samples, 1) reference_mcd = 3.2 print(f"MCD: {mcd} (reference: {reference_mcd})") self.assertAlmostEqual(mcd, reference_mcd, delta=0.1) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_s2s_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from tests.speech import TestFairseqSpeech from fairseq import utils S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq/" class TestS2STransformer(TestFairseqSpeech): def setUp(self): self._set_up( "s2s", "speech_tests/s2s", [ "dev_shuf200.tsv", "src_feat.zip", "config_specaug_lb.yaml", "vocoder", "vocoder_config.json", ], ) def test_s2s_transformer_checkpoint(self): self.base_test( ckpt_name="s2u_transformer_reduced_fisher.pt", reference_score=38.3, dataset="dev_shuf200", arg_overrides={ "config_yaml": "config_specaug_lb.yaml", "multitask_config_yaml": None, "target_is_code": True, "target_code_size": 100, "eval_inference": False, }, score_type="bleu", strict=False, ) def postprocess_tokens(self, task, target, hypo_tokens): tgt_tokens = utils.strip_pad(target, task.tgt_dict.pad()).int().cpu() tgt_str = task.tgt_dict.string(tgt_tokens) hypo_str = task.tgt_dict.string(hypo_tokens) return tgt_str, hypo_str if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_s2t_conformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from tests.speech import TestFairseqSpeech class TestS2TConformer(TestFairseqSpeech): def setUp(self): self.set_up_librispeech() def test_librispeech_s2t_conformer_s_checkpoint(self): self.base_test( ckpt_name="librispeech_conformer_rel_pos_s.pt", reference_score=12, arg_overrides={"config_yaml": "cfg_librispeech.yaml"}, ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_s2t_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from tests.speech import TestFairseqSpeech class TestS2TTransformer(TestFairseqSpeech): def setUp(self): self.set_up_librispeech() def test_librispeech_s2t_transformer_s_checkpoint(self): self.base_test( ckpt_name="librispeech_transformer_s.pt", reference_score=9, arg_overrides={"config_yaml": "cfg_librispeech.yaml"}, ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_tts_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from tqdm import tqdm from fairseq import utils from fairseq.tasks.text_to_speech import batch_mel_cepstral_distortion from tests.speech import TestFairseqSpeech @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestTTSTransformer(TestFairseqSpeech): def setUp(self): self.set_up_ljspeech() @torch.no_grad() def test_ljspeech_tts_transformer_checkpoint(self): models, cfg, task, generator = self.download_and_load_checkpoint( "ljspeech_transformer_g2p.pt", arg_overrides={ "config_yaml": "cfg_ljspeech_g2p.yaml", "vocoder": "griffin_lim", "fp16": False, }, ) batch_iterator = self.get_batch_iterator(task, "ljspeech_test", 65_536, 1024) progress = tqdm(batch_iterator, total=len(batch_iterator)) mcd, n_samples = 0.0, 0 for sample in progress: sample = utils.move_to_cuda(sample) if self.use_cuda else sample hypos = generator.generate(models[0], sample, has_targ=True) rets = batch_mel_cepstral_distortion( [hypo["targ_waveform"] for hypo in hypos], [hypo["waveform"] for hypo in hypos], sr=task.sr, ) mcd += sum(d.item() for d, _ in rets) n_samples += len(sample["id"].tolist()) mcd = round(mcd / n_samples, 1) reference_mcd = 3.3 print(f"MCD: {mcd} (reference: {reference_mcd})") self.assertAlmostEqual(mcd, reference_mcd, delta=0.1) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_wav2vec2.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from tests.speech import TestFairseqSpeech from fairseq.data.data_utils import post_process from fairseq import utils from omegaconf import open_dict S3_BASE_URL = "https://dl.fbaipublicfiles.com/fairseq" @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestWav2Vec2(TestFairseqSpeech): def setUp(self): self._set_up( "librispeech_w2v2", "conformer/wav2vec2/librispeech", [ "test_librispeech-other.ltr", "test_librispeech-other.tsv", "test_librispeech-other_small.ltr_100", "test_librispeech-other_small.tsv", "test-other.zip", "dict.ltr.txt", "dict.ltr_100.txt", ], ) self.unzip_files( "test-other.zip", ) def test_transformer_w2v2(self): self.base_test( ckpt_name="transformer_oss_small_100h.pt", reference_score=38, score_delta=1, dataset="test_librispeech-other", max_tokens=1000000, max_positions=(700000, 1000), arg_overrides={ "task": "audio_finetuning", "labels": "ltr", "nbest": 1, "tpu": False, }, strict=False, ) def test_conformer_w2v2(self): self.base_test( ckpt_name="conformer_LS_PT_LS_FT_rope.pt", reference_score=4.5, score_delta=1, dataset="test_librispeech-other_small", max_tokens=1000000, max_positions=(700000, 1000), arg_overrides={ "task": "audio_finetuning", "labels": "ltr_100", "nbest": 1, "tpu": False, }, strict=True, ) def build_generator(self, task, models, cfg): try: from examples.speech_recognition.w2l_decoder import W2lViterbiDecoder except Exception: raise Exception("Cannot run this test without flashlight dependency") with open_dict(cfg): cfg.nbest = 1 return W2lViterbiDecoder(cfg, task.target_dictionary) def postprocess_tokens(self, task, target, hypo_tokens): tgt_tokens = utils.strip_pad(target, task.target_dictionary.pad()).int().cpu() tgt_str = task.target_dictionary.string(tgt_tokens) tgt_str = post_process(tgt_str, "letter") hypo_pieces = task.target_dictionary.string(hypo_tokens) hypo_str = post_process(hypo_pieces, "letter") return tgt_str, hypo_str if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech/test_xm_transformer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from tests.speech import TestFairseqSpeech class TestXMTransformer(TestFairseqSpeech): def setUp(self): self.set_up_sotasty_es_en() # TODO: investigate increases BLEU score (30.42 -> 31.74) def test_sotasty_es_en_600m_checkpoint(self): self.base_test( ckpt_name="xm_transformer_600m_es_en_md.pt", reference_score=31.74, score_delta=0.2, max_tokens=3_000_000, max_positions=(1_000_000, 1_024), dataset="sotasty_es_en_test_ted", arg_overrides={"config_yaml": "cfg_es_en.yaml"}, score_type="bleu", ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech_recognition/__init__.py ================================================ ================================================ FILE: tests/speech_recognition/asr_test_base.py ================================================ #!/usr/bin/env python3 import argparse import os import unittest from inspect import currentframe, getframeinfo import numpy as np import torch from examples.speech_recognition.data.data_utils import lengths_to_encoder_padding_mask from fairseq.data import data_utils as fairseq_data_utils from fairseq.data.dictionary import Dictionary from fairseq.models import ( BaseFairseqModel, FairseqDecoder, FairseqEncoder, FairseqEncoderDecoderModel, FairseqEncoderModel, FairseqModel, ) from fairseq.tasks.fairseq_task import LegacyFairseqTask DEFAULT_TEST_VOCAB_SIZE = 100 # /////////////////////////////////////////////////////////////////////////// # utility function to setup dummy dict/task/input # /////////////////////////////////////////////////////////////////////////// def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): dummy_dict = Dictionary() # add dummy symbol to satisfy vocab size for id, _ in enumerate(range(vocab_size)): dummy_dict.add_symbol("{}".format(id), 1000) return dummy_dict class DummyTask(LegacyFairseqTask): def __init__(self, args): super().__init__(args) self.dictionary = get_dummy_dictionary() if getattr(self.args, "ctc", False): self.dictionary.add_symbol("<ctc_blank>") self.tgt_dict = self.dictionary @property def target_dictionary(self): return self.dictionary def get_dummy_task_and_parser(): """ to build a fariseq model, we need some dummy parse and task. This function is used to create dummy task and parser to faciliate model/criterion test Note: we use FbSpeechRecognitionTask as the dummy task. You may want to use other task by providing another function """ parser = argparse.ArgumentParser( description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS ) DummyTask.add_args(parser) args = parser.parse_args([]) task = DummyTask.setup_task(args) return task, parser def get_dummy_input(T=100, D=80, B=5, K=100): forward_input = {} # T max sequence length # D feature vector dimension # B batch size # K target dimension size feature = torch.randn(B, T, D) # this (B, T, D) layout is just a convention, you can override it by # write your own _prepare_forward_input function src_lengths = torch.from_numpy( np.random.randint(low=1, high=T, size=B, dtype=np.int64) ) src_lengths[0] = T # make sure the maximum length matches prev_output_tokens = [] for b in range(B): token_length = np.random.randint(low=1, high=src_lengths[b].item() + 1) tokens = np.random.randint(low=0, high=K, size=token_length, dtype=np.int64) prev_output_tokens.append(torch.from_numpy(tokens)) prev_output_tokens = fairseq_data_utils.collate_tokens( prev_output_tokens, pad_idx=1, eos_idx=2, left_pad=False, move_eos_to_beginning=False, ) src_lengths, sorted_order = src_lengths.sort(descending=True) forward_input["src_tokens"] = feature.index_select(0, sorted_order) forward_input["src_lengths"] = src_lengths forward_input["prev_output_tokens"] = prev_output_tokens return forward_input def get_dummy_encoder_output(encoder_out_shape=(100, 80, 5)): """ This only provides an example to generate dummy encoder output """ (T, B, D) = encoder_out_shape encoder_out = {} encoder_out["encoder_out"] = torch.from_numpy( np.random.randn(*encoder_out_shape).astype(np.float32) ) seq_lengths = torch.from_numpy(np.random.randint(low=1, high=T, size=B)) # some dummy mask encoder_out["encoder_padding_mask"] = torch.arange(T).view(1, T).expand( B, -1 ) >= seq_lengths.view(B, 1).expand(-1, T) encoder_out["encoder_padding_mask"].t_() # encoer_padding_mask is (T, B) tensor, with (t, b)-th element indicate # whether encoder_out[t, b] is valid (=0) or not (=1) return encoder_out def _current_postion_info(): cf = currentframe() frameinfo = " (at {}:{})".format( os.path.basename(getframeinfo(cf).filename), cf.f_back.f_lineno ) return frameinfo def check_encoder_output(encoder_output, batch_size=None): """we expect encoder_output to be a dict with the following key/value pairs: - encoder_out: a Torch.Tensor - encoder_padding_mask: a binary Torch.Tensor """ if not isinstance(encoder_output, dict): msg = ( "FairseqEncoderModel.forward(...) must be a dict" + _current_postion_info() ) return False, msg if "encoder_out" not in encoder_output: msg = ( "FairseqEncoderModel.forward(...) must contain encoder_out" + _current_postion_info() ) return False, msg if "encoder_padding_mask" not in encoder_output: msg = ( "FairseqEncoderModel.forward(...) must contain encoder_padding_mask" + _current_postion_info() ) return False, msg if not isinstance(encoder_output["encoder_out"], torch.Tensor): msg = "encoder_out must be a torch.Tensor" + _current_postion_info() return False, msg if encoder_output["encoder_out"].dtype != torch.float32: msg = "encoder_out must have float32 dtype" + _current_postion_info() return False, msg mask = encoder_output["encoder_padding_mask"] if mask is not None: if not isinstance(mask, torch.Tensor): msg = ( "encoder_padding_mask must be a torch.Tensor" + _current_postion_info() ) return False, msg if mask.dtype != torch.uint8 and ( not hasattr(torch, "bool") or mask.dtype != torch.bool ): msg = ( "encoder_padding_mask must have dtype of uint8" + _current_postion_info() ) return False, msg if mask.dim() != 2: msg = ( "we expect encoder_padding_mask to be a 2-d tensor, in shape (T, B)" + _current_postion_info() ) return False, msg if batch_size is not None and mask.size(1) != batch_size: msg = ( "we expect encoder_padding_mask to be a 2-d tensor, with size(1)" + " being the batch size" + _current_postion_info() ) return False, msg return True, None def check_decoder_output(decoder_output): """we expect output from a decoder is a tuple with the following constraint: - the first element is a torch.Tensor - the second element can be anything (reserved for future use) """ if not isinstance(decoder_output, tuple): msg = "FariseqDecoder output must be a tuple" + _current_postion_info() return False, msg if len(decoder_output) != 2: msg = "FairseqDecoder output must be 2-elem tuple" + _current_postion_info() return False, msg if not isinstance(decoder_output[0], torch.Tensor): msg = ( "FariseqDecoder output[0] must be a torch.Tensor" + _current_postion_info() ) return False, msg return True, None # /////////////////////////////////////////////////////////////////////////// # Base Test class # /////////////////////////////////////////////////////////////////////////// class TestBaseFairseqModelBase(unittest.TestCase): """ This class is used to facilitate writing unittest for any class derived from `BaseFairseqModel`. """ @classmethod def setUpClass(cls): if cls is TestBaseFairseqModelBase: raise unittest.SkipTest("Skipping test case in base") super().setUpClass() def setUpModel(self, model): self.assertTrue(isinstance(model, BaseFairseqModel)) self.model = model def setupInput(self): pass def setUp(self): self.model = None self.forward_input = None pass class TestFairseqEncoderDecoderModelBase(TestBaseFairseqModelBase): """ base code to test FairseqEncoderDecoderModel (formally known as `FairseqModel`) must be derived from this base class """ @classmethod def setUpClass(cls): if cls is TestFairseqEncoderDecoderModelBase: raise unittest.SkipTest("Skipping test case in base") super().setUpClass() def setUpModel(self, model_cls, extra_args_setters=None): self.assertTrue( issubclass(model_cls, (FairseqEncoderDecoderModel, FairseqModel)), msg="This class only tests for FairseqModel subclasses", ) task, parser = get_dummy_task_and_parser() model_cls.add_args(parser) args = parser.parse_args([]) if extra_args_setters is not None: for args_setter in extra_args_setters: args_setter(args) model = model_cls.build_model(args, task) self.model = model def setUpInput(self, input=None): self.forward_input = get_dummy_input() if input is None else input def setUp(self): super().setUp() def test_forward(self): if self.model and self.forward_input: forward_output = self.model.forward(**self.forward_input) # for FairseqEncoderDecoderModel, forward returns a tuple of two # elements, the first one is a Torch.Tensor succ, msg = check_decoder_output(forward_output) if not succ: self.assertTrue(succ, msg=msg) self.forward_output = forward_output def test_get_normalized_probs(self): if self.model and self.forward_input: forward_output = self.model.forward(**self.forward_input) logprob = self.model.get_normalized_probs(forward_output, log_probs=True) prob = self.model.get_normalized_probs(forward_output, log_probs=False) # in order for different models/criterion to play with each other # we need to know whether the logprob or prob output is batch_first # or not. We assume an additional attribute will be attached to logprob # or prob. If you find your code failed here, simply override # FairseqModel.get_normalized_probs, see example at # https://fburl.com/batch_first_example self.assertTrue(hasattr(logprob, "batch_first")) self.assertTrue(hasattr(prob, "batch_first")) self.assertTrue(torch.is_tensor(logprob)) self.assertTrue(torch.is_tensor(prob)) class TestFairseqEncoderModelBase(TestBaseFairseqModelBase): """ base class to test FairseqEncoderModel """ @classmethod def setUpClass(cls): if cls is TestFairseqEncoderModelBase: raise unittest.SkipTest("Skipping test case in base") super().setUpClass() def setUpModel(self, model_cls, extra_args_setters=None): self.assertTrue( issubclass(model_cls, FairseqEncoderModel), msg="This class is only used for testing FairseqEncoderModel", ) task, parser = get_dummy_task_and_parser() model_cls.add_args(parser) args = parser.parse_args([]) if extra_args_setters is not None: for args_setter in extra_args_setters: args_setter(args) model = model_cls.build_model(args, task) self.model = model def setUpInput(self, input=None): self.forward_input = get_dummy_input() if input is None else input # get_dummy_input() is originally for s2s, here we delete extra dict # items, so it can be used for EncoderModel / Encoder as well self.forward_input.pop("prev_output_tokens", None) def setUp(self): super().setUp() def test_forward(self): if self.forward_input and self.model: bsz = self.forward_input["src_tokens"].size(0) forward_output = self.model.forward(**self.forward_input) # we expect forward_output to be a dict with the following # key/value pairs: # - encoder_out: a Torch.Tensor # - encoder_padding_mask: a binary Torch.Tensor succ, msg = check_encoder_output(forward_output, batch_size=bsz) if not succ: self.assertTrue(succ, msg=msg) self.forward_output = forward_output def test_get_normalized_probs(self): if self.model and self.forward_input: forward_output = self.model.forward(**self.forward_input) logprob = self.model.get_normalized_probs(forward_output, log_probs=True) prob = self.model.get_normalized_probs(forward_output, log_probs=False) # in order for different models/criterion to play with each other # we need to know whether the logprob or prob output is batch_first # or not. We assume an additional attribute will be attached to logprob # or prob. If you find your code failed here, simply override # FairseqModel.get_normalized_probs, see example at # https://fburl.com/batch_first_example self.assertTrue(hasattr(logprob, "batch_first")) self.assertTrue(hasattr(prob, "batch_first")) self.assertTrue(torch.is_tensor(logprob)) self.assertTrue(torch.is_tensor(prob)) class TestFairseqEncoderBase(unittest.TestCase): """ base class to test FairseqEncoder """ @classmethod def setUpClass(cls): if cls is TestFairseqEncoderBase: raise unittest.SkipTest("Skipping test case in base") super().setUpClass() def setUpEncoder(self, encoder): self.assertTrue( isinstance(encoder, FairseqEncoder), msg="This class is only used for test FairseqEncoder", ) self.encoder = encoder def setUpInput(self, input=None): self.forward_input = get_dummy_input() if input is None else input # get_dummy_input() is originally for s2s, here we delete extra dict # items, so it can be used for EncoderModel / Encoder as well self.forward_input.pop("prev_output_tokens", None) def setUp(self): self.encoder = None self.forward_input = None def test_forward(self): if self.encoder and self.forward_input: bsz = self.forward_input["src_tokens"].size(0) forward_output = self.encoder.forward(**self.forward_input) succ, msg = check_encoder_output(forward_output, batch_size=bsz) if not succ: self.assertTrue(succ, msg=msg) self.forward_output = forward_output class TestFairseqDecoderBase(unittest.TestCase): """ base class to test FairseqDecoder """ @classmethod def setUpClass(cls): if cls is TestFairseqDecoderBase: raise unittest.SkipTest("Skipping test case in base") super().setUpClass() def setUpDecoder(self, decoder): self.assertTrue( isinstance(decoder, FairseqDecoder), msg="This class is only used for test FairseqDecoder", ) self.decoder = decoder def setUpInput(self, input=None): self.forward_input = get_dummy_encoder_output() if input is None else input def setUpPrevOutputTokens(self, tokens=None): if tokens is None: self.encoder_input = get_dummy_input() self.prev_output_tokens = self.encoder_input["prev_output_tokens"] else: self.prev_output_tokens = tokens def setUp(self): self.decoder = None self.forward_input = None self.prev_output_tokens = None def test_forward(self): if ( self.decoder is not None and self.forward_input is not None and self.prev_output_tokens is not None ): forward_output = self.decoder.forward( prev_output_tokens=self.prev_output_tokens, encoder_out=self.forward_input, ) succ, msg = check_decoder_output(forward_output) if not succ: self.assertTrue(succ, msg=msg) self.forward_input = forward_output class DummyEncoderModel(FairseqEncoderModel): def __init__(self, encoder): super().__init__(encoder) @classmethod def build_model(cls, args, task): return cls(DummyEncoder()) def get_logits(self, net_output): # Inverse of sigmoid to use with BinaryCrossEntropyWithLogitsCriterion as # F.binary_cross_entropy_with_logits combines sigmoid and CE return torch.log( torch.div(net_output["encoder_out"], 1 - net_output["encoder_out"]) ) def get_normalized_probs(self, net_output, log_probs, sample=None): lprobs = super().get_normalized_probs(net_output, log_probs, sample=sample) lprobs.batch_first = True return lprobs class DummyEncoder(FairseqEncoder): def __init__(self): super().__init__(None) def forward(self, src_tokens, src_lengths): mask, max_len = lengths_to_encoder_padding_mask(src_lengths) return {"encoder_out": src_tokens, "encoder_padding_mask": mask} class CrossEntropyCriterionTestBase(unittest.TestCase): @classmethod def setUpClass(cls): if cls is CrossEntropyCriterionTestBase: raise unittest.SkipTest("Skipping base class test case") super().setUpClass() def setUpArgs(self): args = argparse.Namespace() args.sentence_avg = False args.threshold = 0.1 # to use with BinaryCrossEntropyWithLogitsCriterion return args def setUp(self): args = self.setUpArgs() self.model = DummyEncoderModel(encoder=DummyEncoder()) self.criterion = self.criterion_cls.build_criterion(args, task=DummyTask(args)) def get_src_tokens(self, correct_prediction, aggregate): """ correct_prediction: True if the net_output (src_tokens) should predict the correct target aggregate: True if the criterion expects net_output (src_tokens) aggregated across time axis """ predicted_idx = 0 if correct_prediction else 1 if aggregate: src_tokens = torch.zeros((2, 2), dtype=torch.float) for b in range(2): src_tokens[b][predicted_idx] = 1.0 else: src_tokens = torch.zeros((2, 10, 2), dtype=torch.float) for b in range(2): for t in range(10): src_tokens[b][t][predicted_idx] = 1.0 return src_tokens def get_target(self, soft_target): if soft_target: target = torch.zeros((2, 2), dtype=torch.float) for b in range(2): target[b][0] = 1.0 else: target = torch.zeros((2, 10), dtype=torch.long) return target def get_test_sample(self, correct, soft_target, aggregate): src_tokens = self.get_src_tokens(correct, aggregate) target = self.get_target(soft_target) L = src_tokens.size(1) return { "net_input": {"src_tokens": src_tokens, "src_lengths": torch.tensor([L])}, "target": target, "ntokens": src_tokens.size(0) * src_tokens.size(1), } ================================================ FILE: tests/speech_recognition/test_collaters.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import numpy as np import torch from examples.speech_recognition.data.collaters import Seq2SeqCollater class TestSeq2SeqCollator(unittest.TestCase): def test_collate(self): eos_idx = 1 pad_idx = 0 collater = Seq2SeqCollater( feature_index=0, label_index=1, pad_index=pad_idx, eos_index=eos_idx ) # 2 frames in the first sample and 3 frames in the second one frames1 = np.array([[7, 8], [9, 10]]) frames2 = np.array([[1, 2], [3, 4], [5, 6]]) target1 = np.array([4, 2, 3, eos_idx]) target2 = np.array([3, 2, eos_idx]) sample1 = {"id": 0, "data": [frames1, target1]} sample2 = {"id": 1, "data": [frames2, target2]} batch = collater.collate([sample1, sample2]) # collate sort inputs by frame's length before creating the batch self.assertTensorEqual(batch["id"], torch.tensor([1, 0])) self.assertEqual(batch["ntokens"], 7) self.assertTensorEqual( batch["net_input"]["src_tokens"], torch.tensor( [[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [pad_idx, pad_idx]]] ), ) self.assertTensorEqual( batch["net_input"]["prev_output_tokens"], torch.tensor([[eos_idx, 3, 2, pad_idx], [eos_idx, 4, 2, 3]]), ) self.assertTensorEqual(batch["net_input"]["src_lengths"], torch.tensor([3, 2])) self.assertTensorEqual( batch["target"], torch.tensor([[3, 2, eos_idx, pad_idx], [4, 2, 3, eos_idx]]), ) self.assertEqual(batch["nsentences"], 2) def assertTensorEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertEqual(t1.ne(t2).long().sum(), 0) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/speech_recognition/test_cross_entropy.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. from examples.speech_recognition.criterions.cross_entropy_acc import ( CrossEntropyWithAccCriterion, ) from .asr_test_base import CrossEntropyCriterionTestBase class CrossEntropyWithAccCriterionTest(CrossEntropyCriterionTestBase): def setUp(self): self.criterion_cls = CrossEntropyWithAccCriterion super().setUp() def test_cross_entropy_all_correct(self): sample = self.get_test_sample(correct=True, soft_target=False, aggregate=False) loss, sample_size, logging_output = self.criterion( self.model, sample, "sum", log_probs=True ) assert logging_output["correct"] == 20 assert logging_output["total"] == 20 assert logging_output["sample_size"] == 20 assert logging_output["ntokens"] == 20 def test_cross_entropy_all_wrong(self): sample = self.get_test_sample(correct=False, soft_target=False, aggregate=False) loss, sample_size, logging_output = self.criterion( self.model, sample, "sum", log_probs=True ) assert logging_output["correct"] == 0 assert logging_output["total"] == 20 assert logging_output["sample_size"] == 20 assert logging_output["ntokens"] == 20 ================================================ FILE: tests/speech_recognition/test_data_utils.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from examples.speech_recognition.data import data_utils class DataUtilsTest(unittest.TestCase): def test_normalization(self): sample_len1 = torch.tensor( [ [ -0.7661, -1.3889, -2.0972, -0.9134, -0.7071, -0.9765, -0.8700, -0.8283, 0.7512, 1.3211, 2.1532, 2.1174, 1.2800, 1.2633, 1.6147, 1.6322, 2.0723, 3.1522, 3.2852, 2.2309, 2.5569, 2.2183, 2.2862, 1.5886, 0.8773, 0.8725, 1.2662, 0.9899, 1.1069, 1.3926, 1.2795, 1.1199, 1.1477, 1.2687, 1.3843, 1.1903, 0.8355, 1.1367, 1.2639, 1.4707, ] ] ) out = data_utils.apply_mv_norm(sample_len1) assert not torch.isnan(out).any() assert (out == sample_len1).all() ================================================ FILE: tests/speech_recognition/test_vggtransformer.py ================================================ #!/usr/bin/env python3 # import models/encoder/decoder to be tested from examples.speech_recognition.models.vggtransformer import ( TransformerDecoder, VGGTransformerEncoder, VGGTransformerModel, vggtransformer_1, vggtransformer_2, vggtransformer_base, ) # import base test class from .asr_test_base import ( DEFAULT_TEST_VOCAB_SIZE, TestFairseqDecoderBase, TestFairseqEncoderBase, TestFairseqEncoderDecoderModelBase, get_dummy_dictionary, get_dummy_encoder_output, get_dummy_input, ) class VGGTransformerModelTest_mid(TestFairseqEncoderDecoderModelBase): def setUp(self): def override_config(args): """ vggtrasformer_1 use 14 layers of transformer, for testing purpose, it is too expensive. For fast turn-around test, reduce the number of layers to 3. """ args.transformer_enc_config = ( "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3" ) super().setUp() extra_args_setter = [vggtransformer_1, override_config] self.setUpModel(VGGTransformerModel, extra_args_setter) self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) class VGGTransformerModelTest_big(TestFairseqEncoderDecoderModelBase): def setUp(self): def override_config(args): """ vggtrasformer_2 use 16 layers of transformer, for testing purpose, it is too expensive. For fast turn-around test, reduce the number of layers to 3. """ args.transformer_enc_config = ( "((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 3" ) super().setUp() extra_args_setter = [vggtransformer_2, override_config] self.setUpModel(VGGTransformerModel, extra_args_setter) self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) class VGGTransformerModelTest_base(TestFairseqEncoderDecoderModelBase): def setUp(self): def override_config(args): """ vggtrasformer_base use 12 layers of transformer, for testing purpose, it is too expensive. For fast turn-around test, reduce the number of layers to 3. """ args.transformer_enc_config = ( "((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 3" ) super().setUp() extra_args_setter = [vggtransformer_base, override_config] self.setUpModel(VGGTransformerModel, extra_args_setter) self.setUpInput(get_dummy_input(T=50, D=80, B=5, K=DEFAULT_TEST_VOCAB_SIZE)) class VGGTransformerEncoderTest(TestFairseqEncoderBase): def setUp(self): super().setUp() self.setUpInput(get_dummy_input(T=50, D=80, B=5)) def test_forward(self): print("1. test standard vggtransformer") self.setUpEncoder(VGGTransformerEncoder(input_feat_per_channel=80)) super().test_forward() print("2. test vggtransformer with limited right context") self.setUpEncoder( VGGTransformerEncoder( input_feat_per_channel=80, transformer_context=(-1, 5) ) ) super().test_forward() print("3. test vggtransformer with limited left context") self.setUpEncoder( VGGTransformerEncoder( input_feat_per_channel=80, transformer_context=(5, -1) ) ) super().test_forward() print("4. test vggtransformer with limited right context and sampling") self.setUpEncoder( VGGTransformerEncoder( input_feat_per_channel=80, transformer_context=(-1, 12), transformer_sampling=(2, 2), ) ) super().test_forward() print("5. test vggtransformer with windowed context and sampling") self.setUpEncoder( VGGTransformerEncoder( input_feat_per_channel=80, transformer_context=(12, 12), transformer_sampling=(2, 2), ) ) class TransformerDecoderTest(TestFairseqDecoderBase): def setUp(self): super().setUp() dict = get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE) decoder = TransformerDecoder(dict) dummy_encoder_output = get_dummy_encoder_output(encoder_out_shape=(50, 5, 256)) self.setUpDecoder(decoder) self.setUpInput(dummy_encoder_output) self.setUpPrevOutputTokens() ================================================ FILE: tests/tasks/test_denoising.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import unittest from tempfile import TemporaryDirectory from fairseq import options from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.tasks.denoising import DenoisingTask from tests.utils import build_vocab, make_data class TestDenoising(unittest.TestCase): def test_denoising(self): with TemporaryDirectory() as dirname: # prep input file raw_file = os.path.join(dirname, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(dirname, split) dataset_impl = "mmap" FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl=dataset_impl, vocab_size=len(vocab), output_prefix=bin_file, ) # setup task train_args = options.parse_args_and_arch( options.get_training_parser(), [ "--task", "denoising", "--arch", "bart_base", "--seed", "42", "--mask-length", "word", "--permute-sentences", "1", "--rotate", "0", "--replace-length", "-1", "--mask", "0.2", dirname, ], ) cfg = convert_namespace_to_omegaconf(train_args) task = DenoisingTask(cfg.task, binarizer.dict) # load datasets original_dataset = task._load_dataset_split(bin_file, 1, False) task.load_dataset(split) masked_dataset = task.dataset(split) iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) mask_index = task.source_dictionary.index("<mask>") for batch in iterator: for sample in range(len(batch)): net_input = batch["net_input"] masked_src_tokens = net_input["src_tokens"][sample] masked_src_length = net_input["src_lengths"][sample] masked_tgt_tokens = batch["target"][sample] sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] original_tokens = original_tokens.masked_select( masked_src_tokens[:masked_src_length] == mask_index ) masked_tokens = masked_tgt_tokens.masked_select( masked_src_tokens == mask_index ) assert masked_tokens.equal(original_tokens) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/tasks/test_masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import unittest from tempfile import TemporaryDirectory from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer from fairseq.tasks.masked_lm import MaskedLMConfig, MaskedLMTask from tests.utils import build_vocab, make_data class TestMaskedLM(unittest.TestCase): def test_masks_tokens(self): with TemporaryDirectory() as dirname: # prep input file raw_file = os.path.join(dirname, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(dirname, split) FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl="mmap", vocab_size=len(vocab), output_prefix=bin_file, ) # setup task cfg = MaskedLMConfig( data=dirname, seed=42, mask_prob=0.5, # increasing the odds of masking random_token_prob=0, # avoiding random tokens for exact match leave_unmasked_prob=0, # always masking for exact match ) task = MaskedLMTask(cfg, binarizer.dict) original_dataset = task._load_dataset_split(bin_file, 1, False) # load datasets task.load_dataset(split) masked_dataset = task.dataset(split) mask_index = task.source_dictionary.index("<mask>") iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) for batch in iterator: for sample in range(len(batch)): net_input = batch["net_input"] masked_src_tokens = net_input["src_tokens"][sample] masked_src_length = net_input["src_lengths"][sample] masked_tgt_tokens = batch["target"][sample] sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] original_tokens = original_tokens.masked_select( masked_src_tokens[:masked_src_length] == mask_index ) masked_tokens = masked_tgt_tokens.masked_select( masked_tgt_tokens != task.source_dictionary.pad() ) assert masked_tokens.equal(original_tokens) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/tasks/test_multilingual_denoising.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import unittest from tempfile import TemporaryDirectory from fairseq import options from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.tasks.multilingual_denoising import MultilingualDenoisingTask from tests.utils import build_vocab, make_data class TestMultilingualDenoising(unittest.TestCase): def test_multilingual_denoising(self): with TemporaryDirectory() as dirname: # prep input file lang_dir = os.path.join(dirname, "en") os.mkdir(lang_dir) raw_file = os.path.join(lang_dir, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(lang_dir, split) dataset_impl = "mmap" FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl=dataset_impl, vocab_size=len(vocab), output_prefix=bin_file, ) # setup task train_args = options.parse_args_and_arch( options.get_training_parser(), [ "--task", "multilingual_denoising", "--arch", "bart_base", "--seed", "42", "--mask-length", "word", "--permute-sentences", "1", "--rotate", "0", "--replace-length", "-1", "--mask", "0.2", dirname, ], ) cfg = convert_namespace_to_omegaconf(train_args) task = MultilingualDenoisingTask(cfg.task, binarizer.dict) # load datasets original_dataset = task._load_dataset_split(bin_file, 1, False) task.load_dataset(split) masked_dataset = task.dataset(split) iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) mask_index = task.source_dictionary.index("<mask>") for batch in iterator: for sample in range(len(batch)): net_input = batch["net_input"] masked_src_tokens = net_input["src_tokens"][sample] masked_src_length = net_input["src_lengths"][sample] masked_tgt_tokens = batch["target"][sample] sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] original_tokens = original_tokens.masked_select( masked_src_tokens[:masked_src_length] == mask_index ) masked_tokens = masked_tgt_tokens.masked_select( masked_src_tokens == mask_index ) assert masked_tokens.equal(original_tokens) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/tasks/test_span_masked_lm.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import unittest from tempfile import TemporaryDirectory from fairseq import options from fairseq.binarizer import FileBinarizer, VocabularyDatasetBinarizer from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.tasks.span_masked_lm import SpanMaskedLMTask from tests.utils import build_vocab, make_data class TestSpanMaskedLM(unittest.TestCase): def test_masks_token_spans(self): with TemporaryDirectory() as dirname: # prep input file raw_file = os.path.join(dirname, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(dirname, split) dataset_impl = "mmap" FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl=dataset_impl, vocab_size=len(vocab), output_prefix=bin_file, ) # adding sentinel tokens for i in range(100): vocab.add_symbol(f"<extra_id_{i}>") # setup task train_args = options.parse_args_and_arch( options.get_training_parser(), [ "--task", "span_masked_lm", "--arch", "bart_base", "--seed", "42", dirname, ], ) cfg = convert_namespace_to_omegaconf(train_args) task = SpanMaskedLMTask(cfg.task, binarizer.dict) # load datasets original_dataset = task._load_dataset_split(bin_file, 1, False) task.load_dataset(split) masked_dataset = task.dataset(split) iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) num_tokens = len(vocab) for batch in iterator: for sample in range(len(batch)): sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] masked_src_tokens = batch["net_input"]["src_tokens"][sample] masked_src_length = batch["net_input"]["src_lengths"][sample] masked_tgt_tokens = batch["target"][sample] original_offset = 0 masked_tgt_offset = 0 extra_id_token = len(vocab) - 1 for masked_src_token in masked_src_tokens[:masked_src_length]: if masked_src_token == extra_id_token: assert ( masked_src_token == masked_tgt_tokens[masked_tgt_offset] ) extra_id_token -= 1 masked_tgt_offset += 1 while ( original_offset < len(original_tokens) and masked_tgt_tokens[masked_tgt_offset] != extra_id_token ): assert ( original_tokens[original_offset] == masked_tgt_tokens[masked_tgt_offset] ) original_offset += 1 masked_tgt_offset += 1 else: assert original_tokens[original_offset] == masked_src_token original_offset += 1 if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_activation_checkpointing.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch import torch.nn as nn from fairseq.modules.checkpoint_activations import checkpoint_wrapper from torch.utils.checkpoint import checkpoint class Model(nn.Module): def __init__( self, use_pytorch_checkpoint=False, use_fairseq_checkpoint=False, **kwargs ): super().__init__() torch.manual_seed(0) self.use_pytorch_checkpoint = use_pytorch_checkpoint self.ffn = nn.Sequential( nn.Linear(32, 128), # add a Dropout layer to test RNG save/restore nn.Dropout(p=0.5), nn.Linear(128, 32), ) if use_fairseq_checkpoint: self.ffn = checkpoint_wrapper(self.ffn, **kwargs) self.out = nn.Linear(32, 1) def forward(self, x): if self.use_pytorch_checkpoint: x = checkpoint(self.ffn, x) else: x = self.ffn(x) return self.out(x) class TestActivationCheckpointing(unittest.TestCase): def _test_checkpoint_wrapper(self, device, log_memory_usage=False): def get_loss_and_gnorm(model): torch.manual_seed(1) input = torch.rand(2, 16, 32).requires_grad_(True).to(device) model.zero_grad() loss = model(input).sum() loss.backward() gnorm = torch.norm( torch.stack([torch.norm(p.grad.detach()) for p in model.parameters()]) ) return {"loss": loss, "gnorm": gnorm} model = Model().to(device) no_cpt = get_loss_and_gnorm(model) model = Model(use_pytorch_checkpoint=True).to(device) pyt_cpt = get_loss_and_gnorm(model) torch.testing.assert_allclose(no_cpt["loss"], pyt_cpt["loss"]) torch.testing.assert_allclose(no_cpt["gnorm"], pyt_cpt["gnorm"]) model = Model(use_fairseq_checkpoint=True).to(device) fairseq_cpt = get_loss_and_gnorm(model) torch.testing.assert_allclose(no_cpt["loss"], fairseq_cpt["loss"]) torch.testing.assert_allclose(no_cpt["gnorm"], fairseq_cpt["gnorm"]) model = Model(use_fairseq_checkpoint=True, offload_to_cpu=True).to(device) fairseq_cpt_offload = get_loss_and_gnorm(model) torch.testing.assert_allclose(no_cpt["loss"], fairseq_cpt_offload["loss"]) torch.testing.assert_allclose(no_cpt["gnorm"], fairseq_cpt_offload["gnorm"]) def test_checkpoint_wrapper_cpu(self): self._test_checkpoint_wrapper(device=torch.device("cpu")) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_checkpoint_wrapper_cuda(self): self._test_checkpoint_wrapper(device=torch.device("cuda")) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_amp_optimizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import copy import unittest import torch from torch.cuda.amp import GradScaler, autocast from fairseq.optim import build_optimizer @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestGradientScalingAMP(unittest.TestCase): def setUp(self): self.x = torch.tensor([2.0]).cuda().half() weight = 3.0 bias = 5.0 self.error = 1.0 self.target = torch.tensor([self.x * weight + bias + self.error]).cuda() self.loss_fn = torch.nn.L1Loss() self.model = torch.nn.Linear(1, 1) self.model.weight.data = torch.tensor([[weight]]) self.model.bias.data = torch.tensor([bias]) self.model.cuda() self.params = list(self.model.parameters()) self.namespace_dls = argparse.Namespace( optimizer="adam", lr=[0.1], adam_betas="(0.9, 0.999)", adam_eps=1e-8, weight_decay=0.0, threshold_loss_scale=1, min_loss_scale=1e-4, ) self.scaler = GradScaler( init_scale=1, growth_interval=1, ) def run_iter(self, model, params, optimizer): optimizer.zero_grad() with autocast(): y = model(self.x) loss = self.loss_fn(y, self.target) self.scaler.scale(loss).backward() self.assertEqual(loss, torch.tensor(1.0, device="cuda:0", dtype=torch.float16)) self.scaler.unscale_(optimizer) grad_norm = optimizer.clip_grad_norm(0) self.assertAlmostEqual(grad_norm.item(), 2.2361, 4) self.scaler.step(optimizer) self.scaler.update() self.assertEqual( model.weight, torch.tensor([[3.1]], device="cuda:0", requires_grad=True), ) self.assertEqual( model.bias, torch.tensor([5.1], device="cuda:0", requires_grad=True), ) self.assertEqual(self.scaler.get_scale(), 2.0) def test_automatic_mixed_precision(self): model = copy.deepcopy(self.model) params = list(model.parameters()) optimizer = build_optimizer(self.namespace_dls, params) self.run_iter(model, params, optimizer) ================================================ FILE: tests/test_average_checkpoints.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import collections import os import shutil import tempfile import unittest import numpy as np import torch from scripts.average_checkpoints import average_checkpoints from torch import nn class ModelWithSharedParameter(nn.Module): def __init__(self): super(ModelWithSharedParameter, self).__init__() self.embedding = nn.Embedding(1000, 200) self.FC1 = nn.Linear(200, 200) self.FC2 = nn.Linear(200, 200) # tie weight in FC2 to FC1 self.FC2.weight = nn.Parameter(self.FC1.weight) self.FC2.bias = nn.Parameter(self.FC1.bias) self.relu = nn.ReLU() def forward(self, input): return self.FC2(self.ReLU(self.FC1(input))) + self.FC1(input) class TestAverageCheckpoints(unittest.TestCase): def test_average_checkpoints(self): params_0 = collections.OrderedDict( [ ("a", torch.DoubleTensor([100.0])), ("b", torch.FloatTensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])), ("c", torch.IntTensor([7, 8, 9])), ] ) params_1 = collections.OrderedDict( [ ("a", torch.DoubleTensor([1.0])), ("b", torch.FloatTensor([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])), ("c", torch.IntTensor([2, 2, 2])), ] ) params_avg = collections.OrderedDict( [ ("a", torch.DoubleTensor([50.5])), ("b", torch.FloatTensor([[1.0, 1.5, 2.0], [2.5, 3.0, 3.5]])), # We expect truncation for integer division ("c", torch.IntTensor([4, 5, 5])), ] ) fd_0, path_0 = tempfile.mkstemp() fd_1, path_1 = tempfile.mkstemp() torch.save(collections.OrderedDict([("model", params_0)]), path_0) torch.save(collections.OrderedDict([("model", params_1)]), path_1) output = average_checkpoints([path_0, path_1])["model"] os.close(fd_0) os.remove(path_0) os.close(fd_1) os.remove(path_1) for (k_expected, v_expected), (k_out, v_out) in zip( params_avg.items(), output.items() ): self.assertEqual( k_expected, k_out, "Key mismatch - expected {} but found {}. " "(Expected list of keys: {} vs actual list of keys: {})".format( k_expected, k_out, params_avg.keys(), output.keys() ), ) np.testing.assert_allclose( v_expected.numpy(), v_out.numpy(), err_msg="Tensor value mismatch for key {}".format(k_expected), ) def test_average_checkpoints_with_shared_parameters(self): def _construct_model_with_shared_parameters(path, value): m = ModelWithSharedParameter() nn.init.constant_(m.FC1.weight, value) torch.save({"model": m.state_dict()}, path) return m tmpdir = tempfile.mkdtemp() paths = [] path = os.path.join(tmpdir, "m1.pt") m1 = _construct_model_with_shared_parameters(path, 1.0) paths.append(path) path = os.path.join(tmpdir, "m2.pt") m2 = _construct_model_with_shared_parameters(path, 2.0) paths.append(path) path = os.path.join(tmpdir, "m3.pt") m3 = _construct_model_with_shared_parameters(path, 3.0) paths.append(path) new_model = average_checkpoints(paths) self.assertTrue( torch.equal( new_model["model"]["embedding.weight"], (m1.embedding.weight + m2.embedding.weight + m3.embedding.weight) / 3.0, ) ) self.assertTrue( torch.equal( new_model["model"]["FC1.weight"], (m1.FC1.weight + m2.FC1.weight + m3.FC1.weight) / 3.0, ) ) self.assertTrue( torch.equal( new_model["model"]["FC2.weight"], (m1.FC2.weight + m2.FC2.weight + m3.FC2.weight) / 3.0, ) ) shutil.rmtree(tmpdir) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_backtranslation_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import tests.utils as test_utils import torch from fairseq.data import ( BacktranslationDataset, LanguagePairDataset, TransformEosDataset, ) from fairseq.sequence_generator import SequenceGenerator class TestBacktranslationDataset(unittest.TestCase): def setUp(self): ( self.tgt_dict, self.w1, self.w2, self.src_tokens, self.src_lengths, self.model, ) = test_utils.sequence_generator_setup() dummy_src_samples = self.src_tokens self.tgt_dataset = test_utils.TestDataset(data=dummy_src_samples) self.cuda = torch.cuda.is_available() def _backtranslation_dataset_helper( self, remove_eos_from_input_src, remove_eos_from_output_src, ): tgt_dataset = LanguagePairDataset( src=self.tgt_dataset, src_sizes=self.tgt_dataset.sizes, src_dict=self.tgt_dict, tgt=None, tgt_sizes=None, tgt_dict=None, ) generator = SequenceGenerator( [self.model], tgt_dict=self.tgt_dict, max_len_a=0, max_len_b=200, beam_size=2, unk_penalty=0, ) backtranslation_dataset = BacktranslationDataset( tgt_dataset=TransformEosDataset( dataset=tgt_dataset, eos=self.tgt_dict.eos(), # remove eos from the input src remove_eos_from_src=remove_eos_from_input_src, ), src_dict=self.tgt_dict, backtranslation_fn=( lambda sample: generator.generate([self.model], sample) ), output_collater=TransformEosDataset( dataset=tgt_dataset, eos=self.tgt_dict.eos(), # if we remove eos from the input src, then we need to add it # back to the output tgt append_eos_to_tgt=remove_eos_from_input_src, remove_eos_from_src=remove_eos_from_output_src, ).collater, cuda=self.cuda, ) dataloader = torch.utils.data.DataLoader( backtranslation_dataset, batch_size=2, collate_fn=backtranslation_dataset.collater, ) backtranslation_batch_result = next(iter(dataloader)) eos, pad, w1, w2 = self.tgt_dict.eos(), self.tgt_dict.pad(), self.w1, self.w2 # Note that we sort by src_lengths and add left padding, so actually # ids will look like: [1, 0] expected_src = torch.LongTensor([[w1, w2, w1, eos], [pad, pad, w1, eos]]) if remove_eos_from_output_src: expected_src = expected_src[:, :-1] expected_tgt = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]]) generated_src = backtranslation_batch_result["net_input"]["src_tokens"] tgt_tokens = backtranslation_batch_result["target"] self.assertTensorEqual(expected_src, generated_src) self.assertTensorEqual(expected_tgt, tgt_tokens) def test_backtranslation_dataset_no_eos_in_output_src(self): self._backtranslation_dataset_helper( remove_eos_from_input_src=False, remove_eos_from_output_src=True, ) def test_backtranslation_dataset_with_eos_in_output_src(self): self._backtranslation_dataset_helper( remove_eos_from_input_src=False, remove_eos_from_output_src=False, ) def test_backtranslation_dataset_no_eos_in_input_src(self): self._backtranslation_dataset_helper( remove_eos_from_input_src=True, remove_eos_from_output_src=False, ) def assertTensorEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertEqual(t1.ne(t2).long().sum(), 0) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_binaries.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import json import logging import os import random import sys import tempfile import unittest from packaging import version from io import StringIO from typing import Dict, List import torch from fairseq import options from fairseq_cli import eval_lm, train from tests.utils import ( create_dummy_data, create_laser_data_and_config_json, generate_main, preprocess_lm_data, preprocess_summarization_data, preprocess_translation_data, train_language_model, train_translation_model, ) try: import transformers # noqa has_hf_transformers = True except ImportError: has_hf_transformers = False class TestTranslation(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_fconv(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fconv") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model(data_dir, "fconv_iwslt_de_en") generate_main(data_dir) def test_raw(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fconv_raw") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, ["--dataset-impl", "raw"]) train_translation_model( data_dir, "fconv_iwslt_de_en", ["--dataset-impl", "raw"] ) generate_main(data_dir, ["--dataset-impl", "raw"]) def test_update_freq(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_update_freq") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "fconv_iwslt_de_en", ["--update-freq", "3"] ) generate_main(data_dir) def test_max_positions(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_max_positions") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) with self.assertRaises(Exception) as context: train_translation_model( data_dir, "fconv_iwslt_de_en", ["--max-target-positions", "5"], ) self.assertTrue( "skip this example with --skip-invalid-size-inputs-valid-test" in str(context.exception) ) train_translation_model( data_dir, "fconv_iwslt_de_en", [ "--max-target-positions", "5", "--skip-invalid-size-inputs-valid-test", ], ) with self.assertRaises(Exception) as context: generate_main(data_dir) generate_main(data_dir, ["--skip-invalid-size-inputs-valid-test"]) def test_generation(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_sampling") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model(data_dir, "fconv_iwslt_de_en") generate_main( data_dir, [ "--sampling", "--temperature", "2", "--beam", "2", "--nbest", "2", ], ) generate_main( data_dir, [ "--sampling", "--sampling-topk", "3", "--beam", "2", "--nbest", "2", ], ) generate_main( data_dir, [ "--sampling", "--sampling-topp", "0.2", "--beam", "2", "--nbest", "2", ], ) generate_main( data_dir, [ "--diversity-rate", "0.5", "--beam", "6", ], ) with self.assertRaises(ValueError): generate_main( data_dir, [ "--diverse-beam-groups", "4", "--match-source-len", ], ) generate_main(data_dir, ["--prefix-size", "2"]) generate_main(data_dir, ["--retain-dropout"]) def test_eval_bleu(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_eval_bleu") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "fconv_iwslt_de_en", [ "--eval-bleu", "--eval-bleu-print-samples", "--eval-bleu-remove-bpe", "--eval-bleu-detok", "space", "--eval-bleu-args", '{"beam": 4, "min_len": 10}', ], ) def test_lstm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_lstm") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "lstm_wiseman_iwslt_de_en", [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--decoder-out-embed-dim", "8", ], ) generate_main(data_dir) def test_lstm_bidirectional(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_lstm_bidirectional") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "lstm", [ "--encoder-layers", "2", "--encoder-bidirectional", "--encoder-hidden-size", "16", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--decoder-out-embed-dim", "8", "--decoder-layers", "2", ], ) generate_main(data_dir) def test_transformer(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "transformer_iwslt_de_en", [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", ], run_validation=True, ) generate_main(data_dir) def test_multilingual_transformer(self): # test with all combinations of encoder/decoder lang tokens encoder_langtok_flags = [ [], ["--encoder-langtok", "src"], ["--encoder-langtok", "tgt"], ] decoder_langtok_flags = [[], ["--decoder-langtok"]] with contextlib.redirect_stdout(StringIO()): for i in range(len(encoder_langtok_flags)): for j in range(len(decoder_langtok_flags)): enc_ltok_flag = encoder_langtok_flags[i] dec_ltok_flag = decoder_langtok_flags[j] with tempfile.TemporaryDirectory( f"test_multilingual_transformer_{i}_{j}" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, arch="multilingual_transformer", task="multilingual_translation", extra_flags=[ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", ] + enc_ltok_flag + dec_ltok_flag, lang_flags=["--lang-pairs", "in-out,out-in"], run_validation=True, extra_valid_flags=enc_ltok_flag + dec_ltok_flag, ) generate_main( data_dir, extra_flags=[ "--task", "multilingual_translation", "--lang-pairs", "in-out,out-in", "--source-lang", "in", "--target-lang", "out", ] + enc_ltok_flag + dec_ltok_flag, ) @unittest.skipIf( sys.platform.lower() == "darwin", "skip latent depth test on MacOS" ) def test_multilingual_translation_latent_depth(self): # test with latent depth in encoder, decoder, or both encoder_latent_layer = [[], ["--encoder-latent-layer"]] decoder_latent_layer = [[], ["--decoder-latent-layer"]] with contextlib.redirect_stdout(StringIO()): for i in range(len(encoder_latent_layer)): for j in range(len(decoder_latent_layer)): if i == 0 and j == 0: continue enc_ll_flag = encoder_latent_layer[i] dec_ll_flag = decoder_latent_layer[j] with tempfile.TemporaryDirectory( f"test_multilingual_translation_latent_depth_{i}_{j}" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data( data_dir, extra_flags=["--joined-dictionary"] ) train_translation_model( data_dir, arch="latent_multilingual_transformer", task="multilingual_translation_latent_depth", extra_flags=[ "--user-dir", "examples/latent_depth/latent_depth_src", "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--share-encoders", "--share-decoders", "--sparsity-weight", "0.1", ] + enc_ll_flag + dec_ll_flag, lang_flags=["--lang-pairs", "in-out,out-in"], run_validation=True, extra_valid_flags=[ "--user-dir", "examples/latent_depth/latent_depth_src", ] + enc_ll_flag + dec_ll_flag, ) generate_main( data_dir, extra_flags=[ "--user-dir", "examples/latent_depth/latent_depth_src", "--task", "multilingual_translation_latent_depth", "--lang-pairs", "in-out,out-in", "--source-lang", "in", "--target-lang", "out", ] + enc_ll_flag + dec_ll_flag, ) def test_translation_multi_simple_epoch(self): # test with all combinations of encoder/decoder lang tokens encoder_langtok_flags = [ [], ["--encoder-langtok", "src"], ["--encoder-langtok", "tgt"], ] decoder_langtok_flags = [[], ["--decoder-langtok"]] with contextlib.redirect_stdout(StringIO()): for i in range(len(encoder_langtok_flags)): for j in range(len(decoder_langtok_flags)): enc_ltok_flag = encoder_langtok_flags[i] dec_ltok_flag = decoder_langtok_flags[j] with tempfile.TemporaryDirectory( f"test_translation_multi_simple_epoch_{i}_{j}" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data( data_dir, extra_flags=["--joined-dictionary"] ) train_translation_model( data_dir, arch="transformer", task="translation_multi_simple_epoch", extra_flags=[ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--sampling-method", "temperature", "--sampling-temperature", "1.5", "--virtual-epoch-size", "1000", ] + enc_ltok_flag + dec_ltok_flag, lang_flags=["--lang-pairs", "in-out,out-in"], run_validation=True, extra_valid_flags=enc_ltok_flag + dec_ltok_flag, ) generate_main( data_dir, extra_flags=[ "--task", "translation_multi_simple_epoch", "--lang-pairs", "in-out,out-in", "--source-lang", "in", "--target-lang", "out", ] + enc_ltok_flag + dec_ltok_flag, ) def test_translation_multi_simple_epoch_no_vepoch(self): # test with all combinations of encoder/decoder lang tokens with contextlib.redirect_stdout(StringIO()): enc_ltok_flag = ["--encoder-langtok", "src"] dec_ltok_flag = ["--decoder-langtok"] with tempfile.TemporaryDirectory( "test_translation_multi_simple_epoch_dict" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, extra_flags=[]) train_translation_model( data_dir, arch="transformer", task="translation_multi_simple_epoch", extra_flags=[ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--sampling-method", "temperature", "--sampling-temperature", "1.5", ] + enc_ltok_flag + dec_ltok_flag, lang_flags=["--lang-pairs", "in-out"], run_validation=True, extra_valid_flags=enc_ltok_flag + dec_ltok_flag, ) generate_main( data_dir, extra_flags=[ "--task", "translation_multi_simple_epoch", "--lang-pairs", "in-out", "--source-lang", "in", "--target-lang", "out", ] + enc_ltok_flag + dec_ltok_flag, ) def test_translation_multi_simple_epoch_dicts(self): # test with all combinations of encoder/decoder lang tokens with contextlib.redirect_stdout(StringIO()): enc_ltok_flag = ["--encoder-langtok", "src"] dec_ltok_flag = ["--decoder-langtok"] with tempfile.TemporaryDirectory( "test_translation_multi_simple_epoch_dict" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, extra_flags=[]) train_translation_model( data_dir, arch="transformer", task="translation_multi_simple_epoch", extra_flags=[ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--sampling-method", "temperature", "--sampling-temperature", "1.5", "--virtual-epoch-size", "1000", ] + enc_ltok_flag + dec_ltok_flag, lang_flags=["--lang-pairs", "in-out"], run_validation=True, extra_valid_flags=enc_ltok_flag + dec_ltok_flag, ) generate_main( data_dir, extra_flags=[ "--task", "translation_multi_simple_epoch", "--lang-pairs", "in-out", "--source-lang", "in", "--target-lang", "out", ] + enc_ltok_flag + dec_ltok_flag, ) def test_translation_multi_simple_epoch_src_tgt_dict_spec(self): # test the specification of explicit --src-dict and --tgt-dict with contextlib.redirect_stdout(StringIO()): enc_ltok_flag = ["--encoder-langtok", "src"] dec_ltok_flag = ["--decoder-langtok"] with tempfile.TemporaryDirectory( "test_translation_multi_simple_epoch_dict" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, extra_flags=[]) train_translation_model( data_dir, arch="transformer", task="translation_multi_simple_epoch", extra_flags=[ "--source-dict", f"{data_dir}/dict.in.txt", "--target-dict", f"{data_dir}/dict.out.txt", "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--sampling-method", "temperature", "--sampling-temperature", "1.5", "--virtual-epoch-size", "1000", ] + enc_ltok_flag + dec_ltok_flag, lang_flags=["--lang-pairs", "in-out"], run_validation=True, extra_valid_flags=enc_ltok_flag + dec_ltok_flag, ) generate_main( data_dir, extra_flags=[ "--task", "translation_multi_simple_epoch", "--lang-pairs", "in-out", "--source-lang", "in", "--target-lang", "out", ] + enc_ltok_flag + dec_ltok_flag, ) def test_transformer_cross_self_attention(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_transformer_cross_self_attention" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "transformer_iwslt_de_en", [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--decoder-embed-dim", "8", "--no-cross-attention", "--cross-self-attention", ], run_validation=True, ) generate_main(data_dir, extra_flags=[]) @unittest.skipIf( version.parse(torch.__version__) > version.parse("1.8"), "skip for latest torch versions", ) def test_transformer_pointer_generator(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_transformer_pointer_generator" ) as data_dir: create_dummy_data(data_dir) preprocess_summarization_data(data_dir) train_translation_model( data_dir, "transformer_pointer_generator", extra_flags=[ "--user-dir", "examples/pointer_generator/pointer_generator_src", "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--alignment-layer", "-1", "--alignment-heads", "1", "--source-position-markers", "0", ], run_validation=True, extra_valid_flags=[ "--user-dir", "examples/pointer_generator/pointer_generator_src", ], ) generate_main( data_dir, extra_flags=[ "--user-dir", "examples/pointer_generator/pointer_generator_src", ], ) def test_lightconv(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_lightconv") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "lightconv_iwslt_de_en", [ "--encoder-conv-type", "lightweight", "--decoder-conv-type", "lightweight", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", ], ) generate_main(data_dir) def test_dynamicconv(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_dynamicconv") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "lightconv_iwslt_de_en", [ "--encoder-conv-type", "dynamic", "--decoder-conv-type", "dynamic", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", ], ) generate_main(data_dir) def test_cmlm_transformer(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_cmlm_transformer") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, ["--joined-dictionary"]) train_translation_model( data_dir, "cmlm_transformer", [ "--apply-bert-init", "--criterion", "nat_loss", "--noise", "full_mask", "--pred-length-offset", "--length-loss-factor", "0.1", ], task="translation_lev", ) generate_main( data_dir, [ "--task", "translation_lev", "--iter-decode-max-iter", "9", "--iter-decode-eos-penalty", "0", "--print-step", ], ) def test_nonautoregressive_transformer(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_nonautoregressive_transformer" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, ["--joined-dictionary"]) train_translation_model( data_dir, "nonautoregressive_transformer", [ "--apply-bert-init", "--src-embedding-copy", "--criterion", "nat_loss", "--noise", "full_mask", "--pred-length-offset", "--length-loss-factor", "0.1", ], task="translation_lev", ) generate_main( data_dir, [ "--task", "translation_lev", "--iter-decode-max-iter", "0", "--iter-decode-eos-penalty", "0", "--print-step", ], ) # def test_nat_crf_transformer(self): # with contextlib.redirect_stdout(StringIO()): # with tempfile.TemporaryDirectory('test_nat_crf_transformer') as data_dir: # create_dummy_data(data_dir) # preprocess_translation_data(data_dir, ['--joined-dictionary']) # train_translation_model(data_dir, 'nacrf_transformer', [ # '--apply-bert-init', '--criterion', # 'nat_loss', '--noise', 'full_mask', '--pred-length-offset', # '--length-loss-factor', '0.1', # '--word-ins-loss-factor', '0.5', # '--crf-lowrank-approx', '1', # '--crf-beam-approx', '1' # ], task='translation_lev') # generate_main(data_dir, [ # '--task', 'translation_lev', # '--iter-decode-max-iter', '0', # '--iter-decode-eos-penalty', '0', # '--print-step', # ]) def test_iterative_nonautoregressive_transformer(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_iterative_nonautoregressive_transformer" ) as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, ["--joined-dictionary"]) train_translation_model( data_dir, "iterative_nonautoregressive_transformer", [ "--apply-bert-init", "--src-embedding-copy", "--criterion", "nat_loss", "--noise", "full_mask", "--stochastic-approx", "--dae-ratio", "0.5", "--train-step", "3", ], task="translation_lev", ) generate_main( data_dir, [ "--task", "translation_lev", "--iter-decode-max-iter", "9", "--iter-decode-eos-penalty", "0", "--print-step", ], ) def test_insertion_transformer(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_insertion_transformer") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir, ["--joined-dictionary"]) train_translation_model( data_dir, "insertion_transformer", [ "--apply-bert-init", "--criterion", "nat_loss", "--noise", "random_mask", ], task="translation_lev", ) generate_main( data_dir, [ "--task", "translation_lev", "--iter-decode-max-iter", "9", "--iter-decode-eos-penalty", "0", "--print-step", ], ) def test_mixture_of_experts(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_moe") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "transformer_iwslt_de_en", [ "--task", "translation_moe", "--user-dir", "examples/translation_moe/translation_moe_src", "--method", "hMoElp", "--mean-pool-gating-network", "--num-experts", "3", "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", ], ) generate_main( data_dir, [ "--task", "translation_moe", "--user-dir", "examples/translation_moe/translation_moe_src", "--method", "hMoElp", "--mean-pool-gating-network", "--num-experts", "3", "--gen-expert", "0", ], ) def test_alignment(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_alignment") as data_dir: create_dummy_data(data_dir, alignment=True) preprocess_translation_data(data_dir, ["--align-suffix", "align"]) train_translation_model( data_dir, "transformer_align", [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--load-alignments", "--alignment-layer", "1", "--criterion", "label_smoothed_cross_entropy_with_alignment", ], run_validation=True, ) generate_main(data_dir) def test_laser_lstm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_laser_lstm") as data_dir: laser_config_file = create_laser_data_and_config_json(data_dir) train_translation_model( laser_config_file.name, "laser_lstm", [ "--user-dir", "examples/laser/laser_src", "--weighting-alpha", "0.3", "--encoder-bidirectional", "--encoder-hidden-size", "512", "--encoder-layers", "5", "--decoder-layers", "1", "--encoder-embed-dim", "320", "--decoder-embed-dim", "320", "--decoder-lang-embed-dim", "32", "--save-dir", data_dir, "--disable-validation", ], task="laser", lang_flags=[], ) def test_laser_transformer(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_laser_transformer") as data_dir: laser_config_file = create_laser_data_and_config_json(data_dir) train_translation_model( laser_config_file.name, "laser_transformer", [ "--user-dir", "examples/laser/laser_src", "--weighting-alpha", "0.3", "--encoder-embed-dim", "320", "--decoder-embed-dim", "320", "--decoder-lang-embed-dim", "32", "--save-dir", data_dir, "--disable-validation", ], task="laser", lang_flags=[], ) def test_alignment_full_context(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_alignment") as data_dir: create_dummy_data(data_dir, alignment=True) preprocess_translation_data(data_dir, ["--align-suffix", "align"]) train_translation_model( data_dir, "transformer_align", [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--load-alignments", "--alignment-layer", "1", "--criterion", "label_smoothed_cross_entropy_with_alignment", "--full-context-alignment", ], run_validation=True, ) generate_main(data_dir) def test_transformer_layerdrop(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer_layerdrop") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "transformer_iwslt_de_en", [ "--encoder-layers", "3", "--decoder-layers", "3", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--encoder-layerdrop", "0.01", "--decoder-layerdrop", "0.01", ], ) generate_main(data_dir) generate_main( data_dir, [ "--model-overrides", "{'encoder_layers_to_keep':'0,2','decoder_layers_to_keep':'1'}", ], ) class TestStories(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_fconv_self_att_wp(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fconv_self_att_wp") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) config = [ "--encoder-layers", "[(128, 3)] * 2", "--decoder-layers", "[(128, 3)] * 2", "--decoder-attention", "True", "--encoder-attention", "False", "--gated-attention", "True", "--self-attention", "True", "--project-input", "True", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--decoder-out-embed-dim", "8", "--multihead-self-attention-nheads", "2", ] train_translation_model(data_dir, "fconv_self_att_wp", config) generate_main(data_dir) # fusion model os.rename( os.path.join(data_dir, "checkpoint_last.pt"), os.path.join(data_dir, "pretrained.pt"), ) config.extend( [ "--pretrained", "True", "--pretrained-checkpoint", os.path.join(data_dir, "pretrained.pt"), "--save-dir", os.path.join(data_dir, "fusion_model"), ] ) train_translation_model(data_dir, "fconv_self_att_wp", config) class TestLanguageModeling(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_fconv_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_fconv_lm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "fconv_lm", [ "--decoder-layers", "[(850, 3)] * 2 + [(1024,4)]", "--decoder-embed-dim", "280", "--optimizer", "nag", "--lr", "0.1", ], ) eval_lm_main(data_dir) generate_main( data_dir, [ "--task", "language_modeling", "--sample-break-mode", "eos", "--tokens-per-sample", "500", ], ) def test_transformer_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "transformer_lm", ["--add-bos-token", "--nval", "1"], run_validation=True, ) eval_lm_main(data_dir) eval_lm_main(data_dir, extra_flags=["--context-window", "25"]) generate_main( data_dir, [ "--task", "language_modeling", "--sample-break-mode", "eos", "--tokens-per-sample", "500", ], ) def test_normformer_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "transformer_lm", [ "--add-bos-token", "--nval", "1", "--scale-fc", "--scale-heads", "--scale-attn", "--scale-fc", ], run_validation=True, ) eval_lm_main(data_dir) eval_lm_main(data_dir, extra_flags=["--context-window", "25"]) generate_main( data_dir, [ "--task", "language_modeling", "--sample-break-mode", "eos", "--tokens-per-sample", "500", ], ) def test_transformer_lm_with_adaptive_softmax(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_transformer_lm_with_adaptive_softmax" ) as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "transformer_lm", [ "--add-bos-token", "--criterion", "adaptive_loss", "--adaptive-softmax-cutoff", "5,10,15", ], run_validation=True, ) eval_lm_main(data_dir) generate_main( data_dir, [ "--task", "language_modeling", "--sample-break-mode", "eos", "--tokens-per-sample", "500", ], ) def test_lightconv_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_lightconv_lm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "lightconv_lm", ["--add-bos-token"], run_validation=True, ) eval_lm_main(data_dir) generate_main( data_dir, [ "--task", "language_modeling", "--sample-break-mode", "eos", "--tokens-per-sample", "500", ], ) def test_lstm_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_lstm_lm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "lstm_lm", ["--add-bos-token"], run_validation=True, ) eval_lm_main(data_dir) generate_main( data_dir, [ "--task", "language_modeling", "--sample-break-mode", "eos", "--tokens-per-sample", "500", ], ) def test_lstm_lm_residuals(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_lstm_lm_residuals") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "lstm_lm", ["--add-bos-token", "--residuals"], run_validation=True, ) eval_lm_main(data_dir) generate_main( data_dir, [ "--task", "language_modeling", "--sample-break-mode", "eos", "--tokens-per-sample", "500", ], ) @unittest.skipIf(not has_hf_transformers, "skip test if transformers is missing") def test_transformer_xl_bptt_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer_xl_bptt_lm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) task_flags = [ "--user-dir", "examples/truncated_bptt", "--task", "truncated_bptt_lm", "--batch-size", "2", "--tokens-per-sample", "50", ] train_language_model( data_dir=data_dir, arch="transformer_xl", extra_flags=task_flags + [ "--n-layer", "2", ], task="truncated_bptt_lm", run_validation=True, extra_valid_flags=task_flags, ) eval_lm_main(data_dir, extra_flags=task_flags) # Train with activation offloading train_language_model( data_dir=data_dir, arch="transformer_xl", extra_flags=task_flags + [ "--n-layer", "2", "--offload-activations", ], task="truncated_bptt_lm", run_validation=True, extra_valid_flags=task_flags, ) class TestMaskedLanguageModel(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_legacy_masked_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_legacy_mlm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_legacy_masked_language_model(data_dir, "masked_lm") def test_roberta_masked_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_roberta_mlm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_masked_lm( data_dir, "roberta_base", extra_flags=["--encoder-layers", "2"] ) def test_roberta_sentence_prediction(self): num_classes = 3 with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_roberta_head") as data_dir: create_dummy_roberta_head_data(data_dir, num_classes=num_classes) preprocess_lm_data(os.path.join(data_dir, "input0")) preprocess_lm_data(os.path.join(data_dir, "label")) train_roberta_head(data_dir, "roberta_base", num_classes=num_classes) def test_roberta_regression_single(self): num_classes = 1 with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_roberta_regression_single" ) as data_dir: create_dummy_roberta_head_data( data_dir, num_classes=num_classes, regression=True ) preprocess_lm_data(os.path.join(data_dir, "input0")) train_roberta_head( data_dir, "roberta_base", num_classes=num_classes, extra_flags=["--regression-target"], ) def test_roberta_regression_multiple(self): num_classes = 3 with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_roberta_regression_multiple" ) as data_dir: create_dummy_roberta_head_data( data_dir, num_classes=num_classes, regression=True ) preprocess_lm_data(os.path.join(data_dir, "input0")) train_roberta_head( data_dir, "roberta_base", num_classes=num_classes, extra_flags=["--regression-target"], ) def test_linformer_roberta_masked_lm(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_linformer_roberta_mlm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_masked_lm( data_dir, "linformer_roberta_base", extra_flags=[ "--user-dir", "examples/linformer/linformer_src", "--encoder-layers", "2", ], ) def test_linformer_roberta_sentence_prediction(self): num_classes = 3 with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_linformer_roberta_head") as data_dir: create_dummy_roberta_head_data(data_dir, num_classes=num_classes) preprocess_lm_data(os.path.join(data_dir, "input0")) preprocess_lm_data(os.path.join(data_dir, "label")) train_roberta_head( data_dir, "linformer_roberta_base", num_classes=num_classes, extra_flags=["--user-dir", "examples/linformer/linformer_src"], ) def test_linformer_roberta_regression_single(self): num_classes = 1 with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_linformer_roberta_regression_single" ) as data_dir: create_dummy_roberta_head_data( data_dir, num_classes=num_classes, regression=True ) preprocess_lm_data(os.path.join(data_dir, "input0")) train_roberta_head( data_dir, "linformer_roberta_base", num_classes=num_classes, extra_flags=[ "--regression-target", "--user-dir", "examples/linformer/linformer_src", ], ) def test_linformer_roberta_regression_multiple(self): num_classes = 3 with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory( "test_linformer_roberta_regression_multiple" ) as data_dir: create_dummy_roberta_head_data( data_dir, num_classes=num_classes, regression=True ) preprocess_lm_data(os.path.join(data_dir, "input0")) train_roberta_head( data_dir, "linformer_roberta_base", num_classes=num_classes, extra_flags=[ "--regression-target", "--user-dir", "examples/linformer/linformer_src", ], ) def _test_pretrained_masked_lm_for_translation(self, learned_pos_emb, encoder_only): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_mlm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_legacy_masked_language_model( data_dir, arch="masked_lm", extra_args=("--encoder-learned-pos",) if learned_pos_emb else (), ) with tempfile.TemporaryDirectory( "test_mlm_translation" ) as translation_dir: create_dummy_data(translation_dir) preprocess_translation_data( translation_dir, extra_flags=["--joined-dictionary"] ) # Train transformer with data_dir/checkpoint_last.pt train_translation_model( translation_dir, arch="transformer_from_pretrained_xlm", extra_flags=[ "--decoder-layers", "1", "--decoder-embed-dim", "32", "--decoder-attention-heads", "1", "--decoder-ffn-embed-dim", "32", "--encoder-layers", "1", "--encoder-embed-dim", "32", "--encoder-attention-heads", "1", "--encoder-ffn-embed-dim", "32", "--pretrained-xlm-checkpoint", "{}/checkpoint_last.pt".format(data_dir), "--activation-fn", "gelu", "--max-source-positions", "500", "--max-target-positions", "500", ] + ( ["--encoder-learned-pos", "--decoder-learned-pos"] if learned_pos_emb else [] ) + (["--init-encoder-only"] if encoder_only else []), task="translation_from_pretrained_xlm", ) def test_pretrained_masked_lm_for_translation_learned_pos_emb(self): self._test_pretrained_masked_lm_for_translation(True, False) def test_pretrained_masked_lm_for_translation_sinusoidal_pos_emb(self): self._test_pretrained_masked_lm_for_translation(False, False) def test_pretrained_masked_lm_for_translation_encoder_only(self): self._test_pretrained_masked_lm_for_translation(True, True) def test_r4f_roberta(self): num_classes = 3 with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_r4f_roberta_head") as data_dir: create_dummy_roberta_head_data(data_dir, num_classes=num_classes) preprocess_lm_data(os.path.join(data_dir, "input0")) preprocess_lm_data(os.path.join(data_dir, "label")) train_roberta_head( data_dir, "roberta_base", num_classes=num_classes, extra_flags=[ "--user-dir", "examples/rxf/rxf_src", "--criterion", "sentence_prediction_r3f", "--spectral-norm-classification-head", ], ) def train_legacy_masked_language_model(data_dir, arch, extra_args=()): train_parser = options.get_training_parser() # TODO: langs should be in and out right? train_args = options.parse_args_and_arch( train_parser, [ "--task", "cross_lingual_lm", data_dir, "--arch", arch, # Optimizer args "--optimizer", "adam", "--lr-scheduler", "reduce_lr_on_plateau", "--lr-shrink", "0.5", "--lr", "0.0001", "--stop-min-lr", "1e-09", # dropout, attention args "--dropout", "0.1", "--attention-dropout", "0.1", # MLM args "--criterion", "legacy_masked_lm_loss", "--masked-lm-only", "--monolingual-langs", "in,out", "--num-segment", "5", # Transformer args: use a small transformer model for fast training "--encoder-layers", "1", "--encoder-embed-dim", "32", "--encoder-attention-heads", "1", "--encoder-ffn-embed-dim", "32", # Other training args "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--dataset-impl", "raw", "--num-workers", "0", ] + list(extra_args), ) train.main(train_args) class TestOptimizers(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_optimizers(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_optimizers") as data_dir: # Use just a bit of data and tiny model to keep this test runtime reasonable create_dummy_data(data_dir, num_examples=10, maxlen=5) preprocess_translation_data(data_dir) optimizers = ["adafactor", "adam", "nag", "adagrad", "sgd", "adadelta"] last_checkpoint = os.path.join(data_dir, "checkpoint_last.pt") for optimizer in optimizers: if os.path.exists(last_checkpoint): os.remove(last_checkpoint) train_translation_model( data_dir, "lstm", [ "--required-batch-size-multiple", "1", "--encoder-layers", "1", "--encoder-hidden-size", "32", "--decoder-layers", "1", "--optimizer", optimizer, ], ) generate_main(data_dir) def read_last_log_entry( logs: List[logging.LogRecord], logger_name: str ) -> Dict[str, float]: for x in reversed(logs): if x.name == logger_name: return json.loads(x.message) raise ValueError(f"No entries from {logger_name} found in captured logs") class TestActivationCheckpointing(unittest.TestCase): base_flags = [ "--encoder-layers", "2", "--decoder-layers", "2", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--restore-file", "x.pt", "--log-format", "json", "--log-interval", "1", "--max-update", "2", ] def _train(self, data_dir, extra_flags): with self.assertLogs() as logs: train_translation_model( data_dir, "transformer_iwslt_de_en", self.base_flags + extra_flags, run_validation=True, extra_valid_flags=["--log-format", "json"], ) return logs.records def test_activation_offloading_does_not_change_metrics(self): """Neither ----checkpoint-activations nor --offload-activations should change loss""" with tempfile.TemporaryDirectory("test_transformer_with_act_cpt") as data_dir: with self.assertLogs(): create_dummy_data(data_dir, num_examples=20) preprocess_translation_data(data_dir) offload_logs = self._train(data_dir, ["--offload-activations"]) baseline_logs = self._train(data_dir, []) assert len(baseline_logs) == len(offload_logs) baseline_valid_stats = read_last_log_entry(baseline_logs, "valid") offload_valid_stats = read_last_log_entry(offload_logs, "valid") baseline_train_stats = read_last_log_entry(baseline_logs, "train") offload_train_stats = read_last_log_entry(offload_logs, "train") assert ( baseline_train_stats["train_loss"] == offload_train_stats["train_loss"] ) assert ( baseline_valid_stats["valid_loss"] == offload_valid_stats["valid_loss"] ) def test_activation_checkpointing_does_not_change_metrics(self): """--checkpoint-activations should not change loss""" with tempfile.TemporaryDirectory("test_transformer_with_act_cpt") as data_dir: with self.assertLogs(): create_dummy_data(data_dir, num_examples=20) preprocess_translation_data(data_dir) ckpt_logs = self._train(data_dir, ["--checkpoint-activations"]) baseline_logs = self._train(data_dir, []) assert len(baseline_logs) == len(ckpt_logs) baseline_train_stats = read_last_log_entry(baseline_logs, "train") ckpt_train_stats = read_last_log_entry(ckpt_logs, "train") assert baseline_train_stats["train_loss"] == ckpt_train_stats["train_loss"] baseline_valid_stats = read_last_log_entry(baseline_logs, "valid") ckpt_valid_stats = read_last_log_entry(ckpt_logs, "valid") assert baseline_valid_stats["valid_loss"] == ckpt_valid_stats["valid_loss"] def create_dummy_roberta_head_data( data_dir, num_examples=100, maxlen=10, num_classes=2, regression=False ): input_dir = "input0" def _create_dummy_data(filename): random_data = torch.rand(num_examples * maxlen) input_data = 97 + torch.floor(26 * random_data).int() if regression: output_data = torch.rand((num_examples, num_classes)) else: output_data = 1 + torch.floor(num_classes * torch.rand(num_examples)).int() with open(os.path.join(data_dir, input_dir, filename + ".out"), "w") as f_in: label_filename = filename + ".label" if regression else filename + ".out" with open(os.path.join(data_dir, "label", label_filename), "w") as f_out: offset = 0 for i in range(num_examples): # write example input ex_len = random.randint(1, maxlen) ex_str = " ".join(map(chr, input_data[offset : offset + ex_len])) print(ex_str, file=f_in) # write example label if regression: class_str = " ".join(map(str, output_data[i].numpy())) print(class_str, file=f_out) else: class_str = "class{}".format(output_data[i]) print(class_str, file=f_out) offset += ex_len os.mkdir(os.path.join(data_dir, input_dir)) os.mkdir(os.path.join(data_dir, "label")) _create_dummy_data("train") _create_dummy_data("valid") _create_dummy_data("test") def train_masked_lm(data_dir, arch, extra_flags=None): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", "masked_lm", data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--criterion", "masked_lm", "--batch-size", "500", "--required-batch-size-multiple", "1", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", "0", ] + (extra_flags or []), ) train.main(train_args) def train_roberta_head(data_dir, arch, num_classes=2, extra_flags=None): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", "sentence_prediction", data_dir, "--arch", arch, "--encoder-layers", "2", "--num-classes", str(num_classes), "--optimizer", "adam", "--lr", "0.0001", "--criterion", "sentence_prediction", "--max-tokens", "500", "--max-positions", "500", "--batch-size", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", "1", "--ddp-backend", "no_c10d", "--num-workers", "0", ] + (extra_flags or []), ) train.main(train_args) def eval_lm_main(data_dir, extra_flags=None): eval_lm_parser = options.get_eval_lm_parser() eval_lm_args = options.parse_args_and_arch( eval_lm_parser, [ data_dir, "--path", os.path.join(data_dir, "checkpoint_last.pt"), "--no-progress-bar", "--num-workers", "0", ] + (extra_flags or []), ) eval_lm.main(eval_lm_args) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_binarizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import typing as tp import unittest from tempfile import TemporaryDirectory from fairseq.binarizer import BinarizeSummary, FileBinarizer, VocabularyDatasetBinarizer from fairseq.data import Dictionary, indexed_dataset from tests.utils import make_data, sizes def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary: d = Dictionary() for s in data: for token in s: d.add_symbol(token) d.finalize() return d class TestBinarizer(unittest.TestCase): def compare_ds_data(self, summary, data, prefix, impl, vocab): self.assertEqual(summary.num_seq, len(data)) self.assertEqual(summary.num_tok, sum([len(s) for s in data])) dataset = indexed_dataset.make_dataset(prefix, impl) self.assertEqual(len(dataset), len(data)) decoded = [vocab.string(dataset[i]).split() for i in range(0, len(dataset))] self.assertEqual(decoded, data) data_sizes = [i.item() for i in dataset.sizes] self.assertEqual(data_sizes, sizes(data)) def test_can_binarize_line(self): data = make_data(length=1) vocab = build_vocab(data) binarizer = VocabularyDatasetBinarizer( vocab, ) sentence = data[0] summary = BinarizeSummary() tensor = binarizer.binarize_line( " ".join(sentence), summary, ) self.assertEqual(len(tensor), len(sentence) + 1) self.assertEqual(summary.num_tok, len(sentence) + 1) self.assertEqual(summary.num_seq, 1) def test_can_binarize_file_chunk(self): # test without multiprocess logic with TemporaryDirectory() as dirname: raw_file = os.path.join(dirname, "raw1") prefix = os.path.join(dirname, "test1") impl = "mmap" data = make_data(out_file=raw_file) vocab = build_vocab(data) binarizer = VocabularyDatasetBinarizer( vocab, append_eos=False, ) summary = FileBinarizer._binarize_chunk_and_finalize( binarizer, raw_file, offset_start=0, offset_end=-1, output_prefix=prefix, dataset_impl=impl, vocab_size=len(vocab), ) self.compare_ds_data(summary, data, prefix, impl, vocab) def test_can_multiprocess(self): with TemporaryDirectory() as dirname: raw_file = os.path.join(dirname, "raw1") prefix = os.path.join(dirname, "test1") impl = "mmap" data = make_data(out_file=raw_file) vocab = build_vocab(data) binarizer = VocabularyDatasetBinarizer( vocab, append_eos=False, ) # with one worker summary = FileBinarizer.multiprocess_dataset( raw_file, impl, binarizer, output_prefix=prefix, vocab_size=len(vocab), num_workers=1, ) self.compare_ds_data(summary, data, prefix, impl, vocab) # with multiple worker prefix_multi = os.path.join(dirname, "test2") summary = FileBinarizer.multiprocess_dataset( raw_file, impl, binarizer, output_prefix=prefix_multi, vocab_size=len(vocab), num_workers=3, ) self.compare_ds_data(summary, data, prefix_multi, impl, vocab) ================================================ FILE: tests/test_character_token_embedder.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from fairseq.data import Dictionary from fairseq.modules import CharacterTokenEmbedder class TestCharacterTokenEmbedder(unittest.TestCase): def test_character_token_embedder(self): vocab = Dictionary() vocab.add_symbol("hello") vocab.add_symbol("there") embedder = CharacterTokenEmbedder( vocab, [(2, 16), (4, 32), (8, 64), (16, 2)], 64, 5, 2 ) test_sents = [["hello", "unk", "there"], ["there"], ["hello", "there"]] max_len = max(len(s) for s in test_sents) input = torch.LongTensor(len(test_sents), max_len + 2).fill_(vocab.pad()) for i in range(len(test_sents)): input[i][0] = vocab.eos() for j in range(len(test_sents[i])): input[i][j + 1] = vocab.index(test_sents[i][j]) input[i][j + 2] = vocab.eos() embs = embedder(input) assert embs.size() == (len(test_sents), max_len + 2, 5) self.assertAlmostEqual(embs[0][0], embs[1][0]) self.assertAlmostEqual(embs[0][0], embs[0][-1]) self.assertAlmostEqual(embs[0][1], embs[2][1]) self.assertAlmostEqual(embs[0][3], embs[1][1]) embs.sum().backward() assert embedder.char_embeddings.weight.grad is not None def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess((t1 - t2).abs().max(), 1e-6) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_checkpoint_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import logging import os import tempfile import unittest from io import StringIO from unittest.mock import patch from fairseq import checkpoint_utils from tests.utils import ( create_dummy_data, preprocess_translation_data, train_translation_model, ) import torch class TestCheckpointUtils(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) @contextlib.contextmanager def _train_transformer(self, seed, extra_args=None): if extra_args is None: extra_args = [] with tempfile.TemporaryDirectory(f"_train_transformer_seed{seed}") as data_dir: create_dummy_data(data_dir) preprocess_translation_data(data_dir) train_translation_model( data_dir, "transformer_iwslt_de_en", [ "--encoder-layers", "3", "--decoder-layers", "3", "--encoder-embed-dim", "8", "--decoder-embed-dim", "8", "--seed", str(seed), ] + extra_args, ) yield os.path.join(data_dir, "checkpoint_last.pt") def test_load_model_ensemble_and_task(self): # with contextlib.redirect_stdout(StringIO()): with self._train_transformer(seed=123) as model1: with self._train_transformer(seed=456) as model2: ensemble, cfg, task = checkpoint_utils.load_model_ensemble_and_task( filenames=[model1, model2] ) self.assertEqual(len(ensemble), 2) # after Transformer has been migrated to Hydra, this will probably # become cfg.common.seed self.assertEqual(ensemble[0].args.seed, 123) self.assertEqual(ensemble[1].args.seed, 456) # the task from the first model should be returned self.assertTrue("seed123" in task.cfg.data) # last cfg is saved self.assertEqual(cfg.common.seed, 456) def test_prune_state_dict(self): with contextlib.redirect_stdout(StringIO()): extra_args = ["--encoder-layerdrop", "0.01", "--decoder-layerdrop", "0.01"] with self._train_transformer(seed=1, extra_args=extra_args) as model: ensemble, cfg, task = checkpoint_utils.load_model_ensemble_and_task( filenames=[model], arg_overrides={ "encoder_layers_to_keep": "0,2", "decoder_layers_to_keep": "1", }, ) self.assertEqual(len(ensemble), 1) self.assertEqual(len(ensemble[0].encoder.layers), 2) self.assertEqual(len(ensemble[0].decoder.layers), 1) def test_torch_persistent_save_async(self): state_dict = {} filename = "async_checkpoint.pt" with patch(f"{checkpoint_utils.__name__}.PathManager.opena") as mock_opena: with patch( f"{checkpoint_utils.__name__}._torch_persistent_save" ) as mock_save: checkpoint_utils.torch_persistent_save( state_dict, filename, async_write=True ) mock_opena.assert_called_with(filename, "wb") mock_save.assert_called() def test_load_ema_from_checkpoint(self): dummy_state = {"a": torch.tensor([1]), "b": torch.tensor([0.1])} with patch(f"{checkpoint_utils.__name__}.PathManager.open") as mock_open, patch( f"{checkpoint_utils.__name__}.torch.load" ) as mock_load: mock_load.return_value = {"extra_state": {"ema": dummy_state}} filename = "ema_checkpoint.pt" state = checkpoint_utils.load_ema_from_checkpoint(filename) mock_open.assert_called_with(filename, "rb") mock_load.assert_called() self.assertIn("a", state["model"]) self.assertIn("b", state["model"]) self.assertTrue(torch.allclose(dummy_state["a"], state["model"]["a"])) self.assertTrue(torch.allclose(dummy_state["b"], state["model"]["b"])) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_checkpoint_utils_for_task_level_attributes.py ================================================ #!/usr/bin/env fbpython # (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. import contextlib import logging import unittest from io import StringIO from unittest.mock import MagicMock, patch import torch from fairseq import checkpoint_utils, data from omegaconf import OmegaConf def mock_trainer(epoch, num_updates, iterations_in_epoch): trainer = MagicMock() trainer.load_checkpoint.return_value = { "train_iterator": { "epoch": epoch, "iterations_in_epoch": iterations_in_epoch, "shuffle": False, }, "FakeTask": checkpoint_dict()["FakeTask"], } trainer.get_num_updates.return_value = num_updates trainer.task.__class__.__name__ = "FakeTask" trainer.task.get_checkpoint_dict.return_value = checkpoint_dict() trainer.task.set_checkpoint_dict = MagicMock() return trainer def checkpoint_dict(): return { "FakeTask": { "observer_stats": { ( 4, 16, "MovingAveragePerChannelMinMax", "MovingAveragePerChannelMinMax", ): {"mod1": 1, "mod2": 2, "mod3": 3} } } } def mock_dict(): d = MagicMock() d.pad.return_value = 1 d.eos.return_value = 2 d.unk.return_value = 3 return d def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch): tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1) tokens_ds = data.TokenBlockDataset( tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) trainer = mock_trainer(epoch, num_updates, iterations_in_epoch) dataset = data.LanguagePairDataset( tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False ) epoch_itr = data.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=[[i] for i in range(epoch_size)], ) return trainer, epoch_itr def get_mock_cfg(finetune_from_model): cfg_mock = OmegaConf.create( { "checkpoint": { "save_dir": None, "optimizer_overrides": "{}", "reset_dataloader": False, "reset_meters": False, "reset_optimizer": False, "reset_lr_scheduler": False, "finetune_from_model": finetune_from_model, "model_parallel_size": 1, "restore_file": "checkpoint_last.pt", "no_save": False, "save_interval_updates": 0, "no_last_checkpoints": False, "keep_interval_updates": 0, "keep_last_epochs": 0, "keep_best_checkpoints": 0, }, "common": { "model_parallel_size": 1, }, } ) return cfg_mock class TestCheckpointsForTaskLevelAttributes(unittest.TestCase): def setUp(self) -> None: self.cfg_mock = get_mock_cfg(None) self.patches = { "os.makedirs": MagicMock(), "os.path.join": MagicMock(), "os.path.isfile": MagicMock(return_value=True), "os.path.isabs": MagicMock(return_value=False), "fairseq.file_io.PathManager.exists": MagicMock(return_value=False), } self.applied_patches = [patch(p, d) for p, d in self.patches.items()] [p.start() for p in self.applied_patches] logging.disable(logging.CRITICAL) self.trainer, self.epoch_itr = get_trainer_and_epoch_itr(2, 150, 200, 50) self.trainer.get_train_iterator = MagicMock(return_value=self.epoch_itr) self.epoch_itr.next_epoch_itr(shuffle=False) checkpoint_utils.save_checkpoint( self.cfg_mock.checkpoint, self.trainer, self.epoch_itr, None ) def tearDown(self): patch.stopall() logging.disable(logging.NOTSET) def test_verify_checkpoint(self) -> None: cp_dict = self.trainer.task.get_checkpoint_dict() self.assertTrue(len(cp_dict) == 1) self.assertTrue("FakeTask" in cp_dict) self.assertTrue("observer_stats" in cp_dict["FakeTask"]) self.assertTrue(len(cp_dict["FakeTask"]["observer_stats"]) == 1) self.assertTrue( ( 4, 16, "MovingAveragePerChannelMinMax", "MovingAveragePerChannelMinMax", ) in cp_dict["FakeTask"]["observer_stats"] ) self.assertTrue( cp_dict["FakeTask"]["observer_stats"][ ( 4, 16, "MovingAveragePerChannelMinMax", "MovingAveragePerChannelMinMax", ) ] == {"mod1": 1, "mod2": 2, "mod3": 3} ) def test_load_checkpoint(self) -> None: with contextlib.redirect_stdout(StringIO()): # Now, load checkpoint to ensure the respective logic works as expected _, epoch_itr = checkpoint_utils.load_checkpoint( self.cfg_mock.checkpoint, self.trainer ) self.trainer.task.set_checkpoint_dict.assert_called_once_with( checkpoint_dict()["FakeTask"] ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_concat_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from fairseq.data import LanguagePairDataset, TokenBlockDataset from fairseq.data.concat_dataset import ConcatDataset from tests.test_train import mock_dict class TestConcatDataset(unittest.TestCase): def setUp(self): d = mock_dict() tokens_1 = torch.LongTensor([1]).view(1, -1) tokens_ds1 = TokenBlockDataset( tokens_1, sizes=[tokens_1.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_1 = LanguagePairDataset( tokens_ds1, tokens_ds1.sizes, d, shuffle=False ) tokens_2 = torch.LongTensor([2]).view(1, -1) tokens_ds2 = TokenBlockDataset( tokens_2, sizes=[tokens_2.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_2 = LanguagePairDataset( tokens_ds2, tokens_ds2.sizes, d, shuffle=False ) def test_concat_dataset_basics(self): d = ConcatDataset([self.dataset_1, self.dataset_2]) assert len(d) == 2 assert d[0]["source"][0] == 1 assert d[1]["source"][0] == 2 d = ConcatDataset([self.dataset_1, self.dataset_2], sample_ratios=[1, 2]) assert len(d) == 3 assert d[0]["source"][0] == 1 assert d[1]["source"][0] == 2 assert d[2]["source"][0] == 2 d = ConcatDataset([self.dataset_1, self.dataset_2], sample_ratios=[2, 1]) assert len(d) == 3 assert d[0]["source"][0] == 1 assert d[1]["source"][0] == 1 assert d[2]["source"][0] == 2 ================================================ FILE: tests/test_constraints.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from typing import List import torch from fairseq.token_generation_constraints import ( ConstraintNode, OrderedConstraintState, UnorderedConstraintState, pack_constraints, ) def tensorize(constraints: List[List[int]]) -> torch.Tensor: return [torch.tensor(x) for x in constraints] class TestHelperRoutines(unittest.TestCase): def setUp(self): self.examples = [ ([[]], torch.tensor([[0]])), ([[], []], torch.tensor([[0], [0]])), ([[torch.tensor([1, 2])], []], torch.tensor([[1, 1, 2, 0], [0, 0, 0, 0]])), ( [ [ torch.tensor([3, 1, 2]), torch.tensor([3]), torch.tensor([4, 5, 6, 7]), ], [], [torch.tensor([1, 8, 9, 10, 1, 4, 11, 12])], ], torch.tensor( [ [3, 3, 1, 2, 0, 3, 0, 4, 5, 6, 7, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 8, 9, 10, 1, 4, 11, 12, 0, 0, 0], ] ), ), ] def test_packing(self): """Ensures the list of lists of tensors gets packed correctly.""" for batch_constraints, expected_tensor in self.examples: packed = pack_constraints(batch_constraints) assert torch.equal(packed, expected_tensor) class TestUnorderedConstraintState(unittest.TestCase): def setUp(self): # Tuples of (contraint set, expected printed graph, token counts per node) self.examples = [ ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), "([None].False#6 ([1].True#4 ([2].False#1 [3].True#1) [3].True#1 [4].True#1) ([4].False#2 ([5].True#2 ([6].False#1 [7].True#1))))", # noqa {1: 4, 2: 1, 3: 2, 4: 3, 5: 2, 6: 1, 7: 1}, ), ([], "[None].False#0", {}), (tensorize([[0]]), "([None].False#1 [0].True#1)", {0: 1}), ( tensorize([[100000, 1, 2, 3, 4, 5]]), "([None].False#1 ([100000].False#1 ([1].False#1 ([2].False#1 ([3].False#1 ([4].False#1 [5].True#1))))))", {100000: 1, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1}, ), ( tensorize([[1, 2], [1, 2]]), "([None].False#2 ([1].False#2 [2].True#2))", {1: 2, 2: 2}, ), ( tensorize([[1, 2], [3, 4]]), "([None].False#2 ([1].False#1 [2].True#1) ([3].False#1 [4].True#1))", {1: 1, 2: 1, 3: 1, 4: 1}, ), ] self.sequences = [ ( self.examples[0][0], [], {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, ), ( self.examples[0][0], [1, 2], {"bank": 2, "num_completed": 0, "finished": False, "is_root": False}, ), ( self.examples[0][0], [1, 2, 94], {"bank": 1, "num_completed": 1, "finished": False, "is_root": True}, ), ( self.examples[0][0], [1, 3, 999, 1, 4], {"bank": 4, "num_completed": 2, "finished": False, "is_root": False}, ), ( self.examples[0][0], [1, 3, 999, 1, 4, 999], {"bank": 4, "num_completed": 2, "finished": False, "is_root": True}, ), ( self.examples[0][0], [4, 5, 6, 8], {"bank": 2, "num_completed": 1, "finished": False, "is_root": True}, ), ( self.examples[0][0], # Tricky, because in last three, goes down [1->4] branch, could miss [1] and [4->5] # [[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]], [1, 2, 3, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5], {"bank": 14, "num_completed": 6, "finished": True, "is_root": False}, ), ( self.examples[0][0], [1, 2, 3, 999, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5, 117], {"bank": 14, "num_completed": 6, "finished": True, "is_root": True}, ), ( tensorize([[1], [2, 3]]), # Should not be able to get credit for entering 1 a second time [1, 1], {"bank": 1, "num_completed": 1, "finished": False, "is_root": True}, ), ( self.examples[4][0], [1, 2, 1, 2], {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, ), ( self.examples[4][0], [1, 2, 1, 2, 1], {"bank": 4, "num_completed": 2, "finished": True, "is_root": True}, ), ( self.examples[5][0], [1, 2, 3, 4, 5], {"bank": 4, "num_completed": 2, "finished": True, "is_root": True}, ), ] def test_graphs(self): """ Test whether unordered graph systems are created correctly. """ for example in self.examples: constraints, expected, gold_counts = example c = ConstraintNode.create(constraints) assert ( ConstraintNode.print_graph(c) == expected ), f"got {ConstraintNode.print_graph(c)}, expected {expected}" assert ( c.token_counts() == gold_counts ), f"{c} got {c.token_counts()} wanted {gold_counts}" def test_next_tokens(self): """ Tests that the set of next tokens is correct. """ for example in self.examples: constraints, expected, gold_counts = example root = ConstraintNode.create(constraints) root_tokens = set(root.children.keys()) for sequence in constraints: state = UnorderedConstraintState(root) for token in sequence: all_tokens = root_tokens.union(state.node.children.keys()) assert ( all_tokens == state.next_tokens() ), f"ALL {all_tokens} NEXT {state.next_tokens()}" state = state.advance(token) def test_sequences(self): for constraints, tokens, expected in self.sequences: state = UnorderedConstraintState.create(pack_constraints([constraints])[0]) for token in tokens: state = state.advance(token) result = {} for attr in expected.keys(): result[attr] = getattr(state, attr) assert ( result == expected ), f"TEST({tokens}) GOT: {result} WANTED: {expected}" class TestOrderedConstraintState(unittest.TestCase): def setUp(self): self.sequences = [ ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [], {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, ), ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [1, 2], {"bank": 2, "num_completed": 0, "finished": False, "is_root": False}, ), ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [1, 2, 94], {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, ), ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [1, 3, 999, 1, 4], {"bank": 0, "num_completed": 0, "finished": False, "is_root": True}, ), ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [1, 2, 3, 999, 999], {"bank": 3, "num_completed": 1, "finished": False, "is_root": False}, ), ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [1, 2, 3, 77, 1, 3, 1], {"bank": 6, "num_completed": 2, "finished": False, "is_root": False}, ), ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [1, 2, 3, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5], {"bank": 14, "num_completed": 6, "finished": True, "is_root": False}, ), ( tensorize([[1, 2, 3], [1, 3], [1, 4], [4, 5, 6, 7], [1], [4, 5]]), [1, 2, 999, 1, 2, 3, 999, 1, 3, 1, 4, 4, 5, 6, 7, 1, 4, 5, 117], {"bank": 14, "num_completed": 6, "finished": True, "is_root": False}, ), ( tensorize([[1], [2, 3]]), [1, 1], {"bank": 1, "num_completed": 1, "finished": False, "is_root": False}, ), ( tensorize([[1, 2], [1, 2]]), [1, 2, 1, 2], {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, ), ( tensorize([[1, 2], [1, 2]]), [1, 2, 1, 2, 1], {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, ), ( tensorize([[1, 2], [3, 4]]), [1, 2, 3, 4, 5], {"bank": 4, "num_completed": 2, "finished": True, "is_root": False}, ), ] def test_sequences(self): for i, (constraints, tokens, expected) in enumerate(self.sequences): state = OrderedConstraintState.create(pack_constraints([constraints])[0]) for token in tokens: state = state.advance(token) result = {} for attr in expected.keys(): result[attr] = getattr(state, attr) assert ( result == expected ), f"TEST({tokens}) GOT: {result} WANTED: {expected}" if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_convtbc.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch import torch.nn as nn from fairseq.modules import ConvTBC class TestConvTBC(unittest.TestCase): def test_convtbc(self): # ksz, in_channels, out_channels conv_tbc = ConvTBC(4, 5, kernel_size=3, padding=1) # out_channels, in_channels, ksz conv1d = nn.Conv1d(4, 5, kernel_size=3, padding=1) conv_tbc.weight.data.copy_(conv1d.weight.data.transpose(0, 2)) conv_tbc.bias.data.copy_(conv1d.bias.data) input_tbc = torch.randn(7, 2, 4, requires_grad=True) input1d = input_tbc.data.transpose(0, 1).transpose(1, 2) input1d.requires_grad = True output_tbc = conv_tbc(input_tbc) output1d = conv1d(input1d) self.assertAlmostEqual( output_tbc.data.transpose(0, 1).transpose(1, 2), output1d.data ) grad_tbc = torch.randn(output_tbc.size()) grad1d = grad_tbc.transpose(0, 1).transpose(1, 2).contiguous() output_tbc.backward(grad_tbc) output1d.backward(grad1d) self.assertAlmostEqual( conv_tbc.weight.grad.data.transpose(0, 2), conv1d.weight.grad.data ) self.assertAlmostEqual(conv_tbc.bias.grad.data, conv1d.bias.grad.data) self.assertAlmostEqual( input_tbc.grad.data.transpose(0, 1).transpose(1, 2), input1d.grad.data ) def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess((t1 - t2).abs().max(), 1e-4) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_data_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import numpy as np from fairseq.data.data_utils_fast import batch_by_size_fn, batch_by_size_vec class TestBatchBySize(unittest.TestCase): @classmethod def batch_by_size_baseline( cls, indices, num_tokens_vec, max_tokens, max_sentences, bsz_mult, ): """Simple, reliable and slow implementation of batch by size""" batches = [] start = 0 while start < len(indices): for end in range(start + 1, len(indices) + 1): max_val = max(num_tokens_vec[pos] for pos in range(start, end)) sent_count = end - start num_tokens = max_val * sent_count overflow = num_tokens > max_tokens > 0 or sent_count > max_sentences > 0 terminate = overflow or end == len(indices) if overflow: sent_count -= 1 if terminate: if sent_count > bsz_mult: sent_count = sent_count - sent_count % bsz_mult batches.append(indices[start : start + sent_count]) start = start + sent_count break return batches @classmethod def _get_error_message( cls, max_sentences, max_tokens, bsz_mult, num_tokens_vec, validation, results ): return f"""Reference batch_by_size implementation should produce same output as the baseline method. Params: max_sentences={max_sentences}, max_tokens={max_tokens}, bsz_mult={bsz_mult}, num_tokens_vec={num_tokens_vec}, expected_batches={validation}, returned_batches={results}""" def _compare_results( self, indices_len, batch_by_size_impl, max_sentences, max_tokens, bsz_mult, num_tokens_vec, ): indices = np.array(list(range(indices_len))) validation = self.batch_by_size_baseline( indices, num_tokens_vec, max_tokens=max_tokens, max_sentences=max_sentences, bsz_mult=bsz_mult, ) results = batch_by_size_impl( indices, num_tokens_vec, max_tokens=max_tokens, max_sentences=max_sentences, bsz_mult=bsz_mult, ) error_msg = self._get_error_message( max_sentences, max_tokens, bsz_mult, num_tokens_vec, validation, results ) self.assertEqual(len(validation), len(results), error_msg) for first, second in zip(validation, results): self.assertTrue(np.array_equal(first, second), error_msg) def _run_compare_with_baseline_sweep(self, batch_by_size_impl): """Compare reference batch_by_size implementation with batch_by_size_baseline across a dense grid of hyperparam values""" MAX_MAX_TOKENS = 10 NUM_TOKENS_VECS_COUNT = 5 for indices_len in [10, 11]: # try odd and even len of indices for max_sentences in range(0, indices_len + 2): for max_tokens in range(0, MAX_MAX_TOKENS): for bsz_mult in range(1, max(MAX_MAX_TOKENS, indices_len) + 2): for _ in range(NUM_TOKENS_VECS_COUNT): num_tokens_vec = np.random.randint( 0, max_tokens + 1, size=indices_len ) self._compare_results( indices_len, batch_by_size_impl, max_sentences, max_tokens, bsz_mult, num_tokens_vec, ) class TestBatchBySizeVec(TestBatchBySize): def test_compare_with_baseline(self): self._run_compare_with_baseline_sweep(batch_by_size_vec) class TestBatchBySizeFn(TestBatchBySize): def test_compare_with_baseline(self): def batch_by_size_fn_wrapper( indices, num_tokens_vec, max_tokens, max_sentences, bsz_mult, ): def num_tokens_fn(idx): return num_tokens_vec[idx] return batch_by_size_fn( indices, num_tokens_fn, max_tokens, max_sentences, bsz_mult ) self._run_compare_with_baseline_sweep(batch_by_size_fn_wrapper) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_dataclass_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from argparse import ArgumentParser from dataclasses import dataclass, field from fairseq.dataclass import FairseqDataclass from fairseq.dataclass.utils import gen_parser_from_dataclass @dataclass class A(FairseqDataclass): data: str = field(default="test", metadata={"help": "the data input"}) num_layers: int = field(default=200, metadata={"help": "more layers is better?"}) @dataclass class B(FairseqDataclass): bar: A = field(default=A()) foo: int = field(default=0, metadata={"help": "not a bar"}) @dataclass class D(FairseqDataclass): arch: A = field(default=A()) foo: int = field(default=0, metadata={"help": "not a bar"}) @dataclass class C(FairseqDataclass): data: str = field(default="test", metadata={"help": "root level data input"}) encoder: D = field(default=D()) decoder: A = field(default=A()) lr: int = field(default=0, metadata={"help": "learning rate"}) class TestDataclassUtils(unittest.TestCase): def test_argparse_convert_basic(self): parser = ArgumentParser() gen_parser_from_dataclass(parser, A(), True) args = parser.parse_args(["--num-layers", "10", "the/data/path"]) self.assertEqual(args.num_layers, 10) self.assertEqual(args.data, "the/data/path") def test_argparse_recursive(self): parser = ArgumentParser() gen_parser_from_dataclass(parser, B(), True) args = parser.parse_args(["--num-layers", "10", "--foo", "10", "the/data/path"]) self.assertEqual(args.num_layers, 10) self.assertEqual(args.foo, 10) self.assertEqual(args.data, "the/data/path") def test_argparse_recursive_prefixing(self): self.maxDiff = None parser = ArgumentParser() gen_parser_from_dataclass(parser, C(), True, "") args = parser.parse_args( [ "--encoder-arch-data", "ENCODER_ARCH_DATA", "--encoder-arch-num-layers", "10", "--encoder-foo", "10", "--decoder-data", "DECODER_DATA", "--decoder-num-layers", "10", "--lr", "10", "the/data/path", ] ) self.assertEqual(args.encoder_arch_data, "ENCODER_ARCH_DATA") self.assertEqual(args.encoder_arch_num_layers, 10) self.assertEqual(args.encoder_foo, 10) self.assertEqual(args.decoder_data, "DECODER_DATA") self.assertEqual(args.decoder_num_layers, 10) self.assertEqual(args.lr, 10) self.assertEqual(args.data, "the/data/path") if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import unittest from typing import Sequence from fairseq.data import LanguagePairDataset, ListDataset, RoundRobinZipDatasets from tests.test_train import mock_dict def lang_pair_dataset(lengths: Sequence[int]) -> LanguagePairDataset: tokens = [[i] * l for i, l in enumerate(lengths)] return LanguagePairDataset(ListDataset(tokens), lengths, mock_dict()) def sample(id: int, length: int): return {"id": id, "source": [id] * length, "target": None} class TestDataset(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_round_robin_zip_datasets(self): long_dataset = lang_pair_dataset([10, 9, 8, 11]) short_dataset = lang_pair_dataset([11, 9]) dataset = RoundRobinZipDatasets({"a": long_dataset, "b": short_dataset}) # Dataset is now sorted by sentence length dataset.ordered_indices() assert dataset.longest_dataset is long_dataset self.assertEqual(dict(dataset[0]), {"a": sample(2, 8), "b": sample(1, 9)}) # The item 2 of dataset 'a' is with item (2 % 2 = 0) of dataset 'b' self.assertEqual(dict(dataset[2]), {"a": sample(0, 10), "b": sample(1, 9)}) def test_round_robin_zip_datasets_filtered(self): long_dataset = lang_pair_dataset([10, 20, 8, 11, 1000, 7, 12]) short_dataset = lang_pair_dataset([11, 20, 9, 1000]) dataset = RoundRobinZipDatasets({"a": long_dataset, "b": short_dataset}) # Dataset is now sorted by sentence length idx = dataset.ordered_indices() idx, _ = dataset.filter_indices_by_size(idx, {"a": 19, "b": 900}) self.assertEqual(list(idx), [0, 1, 2, 3, 4]) self.assertEqual(dict(dataset[0]), {"a": sample(5, 7), "b": sample(2, 9)}) self.assertEqual(dict(dataset[2]), {"a": sample(0, 10), "b": sample(1, 20)}) self.assertEqual(dict(dataset[4]), {"a": sample(6, 12), "b": sample(0, 11)}) def test_round_robin_zip_datasets_filtered_with_tuple(self): long_dataset = lang_pair_dataset([10, 20, 8, 11, 1000, 7, 12]) short_dataset = lang_pair_dataset([11, 20, 9, 1000]) dataset = RoundRobinZipDatasets({"a": long_dataset, "b": short_dataset}) # Dataset is now sorted by sentence length idx = dataset.ordered_indices() idx, _ = dataset.filter_indices_by_size(idx, 19) self.assertEqual(list(idx), [0, 1, 2, 3, 4]) self.assertEqual(dict(dataset[0]), {"a": sample(5, 7), "b": sample(2, 9)}) self.assertEqual(dict(dataset[2]), {"a": sample(0, 10), "b": sample(2, 9)}) self.assertEqual(dict(dataset[4]), {"a": sample(6, 12), "b": sample(2, 9)}) ================================================ FILE: tests/test_dictionary.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import io import os import string import tempfile import unittest import torch from fairseq import tokenizer from fairseq.data import Dictionary class TestDictionary(unittest.TestCase): def test_finalize(self): txt = [ "A B C D", "B C D", "C D", "D", ] ref_ids1 = list( map( torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ], ) ) ref_ids2 = list( map( torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ], ) ) # build dictionary d = Dictionary() for line in txt: d.encode_line(line, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append(dictionary.encode_line(line, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode="w") as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids) def test_overwrite(self): # for example, Camembert overwrites <unk>, <s> and </s> dict_file = io.StringIO( "<unk> 999 #fairseq:overwrite\n" "<s> 999 #fairseq:overwrite\n" "</s> 999 #fairseq:overwrite\n" ", 999\n" "▁de 999\n" ) d = Dictionary() d.add_from_file(dict_file) self.assertEqual(d.index("<pad>"), 1) self.assertEqual(d.index("foo"), 3) self.assertEqual(d.index("<unk>"), 4) self.assertEqual(d.index("<s>"), 5) self.assertEqual(d.index("</s>"), 6) self.assertEqual(d.index(","), 7) self.assertEqual(d.index("▁de"), 8) def test_no_overwrite(self): # for example, Camembert overwrites <unk>, <s> and </s> dict_file = io.StringIO( "<unk> 999\n" "<s> 999\n" "</s> 999\n" ", 999\n" "▁de 999\n" ) d = Dictionary() with self.assertRaisesRegex(RuntimeError, "Duplicate"): d.add_from_file(dict_file) def test_space(self): # for example, character models treat space as a symbol dict_file = io.StringIO(" 999\n" "a 999\n" "b 999\n") d = Dictionary() d.add_from_file(dict_file) self.assertEqual(d.index(" "), 4) self.assertEqual(d.index("a"), 5) self.assertEqual(d.index("b"), 6) def test_add_file_to_dict(self): counts = {} num_lines = 100 per_line = 10 with tempfile.TemporaryDirectory("test_sampling") as data_dir: filename = os.path.join(data_dir, "dummy.txt") with open(filename, "w", encoding="utf-8") as data: for c in string.ascii_letters: line = f"{c} " * per_line for _ in range(num_lines): data.write(f"{line}\n") counts[c] = per_line * num_lines per_line += 5 dict = Dictionary() Dictionary.add_file_to_dictionary( filename, dict, tokenizer.tokenize_line, 10 ) dict.finalize(threshold=0, nwords=-1, padding_factor=8) for c in string.ascii_letters: count = dict.get_count(dict.index(c)) self.assertEqual( counts[c], count, f"{c} count is {count} but should be {counts[c]}" ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_ema.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from copy import deepcopy from dataclasses import dataclass import pytest from typing import Optional from unittest.mock import patch import torch from fairseq.models.ema import EMA class DummyModule(torch.nn.Module): def __init__(self) -> None: """LightningModule for testing purposes Args: epoch_min_loss_override (int, optional): Pass in an epoch that will be set to the minimum validation loss for testing purposes (zero based). If None this is ignored. Defaults to None. """ super().__init__() self.layer = torch.nn.Linear(in_features=32, out_features=2) self.another_layer = torch.nn.Linear(in_features=2, out_features=2) def forward(self, x: torch.Tensor) -> torch.Tensor: x = self.layer(x) return self.another_layer(x) @dataclass class EMAConfig(object): ema_decay: float = 0.99 ema_start_update: int = 0 ema_fp32: bool = False ema_seed_model: Optional[str] = None ema_update_freq: int = 1 class TestEMA(unittest.TestCase): def assertTorchAllClose(self, x, y, atol=1e-8, rtol=1e-5, msg=None): diff = x.float() - y.float() diff_norm = torch.norm(diff) other_norm = torch.norm(y.float()) if msg is None: msg = "|input - other| > {} + {} * |other|".format(atol, rtol) self.assertLessEqual( diff_norm, atol + rtol * other_norm, msg=msg, ) def test_ema(self): model = DummyModule() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) state = deepcopy(model.state_dict()) config = EMAConfig() ema = EMA(model, config) # set decay ema._set_decay(config.ema_decay) self.assertEqual(ema.get_decay(), config.ema_decay) # get model self.assertEqual(ema.get_model(), ema.model) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) # EMA step x = torch.randn(32) y = model(x) loss = y.sum() loss.backward() optimizer.step() ema.step(model) ema_state_dict = ema.get_model().state_dict() for key, param in model.state_dict().items(): prev_param = state[key] ema_param = ema_state_dict[key] if "version" in key: # Do not decay a model.version pytorch param continue self.assertTorchAllClose( ema_param, config.ema_decay * prev_param + (1 - config.ema_decay) * param, ) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) # Load EMA into model model2 = DummyModule() ema.reverse(model2) for key, param in model2.state_dict().items(): ema_param = ema_state_dict[key] self.assertTrue(torch.allclose(ema_param, param)) # Check that step_internal is called once with patch.object(ema, "_step_internal", return_value=None) as mock_method: ema.step(model) mock_method.assert_called_once_with(model, None) def _test_ema_start_update(self, updates): model = DummyModule() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) state = deepcopy(model.state_dict()) config = EMAConfig(ema_start_update=1) ema = EMA(model, config) # EMA step x = torch.randn(32) y = model(x) loss = y.sum() loss.backward() optimizer.step() ema.step(model, updates=updates) ema_state_dict = ema.get_model().state_dict() self.assertEqual(ema.get_decay(), 0 if updates == 0 else config.ema_decay) for key, param in model.state_dict().items(): ema_param = ema_state_dict[key] prev_param = state[key] if "version" in key: # Do not decay a model.version pytorch param continue if updates == 0: self.assertTorchAllClose( ema_param, param, ) else: self.assertTorchAllClose( ema_param, config.ema_decay * prev_param + (1 - config.ema_decay) * param, ) # Check that step_internal is called once with patch.object(ema, "_step_internal", return_value=None) as mock_method: ema.step(model, updates=updates) mock_method.assert_called_once_with(model, updates) def test_ema_before_start_update(self): self._test_ema_start_update(updates=0) def test_ema_after_start_update(self): self._test_ema_start_update(updates=1) def test_ema_fp32(self): dtype = torch.float model = DummyModule().to(dtype) optimizer = torch.optim.SGD(model.parameters(), lr=0.01) state = deepcopy(model.state_dict()) config = EMAConfig(ema_fp32=True) ema = EMA(model, config) x = torch.randn(32) y = model(x.to(dtype)) loss = y.sum() loss.backward() optimizer.step() ema.step(model) for key, param in model.state_dict().items(): prev_param = state[key] ema_param = ema.get_model().state_dict()[key] if "version" in key: # Do not decay a model.version pytorch param continue self.assertIn(key, ema.fp32_params) # EMA update is done in fp32, and hence the EMA param must be # closer to the EMA update done in fp32 than in fp16. self.assertLessEqual( torch.norm( ema_param.float() - ( config.ema_decay * prev_param.float() + (1 - config.ema_decay) * param.float() ) .to(dtype) .float() ), torch.norm( ema_param.float() - ( config.ema_decay * prev_param + (1 - config.ema_decay) * param ).float() ), ) self.assertTorchAllClose( ema_param, ( config.ema_decay * prev_param.float() + (1 - config.ema_decay) * param.float() ).to(dtype), ) @pytest.mark.skipif( not torch.cuda.is_available(), reason="CPU no longer supports Linear in half precision", ) def test_ema_fp16(self): model = DummyModule().cuda().half() optimizer = torch.optim.SGD(model.parameters(), lr=0.01) state = deepcopy(model.state_dict()) config = EMAConfig(ema_fp32=False) ema = EMA(model, config) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) x = torch.randn(32).cuda() y = model(x.half()) loss = y.sum() loss.backward() optimizer.step() ema.step(model) for key, param in model.state_dict().items(): prev_param = state[key] ema_param = ema.get_model().state_dict()[key] if "version" in key: # Do not decay a model.version pytorch param continue # EMA update is done in fp16, and hence the EMA param must be # closer to the EMA update done in fp16 than in fp32. self.assertLessEqual( torch.norm( ema_param.float() - ( config.ema_decay * prev_param + (1 - config.ema_decay) * param ).float() ), torch.norm( ema_param.float() - ( config.ema_decay * prev_param.float() + (1 - config.ema_decay) * param.float() ) .half() .float() ), ) self.assertTorchAllClose( ema_param, config.ema_decay * prev_param + (1 - config.ema_decay) * param, ) # Since fp32 params is not used, it should be of size 0 self.assertEqual(len(ema.fp32_params), 0) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_espnet_multihead_attention.py ================================================ import torch import numpy as np import unittest from fairseq.modules import ( ESPNETMultiHeadedAttention, RelPositionMultiHeadedAttention, RotaryPositionMultiHeadedAttention, ) torch.use_deterministic_algorithms(True) class TestESPNETMultiHeadedAttention(unittest.TestCase): def setUp(self) -> None: self.T = 3 self.B = 1 self.C = 2 torch.manual_seed(0) self.sample = torch.randn(self.T, self.B, self.C) # TBC self.sample_scores = torch.randn(self.B, 1, self.T, self.T) self.MHA = ESPNETMultiHeadedAttention(self.C, 1, dropout=0) def test_forward(self): expected_scores = torch.tensor( [[[0.1713, -0.3776]], [[0.2263, -0.4486]], [[0.2243, -0.4538]]] ) scores, _ = self.MHA(self.sample, self.sample, self.sample) self.assertTrue( np.allclose( expected_scores.cpu().detach().numpy(), scores.cpu().detach().numpy(), atol=1e-4, ) ) def test_forward_qkv(self): expected_query = torch.tensor( [[[[-1.0235, 0.0409], [0.4008, 1.3077], [0.5396, 2.0698]]]] ) expected_key = torch.tensor( [[[[0.5053, -0.4965], [-0.3730, -0.9473], [-0.7019, -0.1935]]]] ) expected_val = torch.tensor( [[[[-0.9940, 0.5403], [0.5924, -0.7619], [0.7504, -1.0892]]]] ) sample_t = self.sample.transpose(0, 1) query, key, val = self.MHA.forward_qkv(sample_t, sample_t, sample_t) self.assertTrue( np.allclose( expected_query.cpu().detach().numpy(), query.cpu().detach().numpy(), atol=1e-4, ) ) self.assertTrue( np.allclose( expected_key.cpu().detach().numpy(), key.cpu().detach().numpy(), atol=1e-4, ) ) self.assertTrue( np.allclose( expected_val.cpu().detach().numpy(), val.cpu().detach().numpy(), atol=1e-4, ) ) def test_forward_attention(self): expected_scores = torch.tensor( [[[0.1627, -0.6249], [-0.2547, -0.6487], [-0.0711, -0.8545]]] ) scores = self.MHA.forward_attention( self.sample.transpose(0, 1).view(self.B, 1, self.T, self.C), self.sample_scores, mask=None, ) self.assertTrue( np.allclose( expected_scores.cpu().detach().numpy(), scores.cpu().detach().numpy(), atol=1e-4, ) ) class TestRelPositionMultiHeadedAttention(unittest.TestCase): def setUp(self) -> None: self.T = 3 self.B = 1 self.C = 2 torch.manual_seed(0) self.sample = torch.randn(self.T, self.B, self.C) # TBC self.sample_x = torch.randn(self.B, 1, self.T, self.T * 2 - 1) self.sample_pos = torch.randn(self.B, self.T * 2 - 1, self.C) self.MHA = RelPositionMultiHeadedAttention(self.C, 1, dropout=0) def test_rel_shift(self): expected_x = torch.tensor( [ [ [ [-0.7193, -0.4033, -0.5966], [-0.8567, 1.1006, -1.0712], [-0.5663, 0.3731, -0.8920], ] ] ] ) x = self.MHA.rel_shift(self.sample_x) self.assertTrue( np.allclose( expected_x.cpu().detach().numpy(), x.cpu().detach().numpy(), atol=1e-4, ) ) def test_forward(self): expected_scores = torch.tensor( [ [[-0.9609, -0.5020]], [[-0.9308, -0.4890]], [[-0.9473, -0.4948]], [[-0.9609, -0.5020]], [[-0.9308, -0.4890]], [[-0.9473, -0.4948]], [[-0.9609, -0.5020]], [[-0.9308, -0.4890]], [[-0.9473, -0.4948]], [[-0.9609, -0.5020]], [[-0.9308, -0.4890]], [[-0.9473, -0.4948]], [[-0.9609, -0.5020]], [[-0.9308, -0.4890]], [[-0.9473, -0.4948]], ] ) scores, _ = self.MHA(self.sample, self.sample, self.sample, self.sample_pos) self.assertTrue( np.allclose( expected_scores.cpu().detach().numpy(), scores.cpu().detach().numpy(), atol=1e-4, ) ) class TestRotaryPositionMultiHeadedAttention(unittest.TestCase): def setUp(self) -> None: self.T = 3 self.B = 1 self.C = 2 torch.manual_seed(0) self.sample = torch.randn(self.T, self.B, self.C) # TBC self.MHA = RotaryPositionMultiHeadedAttention( self.C, 1, dropout=0, precision=None ) def test_forward(self): expected_scores = torch.tensor( [[[-0.3220, -0.4726]], [[-1.2813, -0.0979]], [[-0.3138, -0.4758]]] ) scores, _ = self.MHA(self.sample, self.sample, self.sample) self.assertTrue( np.allclose( expected_scores.cpu().detach().numpy(), scores.cpu().detach().numpy(), atol=1e-4, ) ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_export.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import tempfile import unittest import torch from fairseq.data.dictionary import Dictionary from fairseq.models.transformer import TransformerModel from fairseq.modules import multihead_attention, sinusoidal_positional_embedding from fairseq.tasks.fairseq_task import LegacyFairseqTask DEFAULT_TEST_VOCAB_SIZE = 100 class DummyTask(LegacyFairseqTask): def __init__(self, args): super().__init__(args) self.dictionary = get_dummy_dictionary() if getattr(self.args, "ctc", False): self.dictionary.add_symbol("<ctc_blank>") self.src_dict = self.dictionary self.tgt_dict = self.dictionary @property def source_dictionary(self): return self.src_dict @property def target_dictionary(self): return self.dictionary def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): dummy_dict = Dictionary() # add dummy symbol to satisfy vocab size for id, _ in enumerate(range(vocab_size)): dummy_dict.add_symbol("{}".format(id), 1000) return dummy_dict def get_dummy_task_and_parser(): """ Return a dummy task and argument parser, which can be used to create a model/criterion. """ parser = argparse.ArgumentParser( description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS ) DummyTask.add_args(parser) args = parser.parse_args([]) task = DummyTask.setup_task(args) return task, parser def _test_save_and_load(scripted_module): with tempfile.NamedTemporaryFile() as f: scripted_module.save(f.name) torch.jit.load(f.name) class TestExportModels(unittest.TestCase): def test_export_multihead_attention(self): module = multihead_attention.MultiheadAttention(embed_dim=8, num_heads=2) scripted = torch.jit.script(module) _test_save_and_load(scripted) def test_incremental_state_multihead_attention(self): module1 = multihead_attention.MultiheadAttention(embed_dim=8, num_heads=2) module1 = torch.jit.script(module1) module2 = multihead_attention.MultiheadAttention(embed_dim=8, num_heads=2) module2 = torch.jit.script(module2) state = {} state = module1.set_incremental_state(state, "key", {"a": torch.tensor([1])}) state = module2.set_incremental_state(state, "key", {"a": torch.tensor([2])}) v1 = module1.get_incremental_state(state, "key")["a"] v2 = module2.get_incremental_state(state, "key")["a"] self.assertEqual(v1, 1) self.assertEqual(v2, 2) def test_positional_embedding(self): module = sinusoidal_positional_embedding.SinusoidalPositionalEmbedding( embedding_dim=8, padding_idx=1 ) scripted = torch.jit.script(module) _test_save_and_load(scripted) @unittest.skipIf( torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release" ) def test_export_transformer(self): task, parser = get_dummy_task_and_parser() TransformerModel.add_args(parser) args = parser.parse_args([]) model = TransformerModel.build_model(args, task) scripted = torch.jit.script(model) _test_save_and_load(scripted) @unittest.skipIf( torch.__version__ < "1.6.0", "Targeting OSS scriptability for the 1.6 release" ) def test_export_transformer_no_token_pos_emb(self): task, parser = get_dummy_task_and_parser() TransformerModel.add_args(parser) args = parser.parse_args([]) args.no_token_positional_embeddings = True model = TransformerModel.build_model(args, task) scripted = torch.jit.script(model) _test_save_and_load(scripted) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_file_chunker_utils.py ================================================ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import shutil import tempfile import unittest from typing import Optional class TestFileChunker(unittest.TestCase): _tmpdir: Optional[str] = None _tmpfile: Optional[str] = None _line_content = "Hello, World\n" _num_bytes = None _num_lines = 200 _num_splits = 20 @classmethod def setUpClass(cls) -> None: cls._num_bytes = len(cls._line_content.encode("utf-8")) cls._tmpdir = tempfile.mkdtemp() with open(os.path.join(cls._tmpdir, "test.txt"), "w") as f: cls._tmpfile = f.name for _i in range(cls._num_lines): f.write(cls._line_content) f.flush() @classmethod def tearDownClass(cls) -> None: # Cleanup temp working dir. if cls._tmpdir is not None: shutil.rmtree(cls._tmpdir) # type: ignore def test_find_offsets(self): from fairseq.file_chunker_utils import find_offsets offsets = find_offsets(self._tmpfile, self._num_splits) self.assertEqual(len(offsets), self._num_splits + 1) (zero, *real_offsets, last) = offsets self.assertEqual(zero, 0) for i, o in enumerate(real_offsets): self.assertEqual( o, self._num_bytes + ((i + 1) * self._num_bytes * self._num_lines / self._num_splits), ) self.assertEqual(last, self._num_bytes * self._num_lines) def test_readchunks(self): from fairseq.file_chunker_utils import Chunker, find_offsets offsets = find_offsets(self._tmpfile, self._num_splits) for start, end in zip(offsets, offsets[1:]): with Chunker(self._tmpfile, start, end) as lines: all_lines = list(lines) num_lines = self._num_lines / self._num_splits self.assertAlmostEqual( len(all_lines), num_lines, delta=1 ) # because we split on the bites, we might end up with one more/less line in a chunk self.assertListEqual( all_lines, [self._line_content for _ in range(len(all_lines))] ) ================================================ FILE: tests/test_file_io.py ================================================ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import shutil import sys import tempfile import unittest from typing import Optional from unittest.mock import MagicMock class TestFileIO(unittest.TestCase): _tmpdir: Optional[str] = None _tmpfile: Optional[str] = None _tmpfile_contents = "Hello, World" @classmethod def setUpClass(cls) -> None: cls._tmpdir = tempfile.mkdtemp() with open(os.path.join(cls._tmpdir, "test.txt"), "w") as f: cls._tmpfile = f.name f.write(cls._tmpfile_contents) f.flush() @classmethod def tearDownClass(cls) -> None: # Cleanup temp working dir. if cls._tmpdir is not None: shutil.rmtree(cls._tmpdir) # type: ignore def test_file_io(self): from fairseq.file_io import PathManager with PathManager.open(os.path.join(self._tmpdir, "test.txt"), "r") as f: s = f.read() self.assertEqual(s, self._tmpfile_contents) def test_file_io_oss(self): # Mock iopath to simulate oss environment. sys.modules["iopath"] = MagicMock() from fairseq.file_io import PathManager with PathManager.open(os.path.join(self._tmpdir, "test.txt"), "r") as f: s = f.read() self.assertEqual(s, self._tmpfile_contents) def test_file_io_async(self): # ioPath `PathManager` is initialized after the first `opena` call. try: from fairseq.file_io import PathManager _asyncfile = os.path.join(self._tmpdir, "async.txt") f = PathManager.opena(_asyncfile, "wb") f.close() finally: self.assertTrue(PathManager.async_close()) ================================================ FILE: tests/test_fp16_optimizer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import copy import logging import unittest import torch from fairseq.optim.fp16_optimizer import FP16Optimizer, MemoryEfficientFP16Optimizer from omegaconf import OmegaConf @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestGradientScaling(unittest.TestCase): def setUp(self): self.x = torch.tensor([2.0]).cuda().half() weight = 3.0 bias = 5.0 self.error = 1.0 self.target = torch.tensor([self.x * weight + bias + self.error]).cuda().half() self.loss_fn = torch.nn.L1Loss() self.model = torch.nn.Linear(1, 1) self.model.weight.data = torch.tensor([[weight]]) self.model.bias.data = torch.tensor([bias]) self.model.cuda().half() self.params = list(self.model.parameters()) self.cfg_dls = OmegaConf.create( { "optimization": { "lr": [0.1], }, "optimizer": { "_name": "adam", "lr": [0.1], "adam_betas": "(0.9, 0.999)", "adam_eps": 1e-8, "weight_decay": 0.0, }, "common": { "fp16_init_scale": 1, "fp16_scale_window": 1, "fp16_scale_tolerance": 1, "threshold_loss_scale": 1, "min_loss_scale": 1e-4, "tpu": False, }, } ) logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def run_iter(self, model, params, optimizer): optimizer.zero_grad() y = model(self.x) loss = self.loss_fn(y, self.target) optimizer.backward(loss) self.assertEqual(loss, torch.tensor(1.0, device="cuda:0", dtype=torch.float16)) grad_norm = optimizer.clip_grad_norm(0) self.assertAlmostEqual(grad_norm.item(), 2.2361, 4) optimizer.step() self.assertEqual( model.weight, torch.tensor( [[3.0996]], device="cuda:0", dtype=torch.float16, requires_grad=True ), ) self.assertEqual( model.bias, torch.tensor( [5.1016], device="cuda:0", dtype=torch.float16, requires_grad=True ), ) self.assertEqual(optimizer.scaler.loss_scale, 2.0) def test_mixed_precision(self): model = copy.deepcopy(self.model) params = list(model.parameters()) optimizer = FP16Optimizer.build_optimizer(self.cfg_dls, params) self.run_iter(model, params, optimizer) self.assertTrue( all( torch.all( fp32_params.eq( torch.tensor( [3.1000, 5.1000], device="cuda:0", requires_grad=True ) ) ) for fp32_params in optimizer.fp32_params.values() ) ) def test_memory_efficient(self): model = copy.deepcopy(self.model) params = list(model.parameters()) optimizer = MemoryEfficientFP16Optimizer.build_optimizer(self.cfg_dls, params) self.run_iter(model, params, optimizer) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_hf_hub.py ================================================ #!/usr/bin/env python3 # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch try: import huggingface_hub except ImportError: huggingface_hub = None from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub @unittest.skipIf(not huggingface_hub, "Requires huggingface_hub install") class TestHuggingFaceHub(unittest.TestCase): @torch.no_grad() def test_hf_fastspeech2(self): hf_model_id = "facebook/fastspeech2-en-ljspeech" models, cfg, task = load_model_ensemble_and_task_from_hf_hub(hf_model_id) self.assertTrue(len(models) > 0) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_huffman.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import os import typing as tp import unittest from collections import Counter from tempfile import NamedTemporaryFile, TemporaryDirectory from fairseq.data import Dictionary, indexed_dataset from fairseq.data.huffman import ( HuffmanCodeBuilder, HuffmanCoder, HuffmanMMapIndexedDataset, HuffmanMMapIndexedDatasetBuilder, ) from tests.utils import POPULATION, make_data, sizes def make_counts(data: tp.List[tp.List[str]]) -> Counter: return Counter([symbol for sentence in data for symbol in sentence]) def make_code_builder(data: tp.List[tp.List[str]]) -> HuffmanCodeBuilder: builder = HuffmanCodeBuilder() for sentence in data: builder.add_symbols(*sentence) return builder class TestCodeBuilder(unittest.TestCase): def test_code_builder_can_count(self): data = make_data() counts = make_counts(data) builder = make_code_builder(data) self.assertEqual(builder.symbols, counts) def test_code_builder_can_add(self): data = make_data() counts = make_counts(data) builder = make_code_builder(data) new_builder = builder + builder self.assertEqual(new_builder.symbols, counts + counts) def test_code_builder_can_io(self): data = make_data() builder = make_code_builder(data) with NamedTemporaryFile() as tmp_fp: builder.to_file(tmp_fp.name) other_builder = HuffmanCodeBuilder.from_file(tmp_fp.name) self.assertEqual(builder.symbols, other_builder.symbols) class TestCoder(unittest.TestCase): def test_coder_can_io(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with NamedTemporaryFile() as tmp_fp: coder.to_file(tmp_fp.name) other_coder = HuffmanCoder.from_file(tmp_fp.name) self.assertEqual(coder, other_coder) def test_coder_can_encode_decode(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() encoded = [coder.encode(sentence) for sentence in data] decoded = [[n.symbol for n in coder.decode(enc)] for enc in encoded] self.assertEqual(decoded, data) unseen_data = make_data() unseen_encoded = [coder.encode(sentence) for sentence in unseen_data] unseen_decoded = [ [n.symbol for n in coder.decode(enc)] for enc in unseen_encoded ] self.assertEqual(unseen_decoded, unseen_data) def build_dataset(prefix, data, coder): with HuffmanMMapIndexedDatasetBuilder(prefix, coder) as builder: for sentence in data: builder.add_item(sentence) class TestHuffmanDataset(unittest.TestCase): def test_huffman_can_encode_decode(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix = os.path.join(dirname, "test1") build_dataset(prefix, data, coder) dataset = HuffmanMMapIndexedDataset(prefix) self.assertEqual(len(dataset), len(data)) decoded = [list(dataset.get_symbols(i)) for i in range(0, len(dataset))] self.assertEqual(decoded, data) data_sizes = [i.item() for i in dataset.sizes] self.assertEqual(data_sizes, sizes(data)) def test_huffman_compresses(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix = os.path.join(dirname, "huffman") build_dataset(prefix, data, coder) prefix_mmap = os.path.join(dirname, "mmap") mmap_builder = indexed_dataset.make_builder( indexed_dataset.data_file_path(prefix_mmap), "mmap", vocab_size=len(POPULATION), ) dictionary = Dictionary() for c in POPULATION: dictionary.add_symbol(c) dictionary.finalize() for sentence in data: mmap_builder.add_item(dictionary.encode_line(" ".join(sentence))) mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap)) huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size self.assertLess(huff_size, mmap_size) def test_huffman_can_append(self): data1 = make_data() builder = make_code_builder(data1) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix1 = os.path.join(dirname, "test1") build_dataset(prefix1, data1, coder) data2 = make_data() prefix2 = os.path.join(dirname, "test2") build_dataset(prefix2, data2, coder) prefix3 = os.path.join(dirname, "test3") with HuffmanMMapIndexedDatasetBuilder(prefix3, coder) as builder: builder.append(prefix1) builder.append(prefix2) dataset = HuffmanMMapIndexedDataset(prefix3) self.assertEqual(len(dataset), len(data1) + len(data2)) decoded1 = [list(dataset.get_symbols(i)) for i in range(0, len(data1))] self.assertEqual(decoded1, data1) decoded2 = [ list(dataset.get_symbols(i)) for i in range(len(data1), len(dataset)) ] self.assertEqual(decoded2, data2) data_sizes = [i.item() for i in dataset.sizes] self.assertEqual(data_sizes[: len(data1)], sizes(data1)) self.assertEqual(data_sizes[len(data1) : len(dataset)], sizes(data2)) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_inference_dropout.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import logging import unittest from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models.transformer import TransformerModel from tests.test_sequence_generator import get_dummy_task_and_parser class TestInferenceDropout(unittest.TestCase): def setUp(self): self.task, self.parser = get_dummy_task_and_parser() TransformerModel.add_args(self.parser) self.args = self.parser.parse_args([]) self.args.encoder_layers = 2 self.args.decoder_layers = 1 logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_sets_inference_dropout_to_true(self): self.args.retain_dropout = True self.transformer_model = TransformerModel.build_model(self.args, self.task) cfg = convert_namespace_to_omegaconf(self.args) self.transformer_model.prepare_for_inference_(cfg) assert self.transformer_model.encoder.dropout_module.apply_during_inference assert self.transformer_model.decoder.dropout_module.apply_during_inference for layer in self.transformer_model.encoder.layers: assert layer.dropout_module.apply_during_inference def test_inference_dropout_false_by_default(self): self.transformer_model = TransformerModel.build_model(self.args, self.task) cfg = convert_namespace_to_omegaconf(self.args) self.transformer_model.prepare_for_inference_(cfg) assert not self.transformer_model.encoder.dropout_module.apply_during_inference assert not self.transformer_model.decoder.dropout_module.apply_during_inference for layer in self.transformer_model.encoder.layers: assert not layer.dropout_module.apply_during_inference for layer in self.transformer_model.decoder.layers: assert not layer.dropout_module.apply_during_inference def test_applies_training_mode(self): self.transformer_model = TransformerModel.build_model(self.args, self.task) assert self.transformer_model.encoder.dropout_module.training for layer in self.transformer_model.encoder.layers: assert layer.dropout_module.training self.transformer_model.eval() assert not self.transformer_model.decoder.dropout_module.training for layer in self.transformer_model.encoder.layers: assert not layer.dropout_module.training def test_retain_modules(self): self.args.retain_dropout = True self.args.retain_dropout_modules = [ "TransformerEncoder", "TransformerEncoderLayer", ] self.transformer_model = TransformerModel.build_model(self.args, self.task) cfg = convert_namespace_to_omegaconf(self.args) self.transformer_model.prepare_for_inference_(cfg) assert self.transformer_model.encoder.dropout_module.apply_during_inference assert not self.transformer_model.decoder.dropout_module.apply_during_inference for layer in self.transformer_model.decoder.layers: assert not layer.dropout_module.apply_during_inference ================================================ FILE: tests/test_iopath.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from unittest import mock class TestIOPath(unittest.TestCase): def test_no_iopath(self): from .test_reproducibility import TestReproducibility with mock.patch.dict("sys.modules", {"iopath": None}): # reuse reproducibility tests, which are e2e tests that should cover # most checkpoint related functionality TestReproducibility._test_reproducibility(self, "test_reproducibility") def test_no_supports_rename(self): from .test_reproducibility import TestReproducibility with mock.patch("fairseq.file_io.PathManager.supports_rename") as mock_fn: mock_fn.return_value = False TestReproducibility._test_reproducibility(self, "test_reproducibility") if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_iterators.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from fairseq.data import iterators, ListDataset class TestIterators(unittest.TestCase): def test_counting_iterator_index(self, ref=None, itr=None): # Test the indexing functionality of CountingIterator if ref is None: assert itr is None ref = list(range(10)) itr = iterators.CountingIterator(ref) else: assert len(ref) == 10 assert itr is not None self.assertTrue(itr.has_next()) self.assertEqual(itr.n, 0) self.assertEqual(next(itr), ref[0]) self.assertEqual(itr.n, 1) self.assertEqual(next(itr), ref[1]) self.assertEqual(itr.n, 2) itr.skip(3) self.assertEqual(itr.n, 5) self.assertEqual(next(itr), ref[5]) itr.skip(2) self.assertEqual(itr.n, 8) self.assertEqual(list(itr), [ref[8], ref[9]]) self.assertFalse(itr.has_next()) def test_counting_iterator_length_mismatch(self): ref = list(range(10)) # When the underlying iterable is longer than the CountingIterator, # the remaining items in the iterable should be ignored itr = iterators.CountingIterator(ref, total=8) self.assertEqual(list(itr), ref[:8]) # When the underlying iterable is shorter than the CountingIterator, # raise an IndexError when the underlying iterable is exhausted itr = iterators.CountingIterator(ref, total=12) self.assertRaises(IndexError, list, itr) def test_counting_iterator_take(self): # Test the "take" method of CountingIterator ref = list(range(10)) itr = iterators.CountingIterator(ref) itr.take(5) self.assertEqual(len(itr), len(list(iter(itr)))) self.assertEqual(len(itr), 5) itr = iterators.CountingIterator(ref) itr.take(5) self.assertEqual(next(itr), ref[0]) self.assertEqual(next(itr), ref[1]) itr.skip(2) self.assertEqual(next(itr), ref[4]) self.assertFalse(itr.has_next()) def test_grouped_iterator(self): # test correctness x = list(range(10)) itr = iterators.GroupedIterator(x, 1) self.assertEqual(list(itr), [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9]]) itr = iterators.GroupedIterator(x, 4) self.assertEqual(list(itr), [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9]]) itr = iterators.GroupedIterator(x, 5) self.assertEqual(list(itr), [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9]]) # test the GroupIterator also works correctly as a CountingIterator x = list(range(30)) ref = list(iterators.GroupedIterator(x, 3)) itr = iterators.GroupedIterator(x, 3) self.test_counting_iterator_index(ref, itr) def test_sharded_iterator(self): # test correctness x = list(range(10)) itr = iterators.ShardedIterator(x, num_shards=1, shard_id=0) self.assertEqual(list(itr), x) itr = iterators.ShardedIterator(x, num_shards=2, shard_id=0) self.assertEqual(list(itr), [0, 2, 4, 6, 8]) itr = iterators.ShardedIterator(x, num_shards=2, shard_id=1) self.assertEqual(list(itr), [1, 3, 5, 7, 9]) itr = iterators.ShardedIterator(x, num_shards=3, shard_id=0) self.assertEqual(list(itr), [0, 3, 6, 9]) itr = iterators.ShardedIterator(x, num_shards=3, shard_id=1) self.assertEqual(list(itr), [1, 4, 7, None]) itr = iterators.ShardedIterator(x, num_shards=3, shard_id=2) self.assertEqual(list(itr), [2, 5, 8, None]) # test CountingIterator functionality x = list(range(30)) ref = list(iterators.ShardedIterator(x, num_shards=3, shard_id=0)) itr = iterators.ShardedIterator(x, num_shards=3, shard_id=0) self.test_counting_iterator_index(ref, itr) def test_counting_iterator_buffered_iterator_take(self): ref = list(range(10)) buffered_itr = iterators.BufferedIterator(2, ref) itr = iterators.CountingIterator(buffered_itr) itr.take(5) self.assertEqual(len(itr), len(list(iter(itr)))) self.assertEqual(len(itr), 5) buffered_itr = iterators.BufferedIterator(2, ref) itr = iterators.CountingIterator(buffered_itr) itr.take(5) self.assertEqual(len(buffered_itr), 5) self.assertEqual(len(list(iter(buffered_itr))), 5) buffered_itr = iterators.BufferedIterator(2, ref) itr = iterators.CountingIterator(buffered_itr) itr.take(5) self.assertEqual(next(itr), ref[0]) self.assertEqual(next(itr), ref[1]) itr.skip(2) self.assertEqual(next(itr), ref[4]) self.assertFalse(itr.has_next()) self.assertRaises(StopIteration, next, buffered_itr) ref = list(range(4, 10)) buffered_itr = iterators.BufferedIterator(2, ref) itr = iterators.CountingIterator(buffered_itr, start=4) itr.take(5) self.assertEqual(len(itr), 5) self.assertEqual(len(buffered_itr), 1) self.assertEqual(next(itr), ref[0]) self.assertFalse(itr.has_next()) self.assertRaises(StopIteration, next, buffered_itr) def test_epoch_batch_iterator_skip_remainder_batch(self): reference = [1, 2, 3] itr1 = _get_epoch_batch_itr(reference, 2, True) self.assertEqual(len(itr1), 1) itr2 = _get_epoch_batch_itr(reference, 2, False) self.assertEqual(len(itr2), 2) itr3 = _get_epoch_batch_itr(reference, 1, True) self.assertEqual(len(itr3), 2) itr4 = _get_epoch_batch_itr(reference, 1, False) self.assertEqual(len(itr4), 3) itr5 = _get_epoch_batch_itr(reference, 4, True) self.assertEqual(len(itr5), 0) self.assertFalse(itr5.has_next()) itr6 = _get_epoch_batch_itr(reference, 4, False) self.assertEqual(len(itr6), 1) def test_grouped_iterator_skip_remainder_batch(self): reference = [1, 2, 3, 4, 5, 6, 7, 8, 9] itr1 = _get_epoch_batch_itr(reference, 3, False) grouped_itr1 = iterators.GroupedIterator(itr1, 2, True) self.assertEqual(len(grouped_itr1), 1) itr2 = _get_epoch_batch_itr(reference, 3, False) grouped_itr2 = iterators.GroupedIterator(itr2, 2, False) self.assertEqual(len(grouped_itr2), 2) itr3 = _get_epoch_batch_itr(reference, 3, True) grouped_itr3 = iterators.GroupedIterator(itr3, 2, True) self.assertEqual(len(grouped_itr3), 1) itr4 = _get_epoch_batch_itr(reference, 3, True) grouped_itr4 = iterators.GroupedIterator(itr4, 2, False) self.assertEqual(len(grouped_itr4), 1) itr5 = _get_epoch_batch_itr(reference, 5, True) grouped_itr5 = iterators.GroupedIterator(itr5, 2, True) self.assertEqual(len(grouped_itr5), 0) itr6 = _get_epoch_batch_itr(reference, 5, True) grouped_itr6 = iterators.GroupedIterator(itr6, 2, False) self.assertEqual(len(grouped_itr6), 1) def _get_epoch_batch_itr(ref, bsz, skip_remainder_batch): dsz = len(ref) indices = range(dsz) starts = indices[::bsz] batch_sampler = [indices[s : s + bsz] for s in starts] dataset = ListDataset(ref) itr = iterators.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=batch_sampler, skip_remainder_batch=skip_remainder_batch, ) return itr.next_epoch_itr() if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_label_smoothing.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import copy import unittest import tests.utils as test_utils import torch from fairseq.criterions.cross_entropy import CrossEntropyCriterion from fairseq.criterions.label_smoothed_cross_entropy import ( LabelSmoothedCrossEntropyCriterion, ) class TestLabelSmoothing(unittest.TestCase): def setUp(self): # build dictionary self.d = test_utils.dummy_dictionary(3) vocab = len(self.d) self.assertEqual(vocab, 4 + 3) # 4 special + 3 tokens self.assertEqual(self.d.pad(), 1) self.assertEqual(self.d.eos(), 2) self.assertEqual(self.d.unk(), 3) pad, eos, unk, w1, w2, w3 = 1, 2, 3, 4, 5, 6 # noqa: F841 # build dataset self.data = [ # the first batch item has padding { "source": torch.LongTensor([w1, eos]), "target": torch.LongTensor([w1, eos]), }, { "source": torch.LongTensor([w1, eos]), "target": torch.LongTensor([w1, w1, eos]), }, ] self.sample = next(test_utils.dummy_dataloader(self.data)) # build model self.args = argparse.Namespace() self.args.sentence_avg = False self.args.report_accuracy = False self.args.probs = ( torch.FloatTensor( [ # pad eos unk w1 w2 w3 [0.05, 0.05, 0.1, 0.05, 0.3, 0.4, 0.05], [0.05, 0.10, 0.2, 0.05, 0.2, 0.3, 0.10], [0.05, 0.15, 0.3, 0.05, 0.1, 0.2, 0.15], ] ) .unsqueeze(0) .expand(2, 3, 7) ) # add batch dimension self.task = test_utils.TestTranslationTask.setup_task(self.args, self.d, self.d) self.model = self.task.build_model(self.args) def test_nll_loss(self): self.args.label_smoothing = 0.1 nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task) smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion( self.args, self.task ) nll_loss, nll_sample_size, nll_logging_output = nll_crit( self.model, self.sample ) smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit( self.model, self.sample ) self.assertLess(abs(nll_loss - nll_logging_output["loss"]), 1e-6) self.assertLess(abs(nll_loss - smooth_logging_output["nll_loss"]), 1e-6) def test_padding(self): self.args.label_smoothing = 0.1 crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task) loss, _, logging_output = crit(self.model, self.sample) def get_one_no_padding(idx): # create a new sample with just a single batch item so that there's # no padding sample1 = next(test_utils.dummy_dataloader([self.data[idx]])) args1 = copy.copy(self.args) args1.probs = args1.probs[idx, :, :].unsqueeze(0) model1 = self.task.build_model(args1) loss1, _, _ = crit(model1, sample1) return loss1 loss1 = get_one_no_padding(0) loss2 = get_one_no_padding(1) self.assertAlmostEqual(loss, loss1 + loss2) def test_reduction(self): self.args.label_smoothing = 0.1 crit = LabelSmoothedCrossEntropyCriterion.build_criterion(self.args, self.task) loss, _, logging_output = crit(self.model, self.sample, reduce=True) unreduced_loss, _, _ = crit(self.model, self.sample, reduce=False) self.assertAlmostEqual(loss, unreduced_loss.sum()) def test_zero_eps(self): self.args.label_smoothing = 0.0 nll_crit = CrossEntropyCriterion.build_criterion(self.args, self.task) smooth_crit = LabelSmoothedCrossEntropyCriterion.build_criterion( self.args, self.task ) nll_loss, nll_sample_size, nll_logging_output = nll_crit( self.model, self.sample ) smooth_loss, smooth_sample_size, smooth_logging_output = smooth_crit( self.model, self.sample ) self.assertAlmostEqual(nll_loss, smooth_loss) def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess((t1 - t2).abs().max(), 1e-6) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_lm_context_window.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from fairseq.data import MonolingualDataset from fairseq.tasks.language_modeling import LanguageModelingConfig, LanguageModelingTask from tests import utils as test_utils class TestLMContextWindow(unittest.TestCase): def test_eval_dataloader(self): dictionary = test_utils.dummy_dictionary(10) assert len(dictionary) == 14 # 4 extra special symbols assert dictionary.pad() == 1 dataset = test_utils.TestDataset( [ torch.tensor([4, 5, 6, 7], dtype=torch.long), torch.tensor([8, 9, 10, 11], dtype=torch.long), torch.tensor([12, 13], dtype=torch.long), ] ) dataset = MonolingualDataset(dataset, sizes=[4, 4, 2], src_vocab=dictionary) config = LanguageModelingConfig(tokens_per_sample=4) task = LanguageModelingTask(config, dictionary) eval_dataloader = task.eval_lm_dataloader( dataset=dataset, batch_size=1, context_window=2, num_workers=0, ) batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [4, 5, 6, 7, 1, 1] assert batch["target"][0].tolist() == [4, 5, 6, 7, 1, 1] batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [6, 7, 8, 9, 10, 11] assert batch["target"][0].tolist() == [1, 1, 8, 9, 10, 11] batch = next(eval_dataloader) assert batch["net_input"]["src_tokens"][0].tolist() == [10, 11, 12, 13] assert batch["target"][0].tolist() == [1, 1, 12, 13] if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_lstm_jitable.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import tempfile import unittest import torch from fairseq.data.dictionary import Dictionary from fairseq.models.lstm import LSTMModel from fairseq.tasks.fairseq_task import LegacyFairseqTask DEFAULT_TEST_VOCAB_SIZE = 100 class DummyTask(LegacyFairseqTask): def __init__(self, args): super().__init__(args) self.dictionary = get_dummy_dictionary() if getattr(self.args, "ctc", False): self.dictionary.add_symbol("<ctc_blank>") self.src_dict = self.dictionary self.tgt_dict = self.dictionary @property def source_dictionary(self): return self.src_dict @property def target_dictionary(self): return self.dictionary def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): dummy_dict = Dictionary() # add dummy symbol to satisfy vocab size for id, _ in enumerate(range(vocab_size)): dummy_dict.add_symbol("{}".format(id), 1000) return dummy_dict def get_dummy_task_and_parser(): """ to build a fariseq model, we need some dummy parse and task. This function is used to create dummy task and parser to faciliate model/criterion test Note: we use FbSpeechRecognitionTask as the dummy task. You may want to use other task by providing another function """ parser = argparse.ArgumentParser( description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS ) DummyTask.add_args(parser) args = parser.parse_args([]) task = DummyTask.setup_task(args) return task, parser class TestJitLSTMModel(unittest.TestCase): def _test_save_and_load(self, scripted_module): with tempfile.NamedTemporaryFile() as f: scripted_module.save(f.name) torch.jit.load(f.name) def assertTensorEqual(self, t1, t2): t1 = t1[~torch.isnan(t1)] # can cause size mismatch errors if there are NaNs t2 = t2[~torch.isnan(t2)] self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertEqual(t1.ne(t2).long().sum(), 0) def test_jit_and_export_lstm(self): task, parser = get_dummy_task_and_parser() LSTMModel.add_args(parser) args = parser.parse_args([]) args.criterion = "" model = LSTMModel.build_model(args, task) scripted_model = torch.jit.script(model) self._test_save_and_load(scripted_model) def test_assert_jit_vs_nonjit_(self): task, parser = get_dummy_task_and_parser() LSTMModel.add_args(parser) args = parser.parse_args([]) args.criterion = "" model = LSTMModel.build_model(args, task) model.eval() scripted_model = torch.jit.script(model) scripted_model.eval() idx = len(task.source_dictionary) iter = 100 # Inject random input and check output seq_len_tensor = torch.randint(1, 10, (iter,)) num_samples_tensor = torch.randint(1, 10, (iter,)) for i in range(iter): seq_len = seq_len_tensor[i] num_samples = num_samples_tensor[i] src_token = (torch.randint(0, idx, (num_samples, seq_len)),) src_lengths = torch.randint(1, seq_len + 1, (num_samples,)) src_lengths, _ = torch.sort(src_lengths, descending=True) # Force the first sample to have seq_len src_lengths[0] = seq_len prev_output_token = (torch.randint(0, idx, (num_samples, 1)),) result = model(src_token[0], src_lengths, prev_output_token[0], None) scripted_result = scripted_model( src_token[0], src_lengths, prev_output_token[0], None ) self.assertTensorEqual(result[0], scripted_result[0]) self.assertTensorEqual(result[1], scripted_result[1]) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_memory_efficient_fp16.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import logging import unittest import torch from fairseq.optim.adam import FairseqAdam from fairseq.optim.fp16_optimizer import MemoryEfficientFP16Optimizer from omegaconf import OmegaConf @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") class TestMemoryEfficientFP16(unittest.TestCase): def setUp(self): logging.disable(logging.CRITICAL) def tearDown(self): logging.disable(logging.NOTSET) def test_load_state_dict(self): # define simple FP16 model model = torch.nn.Linear(5, 5).cuda().half() params = list(model.parameters()) # initialize memory efficient FP16 optimizer # with pseudo DictConfigs optimizer = FairseqAdam( cfg=OmegaConf.create( vars( argparse.Namespace( adam_betas="(0.9, 0.999)", adam_eps=1e-8, weight_decay=0.0, lr=[0.00001], ) ) ), params=params, ) me_optimizer = MemoryEfficientFP16Optimizer( cfg=OmegaConf.create( { "common": vars( argparse.Namespace( fp16_init_scale=1, fp16_scale_window=1, fp16_scale_tolerance=1, threshold_loss_scale=1, min_loss_scale=1e-4, ) ) } ), params=params, optimizer=optimizer, ) # optimizer state is created in the first step loss = model(torch.rand(5).cuda().half()).sum() me_optimizer.backward(loss) me_optimizer.step() # reload state state = me_optimizer.state_dict() me_optimizer.load_state_dict(state) for k, v in me_optimizer.optimizer.state.items(): self.assertTrue(k.dtype == torch.float16) for v_i in v.values(): if torch.is_tensor(v_i): self.assertTrue(v_i.dtype == torch.float32) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_metrics.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import uuid from fairseq.logging import metrics class TestMetrics(unittest.TestCase): def test_nesting(self): with metrics.aggregate() as a: metrics.log_scalar("loss", 1) with metrics.aggregate() as b: metrics.log_scalar("loss", 2) self.assertEqual(a.get_smoothed_values()["loss"], 1.5) self.assertEqual(b.get_smoothed_values()["loss"], 2) def test_new_root(self): with metrics.aggregate() as a: metrics.log_scalar("loss", 1) with metrics.aggregate(new_root=True) as b: metrics.log_scalar("loss", 2) self.assertEqual(a.get_smoothed_values()["loss"], 1) self.assertEqual(b.get_smoothed_values()["loss"], 2) def test_nested_new_root(self): with metrics.aggregate() as layer1: metrics.log_scalar("loss", 1) with metrics.aggregate(new_root=True) as layer2: metrics.log_scalar("loss", 2) with metrics.aggregate() as layer3: metrics.log_scalar("loss", 3) with metrics.aggregate(new_root=True) as layer4: metrics.log_scalar("loss", 4) metrics.log_scalar("loss", 1.5) self.assertEqual(layer4.get_smoothed_values()["loss"], 4) self.assertEqual(layer3.get_smoothed_values()["loss"], 3) self.assertEqual(layer2.get_smoothed_values()["loss"], 2.5) self.assertEqual(layer1.get_smoothed_values()["loss"], 1.25) def test_named(self): name = str(uuid.uuid4()) metrics.reset_meters(name) with metrics.aggregate(name): metrics.log_scalar("loss", 1) metrics.log_scalar("loss", 3) with metrics.aggregate(name): metrics.log_scalar("loss", 2) self.assertEqual(metrics.get_smoothed_values(name)["loss"], 1.5) def test_nested_duplicate_names(self): name = str(uuid.uuid4()) metrics.reset_meters(name) with metrics.aggregate(name): metrics.log_scalar("loss", 1) with metrics.aggregate() as other: with metrics.aggregate(name): metrics.log_scalar("loss", 2) metrics.log_scalar("loss", 6) self.assertEqual(metrics.get_smoothed_values(name)["loss"], 3) self.assertEqual(other.get_smoothed_values()["loss"], 2) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_multi_corpus_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from collections import OrderedDict import torch from fairseq.data import LanguagePairDataset, TokenBlockDataset from fairseq.data.multi_corpus_dataset import MultiCorpusDataset from tests.test_train import mock_dict class TestMultiCorpusDataset(unittest.TestCase): def setUp(self): d = mock_dict() tokens_1 = torch.LongTensor([i for i in range(1, 5000, 2)]).view(1, -1) tokens_ds1 = TokenBlockDataset( tokens_1, sizes=[tokens_1.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_1 = LanguagePairDataset( tokens_ds1, tokens_ds1.sizes, d, shuffle=False ) tokens_2 = torch.LongTensor([i for i in range(0, 5000, 2)]).view(1, -1) tokens_ds2 = TokenBlockDataset( tokens_2, sizes=[tokens_2.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_2 = LanguagePairDataset( tokens_ds2, tokens_ds2.sizes, d, shuffle=False ) def _test_sample_helper( self, distribution, ): m = MultiCorpusDataset( OrderedDict({0: self.dataset_1, 1: self.dataset_2}), distribution=distribution, seed=0, sort_indices=True, ) m.set_epoch(1) indices = m.ordered_indices() count_sample_from_first_dataset = 0 items = set() for i in indices: item = m[i]["source"].item() if item % 2 == 1: count_sample_from_first_dataset += 1 items.add(item) sample_from_first_ds_percentage = ( 1.0 * count_sample_from_first_dataset / len(indices) ) self.assertLess( abs(sample_from_first_ds_percentage - distribution[0]), 0.01, ) self.assertEqual( len(items), int( min(len(self.dataset_1), len(indices) * distribution[0]) + min(len(self.dataset_1), len(indices) * distribution[1]) ), ) print(distribution) def test_multi_corpus_dataset(self): for distribution in [[0.5, 0.5], [0.1, 0.9], [0.9, 0.1], [0.0, 1.0]]: self._test_sample_helper(distribution=distribution) ================================================ FILE: tests/test_multi_corpus_sampled_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from collections import OrderedDict import numpy as np import torch from fairseq.data import LanguagePairDataset, TokenBlockDataset from fairseq.data.multi_corpus_sampled_dataset import MultiCorpusSampledDataset from tests.test_train import mock_dict class TestMultiCorpusSampledDataset(unittest.TestCase): def setUp(self): d = mock_dict() tokens_1 = torch.LongTensor([1]).view(1, -1) tokens_ds1 = TokenBlockDataset( tokens_1, sizes=[tokens_1.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_1 = LanguagePairDataset( tokens_ds1, tokens_ds1.sizes, d, shuffle=False ) tokens_2 = torch.LongTensor([2]).view(1, -1) tokens_ds2 = TokenBlockDataset( tokens_2, sizes=[tokens_2.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) self.dataset_2 = LanguagePairDataset( tokens_ds2, tokens_ds2.sizes, d, shuffle=False ) def _test_sample_helper( self, expected_sample_from_first_ds_percentage, num_samples=1000, sampling_func=None, ): # To make sure test is not flaky np.random.seed(0) if sampling_func is None: m = MultiCorpusSampledDataset( OrderedDict({0: self.dataset_1, 1: self.dataset_2}), ) else: m = MultiCorpusSampledDataset( OrderedDict({0: self.dataset_1, 1: self.dataset_2}), sampling_func=sampling_func, ) m.ordered_indices() count_sample_from_first_dataset = 0 for _ in range(num_samples): if m.collater([m[0], m[1]])["net_input"]["src_tokens"][0] == 1: count_sample_from_first_dataset += 1 sample_from_first_ds_percentage = ( 1.0 * count_sample_from_first_dataset / num_samples ) self.assertLess( abs( sample_from_first_ds_percentage - expected_sample_from_first_ds_percentage ), 0.01, ) def test_multi_corpus_sampled_dataset_uniform_sample(self): self._test_sample_helper(expected_sample_from_first_ds_percentage=0.5) def test_multi_corpus_sampled_dataset_weighted_sample(self): def naive_weighted_sample(weights): def f(input): v = np.random.random() agg = 0 for i, weight in enumerate(weights): agg += weight if agg > v: return i return f self._test_sample_helper( expected_sample_from_first_ds_percentage=0.9, sampling_func=naive_weighted_sample(weights=[0.9, 0.1]), ) ================================================ FILE: tests/test_multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import random import unittest import pytest import torch from fairseq.modules.multihead_attention import MultiheadAttention, _mask_for_xformers BATCH = [20, 41, 97] SEQ = [64] EMB = [48] HEADS = [4] DROP = 0.1 DEVICE = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"] ATTN_MASK_DTYPE = [None, torch.uint8, torch.bool, torch.float] KEY_PADDING_MASK_DTYPE = [None, torch.uint8, torch.bool] # FIXME: some tests fail when decimal=2, fix this and set decimal to 2 def assert_almost_equal(x, y, decimal=1, err_msg=""): import numpy.testing as npt if isinstance(x, torch.Tensor): x = x.cpu().detach().numpy() if isinstance(y, torch.Tensor): y = y.cpu().detach().numpy() npt.assert_array_almost_equal(x, y, err_msg=err_msg, decimal=decimal) def _reset_seeds(): torch.manual_seed(0) torch.random.manual_seed(0) random.seed(0) torch.cuda.manual_seed_all(0) def _get_mask(to_dtype: torch.dtype, dim0: int, dim1: int): if to_dtype == torch.float: mask = torch.randint(0, 2, (dim0, dim1)).to(dtype=torch.bool) return mask.to(dtype=to_dtype).masked_fill(mask, -float("inf")) return torch.randint(0, 2, (dim0, dim1)).to(dtype=to_dtype) def test_mask_for_xformers(): # Additive Mask m_float_add = torch.tensor([float("-inf"), 0]).to(torch.float) m_float_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float) m_float16_add = torch.tensor([float("-inf"), 0]).to(torch.float16) m_float16_add_flipped = torch.tensor([0, float("-inf")]).to(torch.float16) m_uint = torch.tensor([1, 0]).to(torch.uint8) m_uint_flipped = torch.tensor([0, 1]).to(torch.uint8) m_bool = torch.tensor([False, True]) assert torch.equal(_mask_for_xformers(m_float_add), m_float_add) assert torch.equal(_mask_for_xformers(m_float16_add), m_float16_add) assert torch.equal(_mask_for_xformers(m_uint), m_uint_flipped) assert torch.equal(_mask_for_xformers(m_bool), ~m_bool) assert torch.equal( _mask_for_xformers(m_float_add, to_dtype=torch.float16), m_float16_add ) assert torch.equal( _mask_for_xformers(m_float_add, to_dtype=torch.float), m_float_add ) assert torch.equal(_mask_for_xformers(m_float_add, to_dtype=torch.bool), m_bool) assert torch.equal( _mask_for_xformers(m_float_add, to_dtype=torch.uint8), m_uint_flipped ) assert torch.equal( _mask_for_xformers(m_float16_add, to_dtype=torch.float16), m_float16_add ) assert torch.equal( _mask_for_xformers(m_float16_add, to_dtype=torch.float), m_float_add ) assert torch.equal(_mask_for_xformers(m_float16_add, to_dtype=torch.bool), m_bool) assert torch.equal( _mask_for_xformers(m_float16_add, to_dtype=torch.uint8), m_uint_flipped ) assert torch.equal( _mask_for_xformers(m_bool, to_dtype=torch.float16), m_float16_add_flipped ) assert torch.equal( _mask_for_xformers(m_bool, to_dtype=torch.float), m_float_add_flipped ) assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.bool), ~m_bool) assert torch.equal(_mask_for_xformers(m_bool, to_dtype=torch.uint8), m_uint) assert torch.equal( _mask_for_xformers(m_uint, to_dtype=torch.float16), m_float16_add ) assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.float), m_float_add) assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.bool), m_bool) assert torch.equal(_mask_for_xformers(m_uint, to_dtype=torch.uint8), m_uint_flipped) @pytest.mark.skipif(not torch.cuda.is_available(), reason="blocksparse requires gpu") @pytest.mark.skip(reason="not part of latest xformers") @pytest.mark.parametrize("device", ["cuda"]) @pytest.mark.parametrize("add_zero_attn", [False]) @pytest.mark.parametrize("batch_size", [20]) @pytest.mark.parametrize("embedding", [64]) @pytest.mark.parametrize("seq_len", [64]) @pytest.mark.parametrize("num_heads", [4]) def test_xformers_blocksparse_parity( device, add_zero_attn, batch_size, embedding, seq_len, num_heads, ): xformers_att_config = '{"name": "scaled_dot_product"}' xformers_blocksparse_blocksize = 16 xformers_blocksparse_layout = torch.ones( seq_len // xformers_blocksparse_blocksize, seq_len // xformers_blocksparse_blocksize, dtype=torch.int32, ) q = torch.rand(seq_len, batch_size, embedding).to(device).half() q.requires_grad = True k = torch.rand(seq_len, batch_size, embedding).to(device).half() k.requires_grad = True v = torch.rand(seq_len, batch_size, embedding).to(device).half() v.requires_grad = True q_ = q.detach().clone().half() q_.requires_grad = True k_ = k.detach().clone().half() k_.requires_grad = True v_ = v.detach().clone().half() v_.requires_grad = True _reset_seeds() xf_blocksparse_mha = ( MultiheadAttention( embedding, num_heads, dropout=0.0, add_zero_attn=add_zero_attn, xformers_att_config=xformers_att_config, xformers_blocksparse_layout=xformers_blocksparse_layout, xformers_blocksparse_blocksize=xformers_blocksparse_blocksize, ) .to(device) .half() ) xf_blocksparse_output, _ = xf_blocksparse_mha( q, k, v, ) _reset_seeds() xformers_mha = ( MultiheadAttention( embedding, num_heads, dropout=0.0, add_zero_attn=add_zero_attn, xformers_att_config=xformers_att_config, xformers_blocksparse_layout=None, ) .to(device) .half() ) xformers_output, _ = xformers_mha( q_, k_, v_, ) # # account for when nan != nan rand = random.uniform(0, 1) xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand) xf_blocksparse_output = xf_blocksparse_output.masked_fill( xf_blocksparse_output.isnan(), rand ) assert_almost_equal(xformers_output, xf_blocksparse_output) loss_blocksparse = torch.norm(xformers_output) loss_original = torch.norm(xf_blocksparse_output) loss_blocksparse.backward() loss_original.backward() q.masked_fill(q.isnan(), rand) q_.masked_fill(q_.isnan(), rand) k.masked_fill(k.isnan(), rand) k_.masked_fill(k_.isnan(), rand) v.masked_fill(v.isnan(), rand) v_.masked_fill(v_.isnan(), rand) assert_almost_equal(q.grad, q_.grad) assert_almost_equal(k.grad, k_.grad) assert_almost_equal(v.grad, v_.grad) @pytest.mark.parametrize("device", DEVICE) @pytest.mark.parametrize("attn_dtype", ATTN_MASK_DTYPE) @pytest.mark.parametrize("key_padding_dtype", KEY_PADDING_MASK_DTYPE) @pytest.mark.parametrize("add_bias_kv", [True, False]) @pytest.mark.parametrize("add_zero_attn", [True, False]) # TODO: test with static_kv True @pytest.mark.parametrize("static_kv", [False]) @pytest.mark.parametrize("batch_size", BATCH) @pytest.mark.parametrize("embedding", EMB) @pytest.mark.parametrize("seq_len", SEQ) @pytest.mark.parametrize("num_heads", HEADS) def test_xformers_single_forward_parity( device, attn_dtype, key_padding_dtype, add_bias_kv, add_zero_attn, static_kv, batch_size, embedding, seq_len, num_heads, ): xformers_att_config = '{"name": "scaled_dot_product"}' attn_mask = ( None if attn_dtype is None else _get_mask(to_dtype=attn_dtype, dim0=seq_len, dim1=seq_len).to(device) ) key_padding_mask = ( None if key_padding_dtype is None else _get_mask(to_dtype=key_padding_dtype, dim0=batch_size, dim1=seq_len).to( device ) ) q = torch.rand(seq_len, batch_size, embedding).to(device) q.requires_grad = True k = torch.rand(seq_len, batch_size, embedding).to(device) k.requires_grad = True v = torch.rand(seq_len, batch_size, embedding).to(device) v.requires_grad = True q_ = q.detach().clone() q_.requires_grad = True k_ = k.detach().clone() k_.requires_grad = True v_ = v.detach().clone() v_.requires_grad = True # TODO: dropouts in the two implementations lead to different entries dropped. _reset_seeds() xformers_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=xformers_att_config, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ).to(device) xformers_output, _ = xformers_mha( q, k, v, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) _reset_seeds() original_mha = MultiheadAttention( embedding, num_heads, dropout=0.0, xformers_att_config=None, add_bias_kv=add_bias_kv, add_zero_attn=add_zero_attn, ).to(device) original_output, _ = original_mha( q_, k_, v_, key_padding_mask=key_padding_mask, attn_mask=attn_mask, static_kv=static_kv, ) # account for when nan != nan if xformers_output.isnan().any() or original_output.isnan().any(): rand = random.uniform(0, 1) xformers_output = xformers_output.masked_fill(xformers_output.isnan(), rand) original_output = original_output.masked_fill(original_output.isnan(), rand) # torch.equal works for cpu, on cuda allclose is needed. assert torch.allclose( xformers_output, original_output, atol=1e-06 ), f"max diff is {torch.max(torch.abs(xformers_output - original_output))}" loss_xformers = torch.norm(xformers_output) loss_original = torch.norm(original_output) loss_xformers.backward() loss_original.backward() # torch.equal works for cpu, on cuda allclose is needed. assert torch.allclose( q.grad, q_.grad ), f"max diff is {torch.max(torch.abs(q.grad - q_.grad))}" assert torch.allclose( k.grad, k_.grad ), f"max diff is {torch.max(torch.abs(k.grad - k_.grad))}" assert torch.allclose( v.grad, v_.grad ), f"max diff is {torch.max(torch.abs(v.grad - v_.grad))}" def test_mask_padding_parity(): def old_padding_code(key_padding_mask, attn_mask): if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, torch.zeros(key_padding_mask.size(0), 1).type_as(key_padding_mask), ], dim=1, ) return key_padding_mask, attn_mask # values don't matter for this test. mha = MultiheadAttention( embed_dim=8, num_heads=2, dropout=0.0, add_bias_kv=True, add_zero_attn=True, ) key_padding_mask = torch.rand((8, 64)) attn_mask = torch.rand((64, 64)) kp_mask_orig, a_mask_orig = old_padding_code(key_padding_mask, attn_mask) kp_mask_new, a_mask_new = mha._pad_masks(key_padding_mask, attn_mask) assert kp_mask_orig.size() == kp_mask_new.size() assert a_mask_orig.size() == a_mask_new.size() assert torch.equal(kp_mask_orig, kp_mask_new) assert torch.equal(a_mask_orig, a_mask_new) def test_add_bias_parity(): # values don't matter for this test. mha = MultiheadAttention( embed_dim=8, num_heads=2, dropout=0.0, add_bias_kv=True, add_zero_attn=True, ) def old_bias_code(k, v, key_padding_mask, attn_mask, bsz): k = torch.cat([k, mha.bias_k.repeat(1, bsz, 1)]) v = torch.cat([v, mha.bias_v.repeat(1, bsz, 1)]) if attn_mask is not None: attn_mask = torch.cat( [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 ) if key_padding_mask is not None: key_padding_mask = torch.cat( [ key_padding_mask, key_padding_mask.new_zeros(key_padding_mask.size(0), 1), ], dim=1, ) return k, v, key_padding_mask, attn_mask seq_len = 64 bsz = 8 embedding = 8 key_padding_mask = torch.rand((bsz, seq_len)) attn_mask = torch.rand((seq_len, seq_len)) k = torch.rand((seq_len, bsz, embedding)) v = torch.rand((seq_len, bsz, embedding)) k_orig, v_orig, kp_mask_orig, a_mask_orig = old_bias_code( k, v, key_padding_mask, attn_mask, bsz ) k_new, v_new, kp_mask_new, a_mask_new = mha._add_bias( k, v, key_padding_mask, attn_mask, bsz ) assert torch.equal(k_orig, k_new) assert torch.equal(v_orig, v_new) assert torch.equal(kp_mask_orig, kp_mask_new) assert torch.equal(a_mask_orig, a_mask_new) class TestMultiheadAttention(unittest.TestCase): def test_append_prev_key_padding_mask(self): bsz = 1 src_len = 4 cases = [ # no padding mask (None, None, None), # current padding mask only ( torch.tensor([[1]]).bool(), None, torch.tensor([[0, 0, 0, 1]]).bool(), ), # previous padding mask only ( None, torch.tensor([[0, 1, 0]]).bool(), torch.tensor([[0, 1, 0, 0]]).bool(), ), # both padding masks ( torch.tensor([[1]]).bool(), torch.tensor([[0, 1, 0]]).bool(), torch.tensor([[0, 1, 0, 1]]).bool(), ), # prev_key_padding_mask already full ( torch.tensor([[0, 1, 0, 1]]).bool(), None, torch.tensor([[0, 1, 0, 1]]).bool(), ), # key_padding_mask already full ( None, torch.tensor([[0, 1, 0, 1]]).bool(), torch.tensor([[0, 1, 0, 1]]).bool(), ), ] for c in cases: key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( c[0], c[1], batch_size=bsz, src_len=src_len, static_kv=False, ) if key_padding_mask is not None: self.assertTrue( torch.all(torch.eq(key_padding_mask, c[2])), f"Unexpected resultant key padding mask: {key_padding_mask}" f" given current: {c[0]} and previous: {c[1]}", ) self.assertEqual(key_padding_mask.size(0), bsz) self.assertEqual(key_padding_mask.size(1), src_len) else: self.assertIsNone(c[2]) def test_pruning_heads(self): embed_dim = 768 num_heads = 12 num_heads_to_keep = 8 dummy_input = torch.randn(32, 2, embed_dim) mha = MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads) reserve_head_index = mha._get_reserve_head_index( num_heads_to_keep=num_heads_to_keep ) mha._adaptive_prune_heads(reserve_head_index=reserve_head_index) mha._set_skip_embed_dim_check() mha(query=dummy_input, key=dummy_input, value=dummy_input) self.assertEqual(mha.head_dim, embed_dim / num_heads) self.assertEqual(mha.num_heads, num_heads_to_keep) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_noising.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest from typing import Dict, List import torch import tests.utils as test_utils from fairseq import utils from fairseq.data import ( Dictionary, LanguagePairDataset, TransformEosDataset, data_utils, noising, ) class TestDataNoising(unittest.TestCase): def _get_test_data_with_bpe_cont_marker(self, append_eos=True): """ Args: append_eos: if True, each input sentence in the source tokens tensor will have an EOS appended to the end. Returns: vocabs: BPE vocab with continuation markers as suffixes to denote non-end of word tokens. This is the standard BPE format used in fairseq's preprocessing. x: input tensor containing numberized source tokens, with EOS at the end if append_eos is true src_lengths: and source lengths. """ vocab = Dictionary() vocab.add_symbol("he@@") vocab.add_symbol("llo") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("y@@") vocab.add_symbol("ou") vocab.add_symbol("n@@") vocab.add_symbol("ew") vocab.add_symbol("or@@") vocab.add_symbol("k") src_tokens = [ ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"], ["how", "are", "y@@", "ou"], ] x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( vocab=vocab, src_tokens=src_tokens, append_eos=append_eos ) return vocab, x, src_lengths def _get_test_data_with_bpe_end_marker(self, append_eos=True): """ Args: append_eos: if True, each input sentence in the source tokens tensor will have an EOS appended to the end. Returns: vocabs: BPE vocab with end-of-word markers as suffixes to denote tokens at the end of a word. This is an alternative to fairseq's standard preprocessing framework and is not generally supported within fairseq. x: input tensor containing numberized source tokens, with EOS at the end if append_eos is true src_lengths: and source lengths. """ vocab = Dictionary() vocab.add_symbol("he") vocab.add_symbol("llo_EOW") vocab.add_symbol("how_EOW") vocab.add_symbol("are_EOW") vocab.add_symbol("y") vocab.add_symbol("ou_EOW") vocab.add_symbol("n") vocab.add_symbol("ew_EOW") vocab.add_symbol("or") vocab.add_symbol("k_EOW") src_tokens = [ ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"], ["how_EOW", "are_EOW", "y", "ou_EOW"], ] x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor( vocab=vocab, src_tokens=src_tokens, append_eos=append_eos ) return vocab, x, src_lengths def _get_test_data_with_word_vocab(self, append_eos=True): """ Args: append_eos: if True, each input sentence in the source tokens tensor will have an EOS appended to the end. Returns: vocabs: word vocab x: input tensor containing numberized source tokens, with EOS at the end if append_eos is true src_lengths: and source lengths. """ vocab = Dictionary() vocab.add_symbol("hello") vocab.add_symbol("how") vocab.add_symbol("are") vocab.add_symbol("you") vocab.add_symbol("new") vocab.add_symbol("york") src_tokens = [ ["hello", "new", "york", "you"], ["how", "are", "you", "new", "york"], ] x, src_lengths = self._convert_src_tokens_to_tensor( vocab=vocab, src_tokens=src_tokens, append_eos=append_eos ) return vocab, x, src_lengths def _convert_src_tokens_to_tensor( self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool ): src_len = [len(x) for x in src_tokens] # If we have to append EOS, we include EOS in counting src length if append_eos: src_len = [length + 1 for length in src_len] x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad()) for i in range(len(src_tokens)): for j in range(len(src_tokens[i])): x[i][j] = vocab.index(src_tokens[i][j]) if append_eos: x[i][j + 1] = vocab.eos() x = x.transpose(1, 0) return x, torch.LongTensor(src_len) def assert_eos_at_end(self, x, x_len, eos): """Asserts last token of every sentence in x is EOS""" for i in range(len(x_len)): self.assertEqual( x[x_len[i] - 1][i], eos, ( "Expected eos (token id {eos}) at the end of sentence {i} " "but got {other} instead" ).format(i=i, eos=eos, other=x[i][-1]), ) def assert_word_dropout_correct(self, x, x_noised, x_len, l_noised): # Expect only the first word (2 bpe tokens) of the first example # was dropped out self.assertEqual(x_len[0] - 2, l_noised[0]) for i in range(l_noised[0]): self.assertEqual(x_noised[i][0], x[i + 2][0]) def test_word_dropout_with_eos(self): vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) self.assert_word_dropout_correct( x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised ) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) def assert_word_blanking_correct(self, x, x_noised, x_len, l_noised, unk): # Expect only the first word (2 bpe tokens) of the first example # was blanked out self.assertEqual(x_len[0], l_noised[0]) for i in range(l_noised[0]): if i < 2: self.assertEqual(x_noised[i][0], unk) else: self.assertEqual(x_noised[i][0], x[i][0]) def test_word_blank_with_eos(self): vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) self.assert_word_blanking_correct( x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() ) self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) def generate_unchanged_shuffle_map(self, length): return {i: i for i in range(length)} def assert_word_shuffle_matches_expected( self, x, x_len, max_shuffle_distance: int, vocab: Dictionary, expected_shufle_maps: List[Dict[int, int]], expect_eos_at_end: bool, bpe_end_marker=None, ): """ This verifies that with a given x, x_len, max_shuffle_distance, and vocab, we get the expected shuffle result. Args: x: Tensor of shape (T x B) = (sequence_length, batch_size) x_len: Tensor of length B = batch_size max_shuffle_distance: arg to pass to noising expected_shuffle_maps: List[mapping] where mapping is a Dict[old_index, new_index], mapping x's elements from their old positions in x to their new positions in x. expect_eos_at_end: if True, check the output to make sure there is an EOS at the end. bpe_end_marker: str denoting the BPE end token. If this is not None, we set the BPE cont token to None in the noising classes. """ bpe_cont_marker = None if bpe_end_marker is None: bpe_cont_marker = "@@" with data_utils.numpy_seed(1234): word_shuffle = noising.WordShuffle( vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker ) x_noised, l_noised = word_shuffle.noising( x, x_len, max_shuffle_distance=max_shuffle_distance ) # For every example, we have a different expected shuffle map. We check # that each example is shuffled as expected according to each # corresponding shuffle map. for i in range(len(expected_shufle_maps)): shuffle_map = expected_shufle_maps[i] for k, v in shuffle_map.items(): self.assertEqual(x[k][i], x_noised[v][i]) # Shuffling should not affect the length of each example for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised): self.assertEqual(pre_shuffle_length, post_shuffle_length) if expect_eos_at_end: self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) def test_word_shuffle_with_eos(self): vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True) # Assert word shuffle with max shuffle distance 0 causes input to be # unchanged self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, max_shuffle_distance=0, vocab=vocab, expected_shufle_maps=[ self.generate_unchanged_shuffle_map(example_len) for example_len in x_len ], expect_eos_at_end=True, ) # Assert word shuffle with max shuffle distance 3 matches our expected # shuffle order self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, vocab=vocab, max_shuffle_distance=3, expected_shufle_maps=[ self.generate_unchanged_shuffle_map(x_len[0]), {0: 0, 1: 3, 2: 1, 3: 2}, ], expect_eos_at_end=True, ) def test_word_shuffle_with_eos_nonbpe(self): """The purpose of this is to test shuffling logic with word vocabs""" vocab, x, x_len = self._get_test_data_with_word_vocab(append_eos=True) # Assert word shuffle with max shuffle distance 0 causes input to be # unchanged self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, max_shuffle_distance=0, vocab=vocab, expected_shufle_maps=[ self.generate_unchanged_shuffle_map(example_len) for example_len in x_len ], expect_eos_at_end=True, ) # Assert word shuffle with max shuffle distance 3 matches our expected # shuffle order self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, vocab=vocab, max_shuffle_distance=3, expected_shufle_maps=[ {0: 0, 1: 1, 2: 3, 3: 2}, {0: 0, 1: 2, 2: 1, 3: 3, 4: 4}, ], expect_eos_at_end=True, ) def test_word_shuffle_without_eos(self): """Same result as word shuffle with eos except no EOS at end""" vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) # Assert word shuffle with max shuffle distance 0 causes input to be # unchanged self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, max_shuffle_distance=0, vocab=vocab, expected_shufle_maps=[ self.generate_unchanged_shuffle_map(example_len) for example_len in x_len ], expect_eos_at_end=False, ) # Assert word shuffle with max shuffle distance 3 matches our expected # shuffle order self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, vocab=vocab, max_shuffle_distance=3, expected_shufle_maps=[ self.generate_unchanged_shuffle_map(x_len[0]), {0: 0, 1: 3, 2: 1, 3: 2}, ], expect_eos_at_end=False, ) def test_word_shuffle_without_eos_with_bpe_end_marker(self): """Same result as word shuffle without eos except using BPE end token""" vocab, x, x_len = self._get_test_data_with_bpe_end_marker(append_eos=False) # Assert word shuffle with max shuffle distance 0 causes input to be # unchanged self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, max_shuffle_distance=0, vocab=vocab, expected_shufle_maps=[ self.generate_unchanged_shuffle_map(example_len) for example_len in x_len ], expect_eos_at_end=False, bpe_end_marker="_EOW", ) # Assert word shuffle with max shuffle distance 3 matches our expected # shuffle order self.assert_word_shuffle_matches_expected( x=x, x_len=x_len, vocab=vocab, max_shuffle_distance=3, expected_shufle_maps=[ self.generate_unchanged_shuffle_map(x_len[0]), {0: 0, 1: 3, 2: 1, 3: 2}, ], expect_eos_at_end=False, bpe_end_marker="_EOW", ) def assert_no_eos_at_end(self, x, x_len, eos): """Asserts that the last token of each sentence in x is not EOS""" for i in range(len(x_len)): self.assertNotEqual( x[x_len[i] - 1][i], eos, "Expected no eos (token id {eos}) at the end of sentence {i}.".format( eos=eos, i=i ), ) def test_word_dropout_without_eos(self): """Same result as word dropout with eos except no EOS at end""" vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2) self.assert_word_dropout_correct( x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised ) self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) def test_word_blank_without_eos(self): """Same result as word blank with eos except no EOS at end""" vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False) with data_utils.numpy_seed(1234): noising_gen = noising.WordDropout(vocab) x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk()) self.assert_word_blanking_correct( x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk() ) self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos()) def _get_noising_dataset_batch( self, src_tokens_no_pad, src_dict, append_eos_to_tgt=False, ): """ Constructs a NoisingDataset and the corresponding ``LanguagePairDataset(NoisingDataset(src), src)``. If *append_eos_to_tgt* is True, wrap the source dataset in :class:`TransformEosDataset` to append EOS to the clean source when using it as the target. """ src_dataset = test_utils.TestDataset(data=src_tokens_no_pad) noising_dataset = noising.NoisingDataset( src_dataset=src_dataset, src_dict=src_dict, seed=1234, max_word_shuffle_distance=3, word_dropout_prob=0.2, word_blanking_prob=0.2, noising_class=noising.UnsupervisedMTNoising, ) tgt = src_dataset language_pair_dataset = LanguagePairDataset( src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict ) language_pair_dataset = TransformEosDataset( language_pair_dataset, src_dict.eos(), append_eos_to_tgt=append_eos_to_tgt, ) dataloader = torch.utils.data.DataLoader( dataset=language_pair_dataset, batch_size=2, collate_fn=language_pair_dataset.collater, ) denoising_batch_result = next(iter(dataloader)) return denoising_batch_result def test_noising_dataset_with_eos(self): src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker( append_eos=True ) # Format data for src_dataset src_tokens = torch.t(src_tokens) src_tokens_no_pad = [] for src_sentence in src_tokens: src_tokens_no_pad.append( utils.strip_pad(tensor=src_sentence, pad=src_dict.pad()) ) denoising_batch_result = self._get_noising_dataset_batch( src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict ) eos, pad = src_dict.eos(), src_dict.pad() # Generated noisy source as source expected_src = torch.LongTensor( [[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]] ) # Original clean source as target (right-padded) expected_tgt = torch.LongTensor( [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]] ) generated_src = denoising_batch_result["net_input"]["src_tokens"] tgt_tokens = denoising_batch_result["target"] self.assertTensorEqual(expected_src, generated_src) self.assertTensorEqual(expected_tgt, tgt_tokens) def test_noising_dataset_without_eos(self): """ Similar to test noising dataset with eos except that we have to set *append_eos_to_tgt* to ``True``. """ src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker( append_eos=False ) # Format data for src_dataset src_tokens = torch.t(src_tokens) src_tokens_no_pad = [] for src_sentence in src_tokens: src_tokens_no_pad.append( utils.strip_pad(tensor=src_sentence, pad=src_dict.pad()) ) denoising_batch_result = self._get_noising_dataset_batch( src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict, append_eos_to_tgt=True, ) eos, pad = src_dict.eos(), src_dict.pad() # Generated noisy source as source expected_src = torch.LongTensor( [[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]] ) # Original clean source as target (right-padded) expected_tgt = torch.LongTensor( [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]] ) generated_src = denoising_batch_result["net_input"]["src_tokens"] tgt_tokens = denoising_batch_result["target"] self.assertTensorEqual(expected_src, generated_src) self.assertTensorEqual(expected_tgt, tgt_tokens) def assertTensorEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertEqual(t1.ne(t2).long().sum(), 0) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_online_backtranslation.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import tempfile import unittest from pathlib import Path from typing import Any, Dict, Sequence import fairseq.data.indexed_dataset as indexed_dataset import fairseq.options import fairseq.tasks.online_backtranslation as obt import torch from tests import utils def mk_sample(tokens: Sequence[int], batch_size: int = 2) -> Dict[str, Any]: batch = torch.stack([torch.tensor(tokens, dtype=torch.long)] * batch_size) sample = { "net_input": { "src_tokens": batch, "prev_output_tokens": batch, "src_lengths": torch.tensor([len(tokens)] * batch_size, dtype=torch.long), }, "target": batch[:, 1:], } return sample def mk_dataset(num_samples: int, max_len: int, output: Path): output.parent.mkdir(exist_ok=True) idx = indexed_dataset.IndexedDatasetBuilder(str(output)) data = torch.randint(5, 100, (num_samples, max_len)) lengths = torch.randint(3, max_len, (num_samples,)) for d, l in zip(data, lengths): d[0] = 0 idx.add_item(d[:l]) idx.finalize(output.with_suffix(".idx")) assert output.exists() assert output.with_suffix(".idx").exists() class OnlineBacktranslationTest(unittest.TestCase): tmp_dir = Path(tempfile.mkdtemp(suffix="OnlineBacktranslationTest")) @classmethod def obt_task( cls, languages: Sequence[str], data: Path = None, language_mapping: str = None ): dict_path = cls.tmp_dir / "dict.txt" if not dict_path.exists(): dictionary = utils.dummy_dictionary(100) dictionary.save(str(dict_path)) if data is not None: (data / "dict.txt").write_text(dict_path.read_text()) else: data = cls.tmp_dir assert len(languages) >= 2 kwargs = { "arch": "transformer", # --max-sentences=1 for better predictability of batches "max_sentences": 1, # Use characteristics dimensions "encoder_layers": 3, "encoder_embed_dim": 12, "encoder_ffn_embed_dim": 14, "encoder_attention_heads": 4, "decoder_layers": 3, "decoder_embed_dim": 12, "decoder_output_dim": 12, "decoder_ffn_embed_dim": 14, "decoder_attention_heads": 4, # Disable dropout so we have comparable tests. "dropout": 0, "attention_dropout": 0, "activation_dropout": 0, "encoder_layerdrop": 0, } args = fairseq.options.get_args( data, task="online_backtranslation", mono_langs=",".join(languages), valid_lang_pairs=f"{languages[0]}-{languages[1]}", tokens_per_sample=256, language_mapping=language_mapping, **kwargs, ) task = obt.OnlineBackTranslationTask.setup_task(args) # we need to build the model to have the correct dictionary model = task.build_model(task.args) return task, model def tmp_path(self, test_case: str) -> Path: return Path(tempfile.mkdtemp(test_case, dir=self.tmp_dir)) def test_lang_tokens(self): task, model = self.obt_task(["en", "ro", "zh"]) assert obt._lang_token("en") in task.dictionary assert obt._lang_token("ro") in task.dictionary assert obt._lang_token("zh") in task.dictionary en_bos = obt._lang_token_index(task.common_dict, "en") assert "en" == task.common_dict[en_bos].strip("_") zh_bos = obt._lang_token_index(task.common_dict, "zh") assert "zh" == task.common_dict[zh_bos].strip("_") zh_sample = mk_sample([zh_bos, 16, 14, 12, 10]) # we expect to receive the bos token for translation assert task.get_bos_token_from_sample(zh_sample) == en_bos def test_backtranslate_sample(self): task, model = self.obt_task(["en", "ro", "zh"]) en_bos = obt._lang_token_index(task.common_dict, "en") zh_bos = obt._lang_token_index(task.common_dict, "zh") sample = mk_sample([zh_bos, 16, 14, 12, 10]) task.backtranslate_sample(sample, "zh", "en") target_zh = list(sample["target"][0]) assert target_zh == [16, 14, 12, 10] # original zh sentence generated_en = sample["net_input"]["src_tokens"][0] assert generated_en[0] == en_bos def test_train_dataset(self): data = self.tmp_path("test_train_dataset") mk_dataset(20, 10, data / "en" / "train.bin") mk_dataset(10, 10, data / "zh" / "train.bin") task, model = self.obt_task(["en", "zh"], data) task.load_dataset("train") en_bos = obt._lang_token_index(task.common_dict, "en") zh_bos = obt._lang_token_index(task.common_dict, "zh") train = task.datasets["train"] train.ordered_indices() train.prefetch([0, 19]) sample_0 = train[0] sample_19 = train[19] self.assertEqual( set(sample_0.keys()), {"en-BT", "en-DENOISE", "zh-BT", "zh-DENOISE"} ) for sample in (sample_0, sample_19): self.assertEqual(sample["en-BT"]["source"][0], en_bos) # bt target isn't ready to look at. self.assertEqual(sample["en-DENOISE"]["source"][0], en_bos) # TODO What could we check on the target side ? for i in range(10): # Zh dataset is shorter, and is wrapped around En dataset. train.prefetch([i, i + 10]) self.assertEqual( list(train[i]["zh-DENOISE"]["source"]), list(train[i + 10]["zh-DENOISE"]["source"]), ) self.assertEqual(train[i]["zh-DENOISE"]["source"][0].item(), zh_bos) # Sorted by increasing len self.assertLess( len(sample_0["en-BT"]["source"]), len(sample_19["en-BT"]["source"]) ) def test_valid_dataset(self): data = self.tmp_path("test_valid_dataset") mk_dataset(10, 21, data / "valid.en-zh.en.bin") mk_dataset(10, 21, data / "valid.en-zh.zh.bin") task, model = self.obt_task(["en", "zh"], data) valid = task.load_dataset("valid") en_bos = obt._lang_token_index(task.common_dict, "en") assert valid is not None valid.prefetch(range(10)) sample_0 = valid[0] sample_9 = valid[9] self.assertEqual(sample_0["id"], 0) self.assertEqual(sample_9["id"], 9) self.assertEqual(sample_0["source"][0], en_bos) self.assertEqual(sample_9["source"][0], en_bos) # TODO: could we test the target side ? def assertFnMatch(self, fn, values): for x, y in values.items(): fn_x = fn(x) self.assertEqual(fn_x, y, f"Fn has wrong value: fn({x}) = {fn_x} != {y}") def test_piecewise_linear_fn(self): self.assertFnMatch( obt.PiecewiseLinearFn.from_string("1.0"), {0: 1, 100: 1, 500: 1, 1000: 1} ) self.assertFnMatch( obt.PiecewiseLinearFn.from_string("0:1,1000:0"), {0: 1, 500: 0.5, 1000: 0, 2000: 0}, ) self.assertFnMatch( obt.PiecewiseLinearFn.from_string("0:0,1000:1"), {0: 0, 500: 0.5, 1000: 1, 2000: 1}, ) self.assertFnMatch( obt.PiecewiseLinearFn.from_string("0:0,1000:1,2000:0"), {0: 0, 500: 0.5, 1000: 1, 1500: 0.5, 2000: 0, 3000: 0}, ) ================================================ FILE: tests/test_plasma_utils.py ================================================ import contextlib import tempfile import unittest from io import StringIO import numpy as np from tests.utils import create_dummy_data, preprocess_lm_data, train_language_model try: from pyarrow import plasma from fairseq.data.plasma_utils import PlasmaStore, PlasmaView PYARROW_AVAILABLE = True except ImportError: PYARROW_AVAILABLE = False dummy_path = "dummy" @unittest.skipUnless(PYARROW_AVAILABLE, "") class TestPlasmaView(unittest.TestCase): def setUp(self) -> None: self.tmp_file = tempfile.NamedTemporaryFile() # noqa: P201 self.path = self.tmp_file.name self.server = PlasmaStore.start(path=self.path, nbytes=10000) self.client = plasma.connect(self.path, num_retries=10) def tearDown(self) -> None: self.client.disconnect() self.tmp_file.close() self.server.kill() def test_two_servers_do_not_share_object_id_space(self): data_server_1 = np.array([0, 1]) data_server_2 = np.array([2, 3]) server_2_path = self.path with tempfile.NamedTemporaryFile() as server_1_path: server = PlasmaStore.start(path=server_1_path.name, nbytes=10000) arr1 = PlasmaView( data_server_1, dummy_path, 1, plasma_path=server_1_path.name ) assert len(arr1.client.list()) == 1 assert (arr1.array == data_server_1).all() arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=server_2_path) assert (arr2.array == data_server_2).all() assert (arr1.array == data_server_1).all() server.kill() def test_hash_collision(self): data_server_1 = np.array([0, 1]) data_server_2 = np.array([2, 3]) arr1 = PlasmaView(data_server_1, dummy_path, 1, plasma_path=self.path) assert len(arr1.client.list()) == 1 arr2 = PlasmaView(data_server_2, dummy_path, 1, plasma_path=self.path) assert len(arr1.client.list()) == 1 assert len(arr2.client.list()) == 1 assert (arr2.array == data_server_1).all() # New hash key based on tuples arr3 = PlasmaView( data_server_2, dummy_path, (1, 12312312312, None), plasma_path=self.path ) assert ( len(arr2.client.list()) == 2 ), "No new object was created by using a novel hash key" assert ( arr3.object_id in arr2.client.list() ), "No new object was created by using a novel hash key" assert ( arr3.object_id in arr3.client.list() ), "No new object was created by using a novel hash key" del arr3, arr2, arr1 @staticmethod def _assert_view_equal(pv1, pv2): np.testing.assert_array_equal(pv1.array, pv2.array) def test_putting_same_array_twice(self): data = np.array([4, 4, 4]) arr1 = PlasmaView(data, dummy_path, 1, plasma_path=self.path) assert len(self.client.list()) == 1 arr1b = PlasmaView( data, dummy_path, 1, plasma_path=self.path ) # should not change contents of store arr1c = PlasmaView( None, dummy_path, 1, plasma_path=self.path ) # should not change contents of store assert len(self.client.list()) == 1 self._assert_view_equal(arr1, arr1b) self._assert_view_equal(arr1, arr1c) PlasmaView( data, dummy_path, 2, plasma_path=self.path ) # new object id, adds new entry assert len(self.client.list()) == 2 new_client = plasma.connect(self.path) assert len(new_client.list()) == 2 # new client can access same objects assert isinstance(arr1.object_id, plasma.ObjectID) del arr1b del arr1c def test_plasma_store_full_raises(self): with tempfile.NamedTemporaryFile() as new_path: server = PlasmaStore.start(path=new_path.name, nbytes=10000) with self.assertRaises(plasma.PlasmaStoreFull): # 2000 floats is more than 2000 bytes PlasmaView( np.random.rand(10000, 1), dummy_path, 1, plasma_path=new_path.name ) server.kill() def test_object_id_overflow(self): PlasmaView.get_object_id("", 2**21) def test_training_lm_plasma(self): with contextlib.redirect_stdout(StringIO()): with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: create_dummy_data(data_dir) preprocess_lm_data(data_dir) train_language_model( data_dir, "transformer_lm", ["--use-plasma-view", "--plasma-path", self.path], run_validation=True, ) ================================================ FILE: tests/test_positional_encoding.py ================================================ import unittest import torch from fairseq.modules import RelPositionalEncoding import numpy as np class TestRelPositionalEncoding(unittest.TestCase): def setUp(self) -> None: self.T = 3 self.B = 1 self.C = 2 torch.manual_seed(0) self.sample = torch.randn(self.T, self.B, self.C) # TBC self.rel_pos_enc = RelPositionalEncoding(max_len=4, d_model=self.C) def test_extend_pe(self): inp = self.sample.transpose(0, 1) self.rel_pos_enc.extend_pe(inp) expected_pe = torch.tensor( [ [ [0.1411, -0.9900], [0.9093, -0.4161], [0.8415, 0.5403], [0.0000, 1.0000], [-0.8415, 0.5403], [-0.9093, -0.4161], [-0.1411, -0.9900], ] ] ) self.assertTrue( np.allclose( expected_pe.cpu().detach().numpy(), self.rel_pos_enc.pe.cpu().detach().numpy(), atol=1e-4, ) ) def test_forward(self): pos_enc = self.rel_pos_enc(self.sample) expected_pos_enc = torch.tensor( [ [[0.9093, -0.4161]], [[0.8415, 0.5403]], [[0.0000, 1.0000]], [[-0.8415, 0.5403]], [[-0.9093, -0.4161]], ] ) self.assertTrue( np.allclose( pos_enc.cpu().detach().numpy(), expected_pos_enc.cpu().detach().numpy(), atol=1e-4, ) ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_reproducibility.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import json import os import tempfile import unittest import torch from . import test_binaries class TestReproducibility(unittest.TestCase): def _test_reproducibility( self, name, extra_flags=None, delta=0.0001, resume_checkpoint="checkpoint1.pt", max_epoch=3, ): def get_last_log_stats_containing_string(log_records, search_string): for log_record in logs.records[::-1]: if isinstance(log_record.msg, str) and search_string in log_record.msg: return json.loads(log_record.msg) if extra_flags is None: extra_flags = [] with tempfile.TemporaryDirectory(name) as data_dir: with self.assertLogs() as logs: test_binaries.create_dummy_data(data_dir) test_binaries.preprocess_translation_data(data_dir) # train epochs 1 and 2 together with self.assertLogs() as logs: test_binaries.train_translation_model( data_dir, "fconv_iwslt_de_en", [ "--dropout", "0.0", "--log-format", "json", "--log-interval", "1", "--max-epoch", str(max_epoch), ] + extra_flags, ) train_log = get_last_log_stats_containing_string(logs.records, "train_loss") valid_log = get_last_log_stats_containing_string(logs.records, "valid_loss") # train epoch 2, resuming from previous checkpoint 1 os.rename( os.path.join(data_dir, resume_checkpoint), os.path.join(data_dir, "checkpoint_last.pt"), ) with self.assertLogs() as logs: test_binaries.train_translation_model( data_dir, "fconv_iwslt_de_en", [ "--dropout", "0.0", "--log-format", "json", "--log-interval", "1", "--max-epoch", str(max_epoch), ] + extra_flags, ) train_res_log = get_last_log_stats_containing_string( logs.records, "train_loss" ) valid_res_log = get_last_log_stats_containing_string( logs.records, "valid_loss" ) for k in ["train_loss", "train_ppl", "train_num_updates", "train_gnorm"]: self.assertAlmostEqual( float(train_log[k]), float(train_res_log[k]), delta=delta ) for k in [ "valid_loss", "valid_ppl", "valid_num_updates", "valid_best_loss", ]: self.assertAlmostEqual( float(valid_log[k]), float(valid_res_log[k]), delta=delta ) def test_reproducibility(self): self._test_reproducibility("test_reproducibility") @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_reproducibility_fp16(self): self._test_reproducibility( "test_reproducibility_fp16", [ "--fp16", "--fp16-init-scale", "4096", ], delta=0.011, ) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_reproducibility_memory_efficient_fp16(self): self._test_reproducibility( "test_reproducibility_memory_efficient_fp16", [ "--memory-efficient-fp16", "--fp16-init-scale", "4096", ], ) @unittest.skipIf(not torch.cuda.is_available(), "test requires a GPU") def test_reproducibility_amp(self): self._test_reproducibility( "test_reproducibility_amp", [ "--amp", "--fp16-init-scale", "4096", ], delta=0.011, ) def test_mid_epoch_reproducibility(self): self._test_reproducibility( "test_mid_epoch_reproducibility", ["--save-interval-updates", "3"], resume_checkpoint="checkpoint_1_3.pt", max_epoch=1, ) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_resampling_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import collections import unittest import numpy as np from fairseq.data import ListDataset, ResamplingDataset class TestResamplingDataset(unittest.TestCase): def setUp(self): self.strings = ["ab", "c", "def", "ghij"] self.weights = [4.0, 2.0, 7.0, 1.5] self.size_ratio = 2 self.dataset = ListDataset( self.strings, np.array([len(s) for s in self.strings]) ) def _test_common(self, resampling_dataset, iters): assert len(self.dataset) == len(self.strings) == len(self.weights) assert len(resampling_dataset) == self.size_ratio * len(self.strings) results = {"ordered_by_size": True, "max_distribution_diff": 0.0} totalfreqs = 0 freqs = collections.defaultdict(int) for epoch_num in range(iters): resampling_dataset.set_epoch(epoch_num) indices = resampling_dataset.ordered_indices() assert len(indices) == len(resampling_dataset) prev_size = -1 for i in indices: cur_size = resampling_dataset.size(i) # Make sure indices map to same sequences within an epoch assert resampling_dataset[i] == resampling_dataset[i] # Make sure length of sequence is correct assert cur_size == len(resampling_dataset[i]) freqs[resampling_dataset[i]] += 1 totalfreqs += 1 if prev_size > cur_size: results["ordered_by_size"] = False prev_size = cur_size assert set(freqs.keys()) == set(self.strings) for s, weight in zip(self.strings, self.weights): freq = freqs[s] / totalfreqs expected_freq = weight / sum(self.weights) results["max_distribution_diff"] = max( results["max_distribution_diff"], abs(expected_freq - freq) ) return results def test_resampling_dataset_batch_by_size_false(self): resampling_dataset = ResamplingDataset( self.dataset, self.weights, size_ratio=self.size_ratio, batch_by_size=False, seed=0, ) results = self._test_common(resampling_dataset, iters=1000) # For batch_by_size = False, the batches should be returned in # arbitrary order of size. assert not results["ordered_by_size"] # Allow tolerance in distribution error of 2%. assert results["max_distribution_diff"] < 0.02 def test_resampling_dataset_batch_by_size_true(self): resampling_dataset = ResamplingDataset( self.dataset, self.weights, size_ratio=self.size_ratio, batch_by_size=True, seed=0, ) results = self._test_common(resampling_dataset, iters=1000) # For batch_by_size = True, the batches should be returned in # increasing order of size. assert results["ordered_by_size"] # Allow tolerance in distribution error of 2%. assert results["max_distribution_diff"] < 0.02 if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_roberta.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import functools import unittest from typing import Any, Dict, Sequence import fairseq import fairseq.options import fairseq.tasks import torch from tests.utils import dummy_dictionary VOCAB_SIZE = 100 @fairseq.tasks.register_task("fake_task") class FakeTask(fairseq.tasks.LegacyFairseqTask): def __init__(self, args): super().__init__(args) self.dictionary = dummy_dictionary(VOCAB_SIZE - 4) assert len(self.dictionary) == VOCAB_SIZE @property def source_dictionary(self): return self.dictionary @property def target_dictionary(self): return self.dictionary @functools.lru_cache() def get_toy_model( device: str, architecture: str = "roberta_enc_dec", **extra_args: Any, ): assert device in ("gpu", "cpu") kwargs = { "arch": architecture, # Use characteristics dimensions "encoder_layers": 3, "encoder_embed_dim": 12, "encoder_ffn_embed_dim": 14, "encoder_attention_heads": 4, "decoder_layers": 3, "decoder_embed_dim": 12, "decoder_ffn_embed_dim": 14, "decoder_attention_heads": 4, # Disable dropout so we have comparable tests. "dropout": 0, "attention_dropout": 0, "activation_dropout": 0, "encoder_layerdrop": 0, # required args "tokens_per_sample": 256, "data": "/tmp/test_roberta", } kwargs.update(extra_args) fake_task = FakeTask(kwargs) args = fairseq.options.get_args( task="online_backtranslation", mono_langs="en,ro", valid_lang_pairs="en-ro", **kwargs, ) torch.manual_seed(0) model = fake_task.build_model(args) if device == "gpu": model.cuda() return fake_task, model def mk_sample( lang: str, device: str, tok: Sequence[int] = None, batch_size: int = 2 ) -> Dict[str, Any]: assert device in ("gpu", "cpu") if not tok: if lang == "en": tok = [10, 11, 12, 13, 14, 15, 2] else: tok = [20, 21, 22, 23, 24, 25, 26, 27, 2] batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size) if device == "gpu": batch = batch.cuda() sample = { "net_input": { "src_tokens": batch, "prev_output_tokens": batch, "src_lengths": torch.tensor( [len(tok)] * batch_size, dtype=torch.long, device=batch.device ), }, "target": batch[:, 1:], } return sample def cpu_gpu(fn): def helper(self): fn(self, "cpu") if torch.cuda.is_available(): fn(self, "gpu") return helper def architectures(fn): def helper(self): for arch in ["roberta_enc_dec", "transformer"]: fn(self, arch) return helper class RobertaTest(unittest.TestCase): def assertTensorEqual(self, t1, t2, delta: float = 1e-6): self.assertEqual(t1.size(), t2.size(), "size mismatch") if delta == 0.0: self.assertEqual(t1.ne(t2).long().sum(), 0) else: self.assertEqual(((t2 - t1).abs() > delta).long().sum(), 0) def assertSharing(self, model, link_groups: Sequence[Sequence[str]]): ids = {} for group in link_groups: group_ids = {name: id(params(model, name)) for name in group} shared_id = group_ids[group[0]] self.assertEqual(group_ids, {name: shared_id for name in group}) self.assertNotIn(shared_id, ids) ids[shared_id] = group def test_roberta_shared_params(self): _, roberta = get_toy_model("cpu", architecture="roberta") self.assertSharing( roberta, [ [ "encoder.sentence_encoder.embed_tokens.weight", "encoder.lm_head.weight", ] ], ) _, roberta = get_toy_model( "cpu", architecture="roberta", untie_weights_roberta=True ) self.assertSharing( roberta, [ ["encoder.sentence_encoder.embed_tokens.weight"], ["encoder.lm_head.weight"], ], ) def test_roberta_enc_dec_shared_params(self): # 3 distinct embeddings _, enc_dec = get_toy_model("cpu", architecture="roberta_enc_dec") self.assertSharing( enc_dec, [ ["encoder.embed_tokens.weight"], ["decoder.embed_tokens.weight"], ["decoder.output_projection.weight"], ], ) # 2 distinct embeddings, one for encoder, one for decoder _, enc_dec = get_toy_model( "cpu", architecture="roberta_enc_dec", share_decoder_input_output_embed=True ) self.assertSharing( enc_dec, [ ["encoder.embed_tokens.weight"], [ "decoder.embed_tokens.weight", "decoder.output_projection.weight", ], ], ) # shared embeddings _, enc_dec = get_toy_model( "cpu", architecture="roberta_enc_dec", share_all_embeddings=True ) self.assertSharing( enc_dec, [ [ "encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "decoder.output_projection.weight", ] ], ) def test_roberta_max_positions_is_correctly_set(self): device = "cpu" task, model = get_toy_model(device) max_pos = model.max_decoder_positions() self.assertEqual(max_pos, 256) self.assertEqual(max_pos, model.decoder.max_positions()) self.assertEqual(max_pos, model.encoder.max_positions()) self.assertEqual(max_pos, model.encoder.embed_positions.max_positions) sentence = [31 for _ in range(max_pos)] sample = mk_sample("en", device, sentence, batch_size=1) self.assertEqual(list(sample["net_input"]["src_lengths"]), [max_pos]) self.assertEqual(len(sample["net_input"]["src_tokens"][0]), max_pos) x, _ = model.forward(**sample["net_input"]) self.assertEqual(x.shape, (1, max_pos, VOCAB_SIZE)) @cpu_gpu def test_roberta_forward_backward(self, device: str): _, model = get_toy_model(device) sample = mk_sample("en", device) en_tokens = sample["net_input"]["src_tokens"] (bs, l) = en_tokens.shape # Forward logits, _ = model(**sample["net_input"]) self.assertEqual(logits.shape, (bs, l, VOCAB_SIZE)) # Backward loss = logits.sum() loss.backward() @cpu_gpu def test_roberta_forward_backward_bs1(self, device: str): _, model = get_toy_model(device) sample = mk_sample("en", device, batch_size=1) o, _ = model.forward(**sample["net_input"]) loss = o.sum() sample2 = mk_sample("ro", device, batch_size=1) o, _ = model.forward(**sample2["net_input"]) loss += o.sum() loss.backward() @cpu_gpu def test_roberta_batching(self, device: str): """ Checks that the batch of size 2 give twice the same results than the batch of size 1. """ _, model = get_toy_model(device) sample = mk_sample("en", device, batch_size=1) slen = sample["net_input"]["src_lengths"][0] sample2 = mk_sample("en", device, batch_size=2) with torch.no_grad(): z = model.encoder.forward( sample["net_input"]["src_tokens"], sample["net_input"]["src_lengths"] ) z = z["encoder_out"][-1] logits, _ = model.forward(**sample["net_input"]) z2 = model.encoder.forward( sample2["net_input"]["src_tokens"], sample["net_input"]["src_lengths"] ) z2 = z2["encoder_out"][-1] logits2, _ = model.forward(**sample2["net_input"]) self.assertEqual(z.shape, (slen, 1, 12)) self.assertEqual(z2.shape, (slen, 2, 12)) self.assertTensorEqual(logits2[0], logits2[1]) self.assertTensorEqual(logits[0], logits2[0]) @cpu_gpu def test_roberta_incremental_decoder(self, device: str): """ Checks that incremental decoding yields the same result than non incremental one. """ task, model = get_toy_model(device) en_sample = mk_sample("en", device) en_tokens = en_sample["net_input"]["src_tokens"] ro_sample = mk_sample("ro", device) ro_tokens = ro_sample["net_input"]["src_tokens"] en_enc = model.encoder.forward( en_tokens, src_lengths=en_sample["net_input"]["src_lengths"] ) (bs, tgt_len) = ro_tokens.shape # Decode without incremental state ro_dec, _ = model.decoder.forward(ro_tokens, encoder_out=en_enc) self.assertEqual(ro_dec.shape, (bs, tgt_len, VOCAB_SIZE)) self.assertTensorEqual(ro_dec[0], ro_dec[1]) # Decode with incremental state inc_state = {} ro_dec_inc = [] for i in range(tgt_len): ro, _ = model.decoder.forward( ro_tokens[:, : i + 1], encoder_out=en_enc, incremental_state=inc_state ) self.assertEqual(ro.shape, (bs, 1, VOCAB_SIZE)) ro_dec_inc.append(ro) for i in range(tgt_len): # Intra-batch self.assertTensorEqual(ro_dec_inc[i][0], ro_dec_inc[i][1]) # Incremental vs non-incremental self.assertTensorEqual(ro_dec_inc[i][:, 0], ro_dec[:, i]) @cpu_gpu def test_regularize_for_adaprune_in_roberta(self, device: str): _, model = get_toy_model( device=device, architecture="roberta_base", mha_reg_scale_factor=0.000375, ffn_reg_scale_factor=0.000375, ) sample = mk_sample("en", device, batch_size=1) task_loss, _ = model.forward(**sample["net_input"]) head_loss = model._get_adaptive_head_loss() ffn_loss = model._get_adaptive_ffn_loss() loss = task_loss.sum() + head_loss + ffn_loss loss.backward() @cpu_gpu def test_ffn_prune_for_adaprune_in_roberta(self, device: str): _, model = get_toy_model( device=device, architecture="roberta_base", ) sample = mk_sample("en", device, batch_size=1) for layer in model.encoder.sentence_encoder.layers: fc1_original_size = layer.fc1.out_features remove_index = layer._get_fc_rank(remove_num=2) layer._prune_fc_layer(remove_index=remove_index) self.assertEqual(layer.fc1.out_features, fc1_original_size - 2) task_loss, _ = model.forward(**sample["net_input"]) def params(model, name): if "." not in name: return getattr(model, name) prefix, name = name.split(".", 1) return params(getattr(model, prefix), name) ================================================ FILE: tests/test_rotary_positional_embedding.py ================================================ import torch import numpy as np import unittest from fairseq.modules.rotary_positional_embedding import apply_rotary_pos_emb from fairseq.modules import RotaryPositionalEmbedding class TestRotaryPositionalEmbedding(unittest.TestCase): def setUp(self) -> None: self.T = 3 self.B = 1 self.C = 2 torch.manual_seed(0) self.sample = torch.randn(self.T, self.B, self.C) # TBC self.rope_pos_emd = RotaryPositionalEmbedding(dim=self.C) def test_forward(self): expected_cos = torch.tensor( [[[[1.0000, 1.0000]]], [[[0.5403, 0.5403]]], [[[-0.4161, -0.4161]]]] ) expected_sin = torch.tensor( [[[[0.0000, 0.0000]]], [[[0.8415, 0.8415]]], [[[0.9093, 0.9093]]]] ) cos, sin = self.rope_pos_emd(self.sample, self.T) self.assertTrue( np.allclose( expected_cos.cpu().detach().numpy(), cos.cpu().detach().numpy(), atol=1e-4, ) ) self.assertTrue( np.allclose( expected_sin.cpu().detach().numpy(), sin.cpu().detach().numpy(), atol=1e-4, ) ) def test_apply_rotary_pos_emb(self): cos, sin = self.rope_pos_emd(self.sample, self.T) query = self.sample.view(self.T, self.B, 1, self.C) expected_query = torch.tensor( [[[[1.5410, -0.2934]]], [[[-1.6555, -1.5263]]], [[[1.7231, -0.4041]]]] ) new_query, new_key = apply_rotary_pos_emb(query, query, cos, sin) self.assertTrue( np.allclose( expected_query.cpu().detach().numpy(), new_query.cpu().detach().numpy(), atol=1e-4, ) ) self.assertTrue( np.allclose( expected_query.cpu().detach().numpy(), new_key.cpu().detach().numpy(), atol=1e-4, ) ) def test_jit_compile_rope_module(self): module_scripted = torch.jit.script(self.rope_pos_emd) apply_rotary_scripted = torch.jit.script(apply_rotary_pos_emb) # Test several different lengths for T in [3, 5, 10]: sample = torch.randn(T, self.B, self.C) # Run forward pass with the original module cos_original, sin_original = self.rope_pos_emd(sample, T) query = sample.view(T, self.B, 1, self.C) new_query, new_key = apply_rotary_pos_emb(query, query, cos_original, sin_original) # Run forward pass with the scripted module cos_scripted, sin_scripted = module_scripted(sample, T) new_query_scripted, new_key_scripted = apply_rotary_scripted(query, query, cos_scripted, sin_scripted) # Ensure the outputs are the same self.assertTrue(torch.allclose(cos_original, cos_scripted)) self.assertTrue(torch.allclose(sin_original, sin_scripted)) self.assertTrue(torch.allclose(new_query, new_query_scripted)) self.assertTrue(torch.allclose(new_key, new_key_scripted)) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_sequence_generator.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import math import tempfile import unittest import numpy as np import torch import tests.utils as test_utils from fairseq import search from fairseq.data.dictionary import Dictionary from fairseq.models.transformer import TransformerModel from fairseq.ngram_repeat_block import NGramRepeatBlock from fairseq.sequence_generator import EnsembleModel, SequenceGenerator from fairseq.tasks.fairseq_task import LegacyFairseqTask DEFAULT_TEST_VOCAB_SIZE = 100 class DummyTask(LegacyFairseqTask): def __init__(self, args): super().__init__(args) self.dictionary = get_dummy_dictionary() if getattr(self.args, "ctc", False): self.dictionary.add_symbol("<ctc_blank>") self.src_dict = self.dictionary self.tgt_dict = self.dictionary @property def source_dictionary(self): return self.src_dict @property def target_dictionary(self): return self.dictionary def get_dummy_dictionary(vocab_size=DEFAULT_TEST_VOCAB_SIZE): dummy_dict = Dictionary() # add dummy symbol to satisfy vocab size for id, _ in enumerate(range(vocab_size)): dummy_dict.add_symbol("{}".format(id), n=1000) return dummy_dict def get_dummy_task_and_parser(): """ to build a fariseq model, we need some dummy parse and task. This function is used to create dummy task and parser to faciliate model/criterion test Note: we use FbSpeechRecognitionTask as the dummy task. You may want to use other task by providing another function """ parser = argparse.ArgumentParser( description="test_dummy_s2s_task", argument_default=argparse.SUPPRESS ) DummyTask.add_args(parser) args = parser.parse_args([]) task = DummyTask.setup_task(args) return task, parser class TestJitSequenceGeneratorBase(unittest.TestCase): def setUp(self): self.task, self.parser = get_dummy_task_and_parser() eos = self.task.tgt_dict.eos() src_tokens = torch.randint(3, 50, (2, 10)).long() src_tokens = torch.cat((src_tokens, torch.LongTensor([[eos], [eos]])), -1) src_lengths = torch.LongTensor([2, 10]) self.sample = { "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths} } TransformerModel.add_args(self.parser) args = self.parser.parse_args([]) args.encoder_layers = 2 args.decoder_layers = 1 self.transformer_model = TransformerModel.build_model(args, self.task) def assertOutputEqual(self, hypo, pos_probs): pos_scores = torch.FloatTensor(pos_probs).log() self.assertTensorSizeEqual(hypo["positional_scores"], pos_scores) self.assertTensorSizeEqual(pos_scores.numel(), hypo["tokens"].numel()) def assertTensorSizeEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess((t1 - t2).abs().max(), 1e-4) def assertTensorEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertEqual(t1.ne(t2).long().sum(), 0) def assertHypoEqual(self, h1, h2): "Check two hypos are equal" self.assertTensorEqual(h1["tokens"], h2["tokens"]) self.assertAlmostEqual(h1["positional_scores"], h2["positional_scores"]) self.assertLess(abs(h1["score"] - h2["score"]), 1e-6) self.assertAlmostEqual(h1["attention"], h2["attention"]) def _test_save_and_load(self, scripted_module): with tempfile.NamedTemporaryFile() as f: scripted_module.save(f.name) torch.jit.load(f.name) JIT_MSG = "Targeting OSS scriptability for the 1.6 release" @unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG) class TestJitSequenceGenerator(TestJitSequenceGeneratorBase): def test_export_transformer(self): model = self.transformer_model torch.jit.script(model) def test_ensemble_sequence_generator(self): model = self.transformer_model generator = SequenceGenerator( [model], self.task.tgt_dict, beam_size=2, no_repeat_ngram_size=2, max_len_b=10, ) scripted_model = torch.jit.script(generator) self._test_save_and_load(scripted_model) def test_export_ensemble_model(self): model = self.transformer_model ensemble_models = EnsembleModel([model]) torch.jit.script(ensemble_models) class TestExportSearch(unittest.TestCase): def setUp(self): task, _ = get_dummy_task_and_parser() self.tgt_dict = task.tgt_dict self.min_top1_prob = 0.4 def test_export_diverse_bs(self): search_strategy = search.DiverseBeamSearch( self.tgt_dict, num_groups=2, diversity_strength=0.0 ) torch.jit.script(search_strategy) def test_export_sampling(self): low_sampling_topp = self.min_top1_prob / 2.0 search_strategy = search.Sampling( self.tgt_dict, sampling_topp=low_sampling_topp ) torch.jit.script(search_strategy) def test_export_diverse_siblings_search(self): search_strategy = search.DiverseSiblingsSearch( self.tgt_dict, diversity_rate=0.5 ) torch.jit.script(search_strategy) class TestSequenceGeneratorBase(unittest.TestCase): def assertHypoTokens(self, hypo, tokens): self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens)) def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0): pos_scores = torch.FloatTensor(pos_probs).log() self.assertAlmostEqual(hypo["positional_scores"], pos_scores) self.assertEqual(pos_scores.numel(), hypo["tokens"].numel()) score = pos_scores.sum() if normalized: score /= pos_scores.numel() ** lenpen self.assertLess(abs(score - hypo["score"]), 1e-6) def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess((t1 - t2).abs().max(), 1e-4) def assertTensorEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertEqual(t1.ne(t2).long().sum(), 0) class TestSequenceGenerator(TestSequenceGeneratorBase): def setUp(self): ( self.tgt_dict, self.w1, self.w2, src_tokens, src_lengths, self.model, ) = test_utils.sequence_generator_setup() self.sample = { "net_input": {"src_tokens": src_tokens, "src_lengths": src_lengths} } def test_with_normalization(self): generator = SequenceGenerator([self.model], self.tgt_dict, beam_size=2) hypos = generator.forward(self.sample) eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 1.0]) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos]) self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0]) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos]) self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0]) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w1, w2, eos]) self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6]) def test_without_normalization(self): # Sentence 1: unchanged from the normalized case # Sentence 2: beams swap order generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, normalize_scores=False ) hypos = generator.forward(self.sample) eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 1.0], normalized=False) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos]) self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], normalized=False) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], normalized=False) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos]) self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], normalized=False) def test_with_lenpen_favoring_short_hypos(self): lenpen = 0.6 generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen ) hypos = generator.forward(self.sample) eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 1.0], lenpen=lenpen) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w2, w1, w2, eos]) self.assertHypoScore(hypos[0][1], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6], lenpen=lenpen) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w1, w2, w1, eos]) self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen) def test_with_lenpen_favoring_long_hypos(self): lenpen = 5.0 generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, len_penalty=lenpen ) hypos = generator.forward(self.sample) eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w2, w1, w2, eos]) self.assertHypoScore(hypos[0][0], [0.1, 0.9, 0.9, 1.0], lenpen=lenpen) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w1, eos]) self.assertHypoScore(hypos[0][1], [0.9, 1.0], lenpen=lenpen) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w2, w1, eos]) self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.4, 1.0], lenpen=lenpen) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w1, w2, eos]) self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.6], lenpen=lenpen) def test_maxlen(self): generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, max_len_b=2 ) hypos = generator.forward(self.sample) eos, w1, w2 = self.tgt_dict.eos(), self.w1, self.w2 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 1.0]) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w2, w2, eos]) self.assertHypoScore(hypos[0][1], [0.1, 0.1, 0.6]) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.6]) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w2, w2, eos]) self.assertHypoScore(hypos[1][1], [0.3, 0.9, 0.01]) def test_encoder_with_different_output_len(self): args = self.model.encoder.args task = test_utils.TestTranslationTask.setup_task( args, self.tgt_dict, self.tgt_dict ) reshaping_model = test_utils.TestReshapingModel.build_model(args, task) generator = SequenceGenerator( [reshaping_model], self.tgt_dict, beam_size=2, max_len_b=2 ) hypos = generator.forward(self.sample) for sent in [0, 1]: for beam in [0, 1]: assert hypos[sent][beam]["attention"] is not None def test_generation_with_additional_input(self): args = self.model.encoder.args task = test_utils.TestTranslationTask.setup_task( args, self.tgt_dict, self.tgt_dict ) add_input_model = test_utils.TestAdditionalInputModel.build_model(args, task) generator = SequenceGenerator([add_input_model], self.tgt_dict, beam_size=2) sample = self.sample.copy() sample["net_input"]["fancy_other_input"] = sample["net_input"]["src_tokens"] hypos = generator.forward(self.sample) eos, w1 = self.tgt_dict.eos(), self.w1 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 1.0]) @unittest.skipUnless(torch.cuda.is_available(), "") class TestRepeatNgramBlocking(TestSequenceGeneratorBase): @classmethod def setUpClass(cls): ( cls.tgt_dict, cls.w1, cls.w2, src_tokens, src_lengths, cls.model, ) = test_utils.sequence_generator_setup() return cls def test_finds_repetitive_tokens(self): bsz, vocab_size, beam_size, step = 2, 4, 1, 3 generated_tok = torch.tensor( [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda" ) lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda") desired_result = lprobs.new_tensor( [[0.0, 0.0, -math.inf, 0.0], [0.0, 0.0, 0.0, -math.inf]] ) cuda_ext_result, baseline_result = self._compare_cuda_ext_to_default_implem( bsz, beam_size, generated_tok, lprobs, step, 2 ) self.assertTensorEqual(cuda_ext_result, desired_result) self.assertTensorEqual(baseline_result, desired_result) @unittest.skipIf(torch.__version__ < "1.6.0", JIT_MSG) def test_jit_no_extension(self): bsz, vocab_size, beam_size, step = 2, 4, 1, 3 generated_tok = torch.tensor( [[2, 2, 2, 2], [3, 3, 3, 3]], dtype=torch.long, device="cuda" ) lprobs = torch.zeros((beam_size * bsz, vocab_size), device="cuda") blocker = NGramRepeatBlock(2, use_extension=False) base_result = blocker(generated_tok, lprobs.clone(), bsz, beam_size, step) scripted_blocker = torch.jit.script(blocker) jit_result = scripted_blocker( generated_tok, lprobs.clone(), bsz, beam_size, step ) self.assertTensorEqual(base_result, jit_result) def test_ngram_blocking_same_as_default_implem(self): """Test that cuda extension returns same things as default impl in many settings.""" vocab_size = 4 step = 6 for _ in range(2): block_param = np.random.choice([1, 2, 3, 4]) batch_size = np.random.randint(1, 8) beam_size = np.random.choice([1, 2, 4, 8]) lprobs = torch.zeros((beam_size * batch_size, vocab_size), device="cuda") generated_tok = torch.tensor( np.random.randint( 0, vocab_size, size=(batch_size * beam_size, step + 1) ), device="cuda", dtype=torch.long, ) self._compare_cuda_ext_to_default_implem( batch_size, beam_size, generated_tok, lprobs, step, block_param, ) def _compare_cuda_ext_to_default_implem( self, bsz, beam_size, generated_tok, lprobs, step, block_param ): """Assert that cuda extension and default implem return the same thing.""" blocker = NGramRepeatBlock(block_param) assert blocker.use_extension, "Extension not compiled" cuda_ext_result = blocker( generated_tok, lprobs.clone(), bsz, beam_size, step, ) blocker.use_extension = False baseline_result = blocker( generated_tok, lprobs.clone(), bsz, beam_size, step, ) self.assertTensorEqual(cuda_ext_result, baseline_result) blocker.use_extension = True return cuda_ext_result, baseline_result class TestDiverseBeamSearch(TestSequenceGeneratorBase): def setUp(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 # construct source data self.src_tokens = torch.LongTensor( [ [self.w1, self.w2, self.eos], [self.w1, self.w2, self.eos], ] ) self.src_lengths = torch.LongTensor([2, 2]) args = argparse.Namespace() unk = 0.0 args.beam_probs = [ # step 0: torch.FloatTensor( [ # eos w1 w2 # sentence 1: [0.0, unk, 0.9, 0.1], # beam 1 [0.0, unk, 0.9, 0.1], # beam 2 # sentence 2: [0.0, unk, 0.7, 0.3], [0.0, unk, 0.7, 0.3], ] ), # step 1: torch.FloatTensor( [ # eos w1 w2 # sentence 1: [0.0, unk, 0.6, 0.4], [0.0, unk, 0.6, 0.4], # sentence 2: [0.25, unk, 0.35, 0.4], [0.25, unk, 0.35, 0.4], ] ), # step 2: torch.FloatTensor( [ # eos w1 w2 # sentence 1: [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], # sentence 2: [0.9, unk, 0.1, 0.0], [0.9, unk, 0.1, 0.0], ] ), ] task = test_utils.TestTranslationTask.setup_task(args, d, d) self.model = task.build_model(args) self.tgt_dict = task.target_dictionary def test_diverse_beam_search(self): search_strategy = search.DiverseBeamSearch( self.tgt_dict, num_groups=2, diversity_strength=0.0 ) generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy, ) sample = { "net_input": { "src_tokens": self.src_tokens, "src_lengths": self.src_lengths, } } hypos = generator.forward(sample) eos, w1, w2 = self.eos, self.w1, self.w2 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0]) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w1, w1, eos]) self.assertHypoScore(hypos[0][1], [0.9, 0.6, 1.0]) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9]) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w1, w2, eos]) self.assertHypoScore(hypos[1][1], [0.7, 0.4, 0.9]) class TestDiverseSiblingsSearch(TestDiverseBeamSearch): def assertHypoScore( self, hypo, pos_probs, sibling_rank, diversity_rate, normalized=True, lenpen=1.0 ): pos_scores = torch.FloatTensor(pos_probs).log() pos_scores.sub_(torch.Tensor(sibling_rank) * diversity_rate) self.assertAlmostEqual(hypo["positional_scores"], pos_scores) self.assertEqual(pos_scores.numel(), hypo["tokens"].numel()) score = pos_scores.sum() if normalized: score /= pos_scores.numel() ** lenpen self.assertLess(abs(score - hypo["score"]), 1e-6) def test_diverse_beam_search(self): search_strategy = search.DiverseSiblingsSearch( self.tgt_dict, diversity_rate=0.5 ) generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy ) sample = { "net_input": { "src_tokens": self.src_tokens, "src_lengths": self.src_lengths, } } hypos = generator.forward(sample) eos, w1, w2 = self.eos, self.w1, self.w2 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, w1, eos]) self.assertHypoScore(hypos[0][0], [0.9, 0.6, 1.0], [0, 1, 1], 0.5) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w1, w2, eos]) self.assertHypoScore(hypos[0][1], [0.9, 0.4, 1.0], [0, 2, 1], 0.5) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w2, eos]) self.assertHypoScore(hypos[1][0], [0.7, 0.4, 0.9], [0, 1, 1], 0.5) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w1, w1, eos]) self.assertHypoScore(hypos[1][1], [0.7, 0.35, 0.9], [0, 2, 1], 0.5) class TestTopPSamplingSearch(TestSequenceGeneratorBase): def setUp(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) self.eos = d.eos() self.w1 = 4 self.w2 = 5 # construct source data self.src_tokens = torch.LongTensor( [ [self.w1, self.w2, self.eos], [self.w1, self.w2, self.eos], ] ) self.src_lengths = torch.LongTensor([2, 2]) args = argparse.Namespace() unk = 0.0 # The minimal probability of top 2 tokens. self.min_top2_prob = 0.75 # The minimal probability of the top 1 token. self.min_top1_prob = 0.4 w1_prob = self.min_top1_prob w2_prob = self.min_top2_prob - self.min_top1_prob eos_prob = 1 - self.min_top2_prob args.beam_probs = [ # step 0: torch.FloatTensor( [ # eos w1 w2 [0.0, unk, 1.0, 0.0], [0.0, unk, 1.0, 0.0], [0.0, unk, 1.0, 0.0], [0.0, unk, 1.0, 0.0], ] ), # step 1: torch.FloatTensor( [ # eos w1 w2 [eos_prob, unk, w1_prob, w2_prob], [eos_prob, unk, w1_prob, w2_prob], [eos_prob, unk, w1_prob, w2_prob], [eos_prob, unk, w1_prob, w2_prob], ] ), # step 2: torch.FloatTensor( [ # eos w1 w2 [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], [1.0, unk, 0.0, 0.0], ] ), ] task = test_utils.TestTranslationTask.setup_task(args, d, d) self.model = task.build_model(args) self.tgt_dict = task.target_dictionary def test_topp_sampling_search_low_prob(self): # Given a prob low enough to top-P sampling, we expect only the top # 1 token to be sampled, which always results in the same output. low_sampling_topp = self.min_top1_prob / 2.0 search_strategy = search.Sampling( self.tgt_dict, sampling_topp=low_sampling_topp ) generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy ) sample = { "net_input": { "src_tokens": self.src_tokens, "src_lengths": self.src_lengths, } } hypos = generator.forward(sample) eos, w1 = self.eos, self.w1 # sentence 1, beam 1 self.assertHypoTokens(hypos[0][0], [w1, w1, eos]) self.assertHypoScore(hypos[0][0], [1.0, 0.4, 1.0]) # sentence 1, beam 2 self.assertHypoTokens(hypos[0][1], [w1, w1, eos]) self.assertHypoScore(hypos[0][1], [1.0, 0.4, 1.0]) # sentence 2, beam 1 self.assertHypoTokens(hypos[1][0], [w1, w1, eos]) self.assertHypoScore(hypos[1][0], [1.0, 0.4, 1.0]) # sentence 2, beam 2 self.assertHypoTokens(hypos[1][1], [w1, w1, eos]) self.assertHypoScore(hypos[1][1], [1.0, 0.4, 1.0]) def test_topp_sampling_search_high_prob(self): # Given a prob high enough to top-P sampling, any of the top 2 # tokens could be sampled. This can cause different outputs. high_sampling_topp = (self.min_top1_prob + self.min_top2_prob) / 2.0 search_strategy = search.Sampling( self.tgt_dict, sampling_topp=high_sampling_topp ) generator = SequenceGenerator( [self.model], self.tgt_dict, beam_size=2, search_strategy=search_strategy ) sample = { "net_input": { "src_tokens": self.src_tokens, "src_lengths": self.src_lengths, } } hypos = generator.forward(sample) eos, w1, w2 = self.eos, self.w1, self.w2 # sentence 1, beam 1 self.assertTrue( self.hypoTokens(hypos[0][0], [w1, w1, eos]) or self.hypoTokens(hypos[0][0], [w1, w2, eos]) ) self.assertTrue( self.hypoScore(hypos[0][0], [1.0, 0.4, 1.0]) or self.hypoScore(hypos[0][0], [1.0, 0.35, 1.0]) ) # sentence 1, beam 2 self.assertTrue( self.hypoTokens(hypos[0][1], [w1, w1, eos]) or self.hypoTokens(hypos[0][1], [w1, w2, eos]) ) self.assertTrue( self.hypoScore(hypos[0][1], [1.0, 0.4, 1.0]) or self.hypoScore(hypos[0][1], [1.0, 0.35, 1.0]) ) # sentence 2, beam 1 self.assertTrue( self.hypoTokens(hypos[1][0], [w1, w1, eos]) or self.hypoTokens(hypos[1][0], [w1, w2, eos]) ) self.assertTrue( self.hypoScore(hypos[1][0], [1.0, 0.4, 1.0]) or self.hypoScore(hypos[1][0], [1.0, 0.35, 1.0]) ) # sentence 2, beam 2 self.assertTrue( self.hypoTokens(hypos[1][1], [w1, w1, eos]) or self.hypoTokens(hypos[1][1], [w1, w2, eos]) ) self.assertTrue( self.hypoScore(hypos[1][1], [1.0, 0.4, 1.0]) or self.hypoScore(hypos[1][1], [1.0, 0.35, 1.0]) ) def hypoTokens(self, hypo, tokens): return self.tensorEqual(hypo["tokens"], torch.LongTensor(tokens)) def hypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0): pos_scores = torch.FloatTensor(pos_probs).log() if not self.almostEqual(hypo["positional_scores"], pos_scores): return False if pos_scores.numel() != hypo["tokens"].numel(): return False score = pos_scores.sum() if normalized: score /= pos_scores.numel() ** lenpen return abs(score - hypo["score"]) < 1e-6 def almostEqual(self, t1, t2): return t1.size() == t2.size() and (t1 - t2).abs().max() < 1e-4 def tensorEqual(self, t1, t2): return t1.size() == t2.size() and t1.ne(t2).long().sum() == 0 if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_sequence_scorer.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import unittest import tests.utils as test_utils import torch from fairseq.sequence_scorer import SequenceScorer class TestSequenceScorer(unittest.TestCase): def test_sequence_scorer(self): # construct dummy dictionary d = test_utils.dummy_dictionary(vocab_size=2) self.assertEqual(d.pad(), 1) self.assertEqual(d.eos(), 2) self.assertEqual(d.unk(), 3) eos = d.eos() w1 = 4 w2 = 5 # construct dataloader data = [ { "source": torch.LongTensor([w1, w2, eos]), "target": torch.LongTensor([w1, w2, w1, eos]), }, { "source": torch.LongTensor([w2, eos]), "target": torch.LongTensor([w2, w1, eos]), }, { "source": torch.LongTensor([w2, eos]), "target": torch.LongTensor([w2, eos]), }, ] data_itr = test_utils.dummy_dataloader(data) # specify expected output probabilities args = argparse.Namespace() unk = 0.0 args.beam_probs = [ # step 0: torch.FloatTensor( [ # eos w1 w2 [0.0, unk, 0.6, 0.4], # sentence 1 [0.0, unk, 0.4, 0.6], # sentence 2 [0.0, unk, 0.7, 0.3], # sentence 3 ] ), # step 1: torch.FloatTensor( [ # eos w1 w2 [0.0, unk, 0.2, 0.7], # sentence 1 [0.0, unk, 0.8, 0.2], # sentence 2 [0.7, unk, 0.1, 0.2], # sentence 3 ] ), # step 2: torch.FloatTensor( [ # eos w1 w2 [0.10, unk, 0.50, 0.4], # sentence 1 [0.15, unk, 0.15, 0.7], # sentence 2 [0.00, unk, 0.00, 0.0], # sentence 3 ] ), # step 3: torch.FloatTensor( [ # eos w1 w2 [0.9, unk, 0.05, 0.05], # sentence 1 [0.0, unk, 0.00, 0.0], # sentence 2 [0.0, unk, 0.00, 0.0], # sentence 3 ] ), ] expected_scores = [ [0.6, 0.7, 0.5, 0.9], # sentence 1 [0.6, 0.8, 0.15], # sentence 2 [0.3, 0.7], # sentence 3 ] task = test_utils.TestTranslationTask.setup_task(args, d, d) model = task.build_model(args) scorer = SequenceScorer(task.target_dictionary) for sample in data_itr: hypos = task.inference_step(scorer, [model], sample) for id, hypos_id in zip(sample["id"].tolist(), hypos): self.assertHypoTokens(hypos_id[0], data[id]["target"]) self.assertHypoScore(hypos_id[0], expected_scores[id]) def assertHypoTokens(self, hypo, tokens): self.assertTensorEqual(hypo["tokens"], torch.LongTensor(tokens)) def assertHypoScore(self, hypo, pos_probs, normalized=True, lenpen=1.0): pos_scores = torch.FloatTensor(pos_probs).log() self.assertAlmostEqual(hypo["positional_scores"], pos_scores) self.assertEqual(pos_scores.numel(), hypo["tokens"].numel()) score = pos_scores.sum() if normalized: score /= pos_scores.numel() ** lenpen self.assertLess(abs(score - hypo["score"]), 1e-6) def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess((t1 - t2).abs().max(), 1e-4) def assertTensorEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertEqual(t1.ne(t2).long().sum(), 0) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_sparse_multihead_attention.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from fairseq.modules.sparse_multihead_attention import SparseMultiheadAttention class TestSparseMultiheadAttention(unittest.TestCase): def test_sparse_multihead_attention(self): attn_weights = torch.randn(1, 8, 8) bidirectional_sparse_mask = torch.tensor( [ [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], [0, 0, 0, 0, 0, float("-inf"), float("-inf"), 0], [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], ] ) bidirectional_attention = SparseMultiheadAttention( 16, 1, stride=4, expressivity=1, is_bidirectional=True ) bidirectional_attention_sparse_mask = ( bidirectional_attention.buffered_sparse_mask(attn_weights, 8, 8) ) torch.all( torch.eq(bidirectional_attention_sparse_mask, bidirectional_sparse_mask) ) sparse_mask = torch.tensor( [ [ 0, float("-inf"), float("-inf"), float("-inf"), float("-inf"), float("-inf"), float("-inf"), float("-inf"), ], [ 0, 0, float("-inf"), float("-inf"), float("-inf"), float("-inf"), float("-inf"), float("-inf"), ], [ 0, 0, 0, float("-inf"), float("-inf"), float("-inf"), float("-inf"), float("-inf"), ], [ 0, 0, 0, 0, float("-inf"), float("-inf"), float("-inf"), float("-inf"), ], [0, 0, 0, 0, 0, float("-inf"), float("-inf"), float("-inf")], [ float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, float("-inf"), float("-inf"), ], [ float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, float("-inf"), ], [float("-inf"), float("-inf"), float("-inf"), 0, 0, 0, 0, 0], ] ) attention = SparseMultiheadAttention( 16, 1, stride=4, expressivity=1, is_bidirectional=False ) attention_sparse_mask = attention.buffered_sparse_mask(attn_weights, 8, 8) torch.all(torch.eq(attention_sparse_mask, sparse_mask)) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_token_block_dataset.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import tests.utils as test_utils import torch from fairseq.data import TokenBlockDataset class TestTokenBlockDataset(unittest.TestCase): def _build_dataset(self, data, **kwargs): sizes = [len(x) for x in data] underlying_ds = test_utils.TestDataset(data) return TokenBlockDataset(underlying_ds, sizes, **kwargs) def test_eos_break_mode(self): data = [ torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), torch.tensor([1], dtype=torch.long), torch.tensor([8, 7, 6, 1], dtype=torch.long), ] ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos") self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) self.assertEqual(ds[1].tolist(), [1]) self.assertEqual(ds[2].tolist(), [8, 7, 6, 1]) data = [ torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), torch.tensor([8, 7, 6, 1], dtype=torch.long), torch.tensor([1], dtype=torch.long), ] ds = self._build_dataset(data, block_size=None, pad=0, eos=1, break_mode="eos") self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) self.assertEqual(ds[1].tolist(), [8, 7, 6, 1]) self.assertEqual(ds[2].tolist(), [1]) def test_block_break_mode(self): data = [ torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), torch.tensor([8, 7, 6, 1], dtype=torch.long), torch.tensor([9, 1], dtype=torch.long), ] ds = self._build_dataset(data, block_size=3, pad=0, eos=1, break_mode="none") self.assertEqual(ds[0].tolist(), [5, 4, 3]) self.assertEqual(ds[1].tolist(), [2, 1, 8]) self.assertEqual(ds[2].tolist(), [7, 6, 1]) self.assertEqual(ds[3].tolist(), [9, 1]) def test_complete_break_mode(self): data = [ torch.tensor([5, 4, 3, 2, 1], dtype=torch.long), torch.tensor([8, 7, 6, 1], dtype=torch.long), torch.tensor([9, 1], dtype=torch.long), ] ds = self._build_dataset( data, block_size=6, pad=0, eos=1, break_mode="complete" ) self.assertEqual(ds[0].tolist(), [5, 4, 3, 2, 1]) self.assertEqual(ds[1].tolist(), [8, 7, 6, 1, 9, 1]) data = [ torch.tensor([4, 3, 2, 1], dtype=torch.long), torch.tensor([5, 1], dtype=torch.long), torch.tensor([1], dtype=torch.long), torch.tensor([6, 1], dtype=torch.long), ] ds = self._build_dataset( data, block_size=3, pad=0, eos=1, break_mode="complete" ) self.assertEqual(ds[0].tolist(), [4, 3, 2, 1]) self.assertEqual(ds[1].tolist(), [5, 1, 1]) self.assertEqual(ds[2].tolist(), [6, 1]) def test_4billion_tokens(self): """Regression test for numpy type promotion issue https://github.com/numpy/numpy/issues/5745""" data = [torch.tensor(list(range(10000)), dtype=torch.long)] * 430000 ds = self._build_dataset( data, block_size=6, pad=0, eos=1, break_mode="complete" ) ds[-1] # __getitem__ works start, end = ds.slice_indices[-1] assert end > 4294967295 # data must be sufficiently large to overflow uint32 assert not isinstance( end + 1, float ) # this would also raise, since np.uint64(1) + 1 => 2.0 if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_train.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import contextlib import logging import unittest from io import StringIO from unittest.mock import MagicMock, patch import torch from fairseq import checkpoint_utils, data from omegaconf import OmegaConf def mock_trainer(epoch, num_updates, iterations_in_epoch): trainer = MagicMock() trainer.load_checkpoint.return_value = { "train_iterator": { "epoch": epoch, "iterations_in_epoch": iterations_in_epoch, "shuffle": False, }, } trainer.get_num_updates.return_value = num_updates return trainer def mock_dict(): d = MagicMock() d.pad.return_value = 1 d.eos.return_value = 2 d.unk.return_value = 3 return d def get_trainer_and_epoch_itr(epoch, epoch_size, num_updates, iterations_in_epoch): tokens = torch.LongTensor(list(range(epoch_size))).view(1, -1) tokens_ds = data.TokenBlockDataset( tokens, sizes=[tokens.size(-1)], block_size=1, pad=0, eos=1, include_targets=False, ) trainer = mock_trainer(epoch, num_updates, iterations_in_epoch) dataset = data.LanguagePairDataset( tokens_ds, tokens_ds.sizes, mock_dict(), shuffle=False ) epoch_itr = data.EpochBatchIterator( dataset=dataset, collate_fn=dataset.collater, batch_sampler=[[i] for i in range(epoch_size)], ) return trainer, epoch_itr def get_mock_cfg(finetune_from_model): cfg_mock = OmegaConf.create( { "checkpoint": { "save_dir": None, "optimizer_overrides": "{}", "reset_dataloader": False, "reset_meters": False, "reset_optimizer": False, "reset_lr_scheduler": False, "finetune_from_model": finetune_from_model, "model_parallel_size": 1, "restore_file": "checkpoint_last.pt", }, "common": { "model_parallel_size": 1, }, } ) return cfg_mock class TestLoadCheckpoint(unittest.TestCase): def setUp(self): self.cfg_mock = get_mock_cfg(None) self.patches = { "os.makedirs": MagicMock(), "os.path.join": MagicMock(), "os.path.isfile": MagicMock(return_value=True), "os.path.isabs": MagicMock(return_value=False), "fairseq.file_io.PathManager.exists": MagicMock(return_value=False), } self.applied_patches = [patch(p, d) for p, d in self.patches.items()] [p.start() for p in self.applied_patches] logging.disable(logging.CRITICAL) def tearDown(self): patch.stopall() logging.disable(logging.NOTSET) def test_load_partial_checkpoint(self): with contextlib.redirect_stdout(StringIO()): trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 200, 50) trainer.get_train_iterator = MagicMock(return_value=epoch_itr) _, epoch_itr = checkpoint_utils.load_checkpoint( self.cfg_mock.checkpoint, trainer ) self.assertEqual(epoch_itr.epoch, 2) self.assertEqual(epoch_itr.iterations_in_epoch, 50) itr = epoch_itr.next_epoch_itr(shuffle=False) self.assertEqual(epoch_itr.epoch, 2) self.assertEqual(epoch_itr.iterations_in_epoch, 50) self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 50) self.assertEqual(epoch_itr.iterations_in_epoch, 51) for _ in range(150 - 52): next(itr) self.assertEqual(epoch_itr.iterations_in_epoch, 149) self.assertTrue(itr.has_next()) next(itr) self.assertFalse(itr.has_next()) itr = epoch_itr.next_epoch_itr(shuffle=False) self.assertTrue(itr.has_next()) self.assertEqual(epoch_itr.epoch, 3) self.assertEqual(epoch_itr.iterations_in_epoch, 0) def test_load_full_checkpoint(self): with contextlib.redirect_stdout(StringIO()): trainer, epoch_itr = get_trainer_and_epoch_itr(2, 150, 300, 150) trainer.get_train_iterator = MagicMock(return_value=epoch_itr) _, epoch_itr = checkpoint_utils.load_checkpoint( self.cfg_mock.checkpoint, trainer ) itr = epoch_itr.next_epoch_itr(shuffle=False) self.assertEqual(epoch_itr.epoch, 3) self.assertEqual(epoch_itr.iterations_in_epoch, 0) self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0) def test_load_no_checkpoint(self): with contextlib.redirect_stdout(StringIO()): trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) trainer.get_train_iterator = MagicMock(return_value=epoch_itr) self.patches["os.path.isfile"].return_value = False _, epoch_itr = checkpoint_utils.load_checkpoint( self.cfg_mock.checkpoint, trainer ) itr = epoch_itr.next_epoch_itr(shuffle=False) self.assertEqual(epoch_itr.epoch, 1) self.assertEqual(epoch_itr.iterations_in_epoch, 0) self.assertEqual(next(itr)["net_input"]["src_tokens"][0].item(), 0) def test_finetune_from_model_args_conflict(self): with contextlib.redirect_stdout(StringIO()): trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) trainer.get_train_iterator = MagicMock(return_value=epoch_itr) for arg in [ "reset_optimizer", "reset_lr_scheduler", "reset_meters", "reset_dataloader", ]: with self.subTest(arg=arg): cfg_mock = get_mock_cfg("/temp/checkpoint_pretrained.pt") cfg_mock["checkpoint"][arg] = True with self.assertRaises(Exception) as context: _, _ = checkpoint_utils.load_checkpoint( cfg_mock.checkpoint, trainer ) self.assertTrue( "--finetune-from-model can not be set together with either --reset-optimizer" " or reset_lr_scheduler or reset_meters or reset_dataloader" in str(context.exception) ) def test_finetune_from_model(self): with contextlib.redirect_stdout(StringIO()): trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) trainer.get_train_iterator = MagicMock(return_value=epoch_itr) from_model_path = "/temp/checkpoint_pretrained.pt" def mock_finetune_exist(path): if path == from_model_path: return True else: return False self.patches[ "fairseq.file_io.PathManager.exists" ].side_effect = mock_finetune_exist cfg_mock = get_mock_cfg(from_model_path) cfg_mock.checkpoint.restore_file = "checkpoint_last.pt" _, _ = checkpoint_utils.load_checkpoint(cfg_mock.checkpoint, trainer) ( checkpoint_path, reset_optimizer, reset_lr_scheduler, optimizer_overrides, ) = trainer.load_checkpoint.call_args[0] reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"] self.assertTrue(reset_optimizer) self.assertTrue(reset_lr_scheduler) self.assertTrue(reset_meters) def test_finetune_from_model_resume(self): with contextlib.redirect_stdout(StringIO()): trainer, epoch_itr = get_trainer_and_epoch_itr(1, 150, 0, 0) trainer.get_train_iterator = MagicMock(return_value=epoch_itr) from_model_path = "/temp/checkpoint_pretrained.pt" # launch second time # both restore_file=checkpoint_last.pt and finetune_from_model are set def mock_finetune_exist(path): if path == from_model_path or path.endsWith("checkpoint_last.pt"): return True else: return False self.patches[ "fairseq.file_io.PathManager.exists" ].side_effect = mock_finetune_exist cfg_mock = get_mock_cfg(from_model_path) cfg_mock.checkpoint.restore_file = "checkpoint_last.pt" _, _ = checkpoint_utils.load_checkpoint(cfg_mock.checkpoint, trainer) ( checkpoint_path, reset_optimizer, reset_lr_scheduler, optimizer_overrides, ) = trainer.load_checkpoint.call_args[0] reset_meters = trainer.load_checkpoint.call_args[1]["reset_meters"] self.assertFalse(reset_optimizer) self.assertFalse(reset_lr_scheduler) self.assertFalse(reset_meters) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_transformer.py ================================================ import argparse import unittest from typing import Any, Dict, Sequence import torch from fairseq.models import transformer from tests.test_roberta import FakeTask def mk_sample(tok: Sequence[int] = None, batch_size: int = 2) -> Dict[str, Any]: if not tok: tok = [10, 11, 12, 13, 14, 15, 2] batch = torch.stack([torch.tensor(tok, dtype=torch.long)] * batch_size) sample = { "net_input": { "src_tokens": batch, "prev_output_tokens": batch, "src_lengths": torch.tensor( [len(tok)] * batch_size, dtype=torch.long, device=batch.device ), }, "target": batch[:, 1:], } return sample def mk_transformer(**extra_args: Any): overrides = { # Use characteristics dimensions "encoder_embed_dim": 12, "encoder_ffn_embed_dim": 14, "decoder_embed_dim": 12, "decoder_ffn_embed_dim": 14, # Disable dropout so we have comparable tests. "dropout": 0, "attention_dropout": 0, "activation_dropout": 0, "encoder_layerdrop": 0, } overrides.update(extra_args) # Overrides the defaults from the parser args = argparse.Namespace(**overrides) transformer.tiny_architecture(args) torch.manual_seed(0) task = FakeTask(args) return transformer.TransformerModel.build_model(args, task) class TransformerTestCase(unittest.TestCase): def test_forward_backward(self): model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=12) sample = mk_sample() o, _ = model.forward(**sample["net_input"]) loss = o.sum() loss.backward() def test_different_encoder_decoder_embed_dim(self): model = mk_transformer(encoder_embed_dim=12, decoder_embed_dim=16) sample = mk_sample() o, _ = model.forward(**sample["net_input"]) loss = o.sum() loss.backward() ================================================ FILE: tests/test_utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import unittest import torch from fairseq import utils class TestUtils(unittest.TestCase): def test_convert_padding_direction(self): pad = 1 left_pad = torch.LongTensor( [ [2, 3, 4, 5, 6], [1, 7, 8, 9, 10], [1, 1, 1, 11, 12], ] ) right_pad = torch.LongTensor( [ [2, 3, 4, 5, 6], [7, 8, 9, 10, 1], [11, 12, 1, 1, 1], ] ) self.assertAlmostEqual( right_pad, utils.convert_padding_direction( left_pad, pad, left_to_right=True, ), ) self.assertAlmostEqual( left_pad, utils.convert_padding_direction( right_pad, pad, right_to_left=True, ), ) def test_make_positions(self): pad = 1 left_pad_input = torch.LongTensor( [ [9, 9, 9, 9, 9], [1, 9, 9, 9, 9], [1, 1, 1, 9, 9], ] ) left_pad_output = torch.LongTensor( [ [2, 3, 4, 5, 6], [1, 2, 3, 4, 5], [1, 1, 1, 2, 3], ] ) right_pad_input = torch.LongTensor( [ [9, 9, 9, 9, 9], [9, 9, 9, 9, 1], [9, 9, 1, 1, 1], ] ) right_pad_output = torch.LongTensor( [ [2, 3, 4, 5, 6], [2, 3, 4, 5, 1], [2, 3, 1, 1, 1], ] ) self.assertAlmostEqual( left_pad_output, utils.make_positions(left_pad_input, pad), ) self.assertAlmostEqual( right_pad_output, utils.make_positions(right_pad_input, pad), ) def test_clip_grad_norm_(self): params = torch.nn.Parameter(torch.zeros(5)).requires_grad_(False) grad_norm = utils.clip_grad_norm_(params, 1.0) self.assertTrue(torch.is_tensor(grad_norm)) self.assertEqual(grad_norm, 0.0) params = [torch.nn.Parameter(torch.zeros(5)) for i in range(3)] for p in params: p.grad = torch.full((5,), fill_value=2.0) grad_norm = utils.clip_grad_norm_(params, 1.0) exp_grad_norm = torch.full((15,), fill_value=2.0).norm() self.assertTrue(torch.is_tensor(grad_norm)) self.assertEqual(grad_norm, exp_grad_norm) grad_norm = utils.clip_grad_norm_(params, 1.0) self.assertAlmostEqual(grad_norm, torch.tensor(1.0)) def test_resolve_max_positions_with_tuple(self): resolved = utils.resolve_max_positions(None, (2000, 100, 2000), 12000) self.assertEqual(resolved, (2000, 100, 2000)) def assertAlmostEqual(self, t1, t2): self.assertEqual(t1.size(), t2.size(), "size mismatch") self.assertLess(utils.item((t1 - t2).abs().max()), 1e-4) if __name__ == "__main__": unittest.main() ================================================ FILE: tests/test_valid_subset_checks.py ================================================ import os import shutil import tempfile import unittest from fairseq import options from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.data.data_utils import raise_if_valid_subsets_unintentionally_ignored from .utils import create_dummy_data, preprocess_lm_data, train_language_model def make_lm_config( data_dir=None, extra_flags=None, task="language_modeling", arch="transformer_lm_gpt2_tiny", ): task_args = [task] if data_dir is not None: task_args += [data_dir] train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", *task_args, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", ] + (extra_flags or []), ) cfg = convert_namespace_to_omegaconf(train_args) return cfg def write_empty_file(path): with open(path, "w"): pass assert os.path.exists(path) class TestValidSubsetsErrors(unittest.TestCase): """Test various filesystem, clarg combinations and ensure that error raising happens as expected""" def _test_case(self, paths, extra_flags): with tempfile.TemporaryDirectory() as data_dir: [ write_empty_file(os.path.join(data_dir, f"{p}.bin")) for p in paths + ["train"] ] cfg = make_lm_config(data_dir, extra_flags=extra_flags) raise_if_valid_subsets_unintentionally_ignored(cfg) def test_default_raises(self): with self.assertRaises(ValueError): self._test_case(["valid", "valid1"], []) with self.assertRaises(ValueError): self._test_case( ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"] ) def partially_specified_valid_subsets(self): with self.assertRaises(ValueError): self._test_case( ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1"] ) # Fix with ignore unused self._test_case( ["valid", "valid1", "valid2"], ["--valid-subset", "valid,valid1", "--ignore-unused-valid-subsets"], ) def test_legal_configs(self): self._test_case(["valid"], []) self._test_case(["valid", "valid1"], ["--ignore-unused-valid-subsets"]) self._test_case(["valid", "valid1"], ["--combine-val"]) self._test_case(["valid", "valid1"], ["--valid-subset", "valid,valid1"]) self._test_case(["valid", "valid1"], ["--valid-subset", "valid1"]) self._test_case( ["valid", "valid1"], ["--combine-val", "--ignore-unused-valid-subsets"] ) self._test_case( ["valid1"], ["--valid-subset", "valid1"] ) # valid.bin doesn't need to be ignored. def test_disable_validation(self): self._test_case([], ["--disable-validation"]) self._test_case(["valid", "valid1"], ["--disable-validation"]) def test_dummy_task(self): cfg = make_lm_config(task="dummy_lm") raise_if_valid_subsets_unintentionally_ignored(cfg) def test_masked_dummy_task(self): cfg = make_lm_config(task="dummy_masked_lm") raise_if_valid_subsets_unintentionally_ignored(cfg) class TestCombineValidSubsets(unittest.TestCase): def _train(self, extra_flags): with self.assertLogs() as logs: with tempfile.TemporaryDirectory("test_transformer_lm") as data_dir: create_dummy_data(data_dir, num_examples=20) preprocess_lm_data(data_dir) shutil.copyfile(f"{data_dir}/valid.bin", f"{data_dir}/valid1.bin") shutil.copyfile(f"{data_dir}/valid.idx", f"{data_dir}/valid1.idx") train_language_model( data_dir, "transformer_lm", ["--max-update", "0", "--log-format", "json"] + extra_flags, run_validation=False, ) return [x.message for x in logs.records] def test_combined(self): flags = ["--combine-valid-subsets", "--required-batch-size-multiple", "1"] logs = self._train(flags) assert any(["valid1" in x for x in logs]) # loaded 100 examples from valid1 assert not any(["valid1_ppl" in x for x in logs]) # metrics are combined def test_subsets(self): flags = [ "--valid-subset", "valid,valid1", "--required-batch-size-multiple", "1", ] logs = self._train(flags) assert any(["valid_ppl" in x for x in logs]) # loaded 100 examples from valid1 assert any(["valid1_ppl" in x for x in logs]) # metrics are combined ================================================ FILE: tests/utils.py ================================================ # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import argparse import json import os import random import shutil import string import sys import typing as tp from io import StringIO import torch import torch.nn.functional as F import fairseq.distributed.utils as distributed_utils from fairseq import options, utils from fairseq.data import Dictionary from fairseq.data.language_pair_dataset import collate from fairseq.dataclass.utils import convert_namespace_to_omegaconf from fairseq.models import ( FairseqEncoder, FairseqEncoderDecoderModel, FairseqIncrementalDecoder, ) from fairseq.models.fairseq_encoder import EncoderOut from fairseq.tasks import LegacyFairseqTask from fairseq_cli import generate, interactive, preprocess, train, validate def dummy_dictionary(vocab_size, prefix="token_"): d = Dictionary() for i in range(vocab_size): token = prefix + str(i) d.add_symbol(token) d.finalize(padding_factor=1) # don't add extra padding symbols return d def dummy_dataloader( samples, padding_idx=1, eos_idx=2, batch_size=None, ): if batch_size is None: batch_size = len(samples) # add any missing data to samples for i, sample in enumerate(samples): if "id" not in sample: sample["id"] = i # create dataloader dataset = TestDataset(samples) dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, collate_fn=(lambda samples: collate(samples, padding_idx, eos_idx)), ) return iter(dataloader) def sequence_generator_setup(): # construct dummy dictionary d = dummy_dictionary(vocab_size=2) eos = d.eos() w1 = 4 w2 = 5 # construct source data src_tokens = torch.LongTensor([[w1, w2, eos], [w1, w2, eos]]) src_lengths = torch.LongTensor([2, 2]) args = argparse.Namespace() unk = 0.0 args.beam_probs = [ # step 0: torch.FloatTensor( [ # eos w1 w2 # sentence 1: [0.0, unk, 0.9, 0.1], # beam 1 [0.0, unk, 0.9, 0.1], # beam 2 # sentence 2: [0.0, unk, 0.7, 0.3], [0.0, unk, 0.7, 0.3], ] ), # step 1: torch.FloatTensor( [ # eos w1 w2 prefix # sentence 1: [1.0, unk, 0.0, 0.0], # w1: 0.9 (emit: w1 <eos>: 0.9*1.0) [0.0, unk, 0.9, 0.1], # w2: 0.1 # sentence 2: [0.25, unk, 0.35, 0.4], # w1: 0.7 (don't emit: w1 <eos>: 0.7*0.25) [0.00, unk, 0.10, 0.9], # w2: 0.3 ] ), # step 2: torch.FloatTensor( [ # eos w1 w2 prefix # sentence 1: [0.0, unk, 0.1, 0.9], # w2 w1: 0.1*0.9 [ 0.6, unk, 0.2, 0.2, ], # w2 w2: 0.1*0.1 (emit: w2 w2 <eos>: 0.1*0.1*0.6) # sentence 2: [ 0.60, unk, 0.4, 0.00, ], # w1 w2: 0.7*0.4 (emit: w1 w2 <eos>: 0.7*0.4*0.6) [0.01, unk, 0.0, 0.99], # w2 w2: 0.3*0.9 ] ), # step 3: torch.FloatTensor( [ # eos w1 w2 prefix # sentence 1: [ 1.0, unk, 0.0, 0.0, ], # w2 w1 w2: 0.1*0.9*0.9 (emit: w2 w1 w2 <eos>: 0.1*0.9*0.9*1.0) [ 1.0, unk, 0.0, 0.0, ], # w2 w1 w1: 0.1*0.9*0.1 (emit: w2 w1 w1 <eos>: 0.1*0.9*0.1*1.0) # sentence 2: [ 0.1, unk, 0.5, 0.4, ], # w2 w2 w2: 0.3*0.9*0.99 (emit: w2 w2 w2 <eos>: 0.3*0.9*0.99*0.1) [ 1.0, unk, 0.0, 0.0, ], # w1 w2 w1: 0.7*0.4*0.4 (emit: w1 w2 w1 <eos>: 0.7*0.4*0.4*1.0) ] ), ] task = TestTranslationTask.setup_task(args, d, d) model = task.build_model(args) tgt_dict = task.target_dictionary return tgt_dict, w1, w2, src_tokens, src_lengths, model def create_dummy_data( data_dir, num_examples=100, maxlen=20, alignment=False, languages=None ): def _create_dummy_data(dir, filename): data = torch.rand(num_examples * maxlen) data = 97 + torch.floor(26 * data).int() with open(os.path.join(dir, filename), "w") as h: offset = 0 for _ in range(num_examples): ex_len = random.randint(1, maxlen) ex_str = " ".join(map(chr, data[offset : offset + ex_len])) print(ex_str, file=h) offset += ex_len def _create_dummy_alignment_data(filename_src, filename_tgt, filename): with open(os.path.join(data_dir, filename_src), "r") as src_f, open( os.path.join(data_dir, filename_tgt), "r" ) as tgt_f, open(os.path.join(data_dir, filename), "w") as h: for src, tgt in zip(src_f, tgt_f): src_len = len(src.split()) tgt_len = len(tgt.split()) avg_len = (src_len + tgt_len) // 2 num_alignments = random.randint(avg_len // 2, 2 * avg_len) src_indices = torch.floor(torch.rand(num_alignments) * src_len).int() tgt_indices = torch.floor(torch.rand(num_alignments) * tgt_len).int() ex_str = " ".join( [ "{}-{}".format(src, tgt) for src, tgt in zip(src_indices, tgt_indices) ] ) print(ex_str, file=h) files_to_write = [ "train.in", "train.out", "valid.in", "valid.out", "test.in", "test.out", ] if languages is None: # En only dummy dataset for f in files_to_write: _create_dummy_data(data_dir, f) else: for lang in languages: lang_dir = os.path.join(data_dir, lang) os.makedirs(lang_dir, exist_ok=True) for f in files_to_write: _create_dummy_data(lang_dir, f) if alignment: _create_dummy_alignment_data("train.in", "train.out", "train.align") _create_dummy_alignment_data("valid.in", "valid.out", "valid.align") _create_dummy_alignment_data("test.in", "test.out", "test.align") def preprocess_lm_data(data_dir, languages=None): preprocess_parser = options.get_preprocessing_parser() if languages is None: preprocess_args = preprocess_parser.parse_args( [ "--only-source", "--trainpref", os.path.join(data_dir, "train.out"), "--validpref", os.path.join(data_dir, "valid.out"), "--testpref", os.path.join(data_dir, "test.out"), "--destdir", data_dir, ] ) preprocess.main(preprocess_args) else: for lang in languages: lang_dir = os.path.join(data_dir, lang) assert os.path.exists(lang_dir) preprocess_args = preprocess_parser.parse_args( [ "--only-source", "--trainpref", os.path.join(lang_dir, "train.out"), "--validpref", os.path.join(lang_dir, "valid.out"), "--testpref", os.path.join(lang_dir, "test.out"), "--destdir", lang_dir, ] ) preprocess.main(preprocess_args) shutil.copyfile( os.path.join(data_dir, languages[0], "dict.txt"), os.path.join(data_dir, "dict.txt"), ) def preprocess_translation_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args( [ "--source-lang", "in", "--target-lang", "out", "--trainpref", os.path.join(data_dir, "train"), "--validpref", os.path.join(data_dir, "valid"), "--testpref", os.path.join(data_dir, "test"), "--thresholdtgt", "0", "--thresholdsrc", "0", "--destdir", data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args) def preprocess_summarization_data(data_dir, extra_flags=None): preprocess_parser = options.get_preprocessing_parser() preprocess_args = preprocess_parser.parse_args( [ "--source-lang", "in", "--target-lang", "out", "--trainpref", os.path.join(data_dir, "train"), "--validpref", os.path.join(data_dir, "valid"), "--testpref", os.path.join(data_dir, "test"), "--thresholdtgt", "0", "--thresholdsrc", "0", "--joined-dictionary", "--destdir", data_dir, ] + (extra_flags or []), ) preprocess.main(preprocess_args) def create_laser_data_and_config_json(data_dir): src_langs = ["de", "fr", "ru", "tr", "zh"] tgt_langs = ["en", "es"] config_json = {} config_train_json = [] src_vocab = None tgt_vocab = None for src_lang in src_langs: for tgt_lang in tgt_langs: langpair_folder = f"{src_lang}-{tgt_lang}" langpair_path = os.path.join(data_dir, langpair_folder) os.mkdir(langpair_path) create_dummy_data(langpair_path) preprocess_translation_data(langpair_path, ["--dataset-impl", "cached"]) src_vocab = os.path.join(langpair_path, "dict.in.txt") tgt_vocab = os.path.join(langpair_path, "dict.out.txt") config_train_json.append( { "id": 0 if tgt_lang == "en" else 1, "src": os.path.join(langpair_path, "train.in-out.in"), "tgt": os.path.join(langpair_path, "train.in-out.out"), } ) config_json["src_vocab"] = src_vocab config_json["tgt_vocab"] = tgt_vocab config_json["train"] = config_train_json with open(os.path.join(data_dir, "laserconfig.json"), "w") as config_file: json.dump(config_json, config_file) return config_file def train_translation_model( data_dir, arch, extra_flags=None, task="translation", run_validation=False, lang_flags=None, extra_valid_flags=None, world_size=1, ): if lang_flags is None: lang_flags = [ "--source-lang", "in", "--target-lang", "out", ] train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", task, data_dir, "--save-dir", data_dir, "--arch", arch, "--optimizer", "nag", "--lr", "0.05", "--max-tokens", "500", "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", str(world_size), "--num-workers", "0", ] + lang_flags + (extra_flags or []), ) cfg = convert_namespace_to_omegaconf(train_args) distributed_utils.call_main(cfg, train.main) if run_validation: # test validation validate_parser = options.get_validation_parser() validate_args = options.parse_args_and_arch( validate_parser, [ "--task", task, data_dir, "--path", os.path.join(data_dir, "checkpoint_last.pt"), "--valid-subset", "valid", "--max-tokens", "500", "--no-progress-bar", "--num-workers", "0", ] + lang_flags + (extra_valid_flags or []), ) validate.main(validate_args) def generate_main(data_dir, extra_flags=None, path=None): if extra_flags is None: extra_flags = [ "--print-alignment", ] if path is None: path = os.path.join(data_dir, "checkpoint_last.pt") generate_parser = options.get_generation_parser() generate_args = options.parse_args_and_arch( generate_parser, [ data_dir, "--path", path, "--beam", "3", "--batch-size", "64", "--max-len-b", "5", "--gen-subset", "valid", "--no-progress-bar", "--num-workers", "0", ] + (extra_flags or []), ) # evaluate model in batch mode generate.main(generate_args) # evaluate model interactively generate_args.buffer_size = 0 generate_args.input = "-" generate_args.batch_size = None orig_stdin = sys.stdin sys.stdin = StringIO("h e l l o\n") interactive.main(generate_args) sys.stdin = orig_stdin class TestDataset(torch.utils.data.Dataset): def __init__(self, data): super().__init__() self.data = data self.sizes = None def __getitem__(self, index): return self.data[index] def __len__(self): return len(self.data) class TestTranslationTask(LegacyFairseqTask): def __init__(self, args, src_dict, tgt_dict, model): super().__init__(args) self.src_dict = src_dict self.tgt_dict = tgt_dict self.model = model @classmethod def setup_task(cls, args, src_dict=None, tgt_dict=None, model=None): return cls(args, src_dict, tgt_dict, model) def build_model(self, args, from_checkpoint=False): return TestModel.build_model(args, self) @property def source_dictionary(self): return self.src_dict @property def target_dictionary(self): return self.tgt_dict class TestModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @classmethod def build_model(cls, args, task): encoder = TestEncoder(args, task.source_dictionary) decoder = TestIncrementalDecoder(args, task.target_dictionary) return cls(encoder, decoder) class TestEncoder(FairseqEncoder): def __init__(self, args, dictionary): super().__init__(dictionary) self.args = args def forward(self, src_tokens, src_lengths=None, **kwargs): return EncoderOut( encoder_out=src_tokens, encoder_padding_mask=None, encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=None, ) def reorder_encoder_out(self, encoder_out, new_order): return EncoderOut( encoder_out=encoder_out.encoder_out.index_select(0, new_order), encoder_padding_mask=None, encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=None, ) class TestIncrementalDecoder(FairseqIncrementalDecoder): def __init__(self, args, dictionary): super().__init__(dictionary) assert hasattr(args, "beam_probs") or hasattr(args, "probs") args.max_decoder_positions = getattr(args, "max_decoder_positions", 100) self.args = args def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None): if incremental_state is not None: prev_output_tokens = prev_output_tokens[:, -1:] bbsz = prev_output_tokens.size(0) vocab = len(self.dictionary) src_len = encoder_out.encoder_out.size(1) tgt_len = prev_output_tokens.size(1) # determine number of steps if incremental_state is not None: # cache step number step = utils.get_incremental_state(self, incremental_state, "step") if step is None: step = 0 utils.set_incremental_state(self, incremental_state, "step", step + 1) steps = [step] else: steps = list(range(tgt_len)) # define output in terms of raw probs if hasattr(self.args, "probs"): assert ( self.args.probs.dim() == 3 ), "expected probs to have size bsz*steps*vocab" probs = self.args.probs.index_select(1, torch.LongTensor(steps)) else: probs = torch.FloatTensor(bbsz, len(steps), vocab).zero_() for i, step in enumerate(steps): # args.beam_probs gives the probability for every vocab element, # starting with eos, then unknown, and then the rest of the vocab if step < len(self.args.beam_probs): probs[:, i, self.dictionary.eos() :] = self.args.beam_probs[step] else: probs[:, i, self.dictionary.eos()] = 1.0 # random attention attn = torch.rand(bbsz, tgt_len, src_len) dev = prev_output_tokens.device return probs.to(dev), {"attn": [attn.to(dev)]} def get_normalized_probs(self, net_output, log_probs, _): # the decoder returns probabilities directly probs = net_output[0] if log_probs: return probs.log() else: return probs def max_positions(self): return self.args.max_decoder_positions class TestReshapingEncoder(FairseqEncoder): def __init__(self, args, dictionary): super().__init__(dictionary) self.args = args def forward(self, src_tokens, src_lengths=None, **kwargs): b_sz, t_sz = src_tokens.shape padding_needed = t_sz % 2 x = src_tokens if padding_needed > 0: padding_needed = 2 - padding_needed x = F.pad(x, (0, padding_needed)) return EncoderOut( encoder_out=x.view(b_sz, -1, 2), encoder_padding_mask=None, encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=None, ) def reorder_encoder_out(self, encoder_out, new_order): return EncoderOut( encoder_out=encoder_out.encoder_out.index_select(0, new_order), encoder_padding_mask=None, encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=None, ) class TestReshapingModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @classmethod def build_model(cls, args, task): encoder = TestReshapingEncoder(args, task.source_dictionary) decoder = TestIncrementalDecoder(args, task.target_dictionary) return cls(encoder, decoder) class TestAdditionalInputEncoder(FairseqEncoder): def __init__(self, args, dictionary): super().__init__(dictionary) self.args = args def forward(self, src_tokens, src_lengths=None, **kwargs): assert "fancy_other_input" in kwargs assert kwargs["fancy_other_input"] is not None return EncoderOut( encoder_out=src_tokens, encoder_padding_mask=None, encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=None, ) def reorder_encoder_out(self, encoder_out, new_order): return EncoderOut( encoder_out=encoder_out.encoder_out.index_select(0, new_order), encoder_padding_mask=None, encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=None, ) class TestAdditionalInputModel(FairseqEncoderDecoderModel): def __init__(self, encoder, decoder): super().__init__(encoder, decoder) @classmethod def build_model(cls, args, task): encoder = TestAdditionalInputEncoder(args, task.source_dictionary) decoder = TestIncrementalDecoder(args, task.target_dictionary) return cls(encoder, decoder) def forward(self, src_tokens, src_lengths, prev_output_tokens, **kwargs): encoder_out = self.encoder(src_tokens, src_lengths=src_lengths, **kwargs) decoder_out = self.decoder( prev_output_tokens, encoder_out=encoder_out, **kwargs ) return decoder_out def train_language_model( data_dir, arch, extra_flags=None, run_validation=False, extra_valid_flags=None, task="language_modeling", world_size=1, ): train_parser = options.get_training_parser() train_args = options.parse_args_and_arch( train_parser, [ "--task", task, data_dir, "--arch", arch, "--optimizer", "adam", "--lr", "0.0001", "--max-tokens", "500", "--tokens-per-sample", "500", "--save-dir", data_dir, "--max-epoch", "1", "--no-progress-bar", "--distributed-world-size", str(world_size), "--ddp-backend", "no_c10d", "--num-workers", "0", ] + (extra_flags or []), ) cfg = convert_namespace_to_omegaconf(train_args) distributed_utils.call_main(cfg, train.main) if run_validation: # test validation validate_parser = options.get_validation_parser() validate_args = options.parse_args_and_arch( validate_parser, [ "--task", task, data_dir, "--path", os.path.join(data_dir, "checkpoint_last.pt"), "--valid-subset", "valid", "--max-tokens", "500", "--no-progress-bar", "--num-workers", "0", ] + (extra_valid_flags or []), ) validate.main(validate_args) def sizes(data): return [len(sentence) for sentence in data] POPULATION = string.ascii_letters + string.digits def make_sentence() -> tp.List[str]: length = random.randint(10, 50) return random.choices( population=POPULATION, k=length, weights=range(1, len(POPULATION) + 1) ) def make_data(length=1000, out_file=None) -> tp.List[tp.List[str]]: data = ( [make_sentence() for _ in range(0, length)] # add all the symbols at least once + [list(string.ascii_letters), list(string.digits)] ) if out_file is not None: with open(out_file, "w", encoding="utf-8") as out: for s in data: print(" ".join(s), file=out) return data def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary: d = Dictionary() for s in data: for token in s: d.add_symbol(token) d.finalize() return d ================================================ FILE: train.py ================================================ #!/usr/bin/env python3 -u # Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. """ Legacy entry point. Use fairseq_cli/train.py or fairseq-train instead. """ from fairseq_cli.train import cli_main if __name__ == "__main__": cli_main()