Repository: jctian98/e2e_lfmmi Branch: master Commit: 34b805690663 Files: 1103 Total size: 7.8 MB Directory structure: gitextract_ni0k430x/ ├── .gitignore ├── .run.sh.swp ├── README.md ├── __init__.py ├── asr/ │ ├── __init__.py │ ├── asr_mix_utils.py │ ├── asr_utils.py │ ├── chainer_backend/ │ │ ├── __init__.py │ │ └── asr.py │ └── pytorch_backend/ │ ├── __init__.py │ ├── asr.py │ ├── asr_init.py │ ├── asr_mix.py │ └── recog.py ├── bin/ │ ├── __init__.py │ ├── asr_align.py │ ├── asr_enhance.py │ ├── asr_recog.py │ ├── asr_train.py │ ├── lm_train.py │ ├── mt_train.py │ ├── mt_trans.py │ ├── st_train.py │ ├── st_trans.py │ ├── tts_decode.py │ ├── tts_train.py │ ├── vc_decode.py │ └── vc_train.py ├── egs/ │ ├── .gitignore │ ├── aishell1/ │ │ ├── .gitignore │ │ ├── aed.sh │ │ ├── cmd.sh │ │ ├── conf/ │ │ │ ├── fbank.conf │ │ │ ├── gpu.conf │ │ │ ├── lm.yaml │ │ │ ├── lm_rnn.yaml │ │ │ ├── lm_transformer.yaml │ │ │ ├── pitch.conf │ │ │ ├── queue.conf │ │ │ ├── slurm.conf │ │ │ ├── specaug.yaml │ │ │ ├── specaug_test.yaml │ │ │ └── tuning/ │ │ │ ├── decode_pytorch_transformer.yaml │ │ │ ├── decode_rnn.yaml │ │ │ ├── train_pytorch_conformer_kernel15.yaml │ │ │ ├── train_pytorch_conformer_kernel31.yaml │ │ │ ├── train_pytorch_conformer_kernel31_large.yaml │ │ │ ├── train_pytorch_conformer_kernel31_small.yaml │ │ │ ├── train_pytorch_transformer.yaml │ │ │ ├── train_rnn.yaml │ │ │ └── transducer/ │ │ │ ├── decode_default.yaml │ │ │ ├── train_conformer-rnn_transducer.yaml │ │ │ ├── train_conformer-rnn_transducer_aux_ngpu4.yaml │ │ │ ├── train_conformer-rnn_transducer_aux_ngpu4_att.yaml │ │ │ ├── train_conformer-rnn_transducer_aux_ngpu4_small.yaml │ │ │ ├── train_conformer-rnn_transducer_ngpu4.yaml │ │ │ ├── train_conformer-rnn_transducer_ngpu4_large.yaml │ │ │ ├── train_transducer.yaml │ │ │ └── train_transducer_aux.yaml │ │ ├── local/ │ │ │ ├── add_lex_disambig.pl │ │ │ ├── aishell_data_prep.sh │ │ │ ├── aishell_train_lms.sh │ │ │ ├── apply_map.pl │ │ │ ├── build_sp_text.py │ │ │ ├── build_word_mapping.py │ │ │ ├── compile_bigram.sh │ │ │ ├── download_and_untar.sh │ │ │ ├── fstaddselfloops.pl │ │ │ ├── k2_aishell_prepare_dict.sh │ │ │ ├── k2_aishell_prepare_dict_char.sh │ │ │ ├── k2_prepare_lang.sh │ │ │ ├── make_lexicon_fst.py │ │ │ ├── max_rescore.py │ │ │ ├── parse_options.sh │ │ │ ├── parse_text_jieba.py │ │ │ ├── prepare_word_lex.py │ │ │ └── sym2int.pl │ │ ├── nt.sh │ │ ├── path.sh │ │ └── prepare.sh │ ├── aishell2/ │ │ ├── .gitignore │ │ ├── aed.sh │ │ ├── conf/ │ │ │ ├── .fbank.conf.swp │ │ │ ├── fbank.conf │ │ │ ├── gpu.conf │ │ │ ├── lm.yaml │ │ │ ├── lm_rnn.yaml │ │ │ ├── lm_transformer.yaml │ │ │ ├── pitch.conf │ │ │ ├── queue.conf │ │ │ ├── slurm.conf │ │ │ ├── specaug.yaml │ │ │ ├── specaug_test.yaml │ │ │ └── tuning/ │ │ │ ├── decode_pytorch_transformer.yaml │ │ │ ├── decode_rnn.yaml │ │ │ ├── train_pytorch_conformer_kernel15.yaml │ │ │ ├── train_pytorch_conformer_kernel31.yaml │ │ │ ├── train_pytorch_transformer.yaml │ │ │ ├── train_rnn.yaml │ │ │ └── transducer/ │ │ │ ├── decode_default.yaml │ │ │ ├── train_conformer-rnn_transducer.yaml │ │ │ ├── train_conformer-rnn_transducer_aux_ngpu4.yaml │ │ │ ├── train_conformer-rnn_transducer_ngpu4.yaml │ │ │ ├── train_transducer.yaml │ │ │ └── train_transducer_aux.yaml │ │ ├── local/ │ │ │ ├── add_lex_disambig.pl │ │ │ ├── apply_map.pl │ │ │ ├── fstaddselfloops.pl │ │ │ ├── jieba_split_text.py │ │ │ ├── k2_prepare_lang.sh │ │ │ ├── make_lexicon_fst.py │ │ │ ├── max_rescore.py │ │ │ ├── mmi_rescore.sh │ │ │ ├── parse_options.sh │ │ │ ├── prepare_data.sh │ │ │ ├── prepare_dict.sh │ │ │ ├── rerank.py │ │ │ ├── sym2int.pl │ │ │ ├── train_lms.sh │ │ │ └── word_segmentation.py │ │ ├── nt.sh │ │ └── prepare.sh │ ├── asrucs/ │ │ ├── .gitignore │ │ ├── cmd.sh │ │ ├── conf/ │ │ │ ├── decode.yaml │ │ │ ├── fbank.conf │ │ │ ├── gpu.conf │ │ │ ├── lm.yaml │ │ │ ├── lm_rnn.yaml │ │ │ ├── lm_transformer.yaml │ │ │ ├── pitch.conf │ │ │ ├── pure_ctc.yaml │ │ │ ├── queue.conf │ │ │ ├── slurm.conf │ │ │ ├── specaug.yaml │ │ │ ├── specaug_test.yaml │ │ │ ├── train.yaml │ │ │ ├── train_conformer-rnn_transducer_cs.yaml │ │ │ └── tuning/ │ │ │ ├── decode_pytorch_transformer.yaml │ │ │ ├── decode_rnn.yaml │ │ │ ├── train_pytorch_conformer_kernel15.yaml │ │ │ ├── train_pytorch_conformer_kernel31.yaml │ │ │ ├── train_pytorch_conformer_kernel31_large.yaml │ │ │ ├── train_pytorch_conformer_kernel31_small.yaml │ │ │ ├── train_pytorch_transformer.yaml │ │ │ ├── train_rnn.yaml │ │ │ └── transducer/ │ │ │ ├── decode_default.yaml │ │ │ ├── train_conformer-rnn_transducer.yaml │ │ │ ├── train_conformer-rnn_transducer_aux_ngpu4.yaml │ │ │ ├── train_conformer-rnn_transducer_aux_ngpu4_att.yaml │ │ │ ├── train_conformer-rnn_transducer_aux_ngpu4_small.yaml │ │ │ ├── train_conformer-rnn_transducer_ngpu4.yaml │ │ │ ├── train_conformer-rnn_transducer_ngpu4_large.yaml │ │ │ ├── train_transducer.yaml │ │ │ └── train_transducer_aux.yaml │ │ ├── espnet │ │ ├── espnet_utils │ │ ├── local/ │ │ │ ├── add_seperator.py │ │ │ ├── generate_fake_cs.py │ │ │ └── prepare_fake_cs.sh │ │ ├── nt.sh │ │ ├── path.sh │ │ ├── prepare.sh │ │ ├── steps │ │ ├── text │ │ └── utils │ ├── espnet_utils/ │ │ ├── add_uttcls_json.py │ │ ├── addjson.py │ │ ├── apply-cmvn.py │ │ ├── asr_align_wav.sh │ │ ├── average_checkpoints.py │ │ ├── build_fake_lexicon.py │ │ ├── build_sp_text.py │ │ ├── calculate_rtf.py │ │ ├── change_root.py │ │ ├── change_yaml.py │ │ ├── clean_corpus.sh │ │ ├── compute-cmvn-stats.py │ │ ├── compute-fbank-feats.py │ │ ├── compute-stft-feats.py │ │ ├── concat_json_multiref.py │ │ ├── concatjson.py │ │ ├── convert_fbank.sh │ │ ├── convert_fbank_to_wav.py │ │ ├── copy-feats.py │ │ ├── data2json.sh │ │ ├── divide_lang.sh │ │ ├── double_precious_cer.py │ │ ├── download_from_google_drive.sh │ │ ├── dump-pcm.py │ │ ├── dump.sh │ │ ├── dump_pcm.sh │ │ ├── eval-source-separation.py │ │ ├── eval_perm_free_error.py │ │ ├── eval_source_separation.sh │ │ ├── feat-to-shape.py │ │ ├── feat_to_shape.sh │ │ ├── feats2npy.py │ │ ├── filt.py │ │ ├── filter_all_eng_utts.py │ │ ├── filter_scp.py │ │ ├── filter_trn.py │ │ ├── free-gpu.sh │ │ ├── gdown.pl │ │ ├── generate_wav.sh │ │ ├── generate_wav_from_fbank.py │ │ ├── get_yaml.py │ │ ├── jieba_build_dict.py │ │ ├── json2sctm.py │ │ ├── json2text.py │ │ ├── json2trn.py │ │ ├── json2trn_mt.py │ │ ├── json2trn_wo_dict.py │ │ ├── k2/ │ │ │ ├── add_lex_disambig.pl │ │ │ ├── apply_map.pl │ │ │ ├── fstaddselfloops.pl │ │ │ ├── k2_prepare_lang.sh │ │ │ ├── parse_options.sh │ │ │ └── sym2int.pl │ │ ├── make_fbank.sh │ │ ├── make_pair_json.py │ │ ├── make_stft.sh │ │ ├── mbr_analysis.py │ │ ├── mcd_calculate.py │ │ ├── merge_scp2json.py │ │ ├── mergejson.py │ │ ├── mix-mono-wav-scp.py │ │ ├── mmi_rescore.sh │ │ ├── pack_model.sh │ │ ├── prepare_block_load.sh │ │ ├── prepare_mer.py │ │ ├── queue-freegpu.pl │ │ ├── recog_wav.sh │ │ ├── reduce_data_dir.sh │ │ ├── remove_longshortdata.sh │ │ ├── remove_punctuation.pl │ │ ├── rerank_mmi.py │ │ ├── result2json.py │ │ ├── score_bleu.sh │ │ ├── score_lang_id.py │ │ ├── score_sclite.sh │ │ ├── score_sclite_case.sh │ │ ├── score_sclite_wo_dict.sh │ │ ├── scp2json.py │ │ ├── show_result.sh │ │ ├── significant_test.sh │ │ ├── sort_scp_by_length.py │ │ ├── speed_perturb.sh │ │ ├── split_scp.py │ │ ├── split_scp_fix_length.py │ │ ├── splitjson.py │ │ ├── spm_decode │ │ ├── spm_encode │ │ ├── spm_train │ │ ├── stdout.pl │ │ ├── synth_wav.sh │ │ ├── text2token.py │ │ ├── text2vocabulary.py │ │ ├── text_norm.py │ │ ├── trace_rnnt.py │ │ ├── train_lms_srilm.sh │ │ ├── translate_wav.sh │ │ ├── trim_silence.py │ │ ├── trim_silence.sh │ │ ├── trn2ctm.py │ │ ├── trn2stm.py │ │ ├── update_json.sh │ │ ├── word_ngram_rescore.py │ │ └── word_ngram_rescore.sh │ ├── steps/ │ │ ├── align_basis_fmllr.sh │ │ ├── align_basis_fmllr_lats.sh │ │ ├── align_fmllr.sh │ │ ├── align_fmllr_lats.sh │ │ ├── align_lvtln.sh │ │ ├── align_raw_fmllr.sh │ │ ├── align_sgmm2.sh │ │ ├── align_si.sh │ │ ├── best_path_weights.sh │ │ ├── cleanup/ │ │ │ ├── clean_and_segment_data.sh │ │ │ ├── clean_and_segment_data_nnet3.sh │ │ │ ├── combine_short_segments.py │ │ │ ├── create_segments_from_ctm.pl │ │ │ ├── debug_lexicon.sh │ │ │ ├── decode_fmllr_segmentation.sh │ │ │ ├── decode_segmentation.sh │ │ │ ├── decode_segmentation_nnet3.sh │ │ │ ├── find_bad_utts.sh │ │ │ ├── find_bad_utts_nnet.sh │ │ │ ├── internal/ │ │ │ │ ├── align_ctm_ref.py │ │ │ │ ├── compute_tf_idf.py │ │ │ │ ├── ctm_to_text.pl │ │ │ │ ├── get_ctm_edits.py │ │ │ │ ├── get_non_scored_words.py │ │ │ │ ├── get_pron_stats.py │ │ │ │ ├── make_one_biased_lm.py │ │ │ │ ├── modify_ctm_edits.py │ │ │ │ ├── resolve_ctm_edits_overlaps.py │ │ │ │ ├── retrieve_similar_docs.py │ │ │ │ ├── segment_ctm_edits.py │ │ │ │ ├── segment_ctm_edits_mild.py │ │ │ │ ├── split_text_into_docs.pl │ │ │ │ ├── stitch_documents.py │ │ │ │ ├── taint_ctm_edits.py │ │ │ │ └── tf_idf.py │ │ │ ├── lattice_oracle_align.sh │ │ │ ├── make_biased_lm_graphs.sh │ │ │ ├── make_biased_lms.py │ │ │ ├── make_segmentation_data_dir.sh │ │ │ ├── make_segmentation_graph.sh │ │ │ ├── make_utterance_fsts.pl │ │ │ ├── make_utterance_graph.sh │ │ │ ├── segment_long_utterances.sh │ │ │ ├── segment_long_utterances_nnet3.sh │ │ │ └── split_long_utterance.sh │ │ ├── combine_ali_dirs.sh │ │ ├── combine_trans_dirs.sh │ │ ├── compare_alignments.sh │ │ ├── compute_cmvn_stats.sh │ │ ├── compute_vad_decision.sh │ │ ├── conf/ │ │ │ ├── append_eval_to_ctm.py │ │ │ ├── append_prf_to_ctm.py │ │ │ ├── apply_calibration.sh │ │ │ ├── convert_ctm_to_tra.py │ │ │ ├── get_ctm_conf.sh │ │ │ ├── lattice_depth_per_frame.sh │ │ │ ├── parse_arpa_unigrams.py │ │ │ ├── prepare_calibration_data.py │ │ │ ├── prepare_word_categories.py │ │ │ └── train_calibration.sh │ │ ├── copy_ali_dir.sh │ │ ├── copy_lat_dir.sh │ │ ├── copy_trans_dir.sh │ │ ├── data/ │ │ │ ├── augment_data_dir.py │ │ │ ├── data_dir_manipulation_lib.py │ │ │ ├── make_musan.py │ │ │ ├── make_musan.sh │ │ │ └── reverberate_data_dir.py │ │ ├── decode.sh │ │ ├── decode_basis_fmllr.sh │ │ ├── decode_biglm.sh │ │ ├── decode_combine.sh │ │ ├── decode_fmllr.sh │ │ ├── decode_fmllr_extra.sh │ │ ├── decode_fmmi.sh │ │ ├── decode_fromlats.sh │ │ ├── decode_lvtln.sh │ │ ├── decode_nolats.sh │ │ ├── decode_raw_fmllr.sh │ │ ├── decode_sgmm2.sh │ │ ├── decode_sgmm2_fromlats.sh │ │ ├── decode_sgmm2_rescore.sh │ │ ├── decode_sgmm2_rescore_project.sh │ │ ├── decode_with_map.sh │ │ ├── diagnostic/ │ │ │ ├── analyze_alignments.sh │ │ │ ├── analyze_lats.sh │ │ │ ├── analyze_lattice_depth_stats.py │ │ │ └── analyze_phone_length_stats.py │ │ ├── dict/ │ │ │ ├── apply_g2p.sh │ │ │ ├── apply_g2p_phonetisaurus.sh │ │ │ ├── apply_lexicon_edits.py │ │ │ ├── get_pron_stats.py │ │ │ ├── internal/ │ │ │ │ ├── get_subsegments.py │ │ │ │ ├── prune_pron_candidates.py │ │ │ │ └── sum_arc_info.py │ │ │ ├── learn_lexicon_bayesian.sh │ │ │ ├── learn_lexicon_greedy.sh │ │ │ ├── merge_learned_lexicons.py │ │ │ ├── prons_to_lexicon.py │ │ │ ├── prune_pron_candidates.py │ │ │ ├── select_prons_bayesian.py │ │ │ ├── select_prons_greedy.py │ │ │ ├── train_g2p.sh │ │ │ └── train_g2p_phonetisaurus.sh │ │ ├── get_ctm.sh │ │ ├── get_ctm_conf_fast.sh │ │ ├── get_ctm_fast.sh │ │ ├── get_fmllr_basis.sh │ │ ├── get_lexicon_probs.sh │ │ ├── get_prons.sh │ │ ├── get_train_ctm.sh │ │ ├── info/ │ │ │ ├── chain_dir_info.pl │ │ │ ├── gmm_dir_info.pl │ │ │ ├── nnet2_dir_info.pl │ │ │ ├── nnet3_dir_info.pl │ │ │ └── nnet3_disc_dir_info.pl │ │ ├── libs/ │ │ │ ├── __init__.py │ │ │ ├── common.py │ │ │ └── nnet3/ │ │ │ ├── __init__.py │ │ │ ├── report/ │ │ │ │ ├── __init__.py │ │ │ │ └── log_parse.py │ │ │ ├── train/ │ │ │ │ ├── __init__.py │ │ │ │ ├── chain_objf/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── acoustic_model.py │ │ │ │ ├── common.py │ │ │ │ ├── dropout_schedule.py │ │ │ │ └── frame_level_objf/ │ │ │ │ ├── __init__.py │ │ │ │ ├── acoustic_model.py │ │ │ │ ├── common.py │ │ │ │ └── raw_model.py │ │ │ └── xconfig/ │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── basic_layers.py │ │ │ ├── composite_layers.py │ │ │ ├── convolution.py │ │ │ ├── gru.py │ │ │ ├── layers.py │ │ │ ├── lstm.py │ │ │ ├── parser.py │ │ │ ├── stats_layer.py │ │ │ ├── trivial_layers.py │ │ │ └── utils.py │ │ ├── lmrescore.sh │ │ ├── lmrescore_const_arpa.sh │ │ ├── lmrescore_const_arpa_undeterminized.sh │ │ ├── lmrescore_rnnlm_lat.sh │ │ ├── make_denlats.sh │ │ ├── make_denlats_sgmm2.sh │ │ ├── make_fbank.sh │ │ ├── make_fbank_pitch.sh │ │ ├── make_index.sh │ │ ├── make_mfcc.sh │ │ ├── make_mfcc_pitch.sh │ │ ├── make_mfcc_pitch_online.sh │ │ ├── make_phone_graph.sh │ │ ├── make_plp.sh │ │ ├── make_plp_pitch.sh │ │ ├── nnet/ │ │ │ ├── align.sh │ │ │ ├── decode.sh │ │ │ ├── ivector/ │ │ │ │ ├── extract_ivectors.sh │ │ │ │ ├── train_diag_ubm.sh │ │ │ │ └── train_ivector_extractor.sh │ │ │ ├── make_bn_feats.sh │ │ │ ├── make_denlats.sh │ │ │ ├── make_fmllr_feats.sh │ │ │ ├── make_fmmi_feats.sh │ │ │ ├── make_priors.sh │ │ │ ├── pretrain_dbn.sh │ │ │ ├── train.sh │ │ │ ├── train_mmi.sh │ │ │ ├── train_mpe.sh │ │ │ └── train_scheduler.sh │ │ ├── nnet2/ │ │ │ ├── adjust_priors.sh │ │ │ ├── align.sh │ │ │ ├── check_ivectors_compatible.sh │ │ │ ├── convert_lda_to_raw.sh │ │ │ ├── convert_nnet1_to_nnet2.sh │ │ │ ├── create_appended_model.sh │ │ │ ├── decode.sh │ │ │ ├── dump_bottleneck_features.sh │ │ │ ├── get_egs.sh │ │ │ ├── get_egs2.sh │ │ │ ├── get_egs_discriminative2.sh │ │ │ ├── get_ivector_id.sh │ │ │ ├── get_lda.sh │ │ │ ├── get_lda_block.sh │ │ │ ├── get_perturbed_feats.sh │ │ │ ├── make_denlats.sh │ │ │ ├── make_multisplice_configs.py │ │ │ ├── relabel_egs.sh │ │ │ ├── relabel_egs2.sh │ │ │ ├── remove_egs.sh │ │ │ ├── retrain_fast.sh │ │ │ ├── retrain_simple2.sh │ │ │ ├── retrain_tanh.sh │ │ │ ├── train_block.sh │ │ │ ├── train_convnet_accel2.sh │ │ │ ├── train_discriminative.sh │ │ │ ├── train_discriminative2.sh │ │ │ ├── train_discriminative_multilang2.sh │ │ │ ├── train_more.sh │ │ │ ├── train_more2.sh │ │ │ ├── train_multilang2.sh │ │ │ ├── train_multisplice_accel2.sh │ │ │ ├── train_multisplice_ensemble.sh │ │ │ ├── train_pnorm.sh │ │ │ ├── train_pnorm_accel2.sh │ │ │ ├── train_pnorm_bottleneck_fast.sh │ │ │ ├── train_pnorm_ensemble.sh │ │ │ ├── train_pnorm_fast.sh │ │ │ ├── train_pnorm_multisplice.sh │ │ │ ├── train_pnorm_multisplice2.sh │ │ │ ├── train_pnorm_simple.sh │ │ │ ├── train_pnorm_simple2.sh │ │ │ ├── train_tanh.sh │ │ │ ├── train_tanh_bottleneck.sh │ │ │ ├── train_tanh_fast.sh │ │ │ └── update_nnet.sh │ │ ├── nnet3/ │ │ │ ├── adjust_priors.sh │ │ │ ├── align.sh │ │ │ ├── align_lats.sh │ │ │ ├── chain/ │ │ │ │ ├── align_lats.sh │ │ │ │ ├── build_tree.sh │ │ │ │ ├── build_tree_multiple_sources.sh │ │ │ │ ├── e2e/ │ │ │ │ │ ├── README.txt │ │ │ │ │ ├── compute_biphone_stats.py │ │ │ │ │ ├── get_egs_e2e.sh │ │ │ │ │ ├── prepare_e2e.sh │ │ │ │ │ ├── text_to_phones.py │ │ │ │ │ └── train_e2e.py │ │ │ │ ├── gen_topo.pl │ │ │ │ ├── gen_topo.py │ │ │ │ ├── gen_topo2.py │ │ │ │ ├── gen_topo3.py │ │ │ │ ├── gen_topo4.py │ │ │ │ ├── gen_topo5.py │ │ │ │ ├── gen_topo_orig.py │ │ │ │ ├── get_egs.sh │ │ │ │ ├── get_model_context.sh │ │ │ │ ├── get_phone_post.sh │ │ │ │ ├── make_weighted_den_fst.sh │ │ │ │ ├── multilingual/ │ │ │ │ │ └── combine_egs.sh │ │ │ │ ├── train.py │ │ │ │ └── train_tdnn.sh │ │ │ ├── chain2/ │ │ │ │ ├── combine_egs.sh │ │ │ │ ├── compute_preconditioning_matrix.sh │ │ │ │ ├── get_raw_egs.sh │ │ │ │ ├── internal/ │ │ │ │ │ ├── get_best_model.sh │ │ │ │ │ └── get_train_schedule.py │ │ │ │ ├── process_egs.sh │ │ │ │ ├── randomize_egs.sh │ │ │ │ ├── train.sh │ │ │ │ ├── validate_processed_egs.sh │ │ │ │ ├── validate_randomized_egs.sh │ │ │ │ └── validate_raw_egs.sh │ │ │ ├── components.py │ │ │ ├── compute_output.sh │ │ │ ├── convert_nnet2_to_nnet3.py │ │ │ ├── decode.sh │ │ │ ├── decode_grammar.sh │ │ │ ├── decode_lookahead.sh │ │ │ ├── decode_looped.sh │ │ │ ├── decode_score_fusion.sh │ │ │ ├── decode_semisup.sh │ │ │ ├── dot/ │ │ │ │ ├── descriptor_parser.py │ │ │ │ └── nnet3_to_dot.py │ │ │ ├── get_degs.sh │ │ │ ├── get_egs.sh │ │ │ ├── get_egs_discriminative.sh │ │ │ ├── get_egs_targets.sh │ │ │ ├── get_saturation.pl │ │ │ ├── get_successful_models.py │ │ │ ├── lstm/ │ │ │ │ ├── make_configs.py │ │ │ │ └── train.sh │ │ │ ├── make_bottleneck_features.sh │ │ │ ├── make_denlats.sh │ │ │ ├── make_tdnn_configs.py │ │ │ ├── multilingual/ │ │ │ │ ├── allocate_multilingual_examples.py │ │ │ │ └── combine_egs.sh │ │ │ ├── nnet3_to_dot.sh │ │ │ ├── report/ │ │ │ │ ├── convert_model.py │ │ │ │ ├── generate_plots.py │ │ │ │ └── summarize_compute_debug_timing.py │ │ │ ├── tdnn/ │ │ │ │ ├── make_configs.py │ │ │ │ ├── train.sh │ │ │ │ └── train_raw_nnet.sh │ │ │ ├── train_discriminative.sh │ │ │ ├── train_dnn.py │ │ │ ├── train_raw_dnn.py │ │ │ ├── train_raw_rnn.py │ │ │ ├── train_rnn.py │ │ │ ├── train_tdnn.sh │ │ │ ├── xconfig_to_config.py │ │ │ └── xconfig_to_configs.py │ │ ├── online/ │ │ │ ├── decode.sh │ │ │ ├── nnet2/ │ │ │ │ ├── align.sh │ │ │ │ ├── copy_data_dir.sh │ │ │ │ ├── copy_ivector_dir.sh │ │ │ │ ├── decode.sh │ │ │ │ ├── dump_nnet_activations.sh │ │ │ │ ├── extract_ivectors.sh │ │ │ │ ├── extract_ivectors_online.sh │ │ │ │ ├── get_egs.sh │ │ │ │ ├── get_egs2.sh │ │ │ │ ├── get_egs_discriminative2.sh │ │ │ │ ├── get_pca_transform.sh │ │ │ │ ├── make_denlats.sh │ │ │ │ ├── prepare_online_decoding.sh │ │ │ │ ├── prepare_online_decoding_retrain.sh │ │ │ │ ├── prepare_online_decoding_transfer.sh │ │ │ │ ├── train_diag_ubm.sh │ │ │ │ └── train_ivector_extractor.sh │ │ │ ├── nnet3/ │ │ │ │ ├── decode.sh │ │ │ │ ├── decode_wake_word.sh │ │ │ │ └── prepare_online_decoding.sh │ │ │ └── prepare_online_decoding.sh │ │ ├── oracle_wer.sh │ │ ├── overlap/ │ │ │ ├── get_overlap_segments.py │ │ │ ├── get_overlap_targets.py │ │ │ ├── output_to_rttm.py │ │ │ ├── post_process_output.sh │ │ │ └── prepare_overlap_graph.py │ │ ├── paste_feats.sh │ │ ├── pytorchnn/ │ │ │ ├── check_py.py │ │ │ ├── compute_sentence_scores.py │ │ │ ├── data.py │ │ │ ├── lmrescore_nbest_pytorchnn.sh │ │ │ ├── model.py │ │ │ └── train.py │ │ ├── resegment_data.sh │ │ ├── resegment_text.sh │ │ ├── rnnlmrescore.sh │ │ ├── scoring/ │ │ │ ├── score_kaldi_cer.sh │ │ │ ├── score_kaldi_compare.sh │ │ │ └── score_kaldi_wer.sh │ │ ├── search_index.sh │ │ ├── segmentation/ │ │ │ ├── ali_to_targets.sh │ │ │ ├── combine_targets_dirs.sh │ │ │ ├── convert_targets_dir_to_whole_recording.sh │ │ │ ├── convert_utt2spk_and_segments_to_rttm.py │ │ │ ├── copy_targets_dir.sh │ │ │ ├── decode_sad.sh │ │ │ ├── detect_speech_activity.sh │ │ │ ├── evaluate_segmentation.pl │ │ │ ├── get_targets_for_out_of_segments.sh │ │ │ ├── internal/ │ │ │ │ ├── arc_info_to_targets.py │ │ │ │ ├── find_oov_phone.py │ │ │ │ ├── get_default_targets_for_out_of_segments.py │ │ │ │ ├── get_transform_probs_mat.py │ │ │ │ ├── merge_segment_targets_to_recording.py │ │ │ │ ├── merge_targets.py │ │ │ │ ├── prepare_sad_graph.py │ │ │ │ ├── resample_targets.py │ │ │ │ ├── sad_to_segments.py │ │ │ │ └── verify_phones_list.py │ │ │ ├── lats_to_targets.sh │ │ │ ├── merge_targets_dirs.sh │ │ │ ├── post_process_sad_to_segments.sh │ │ │ ├── prepare_targets_gmm.sh │ │ │ ├── resample_targets_dir.sh │ │ │ └── validate_targets_dir.sh │ │ ├── select_feats.sh │ │ ├── shift_feats.sh │ │ ├── subset_ali_dir.sh │ │ ├── tandem/ │ │ │ ├── align_fmllr.sh │ │ │ ├── align_sgmm2.sh │ │ │ ├── align_si.sh │ │ │ ├── decode.sh │ │ │ ├── decode_fmllr.sh │ │ │ ├── decode_sgmm2.sh │ │ │ ├── make_denlats.sh │ │ │ ├── make_denlats_sgmm2.sh │ │ │ ├── mk_aslf_lda_mllt.sh │ │ │ ├── mk_aslf_sgmm2.sh │ │ │ ├── train_deltas.sh │ │ │ ├── train_lda_mllt.sh │ │ │ ├── train_mllt.sh │ │ │ ├── train_mmi.sh │ │ │ ├── train_mmi_sgmm2.sh │ │ │ ├── train_mono.sh │ │ │ ├── train_sat.sh │ │ │ ├── train_sgmm2.sh │ │ │ └── train_ubm.sh │ │ ├── tfrnnlm/ │ │ │ ├── check_py.py │ │ │ ├── check_tensorflow_installed.sh │ │ │ ├── lmrescore_rnnlm_lat.sh │ │ │ ├── lmrescore_rnnlm_lat_pruned.sh │ │ │ ├── lstm.py │ │ │ ├── lstm_fast.py │ │ │ ├── reader.py │ │ │ └── vanilla_rnnlm.py │ │ ├── train_deltas.sh │ │ ├── train_diag_ubm.sh │ │ ├── train_lda_mllt.sh │ │ ├── train_lvtln.sh │ │ ├── train_map.sh │ │ ├── train_mmi.sh │ │ ├── train_mmi_fmmi.sh │ │ ├── train_mmi_fmmi_indirect.sh │ │ ├── train_mmi_sgmm2.sh │ │ ├── train_mono.sh │ │ ├── train_mpe.sh │ │ ├── train_quick.sh │ │ ├── train_raw_sat.sh │ │ ├── train_sat.sh │ │ ├── train_sat_basis.sh │ │ ├── train_segmenter.sh │ │ ├── train_sgmm2.sh │ │ ├── train_sgmm2_group.sh │ │ ├── train_smbr.sh │ │ ├── train_ubm.sh │ │ └── word_align_lattices.sh │ └── utils/ │ ├── add_disambig.pl │ ├── add_lex_disambig.pl │ ├── analyze_segments.pl │ ├── apply_map.pl │ ├── best_wer.sh │ ├── build_const_arpa_lm.sh │ ├── combine_data.sh │ ├── convert_slf.pl │ ├── convert_slf_parallel.sh │ ├── copy_data_dir.sh │ ├── create_data_link.pl │ ├── create_split_dir.pl │ ├── ctm/ │ │ ├── convert_ctm.pl │ │ ├── fix_ctm.sh │ │ └── resolve_ctm_overlaps.py │ ├── data/ │ │ ├── combine_short_segments.sh │ │ ├── convert_data_dir_to_whole.sh │ │ ├── extend_segment_times.py │ │ ├── extract_wav_segments_data_dir.sh │ │ ├── fix_subsegment_feats.pl │ │ ├── get_allowed_durations.py │ │ ├── get_frame_shift.sh │ │ ├── get_num_frames.sh │ │ ├── get_reco2dur.sh │ │ ├── get_reco2utt_for_data.sh │ │ ├── get_segments_for_data.sh │ │ ├── get_uniform_subsegments.py │ │ ├── get_utt2dur.sh │ │ ├── get_utt2num_frames.sh │ │ ├── internal/ │ │ │ ├── choose_utts_to_combine.py │ │ │ ├── combine_segments_to_recording.py │ │ │ ├── modify_speaker_info.py │ │ │ └── perturb_volume.py │ │ ├── limit_feature_dim.sh │ │ ├── modify_speaker_info.sh │ │ ├── modify_speaker_info_to_recording.sh │ │ ├── normalize_data_range.pl │ │ ├── perturb_data_dir_speed_3way.sh │ │ ├── perturb_data_dir_volume.sh │ │ ├── perturb_speed_to_allowed_lengths.py │ │ ├── remove_dup_utts.sh │ │ ├── resample_data_dir.sh │ │ ├── shift_and_combine_feats.sh │ │ ├── shift_feats.sh │ │ └── subsegment_data_dir.sh │ ├── dict_dir_add_pronprobs.sh │ ├── eps2disambig.pl │ ├── filt.py │ ├── filter_scp.pl │ ├── filter_scps.pl │ ├── find_arpa_oovs.pl │ ├── fix_data_dir.sh │ ├── format_lm.sh │ ├── format_lm_sri.sh │ ├── gen_topo.pl │ ├── int2sym.pl │ ├── kwslist_post_process.pl │ ├── lang/ │ │ ├── add_unigrams_arpa.pl │ │ ├── adjust_unk_arpa.pl │ │ ├── adjust_unk_graph.sh │ │ ├── bpe/ │ │ │ ├── add_final_optional_silence.sh │ │ │ ├── apply_bpe.py │ │ │ ├── bidi.py │ │ │ ├── learn_bpe.py │ │ │ ├── prepend_words.py │ │ │ └── reverse.py │ │ ├── check_g_properties.pl │ │ ├── check_phones_compatible.sh │ │ ├── compute_sentence_probs_arpa.py │ │ ├── extend_lang.sh │ │ ├── get_word_position_phone_map.pl │ │ ├── grammar/ │ │ │ ├── augment_phones_txt.py │ │ │ └── augment_words_txt.py │ │ ├── internal/ │ │ │ ├── apply_unk_lm.sh │ │ │ ├── arpa2fst_constrained.py │ │ │ └── modify_unk_pron.py │ │ ├── limit_arpa_unk_history.py │ │ ├── make_kn_lm.py │ │ ├── make_lexicon_fst.py │ │ ├── make_lexicon_fst_silprob.py │ │ ├── make_phone_bigram_lang.sh │ │ ├── make_phone_lm.py │ │ ├── make_position_dependent_subword_lexicon.py │ │ ├── make_subword_lexicon_fst.py │ │ ├── make_unk_lm.sh │ │ └── validate_disambig_sym_file.pl │ ├── ln.pl │ ├── make_absolute.sh │ ├── make_lexicon_fst.pl │ ├── make_lexicon_fst_silprob.pl │ ├── make_unigram_grammar.pl │ ├── map_arpa_lm.pl │ ├── mkgraph.sh │ ├── mkgraph_lookahead.sh │ ├── nnet/ │ │ ├── gen_dct_mat.py │ │ ├── gen_hamm_mat.py │ │ ├── gen_splice.py │ │ ├── make_blstm_proto.py │ │ ├── make_cnn_proto.py │ │ ├── make_lstm_proto.py │ │ ├── make_nnet_proto.py │ │ └── subset_data_tr_cv.sh │ ├── nnet-cpu/ │ │ ├── make_nnet_config.pl │ │ ├── make_nnet_config_block.pl │ │ ├── make_nnet_config_preconditioned.pl │ │ └── update_learning_rates.pl │ ├── nnet3/ │ │ └── convert_config_tdnn_to_affine.py │ ├── parallel/ │ │ ├── limit_num_gpus.sh │ │ ├── pbs.pl │ │ ├── queue.pl │ │ ├── retry.pl │ │ ├── run.pl │ │ └── slurm.pl │ ├── parse_options.sh │ ├── perturb_data_dir_speed.sh │ ├── pinyin_map.pl │ ├── prepare_extended_lang.sh │ ├── prepare_lang.sh │ ├── prepare_online_nnet_dist_build.sh │ ├── remove_data_links.sh │ ├── remove_oovs.pl │ ├── reverse_arpa.py │ ├── rnnlm_compute_scores.sh │ ├── s2eps.pl │ ├── scoring/ │ │ ├── wer_ops_details.pl │ │ ├── wer_per_spk_details.pl │ │ ├── wer_per_utt_details.pl │ │ └── wer_report.pl │ ├── segmentation.pl │ ├── show_lattice.sh │ ├── shuffle_list.pl │ ├── spk2utt_to_utt2spk.pl │ ├── split_data.sh │ ├── split_scp.pl │ ├── ssh.pl │ ├── subset_data_dir.sh │ ├── subset_scp.pl │ ├── subword/ │ │ ├── prepare_lang_subword.sh │ │ └── prepare_subword_text.sh │ ├── summarize_logs.pl │ ├── summarize_warnings.pl │ ├── sym2int.pl │ ├── utt2spk_to_spk2utt.pl │ ├── validate_data_dir.sh │ ├── validate_dict_dir.pl │ ├── validate_lang.pl │ ├── validate_text.pl │ └── write_kwslist.pl ├── env/ │ └── build_env.sh ├── kaldi ├── lm/ │ ├── __init__.py │ ├── chainer_backend/ │ │ ├── __init__.py │ │ ├── extlm.py │ │ └── lm.py │ ├── lm_utils.py │ └── pytorch_backend/ │ ├── __init__.py │ ├── extlm.py │ └── lm.py ├── mt/ │ ├── __init__.py │ ├── mt_utils.py │ └── pytorch_backend/ │ ├── __init__.py │ └── mt.py ├── nets/ │ ├── __init__.py │ ├── asr_interface.py │ ├── batch_beam_search.py │ ├── batch_beam_search_online_sim.py │ ├── beam_search.py │ ├── beam_search_transducer.py │ ├── chainer_backend/ │ │ ├── __init__.py │ │ ├── asr_interface.py │ │ ├── ctc.py │ │ ├── deterministic_embed_id.py │ │ ├── e2e_asr.py │ │ ├── e2e_asr_transformer.py │ │ ├── nets_utils.py │ │ ├── rnn/ │ │ │ ├── __init__.py │ │ │ ├── attentions.py │ │ │ ├── decoders.py │ │ │ ├── encoders.py │ │ │ └── training.py │ │ └── transformer/ │ │ ├── __init__.py │ │ ├── attention.py │ │ ├── ctc.py │ │ ├── decoder.py │ │ ├── decoder_layer.py │ │ ├── embedding.py │ │ ├── encoder.py │ │ ├── encoder_layer.py │ │ ├── label_smoothing_loss.py │ │ ├── layer_norm.py │ │ ├── mask.py │ │ ├── positionwise_feed_forward.py │ │ ├── subsampling.py │ │ └── training.py │ ├── ctc_prefix_score.py │ ├── e2e_asr_common.py │ ├── e2e_mt_common.py │ ├── lm_interface.py │ ├── mt_interface.py │ ├── pytorch_backend/ │ │ ├── __init__.py │ │ ├── conformer/ │ │ │ ├── __init__.py │ │ │ ├── argument.py │ │ │ ├── convolution.py │ │ │ ├── encoder.py │ │ │ ├── encoder_layer.py │ │ │ └── swish.py │ │ ├── ctc.py │ │ ├── e2e_asr.py │ │ ├── e2e_asr_conformer.py │ │ ├── e2e_asr_maskctc.py │ │ ├── e2e_asr_mix.py │ │ ├── e2e_asr_mix_transformer.py │ │ ├── e2e_asr_mulenc.py │ │ ├── e2e_asr_transducer.py │ │ ├── e2e_asr_transducer_cs.py │ │ ├── e2e_asr_transformer.py │ │ ├── e2e_mt.py │ │ ├── e2e_mt_transformer.py │ │ ├── e2e_st.py │ │ ├── e2e_st_conformer.py │ │ ├── e2e_st_transformer.py │ │ ├── e2e_tts_fastspeech.py │ │ ├── e2e_tts_tacotron2.py │ │ ├── e2e_tts_transformer.py │ │ ├── e2e_vc_tacotron2.py │ │ ├── e2e_vc_transformer.py │ │ ├── fastspeech/ │ │ │ ├── __init__.py │ │ │ ├── duration_calculator.py │ │ │ ├── duration_predictor.py │ │ │ └── length_regulator.py │ │ ├── frontends/ │ │ │ ├── __init__.py │ │ │ ├── beamformer.py │ │ │ ├── dnn_beamformer.py │ │ │ ├── dnn_wpe.py │ │ │ ├── feature_transform.py │ │ │ ├── frontend.py │ │ │ └── mask_estimator.py │ │ ├── gtn_ctc.py │ │ ├── initialization.py │ │ ├── lm/ │ │ │ ├── __init__.py │ │ │ ├── default.py │ │ │ ├── seq_rnn.py │ │ │ └── transformer.py │ │ ├── maskctc/ │ │ │ ├── __init__.py │ │ │ ├── add_mask_token.py │ │ │ └── mask.py │ │ ├── nets_utils.py │ │ ├── rnn/ │ │ │ ├── __init__.py │ │ │ ├── argument.py │ │ │ ├── attentions.py │ │ │ ├── decoders.py │ │ │ └── encoders.py │ │ ├── streaming/ │ │ │ ├── __init__.py │ │ │ ├── segment.py │ │ │ └── window.py │ │ ├── tacotron2/ │ │ │ ├── __init__.py │ │ │ ├── cbhg.py │ │ │ ├── decoder.py │ │ │ └── encoder.py │ │ ├── transducer/ │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── auxiliary_task.py │ │ │ ├── blocks.py │ │ │ ├── causal_conv1d.py │ │ │ ├── custom_decoder.py │ │ │ ├── custom_encoder.py │ │ │ ├── error_calculator.py │ │ │ ├── initializer.py │ │ │ ├── joint_network.py │ │ │ ├── loss.py │ │ │ ├── rnn_decoder.py │ │ │ ├── rnn_encoder.py │ │ │ ├── tdnn.py │ │ │ ├── transformer_decoder_layer.py │ │ │ ├── utils.py │ │ │ └── vgg2l.py │ │ ├── transformer/ │ │ │ ├── __init__.py │ │ │ ├── add_sos_eos.py │ │ │ ├── argument.py │ │ │ ├── attention.py │ │ │ ├── contextual_block_encoder_layer.py │ │ │ ├── decoder.py │ │ │ ├── decoder_layer.py │ │ │ ├── dynamic_conv.py │ │ │ ├── dynamic_conv2d.py │ │ │ ├── embedding.py │ │ │ ├── encoder.py │ │ │ ├── encoder_layer.py │ │ │ ├── encoder_mix.py │ │ │ ├── initializer.py │ │ │ ├── label_smoothing_loss.py │ │ │ ├── layer_norm.py │ │ │ ├── lightconv.py │ │ │ ├── lightconv2d.py │ │ │ ├── mask.py │ │ │ ├── multi_layer_conv.py │ │ │ ├── optimizer.py │ │ │ ├── plot.py │ │ │ ├── positionwise_feed_forward.py │ │ │ ├── repeat.py │ │ │ ├── sgd_optimizer.py │ │ │ ├── subsampling.py │ │ │ └── subsampling_without_posenc.py │ │ └── wavenet.py │ ├── scorer_interface.py │ ├── scorers/ │ │ ├── .mmi_rnnt_scorer.py.swp │ │ ├── __init__.py │ │ ├── _mmi_utils.py │ │ ├── ctc.py │ │ ├── ctc_rnnt_scorer.py │ │ ├── length_bonus.py │ │ ├── lookahead.py │ │ ├── mmi.py │ │ ├── mmi_alignment_score.py │ │ ├── mmi_frame_prefix_scorer.py │ │ ├── mmi_frame_scorer.py │ │ ├── mmi_frame_scorer_trace.py │ │ ├── mmi_lookahead.py │ │ ├── mmi_lookahead_bak.py │ │ ├── mmi_lookahead_split.py │ │ ├── mmi_prefix_score.py │ │ ├── mmi_rescorer.py │ │ ├── mmi_rnnt_lookahead_scorer.py │ │ ├── mmi_rnnt_scorer.py │ │ ├── mmi_utils.py │ │ ├── new_mmi_frame_scorer.py │ │ ├── ngram.py │ │ ├── sorted_matcher.py │ │ ├── test.py │ │ ├── tlg_scorer.py │ │ ├── trace_frame.py │ │ └── word_ngram.py │ ├── st_interface.py │ ├── transducer_decoder_interface.py │ └── tts_interface.py ├── optimizer/ │ ├── __init__.py │ ├── chainer.py │ ├── factory.py │ ├── parser.py │ └── pytorch.py ├── scheduler/ │ ├── __init__.py │ ├── chainer.py │ ├── pytorch.py │ └── scheduler.py ├── snowfall/ │ ├── __init__.py │ ├── common.py │ ├── data/ │ │ ├── __init__.py │ │ ├── aishell.py │ │ ├── asr_datamodule.py │ │ ├── datamodule.py │ │ └── librispeech.py │ ├── decoding/ │ │ ├── __init__.py │ │ ├── graph.py │ │ └── lm_rescore.py │ ├── dist.py │ ├── lexicon.py │ ├── models/ │ │ ├── __init__.py │ │ ├── conformer.py │ │ ├── contextnet.py │ │ ├── interface.py │ │ ├── tdnn.py │ │ ├── tdnn_lstm.py │ │ ├── tdnnf.py │ │ └── transformer.py │ ├── objectives/ │ │ ├── __init__.py │ │ ├── common.py │ │ ├── ctc.py │ │ └── mmi.py │ ├── training/ │ │ ├── __init__.py │ │ ├── ctc_graph.py │ │ ├── diagnostics.py │ │ ├── mmi_graph.py │ │ └── mmi_mbr_graph.py │ └── warpper/ │ ├── k2_decode.py │ ├── mmi_test.py │ ├── mmi_utils.py │ ├── prefix_scorer.py │ ├── warpper_ctc.py │ └── warpper_mmi.py ├── st/ │ ├── __init__.py │ └── pytorch_backend/ │ ├── __init__.py │ └── st.py ├── transform/ │ ├── __init__.py │ ├── add_deltas.py │ ├── channel_selector.py │ ├── cmvn.py │ ├── functional.py │ ├── perturb.py │ ├── spec_augment.py │ ├── spectrogram.py │ ├── transform_interface.py │ ├── transformation.py │ └── wpe.py ├── tts/ │ ├── __init__.py │ └── pytorch_backend/ │ ├── __init__.py │ └── tts.py ├── utils/ │ ├── __init__.py │ ├── bmuf.py │ ├── check_kwargs.py │ ├── cli_readers.py │ ├── cli_utils.py │ ├── cli_writers.py │ ├── dataset.py │ ├── deterministic_utils.py │ ├── draw_num_fst.py │ ├── dynamic_import.py │ ├── fill_missing_args.py │ ├── io_utils.py │ ├── parse_decoding_process.py │ ├── parse_npy.py │ ├── print.py │ ├── rtf_calculator.py │ ├── sampler.py │ ├── spec_augment.py │ └── training/ │ ├── __init__.py │ ├── batchfy.py │ ├── evaluator.py │ ├── iterators.py │ ├── tensorboard_logger.py │ └── train_utils.py ├── vc/ │ └── pytorch_backend/ │ └── vc.py └── version.txt ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.pyc interface ================================================ FILE: README.md ================================================ # End-to-end speech secognition toolkit This is an E2E ASR toolkit modified from Espnet1 (version 0.9.9). If this repositry can help you, we will be appreciate if you can star it and cite our papers. This is the official implementation following papers: [**Consistent Training and Decoding For End-to-end Speech Recognition Using Lattice-free MMI**](https://ieeexplore.ieee.org/document/9746579/) (Accepted by ICASSP 2022) [**Improving Mandarin End-to-End Speech Recognition with Word N-gram Language Model**](https://ieeexplore.ieee.org/document/9721084) (Accepted by SPL) [**Integrate Lattice-Free MMI into End-to-End Speech Recognition**](https://arxiv.org/abs/2203.15614) (Submitted to TASLP) We achieve state-of-the-art results on two of the most popular results in Aishell-1 and AIshell-2 Mandarin datasets. Please feel free to change / modify the code as you like. :) ### Update - 2021/12/29: Release the first version, which contains all MMI-related features, including MMI training criteria, MMI Prefix Score (for attention-based encoder-decoder, AED) and MMI Alignment Score (For neural transducer, NT). - 2022/1/6: Release the word-level N-gram LM scorer. - 2022/1/12: We update the instructions to build the environment. We also release the trained NT model for Aishell-1 for quick performance check. We update the guildline to run our code. - 2022/3/29 We release a new CTC / RNN-T recipe for code-switch problem based on ASRU 2019 Mandarin-English code-switch dataset (see egs/asrucs); Results on Aishell-1 and Aishell-2 are also updated. ### Environment: The main dependencies of this code can be divided into three part: `kaldi`, `espnet` and `k2` Please follow the instructions in [build_env.sh](https://github.com/jctian98/e2e_lfmmi/blob/master/env/build_env.sh) to build the environment. Note the script cannot run automatically and you need to run it line-by-line. ### Results Currently we have released examples on Aishell-1 and Aishell-2 datasets. With MMI training & decoding methods and the word-level N-gram LM. We achieve results on Aishell-1 and Aishell-2 as below. All results are in CER% The model file of Aishell-1 NT system is [here](https://drive.google.com/file/d/1VE2YtLb70UpQkeGWE8WhHJl7sSwNa_zG/view?usp=sharing) for quick performance check. | Test set | Aishell-1-dev | Aishell-1-test | Aishell-2-ios | Aishell-2-android | Aishell-2-mic | | :---- | :-: | :--: | :-: | :-----: | :-: | | AED | 4.60| 5.07 | 5.72| 6.60 | 6.58| | AED + MMI + Word Ngram | 4.08| 4.45 | 5.15| 5.92 | 5.77| | NT | 4.41| 4.82 | 5.81| 6.52 | 6.52| | NT + MMI + Word Ngram | 3.79| 4.10 | 5.02| 5.85 | 5.66| ### Get Start Take Aishell-1 as an example. Working process for other examples are very similar. step 1: clone the code and link kaldi ``` conda activate lfmmi git clone https://github.com/jctian98/e2e_lfmmi E2E-ASR-Framework # clone and RENAME cd E2E-ASR-Framework ln -s kaldi # link kaldi ``` step 2: prepare data, lexicon and LMs. Before you run, please set the datadir in `prepare.sh` ``` cd egs/aishell1 bash prepare.sh ``` step 3: model training. You should split the data before start the training. You can skip this step and download our trained model [here](https://drive.google.com/file/d/1VE2YtLb70UpQkeGWE8WhHJl7sSwNa_zG/view?usp=sharing) ``` python3 espnet_utils/splitjson.py -p dump/train_sp/deltafalse/data.json bash nt.sh --stop_stage 1 ``` step 4: decode ``` bash nt.sh --stage 2 --mmi-weight 0.2 --word-ngram-weight 0.4 ``` Several Hint: 1. Please change the paths in `path.sh` accordingly before you start 2. Please change the `data` to config your data path in `prepare.sh` 3. Our code runs in DDP style and requires some global variables. Before you start, you need to set them manually. We assume Pytorch distributed API works well on your machine. ``` export HOST_GPU_NUM=x # number of GPUs on each host export HOST_NUM=x # number of hosts export NODE_NUM=x # number of GPUs in total (on all hosts) export INDEX=x # index of this host export CHIEF_IP=xx.xx.xx.xx # IP of the master host ``` 4. You may encounter some problem about `k2`. Try to delete `data/lang_phone/Linv.pt` (in training) and `data/word_3gram/G.pt`(in decoding) and re-generate them again. 5. Multiple choices are available during decoding (we take `nt.sh` as an example, but the usage of `aed.sh` is the same). To use the MMI-related scorers, you need train the model with MMI auxiliary criterion; To use MMI Prefix Score (in AED) or MMI Alignment score (in NT): ``` bash nt.sh --stage 2 --mmi-weight 0.2 ``` To use any external LM, you need to train them in advance (as implemented in `prepare.sh`) To use word-level N-gram LM: ``` bash nt.sh --stage 2 --word-ngram-weight 0.4 ``` To use character-level N-gram LM: ``` bash nt.sh --stage 2 --ngram-weight 1.0 ``` To use neural network LM: ``` bash nt.sh --stage 2 --lm-weight 1.0 ``` ### Reference kaldi: https://github.com/kaldi-asr/kaldi Espent: https://github.com/espnet/espnet k2-fsa: https://github.com/k2-fsa/k2 ### Citations ``` @INPROCEEDINGS{9746579, author={Tian, Jinchuan and Yu, Jianwei and Weng, Chao and Zhang, Shi-Xiong and Su, Dan and Yu, Dong and Zou, Yuexian}, booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, title={Consistent Training and Decoding for End-to-End Speech Recognition Using Lattice-Free MMI}, year={2022}, volume={}, number={}, pages={7782-7786}, doi={10.1109/ICASSP43922.2022.9746579}} @ARTICLE{9721084, author={Tian, Jinchuan and Yu, Jianwei and Weng, Chao and Zou, Yuexian and Yu, Dong}, journal={IEEE Signal Processing Letters}, title={Improving Mandarin End-to-End Speech Recognition with Word N-gram Language Model}, year={2022}, volume={}, number={}, pages={1-1}, doi={10.1109/LSP.2022.3154241}} @article{tian2022integrate, title={Integrate Lattice-Free MMI into End-to-End Speech Recognition}, author={Tian, Jinchuan and Yu, Jianwei and Weng, Chao and Zou, Yuexian and Yu, Dong}, journal={arXiv preprint arXiv:2203.15614}, year={2022} } ``` ### Authorship Jinchuan Tian; tianjinchuan@stu.pku.edu.cn or tyriontian@tencent.com Jianwei Yu; tomasyu@tencent.com (supervisor) Chao Weng; cweng@tencent.com Yuexian Zou; zouyx@pku.edu.cn ================================================ FILE: __init__.py ================================================ """Initialize espnet package.""" import os dirname = os.path.dirname(__file__) version_file = os.path.join(dirname, "version.txt") with open(version_file, "r") as f: __version__ = f.read().strip() ================================================ FILE: asr/__init__.py ================================================ """Initialize sub package.""" ================================================ FILE: asr/asr_mix_utils.py ================================================ #!/usr/bin/env python3 """ This script is used to provide utility functions designed for multi-speaker ASR. Copyright 2017 Johns Hopkins University (Shinji Watanabe) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) Most functions can be directly used as in asr_utils.py: CompareValueTrigger, restore_snapshot, adadelta_eps_decay, chainer_load, torch_snapshot, torch_save, torch_resume, AttributeDict, get_model_conf. """ import copy import logging import os from chainer.training import extension import matplotlib from espnet.asr.asr_utils import parse_hypothesis matplotlib.use("Agg") # * -------------------- chainer extension related -------------------- * class PlotAttentionReport(extension.Extension): """Plot attention reporter. Args: att_vis_fn (espnet.nets.*_backend.e2e_asr.calculate_all_attentions): Function of attention visualization. data (list[tuple(str, dict[str, dict[str, Any]])]): List json utt key items. outdir (str): Directory to save figures. converter (espnet.asr.*_backend.asr.CustomConverter): CustomConverter object. Function to convert data. device (torch.device): The destination device to send tensor. reverse (bool): If True, input and output length are reversed. """ def __init__(self, att_vis_fn, data, outdir, converter, device, reverse=False): """Initialize PlotAttentionReport.""" self.att_vis_fn = att_vis_fn self.data = copy.deepcopy(data) self.outdir = outdir self.converter = converter self.device = device self.reverse = reverse if not os.path.exists(self.outdir): os.makedirs(self.outdir) def __call__(self, trainer): """Plot and save imaged matrix of att_ws.""" att_ws_sd = self.get_attention_weights() for ns, att_ws in enumerate(att_ws_sd): for idx, att_w in enumerate(att_ws): filename = "%s/%s.ep.{.updater.epoch}.output%d.png" % ( self.outdir, self.data[idx][0], ns + 1, ) att_w = self.get_attention_weight(idx, att_w, ns) self._plot_and_save_attention(att_w, filename.format(trainer)) def log_attentions(self, logger, step): """Add image files of attention matrix to tensorboard.""" att_ws_sd = self.get_attention_weights() for ns, att_ws in enumerate(att_ws_sd): for idx, att_w in enumerate(att_ws): att_w = self.get_attention_weight(idx, att_w, ns) plot = self.draw_attention_plot(att_w) logger.add_figure("%s" % (self.data[idx][0]), plot.gcf(), step) plot.clf() def get_attention_weights(self): """Return attention weights. Returns: arr_ws_sd (numpy.ndarray): attention weights. It's shape would be differ from bachend.dtype=float * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax). 2) other case => (B, Lmax, Tmax). * chainer-> attention weights (B, Lmax, Tmax). """ batch = self.converter([self.converter.transform(self.data)], self.device) att_ws_sd = self.att_vis_fn(*batch) return att_ws_sd def get_attention_weight(self, idx, att_w, spkr_idx): """Transform attention weight in regard to self.reverse.""" if self.reverse: dec_len = int(self.data[idx][1]["input"][0]["shape"][0]) enc_len = int(self.data[idx][1]["output"][spkr_idx]["shape"][0]) else: dec_len = int(self.data[idx][1]["output"][spkr_idx]["shape"][0]) enc_len = int(self.data[idx][1]["input"][0]["shape"][0]) if len(att_w.shape) == 3: att_w = att_w[:, :dec_len, :enc_len] else: att_w = att_w[:dec_len, :enc_len] return att_w def draw_attention_plot(self, att_w): """Visualize attention weights matrix. Args: att_w(Tensor): Attention weight matrix. Returns: matplotlib.pyplot: pyplot object with attention matrix image. """ import matplotlib.pyplot as plt if len(att_w.shape) == 3: for h, aw in enumerate(att_w, 1): plt.subplot(1, len(att_w), h) plt.imshow(aw, aspect="auto") plt.xlabel("Encoder Index") plt.ylabel("Decoder Index") else: plt.imshow(att_w, aspect="auto") plt.xlabel("Encoder Index") plt.ylabel("Decoder Index") plt.tight_layout() return plt def _plot_and_save_attention(self, att_w, filename): plt = self.draw_attention_plot(att_w) plt.savefig(filename) plt.close() def add_results_to_json(js, nbest_hyps_sd, char_list): """Add N-best results to json. Args: js (dict[str, Any]): Groundtruth utterance dict. nbest_hyps_sd (list[dict[str, Any]]): List of hypothesis for multi_speakers (# Utts x # Spkrs). char_list (list[str]): List of characters. Returns: dict[str, Any]: N-best results added utterance dict. """ # copy old json info new_js = dict() new_js["utt2spk"] = js["utt2spk"] num_spkrs = len(nbest_hyps_sd) new_js["output"] = [] for ns in range(num_spkrs): tmp_js = [] nbest_hyps = nbest_hyps_sd[ns] for n, hyp in enumerate(nbest_hyps, 1): # parse hypothesis rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list) # copy ground-truth out_dic = dict(js["output"][ns].items()) # update name out_dic["name"] += "[%d]" % n # add recognition results out_dic["rec_text"] = rec_text out_dic["rec_token"] = rec_token out_dic["rec_tokenid"] = rec_tokenid out_dic["score"] = score # add to list of N-best result dicts tmp_js.append(out_dic) # show 1-best result if n == 1: logging.info("groundtruth: %s" % out_dic["text"]) logging.info("prediction : %s" % out_dic["rec_text"]) new_js["output"].append(tmp_js) return new_js ================================================ FILE: asr/asr_utils.py ================================================ # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) import argparse import copy import json import logging import os import shutil import tempfile import numpy as np import torch # * -------------------- training iterator related -------------------- * class CompareValueTrigger(object): """Trigger invoked when key value getting bigger or lower than before. Args: key (str) : Key of value. compare_fn ((float, float) -> bool) : Function to compare the values. trigger (tuple(int, str)) : Trigger that decide the comparison interval. """ def __init__(self, key, compare_fn, trigger=(1, "epoch")): from chainer import training self._key = key self._best_value = None self._interval_trigger = training.util.get_trigger(trigger) self._init_summary() self._compare_fn = compare_fn def __call__(self, trainer): """Get value related to the key and compare with current value.""" observation = trainer.observation summary = self._summary key = self._key if key in observation: summary.add({key: observation[key]}) if not self._interval_trigger(trainer): return False stats = summary.compute_mean() value = float(stats[key]) # copy to CPU self._init_summary() if self._best_value is None: # initialize best value self._best_value = value return False elif self._compare_fn(self._best_value, value): return True else: self._best_value = value return False def _init_summary(self): import chainer self._summary = chainer.reporter.DictSummary() try: from chainer.training import extension except ImportError: PlotAttentionReport = None else: class PlotAttentionReport(extension.Extension): """Plot attention reporter. Args: att_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_attentions): Function of attention visualization. data (list[tuple(str, dict[str, list[Any]])]): List json utt key items. outdir (str): Directory to save figures. converter (espnet.asr.*_backend.asr.CustomConverter): Function to convert data. device (int | torch.device): Device. reverse (bool): If True, input and output length are reversed. ikey (str): Key to access input (for ASR/ST ikey="input", for MT ikey="output".) iaxis (int): Dimension to access input (for ASR/ST iaxis=0, for MT iaxis=1.) okey (str): Key to access output (for ASR/ST okey="input", MT okay="output".) oaxis (int): Dimension to access output (for ASR/ST oaxis=0, for MT oaxis=0.) subsampling_factor (int): subsampling factor in encoder """ def __init__( self, att_vis_fn, data, outdir, converter, transform, device, reverse=False, ikey="input", iaxis=0, okey="output", oaxis=0, subsampling_factor=1, ): self.att_vis_fn = att_vis_fn self.data = copy.deepcopy(data) self.data_dict = {k: v for k, v in copy.deepcopy(data)} # key is utterance ID self.outdir = outdir self.converter = converter self.transform = transform self.device = device self.reverse = reverse self.ikey = ikey self.iaxis = iaxis self.okey = okey self.oaxis = oaxis self.factor = subsampling_factor if not os.path.exists(self.outdir): os.makedirs(self.outdir) def __call__(self, trainer): """Plot and save image file of att_ws matrix.""" att_ws, uttid_list = self.get_attention_weights() if isinstance(att_ws, list): # multi-encoder case num_encs = len(att_ws) - 1 # atts for i in range(num_encs): for idx, att_w in enumerate(att_ws[i]): filename = "%s/%s.ep.{.updater.epoch}.att%d.png" % ( self.outdir, uttid_list[idx], i + 1, ) att_w = self.trim_attention_weight(uttid_list[idx], att_w) np_filename = "%s/%s.ep.{.updater.epoch}.att%d.npy" % ( self.outdir, uttid_list[idx], i + 1, ) np.save(np_filename.format(trainer), att_w) self._plot_and_save_attention(att_w, filename.format(trainer)) # han for idx, att_w in enumerate(att_ws[num_encs]): filename = "%s/%s.ep.{.updater.epoch}.han.png" % ( self.outdir, uttid_list[idx], ) att_w = self.trim_attention_weight(uttid_list[idx], att_w) np_filename = "%s/%s.ep.{.updater.epoch}.han.npy" % ( self.outdir, uttid_list[idx], ) np.save(np_filename.format(trainer), att_w) self._plot_and_save_attention( att_w, filename.format(trainer), han_mode=True ) else: for idx, att_w in enumerate(att_ws): filename = "%s/%s.ep.{.updater.epoch}.png" % ( self.outdir, uttid_list[idx], ) att_w = self.trim_attention_weight(uttid_list[idx], att_w) np_filename = "%s/%s.ep.{.updater.epoch}.npy" % ( self.outdir, uttid_list[idx], ) np.save(np_filename.format(trainer), att_w) self._plot_and_save_attention(att_w, filename.format(trainer)) def log_attentions(self, logger, step): """Add image files of att_ws matrix to the tensorboard.""" att_ws, uttid_list = self.get_attention_weights() if isinstance(att_ws, list): # multi-encoder case num_encs = len(att_ws) - 1 # atts for i in range(num_encs): for idx, att_w in enumerate(att_ws[i]): att_w = self.trim_attention_weight(uttid_list[idx], att_w) plot = self.draw_attention_plot(att_w) logger.add_figure( "%s_att%d" % (uttid_list[idx], i + 1), plot.gcf(), step, ) # han for idx, att_w in enumerate(att_ws[num_encs]): att_w = self.trim_attention_weight(uttid_list[idx], att_w) plot = self.draw_han_plot(att_w) logger.add_figure( "%s_han" % (uttid_list[idx]), plot.gcf(), step, ) else: for idx, att_w in enumerate(att_ws): att_w = self.trim_attention_weight(uttid_list[idx], att_w) plot = self.draw_attention_plot(att_w) logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step) def get_attention_weights(self): """Return attention weights. Returns: numpy.ndarray: attention weights. float. Its shape would be differ from backend. * pytorch-> 1) multi-head case => (B, H, Lmax, Tmax), 2) other case => (B, Lmax, Tmax). * chainer-> (B, Lmax, Tmax) """ return_batch, uttid_list = self.transform(self.data, return_uttid=True) batch = self.converter([return_batch], self.device) if isinstance(batch, tuple): att_ws = self.att_vis_fn(*batch) else: att_ws = self.att_vis_fn(**batch) return att_ws, uttid_list def trim_attention_weight(self, uttid, att_w): """Transform attention matrix with regard to self.reverse.""" if self.reverse: enc_key, enc_axis = self.okey, self.oaxis dec_key, dec_axis = self.ikey, self.iaxis else: enc_key, enc_axis = self.ikey, self.iaxis dec_key, dec_axis = self.okey, self.oaxis dec_len = int(self.data_dict[uttid][dec_key][dec_axis]["shape"][0]) enc_len = int(self.data_dict[uttid][enc_key][enc_axis]["shape"][0]) if self.factor > 1: enc_len //= self.factor if len(att_w.shape) == 3: att_w = att_w[:, :dec_len, :enc_len] else: att_w = att_w[:dec_len, :enc_len] return att_w def draw_attention_plot(self, att_w): """Plot the att_w matrix. Returns: matplotlib.pyplot: pyplot object with attention matrix image. """ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt plt.clf() att_w = att_w.astype(np.float32) if len(att_w.shape) == 3: for h, aw in enumerate(att_w, 1): plt.subplot(1, len(att_w), h) plt.imshow(aw, aspect="auto") plt.xlabel("Encoder Index") plt.ylabel("Decoder Index") else: plt.imshow(att_w, aspect="auto") plt.xlabel("Encoder Index") plt.ylabel("Decoder Index") plt.tight_layout() return plt def draw_han_plot(self, att_w): """Plot the att_w matrix for hierarchical attention. Returns: matplotlib.pyplot: pyplot object with attention matrix image. """ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt plt.clf() if len(att_w.shape) == 3: for h, aw in enumerate(att_w, 1): legends = [] plt.subplot(1, len(att_w), h) for i in range(aw.shape[1]): plt.plot(aw[:, i]) legends.append("Att{}".format(i)) plt.ylim([0, 1.0]) plt.xlim([0, aw.shape[0]]) plt.grid(True) plt.ylabel("Attention Weight") plt.xlabel("Decoder Index") plt.legend(legends) else: legends = [] for i in range(att_w.shape[1]): plt.plot(att_w[:, i]) legends.append("Att{}".format(i)) plt.ylim([0, 1.0]) plt.xlim([0, att_w.shape[0]]) plt.grid(True) plt.ylabel("Attention Weight") plt.xlabel("Decoder Index") plt.legend(legends) plt.tight_layout() return plt def _plot_and_save_attention(self, att_w, filename, han_mode=False): if han_mode: plt = self.draw_han_plot(att_w) else: plt = self.draw_attention_plot(att_w) plt.savefig(filename) plt.close() try: from chainer.training import extension except ImportError: PlotCTCReport = None else: class PlotCTCReport(extension.Extension): """Plot CTC reporter. Args: ctc_vis_fn (espnet.nets.*_backend.e2e_asr.E2E.calculate_all_ctc_probs): Function of CTC visualization. data (list[tuple(str, dict[str, list[Any]])]): List json utt key items. outdir (str): Directory to save figures. converter (espnet.asr.*_backend.asr.CustomConverter): Function to convert data. device (int | torch.device): Device. reverse (bool): If True, input and output length are reversed. ikey (str): Key to access input (for ASR/ST ikey="input", for MT ikey="output".) iaxis (int): Dimension to access input (for ASR/ST iaxis=0, for MT iaxis=1.) okey (str): Key to access output (for ASR/ST okey="input", MT okay="output".) oaxis (int): Dimension to access output (for ASR/ST oaxis=0, for MT oaxis=0.) subsampling_factor (int): subsampling factor in encoder """ def __init__( self, ctc_vis_fn, data, outdir, converter, transform, device, reverse=False, ikey="input", iaxis=0, okey="output", oaxis=0, subsampling_factor=1, ): self.ctc_vis_fn = ctc_vis_fn self.data = copy.deepcopy(data) self.data_dict = {k: v for k, v in copy.deepcopy(data)} # key is utterance ID self.outdir = outdir self.converter = converter self.transform = transform self.device = device self.reverse = reverse self.ikey = ikey self.iaxis = iaxis self.okey = okey self.oaxis = oaxis self.factor = subsampling_factor if not os.path.exists(self.outdir): os.makedirs(self.outdir) def __call__(self, trainer): """Plot and save image file of ctc prob.""" ctc_probs, uttid_list = self.get_ctc_probs() if isinstance(ctc_probs, list): # multi-encoder case num_encs = len(ctc_probs) - 1 for i in range(num_encs): for idx, ctc_prob in enumerate(ctc_probs[i]): filename = "%s/%s.ep.{.updater.epoch}.ctc%d.png" % ( self.outdir, uttid_list[idx], i + 1, ) ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob) np_filename = "%s/%s.ep.{.updater.epoch}.ctc%d.npy" % ( self.outdir, uttid_list[idx], i + 1, ) np.save(np_filename.format(trainer), ctc_prob) self._plot_and_save_ctc(ctc_prob, filename.format(trainer)) else: for idx, ctc_prob in enumerate(ctc_probs): filename = "%s/%s.ep.{.updater.epoch}.png" % ( self.outdir, uttid_list[idx], ) ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob) np_filename = "%s/%s.ep.{.updater.epoch}.npy" % ( self.outdir, uttid_list[idx], ) np.save(np_filename.format(trainer), ctc_prob) self._plot_and_save_ctc(ctc_prob, filename.format(trainer)) def log_ctc_probs(self, logger, step): """Add image files of ctc probs to the tensorboard.""" ctc_probs, uttid_list = self.get_ctc_probs() if isinstance(ctc_probs, list): # multi-encoder case num_encs = len(ctc_probs) - 1 for i in range(num_encs): for idx, ctc_prob in enumerate(ctc_probs[i]): ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob) plot = self.draw_ctc_plot(ctc_prob) logger.add_figure( "%s_ctc%d" % (uttid_list[idx], i + 1), plot.gcf(), step, ) else: for idx, ctc_prob in enumerate(ctc_probs): ctc_prob = self.trim_ctc_prob(uttid_list[idx], ctc_prob) plot = self.draw_ctc_plot(ctc_prob) logger.add_figure("%s" % (uttid_list[idx]), plot.gcf(), step) def get_ctc_probs(self): """Return CTC probs. Returns: numpy.ndarray: CTC probs. float. Its shape would be differ from backend. (B, Tmax, vocab). """ return_batch, uttid_list = self.transform(self.data, return_uttid=True) batch = self.converter([return_batch], self.device) if isinstance(batch, tuple): probs = self.ctc_vis_fn(*batch) else: probs = self.ctc_vis_fn(**batch) return probs, uttid_list def trim_ctc_prob(self, uttid, prob): """Trim CTC posteriors accoding to input lengths.""" enc_len = int(self.data_dict[uttid][self.ikey][self.iaxis]["shape"][0]) if self.factor > 1: enc_len //= self.factor prob = prob[:enc_len] return prob def draw_ctc_plot(self, ctc_prob): """Plot the ctc_prob matrix. Returns: matplotlib.pyplot: pyplot object with CTC prob matrix image. """ import matplotlib matplotlib.use("Agg") import matplotlib.pyplot as plt ctc_prob = ctc_prob.astype(np.float32) plt.clf() topk_ids = np.argsort(ctc_prob, axis=1) n_frames, vocab = ctc_prob.shape times_probs = np.arange(n_frames) plt.figure(figsize=(20, 8)) # NOTE: index 0 is reserved for blank for idx in set(topk_ids.reshape(-1).tolist()): if idx == 0: plt.plot( times_probs, ctc_prob[:, 0], ":", label="", color="grey" ) else: plt.plot(times_probs, ctc_prob[:, idx]) plt.xlabel(u"Input [frame]", fontsize=12) plt.ylabel("Posteriors", fontsize=12) plt.xticks(list(range(0, int(n_frames) + 1, 10))) plt.yticks(list(range(0, 2, 1))) plt.tight_layout() return plt def _plot_and_save_ctc(self, ctc_prob, filename): plt = self.draw_ctc_plot(ctc_prob) plt.savefig(filename) plt.close() def restore_snapshot(model, snapshot, load_fn=None): """Extension to restore snapshot. Returns: An extension function. """ import chainer from chainer import training if load_fn is None: load_fn = chainer.serializers.load_npz @training.make_extension(trigger=(1, "epoch")) def restore_snapshot(trainer): _restore_snapshot(model, snapshot, load_fn) return restore_snapshot def _restore_snapshot(model, snapshot, load_fn=None): if load_fn is None: import chainer load_fn = chainer.serializers.load_npz load_fn(snapshot, model) logging.info("restored from " + str(snapshot)) def adadelta_eps_decay(eps_decay): """Extension to perform adadelta eps decay. Args: eps_decay (float): Decay rate of eps. Returns: An extension function. """ from chainer import training @training.make_extension(trigger=(1, "epoch")) def adadelta_eps_decay(trainer): _adadelta_eps_decay(trainer, eps_decay) return adadelta_eps_decay def _adadelta_eps_decay(trainer, eps_decay): optimizer = trainer.updater.get_optimizer("main") # for chainer if hasattr(optimizer, "eps"): current_eps = optimizer.eps setattr(optimizer, "eps", current_eps * eps_decay) logging.info("adadelta eps decayed to " + str(optimizer.eps)) # pytorch else: for p in optimizer.param_groups: p["eps"] *= eps_decay logging.info("adadelta eps decayed to " + str(p["eps"])) def adam_lr_decay(eps_decay): """Extension to perform adam lr decay. Args: eps_decay (float): Decay rate of lr. Returns: An extension function. """ from chainer import training @training.make_extension(trigger=(1, "epoch")) def adam_lr_decay(trainer): _adam_lr_decay(trainer, eps_decay) return adam_lr_decay def _adam_lr_decay(trainer, eps_decay): optimizer = trainer.updater.get_optimizer("main") # for chainer if hasattr(optimizer, "lr"): current_lr = optimizer.lr setattr(optimizer, "lr", current_lr * eps_decay) logging.info("adam lr decayed to " + str(optimizer.lr)) # pytorch else: for p in optimizer.param_groups: p["lr"] *= eps_decay logging.info("adam lr decayed to " + str(p["lr"])) def torch_snapshot(savefun=torch.save, filename="snapshot.ep.{.updater.epoch}"): """Extension to take snapshot of the trainer for pytorch. Returns: An extension function. """ from chainer.training import extension @extension.make_extension(trigger=(1, "epoch"), priority=-100) def torch_snapshot(trainer): _torch_snapshot_object(trainer, trainer, filename.format(trainer), savefun) return torch_snapshot def _torch_snapshot_object(trainer, target, filename, savefun): from chainer.serializers import DictionarySerializer # make snapshot_dict dictionary s = DictionarySerializer() s.save(trainer) if hasattr(trainer.updater.model, "model"): # (for TTS) if hasattr(trainer.updater.model.model, "module"): model_state_dict = trainer.updater.model.model.module.state_dict() else: model_state_dict = trainer.updater.model.model.state_dict() else: # (for ASR) if hasattr(trainer.updater.model, "module"): model_state_dict = trainer.updater.model.module.state_dict() else: model_state_dict = trainer.updater.model.state_dict() snapshot_dict = { "trainer": s.target, "model": model_state_dict, } if hasattr(trainer.updater, "ddp_trainer"): # For ASR snapshot_dict["optimizer"] = trainer.updater.ddp_trainer.optimizer.state_dict() else: # Others like LM snapshot_dict["optimizer"] = trainer.updater.get_optimizer("main").state_dict() # save snapshot dictionary fn = filename.format(trainer) prefix = "tmp" + fn tmpdir = tempfile.mkdtemp(prefix=prefix, dir=trainer.out) tmppath = os.path.join(tmpdir, fn) try: savefun(snapshot_dict, tmppath) shutil.move(tmppath, os.path.join(trainer.out, fn)) finally: shutil.rmtree(tmpdir) def add_gradient_noise(model, iteration, duration=100, eta=1.0, scale_factor=0.55): """Adds noise from a standard normal distribution to the gradients. The standard deviation (`sigma`) is controlled by the three hyper-parameters below. `sigma` goes to zero (no noise) with more iterations. Args: model (torch.nn.model): Model. iteration (int): Number of iterations. duration (int) {100, 1000}: Number of durations to control the interval of the `sigma` change. eta (float) {0.01, 0.3, 1.0}: The magnitude of `sigma`. scale_factor (float) {0.55}: The scale of `sigma`. """ interval = (iteration // duration) + 1 sigma = eta / interval ** scale_factor for param in model.parameters(): if param.grad is not None: _shape = param.grad.size() noise = sigma * torch.randn(_shape).to(param.device) param.grad += noise # * -------------------- general -------------------- * def get_model_conf(model_path, conf_path=None): """Get model config information by reading a model config file (model.json). Args: model_path (str): Model path. conf_path (str): Optional model config path. Returns: list[int, int, dict[str, Any]]: Config information loaded from json file. """ if conf_path is None: model_conf = os.path.dirname(model_path) + "/model.json" else: model_conf = conf_path with open(model_conf, "rb") as f: logging.info("reading a config file from " + model_conf) confs = json.load(f) if isinstance(confs, dict): # for lm args = confs return argparse.Namespace(**args) else: # for asr, tts, mt idim, odim, args = confs return idim, odim, argparse.Namespace(**args) def chainer_load(path, model): """Load chainer model parameters. Args: path (str): Model path or snapshot file path to be loaded. model (chainer.Chain): Chainer model. """ import chainer if "snapshot" in os.path.basename(path): chainer.serializers.load_npz(path, model, path="updater/model:main/") else: chainer.serializers.load_npz(path, model) def torch_save(path, model): """Save torch model states. Args: path (str): Model path to be saved. model (torch.nn.Module): Torch model. """ if hasattr(model, "module"): torch.save(model.module.state_dict(), path) else: torch.save(model.state_dict(), path) def snapshot_object(target, filename): """Returns a trainer extension to take snapshots of a given object. Args: target (model): Object to serialize. filename (str): Name of the file into which the object is serialized.It can be a format string, where the trainer object is passed to the :meth: `str.format` method. For example, ``'snapshot_{.updater.iteration}'`` is converted to ``'snapshot_10000'`` at the 10,000th iteration. Returns: An extension function. """ from chainer.training import extension @extension.make_extension(trigger=(1, "epoch"), priority=-100) def snapshot_object(trainer): torch_save(os.path.join(trainer.out, filename.format(trainer)), target) return snapshot_object def torch_load(path, model): """Load torch model states. Args: path (str): Model path or snapshot file path to be loaded. model (torch.nn.Module): Torch model. """ if "snapshot" in os.path.basename(path): model_state_dict = torch.load(path, map_location=lambda storage, loc: storage)[ "model" ] else: model_state_dict = torch.load(path, map_location=lambda storage, loc: storage) if hasattr(model, "module"): model.module.load_state_dict(model_state_dict) else: model.load_state_dict(model_state_dict) del model_state_dict def torch_resume(snapshot_path, trainer, load_trainer_and_opt=True): """Resume from snapshot for pytorch. Args: snapshot_path (str): Snapshot file path. trainer (chainer.training.Trainer): Chainer's trainer instance. """ from chainer.serializers import NpzDeserializer if not load_trainer_and_opt: print("Only model weights are resumed") print("trainer and optimizer is ignored") print("make sure this is the second-stage training") # load snapshot snapshot_dict = torch.load(snapshot_path, map_location=lambda storage, loc: storage) # restore trainer states if load_trainer_and_opt: d = NpzDeserializer(snapshot_dict["trainer"]) d.load(trainer) # restore model states if hasattr(trainer.updater.model, "model"): # (for TTS model) if hasattr(trainer.updater.model.model, "module"): trainer.updater.model.model.module.load_state_dict(snapshot_dict["model"]) else: trainer.updater.model.model.load_state_dict(snapshot_dict["model"]) else: # (for ASR model) if hasattr(trainer.updater.model, "module"): trainer.updater.model.module.load_state_dict(snapshot_dict["model"]) else: trainer.updater.model.load_state_dict(snapshot_dict["model"]) # restore optimizer states if load_trainer_and_opt and hasattr(trainer.updater.ddp_trainer, "optimizer"): trainer.updater.ddp_trainer.optimizer.load_state_dict(snapshot_dict["optimizer"]) # delete opened snapshot del snapshot_dict # * ------------------ recognition related ------------------ * def parse_hypothesis(hyp, char_list): """Parse hypothesis. Args: hyp (list[dict[str, Any]]): Recognition hypothesis. char_list (list[str]): List of characters. Returns: tuple(str, str, str, float) """ # remove sos and get results tokenid_as_list = list(map(int, hyp["yseq"][1:])) token_as_list = [char_list[idx] for idx in tokenid_as_list] score = float(hyp["score"]) # convert to string tokenid = " ".join([str(idx) for idx in tokenid_as_list]) token = " ".join(token_as_list) text = "".join(token_as_list).replace("", " ") return text, token, tokenid, score def add_results_to_json(js, nbest_hyps, char_list): """Add N-best results to json. Args: js (dict[str, Any]): Groundtruth utterance dict. nbest_hyps_sd (list[dict[str, Any]]): List of hypothesis for multi_speakers: nutts x nspkrs. char_list (list[str]): List of characters. Returns: dict[str, Any]: N-best results added utterance dict. """ # copy old json info new_js = dict() new_js["utt2spk"] = js["utt2spk"] new_js["output"] = [] for n, hyp in enumerate(nbest_hyps, 1): # parse hypothesis rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp, char_list) # copy ground-truth if len(js["output"]) > 0: out_dic = dict(js["output"][0].items()) else: # for no reference case (e.g., speech translation) out_dic = {"name": ""} # update name out_dic["name"] += "[%d]" % n # add recognition results out_dic["rec_text"] = rec_text out_dic["rec_token"] = rec_token out_dic["rec_tokenid"] = rec_tokenid out_dic["score"] = score # RNNT MMI if "mmi_tot_score" in hyp: out_dic["mmi_tot_score"] = hyp["mmi_tot_score"] # LASCTC MMI if "scores" in hyp: if "mmi_tot_score" in hyp["scores"]: out_dic["mmi_tot_score"] = hyp["scores"]["mmi_tot_score"] if "mmi" in hyp["scores"]: out_dic["mmi"] = hyp["scores"]["mmi"] # add to list of N-best result dicts new_js["output"].append(out_dic) # show 1-best result if n == 1: if "text" in out_dic.keys(): logging.info("groundtruth: %s" % out_dic["text"]) logging.info("prediction : %s" % out_dic["rec_text"]) return new_js def plot_spectrogram( plt, spec, mode="db", fs=None, frame_shift=None, bottom=True, left=True, right=True, top=False, labelbottom=True, labelleft=True, labelright=True, labeltop=False, cmap="inferno", ): """Plot spectrogram using matplotlib. Args: plt (matplotlib.pyplot): pyplot object. spec (numpy.ndarray): Input stft (Freq, Time) mode (str): db or linear. fs (int): Sample frequency. To convert y-axis to kHz unit. frame_shift (int): The frame shift of stft. To convert x-axis to second unit. bottom (bool):Whether to draw the respective ticks. left (bool): right (bool): top (bool): labelbottom (bool):Whether to draw the respective tick labels. labelleft (bool): labelright (bool): labeltop (bool): cmap (str): Colormap defined in matplotlib. """ spec = np.abs(spec) if mode == "db": x = 20 * np.log10(spec + np.finfo(spec.dtype).eps) elif mode == "linear": x = spec else: raise ValueError(mode) if fs is not None: ytop = fs / 2000 ylabel = "kHz" else: ytop = x.shape[0] ylabel = "bin" if frame_shift is not None and fs is not None: xtop = x.shape[1] * frame_shift / fs xlabel = "s" else: xtop = x.shape[1] xlabel = "frame" extent = (0, xtop, 0, ytop) plt.imshow(x[::-1], cmap=cmap, extent=extent) if labelbottom: plt.xlabel("time [{}]".format(xlabel)) if labelleft: plt.ylabel("freq [{}]".format(ylabel)) plt.colorbar().set_label("{}".format(mode)) plt.tick_params( bottom=bottom, left=left, right=right, top=top, labelbottom=labelbottom, labelleft=labelleft, labelright=labelright, labeltop=labeltop, ) plt.axis("auto") # * ------------------ recognition related ------------------ * def format_mulenc_args(args): """Format args for multi-encoder setup. It deals with following situations: (when args.num_encs=2): 1. args.elayers = None -> args.elayers = [4, 4]; 2. args.elayers = 4 -> args.elayers = [4, 4]; 3. args.elayers = [4, 4, 4] -> args.elayers = [4, 4]. """ # default values when None is assigned. default_dict = { "etype": "blstmp", "elayers": 4, "eunits": 300, "subsample": "1", "dropout_rate": 0.0, "atype": "dot", "adim": 320, "awin": 5, "aheads": 4, "aconv_chans": -1, "aconv_filts": 100, } for k in default_dict.keys(): if isinstance(vars(args)[k], list): if len(vars(args)[k]) != args.num_encs: logging.warning( "Length mismatch {}: Convert {} to {}.".format( k, vars(args)[k], vars(args)[k][: args.num_encs] ) ) vars(args)[k] = vars(args)[k][: args.num_encs] else: if not vars(args)[k]: # assign default value if it is None vars(args)[k] = default_dict[k] logging.warning( "{} is not specified, use default value {}.".format( k, default_dict[k] ) ) # duplicate logging.warning( "Type mismatch {}: Convert {} to {}.".format( k, vars(args)[k], [vars(args)[k] for _ in range(args.num_encs)] ) ) vars(args)[k] = [vars(args)[k] for _ in range(args.num_encs)] return args ================================================ FILE: asr/chainer_backend/__init__.py ================================================ """Initialize sub package.""" ================================================ FILE: asr/chainer_backend/asr.py ================================================ # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Training/decoding definition for the speech recognition task.""" import json import logging import os import six # chainer related import chainer from chainer import training from chainer.datasets import TransformDataset from chainer.training import extensions # espnet related from espnet.asr.asr_utils import adadelta_eps_decay from espnet.asr.asr_utils import add_results_to_json from espnet.asr.asr_utils import chainer_load from espnet.asr.asr_utils import CompareValueTrigger from espnet.asr.asr_utils import get_model_conf from espnet.asr.asr_utils import restore_snapshot from espnet.nets.asr_interface import ASRInterface from espnet.utils.deterministic_utils import set_deterministic_chainer from espnet.utils.dynamic_import import dynamic_import from espnet.utils.io_utils import LoadInputsAndTargets from espnet.utils.training.batchfy import make_batchset from espnet.utils.training.evaluator import BaseEvaluator from espnet.utils.training.iterators import ShufflingEnabler from espnet.utils.training.iterators import ToggleableShufflingMultiprocessIterator from espnet.utils.training.iterators import ToggleableShufflingSerialIterator from espnet.utils.training.train_utils import check_early_stop from espnet.utils.training.train_utils import set_early_stop # rnnlm import espnet.lm.chainer_backend.extlm as extlm_chainer import espnet.lm.chainer_backend.lm as lm_chainer # numpy related import matplotlib from espnet.utils.training.tensorboard_logger import TensorboardLogger from tensorboardX import SummaryWriter matplotlib.use("Agg") def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ # display chainer version logging.info("chainer version = " + chainer.__version__) set_deterministic_chainer(args) # check cuda and cudnn availability if not chainer.cuda.available: logging.warning("cuda is not available") if not chainer.cuda.cudnn_enabled: logging.warning("cudnn is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]["input"][0]["shape"][1]) odim = int(valid_json[utts[0]]["output"][0]["shape"][1]) logging.info("#input dims : " + str(idim)) logging.info("#output dims: " + str(odim)) # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = "ctc" logging.info("Pure CTC mode") elif args.mtlalpha == 0.0: mtl_mode = "att" logging.info("Pure attention mode") else: mtl_mode = "mtl" logging.info("Multitask learning mode") # specify model architecture logging.info("import model module: " + args.model_module) model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args, flag_return=False) assert isinstance(model, ASRInterface) total_subsampling_factor = model.get_total_subsampling_factor() # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to " + model_conf) f.write( json.dumps( (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) # Set gpu ngpu = args.ngpu if ngpu == 1: gpu_id = 0 # Make a specified GPU current chainer.cuda.get_device_from_id(gpu_id).use() model.to_gpu() # Copy the model to the GPU logging.info("single gpu calculation.") elif ngpu > 1: gpu_id = 0 devices = {"main": gpu_id} for gid in six.moves.xrange(1, ngpu): devices["sub_%d" % gid] = gid logging.info("multi gpu calculation (#gpus = %d)." % ngpu) logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu) ) else: gpu_id = -1 logging.info("cpu calculation") # Setup an optimizer if args.opt == "adadelta": optimizer = chainer.optimizers.AdaDelta(eps=args.eps) elif args.opt == "adam": optimizer = chainer.optimizers.Adam() elif args.opt == "noam": optimizer = chainer.optimizers.Adam(alpha=0, beta1=0.9, beta2=0.98, eps=1e-9) else: raise NotImplementedError("args.opt={}".format(args.opt)) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.grad_clip)) # Setup a converter converter = model.custom_converter(subsampling_factor=model.subsample[0]) # read json data with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] # set up training iterator and updater load_tr = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing ) load_cv = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing ) use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 accum_grad = args.accum_grad if ngpu <= 1: # make minibatch list (variable length) train = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0, ) # hack to make batchsize argument as 1 # actual batchsize is included in a list if args.n_iter_processes > 0: train_iters = [ ToggleableShufflingMultiprocessIterator( TransformDataset(train, load_tr), batch_size=1, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20, shuffle=not use_sortagrad, ) ] else: train_iters = [ ToggleableShufflingSerialIterator( TransformDataset(train, load_tr), batch_size=1, shuffle=not use_sortagrad, ) ] # set up updater updater = model.custom_updater( train_iters[0], optimizer, converter=converter, device=gpu_id, accum_grad=accum_grad, ) else: if args.batch_count not in ("auto", "seq") and args.batch_size == 0: raise NotImplementedError( "--batch-count 'bin' and 'frame' are not implemented " "in chainer multi gpu" ) # set up minibatches train_subsets = [] for gid in six.moves.xrange(ngpu): # make subset train_json_subset = { k: v for i, (k, v) in enumerate(train_json.items()) if i % ngpu == gid } # make minibatch list (variable length) train_subsets += [ make_batchset( train_json_subset, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, ) ] # each subset must have same length for MultiprocessParallelUpdater maxlen = max([len(train_subset) for train_subset in train_subsets]) for train_subset in train_subsets: if maxlen != len(train_subset): for i in six.moves.xrange(maxlen - len(train_subset)): train_subset += [train_subset[i]] # hack to make batchsize argument as 1 # actual batchsize is included in a list if args.n_iter_processes > 0: train_iters = [ ToggleableShufflingMultiprocessIterator( TransformDataset(train_subsets[gid], load_tr), batch_size=1, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20, shuffle=not use_sortagrad, ) for gid in six.moves.xrange(ngpu) ] else: train_iters = [ ToggleableShufflingSerialIterator( TransformDataset(train_subsets[gid], load_tr), batch_size=1, shuffle=not use_sortagrad, ) for gid in six.moves.xrange(ngpu) ] # set up updater updater = model.custom_parallel_updater( train_iters, optimizer, converter=converter, devices=devices ) # Set up a trainer trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler(train_iters), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) if args.opt == "noam": from espnet.nets.chainer_backend.transformer.training import VaswaniRule trainer.extend( VaswaniRule( "alpha", d=args.adim, warmup_steps=args.transformer_warmup_steps, scale=args.transformer_lr, ), trigger=(1, "iteration"), ) # Resume from a snapshot if args.resume: chainer.serializers.load_npz(args.resume, trainer) # set up validation iterator valid = make_batchset( valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0, ) if args.n_iter_processes > 0: valid_iter = chainer.iterators.MultiprocessIterator( TransformDataset(valid, load_cv), batch_size=1, repeat=False, shuffle=False, n_processes=args.n_iter_processes, n_prefetch=8, maxtasksperchild=20, ) else: valid_iter = chainer.iterators.SerialIterator( TransformDataset(valid, load_cv), batch_size=1, repeat=False, shuffle=False ) # Evaluate the model with the test dataset for each epoch trainer.extend(BaseEvaluator(valid_iter, model, converter=converter, device=gpu_id)) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0: data = sorted( list(valid_json.items())[: args.num_save_attention], key=lambda x: int(x[1]["input"][0]["shape"][1]), reverse=True, ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class logging.info("Using custom PlotAttentionReport") att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=gpu_id, subsampling_factor=total_subsampling_factor, ) trainer.extend(att_reporter, trigger=(1, "epoch")) else: att_reporter = None # Take a snapshot for each specified epoch trainer.extend( extensions.snapshot(filename="snapshot.ep.{.updater.epoch}"), trigger=(1, "epoch"), ) # Make a plot for training and validation values trainer.extend( extensions.PlotReport( [ "main/loss", "validation/main/loss", "main/loss_ctc", "validation/main/loss_ctc", "main/loss_att", "validation/main/loss_att", ], "epoch", file_name="loss.png", ) ) trainer.extend( extensions.PlotReport( ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png" ) ) # Save best models trainer.extend( extensions.snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger("validation/main/loss"), ) if mtl_mode != "ctc": trainer.extend( extensions.snapshot_object(model, "model.acc.best"), trigger=training.triggers.MaxValueTrigger("validation/main/acc"), ) # epsilon decay in the optimizer if args.opt == "adadelta": if args.criterion == "acc" and mtl_mode != "ctc": trainer.extend( restore_snapshot(model, args.outdir + "/model.acc.best"), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot(model, args.outdir + "/model.loss.best"), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, "iteration")) ) report_keys = [ "epoch", "iteration", "main/loss", "main/loss_ctc", "main/loss_att", "validation/main/loss", "validation/main/loss_ctc", "validation/main/loss_att", "main/acc", "validation/main/acc", "elapsed_time", ] if args.opt == "adadelta": trainer.extend( extensions.observe_value( "eps", lambda trainer: trainer.updater.get_optimizer("main").eps ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("eps") trainer.extend( extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, "iteration"), ) trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": writer = SummaryWriter(args.tensorboard_dir) trainer.extend( TensorboardLogger(writer, att_reporter), trigger=(args.report_interval_iters, "iteration"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs) def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ # display chainer version logging.info("chainer version = " + chainer.__version__) set_deterministic_chainer(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) # specify model architecture logging.info("reading model parameters from " + args.model) # To be compatible with v.0.3.0 models if hasattr(train_args, "model_module"): model_module = train_args.model_module else: model_module = "espnet.nets.chainer_backend.e2e_asr:E2E" model_class = dynamic_import(model_module) model = model_class(idim, odim, train_args) assert isinstance(model, ASRInterface) chainer_load(args.model, model) # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_chainer.ClassifierWithState( lm_chainer.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit ) ) chainer_load(args.rnnlm, rnnlm) else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_chainer.ClassifierWithState( lm_chainer.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit) ) chainer_load(args.word_rnnlm, word_rnnlm) if rnnlm is not None: rnnlm = lm_chainer.ClassifierWithState( extlm_chainer.MultiLevelLM( word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict ) ) else: rnnlm = lm_chainer.ClassifierWithState( extlm_chainer.LookAheadWordLM( word_rnnlm.predictor, word_dict, char_dict ) ) # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing ) # decode each utterance new_js = {} with chainer.no_backprop_mode(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list ) with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) ================================================ FILE: asr/pytorch_backend/__init__.py ================================================ """Initialize sub package.""" ================================================ FILE: asr/pytorch_backend/asr.py ================================================ # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Training/decoding definition for the speech recognition task.""" import copy import json import logging import math import os import sys from chainer import reporter as reporter_module from chainer import training from chainer.training import extensions from chainer.training.updater import StandardUpdater import numpy as np import torch import torch.distributed as dist import time from espnet.asr.asr_utils import adadelta_eps_decay from espnet.asr.asr_utils import add_results_to_json from espnet.asr.asr_utils import CompareValueTrigger from espnet.asr.asr_utils import format_mulenc_args from espnet.asr.asr_utils import get_model_conf from espnet.asr.asr_utils import plot_spectrogram from espnet.asr.asr_utils import restore_snapshot from espnet.asr.asr_utils import snapshot_object from espnet.asr.asr_utils import torch_load from espnet.asr.asr_utils import torch_resume from espnet.asr.asr_utils import torch_snapshot from espnet.asr.pytorch_backend.asr_init import freeze_modules from espnet.asr.pytorch_backend.asr_init import load_trained_model from espnet.asr.pytorch_backend.asr_init import load_trained_modules import espnet.lm.pytorch_backend.extlm as extlm_pytorch from espnet.nets.asr_interface import ASRInterface from espnet.nets.beam_search_transducer import BeamSearchTransducer from espnet.nets.pytorch_backend.e2e_asr import pad_list import espnet.nets.pytorch_backend.lm.default as lm_pytorch from espnet.nets.pytorch_backend.streaming.segment import SegmentStreamingE2E from espnet.nets.pytorch_backend.streaming.window import WindowStreamingE2E from espnet.transform.spectrogram import IStft from espnet.transform.transformation import Transformation from espnet.utils.cli_writers import file_writer_helper from espnet.utils.dataset import ChainerDataLoader from espnet.utils.dataset import TransformDataset from espnet.utils.deterministic_utils import set_deterministic_pytorch from espnet.utils.dynamic_import import dynamic_import from espnet.utils.io_utils import LoadInputsAndTargets from espnet.utils.training.batchfy import make_batchset from espnet.utils.training.evaluator import BaseEvaluator from espnet.utils.training.iterators import ShufflingEnabler from espnet.utils.training.tensorboard_logger import TensorboardLogger from espnet.utils.training.train_utils import check_early_stop from espnet.utils.training.train_utils import set_early_stop from espnet.snowfall.warpper.k2_decode import k2_decode import matplotlib from espnet.utils.parse_decoding_process import plot_decoding_logs from espnet.utils.bmuf import BlockAdamTrainer matplotlib.use("Agg") if sys.version_info[0] == 2: from itertools import izip_longest as zip_longest else: from itertools import zip_longest as zip_longest from espnet.nets.scorers.mmi_rnnt_scorer import MMIRNNTScorer # from espnet.nets.scorers.mmi_alignment_score import MMIRNNTScorer from espnet.utils.print import step_print from espnet.utils.sampler import BufferSampler from espnet.utils.rtf_calculator import RTF_calculator from espnet.nets.lm_interface import dynamic_import_lm def _recursive_to(xs, device): if torch.is_tensor(xs): return xs.to(device) if isinstance(xs, tuple): return tuple(_recursive_to(x, device) for x in xs) return xs def is_alphabet(char): if (char >= '\u0041' and char <= '\u005a') or (char >= '\u0061' and char <= '\u007a'): return True else: return False class CustomEvaluator(BaseEvaluator): """Custom Evaluator for Pytorch. Args: model (torch.nn.Module): The model to evaluate. iterator (chainer.dataset.Iterator) : The train iterator. target (link | dict[str, link]) :Link object or a dictionary of links to evaluate. If this is just a link object, the link is registered by the name ``'main'``. device (torch.device): The device used. ngpu (int): The number of GPUs. """ def __init__(self, model, iterator, target, device, ngpu=None): super(CustomEvaluator, self).__init__(iterator, target) self.model = model self.device = device if ngpu is not None: self.ngpu = ngpu elif device.type == "cpu": self.ngpu = 0 else: self.ngpu = 1 # The core part of the update routine can be customized by overriding def evaluate(self): """Main evaluate routine for CustomEvaluator.""" iterator = self._iterators["main"] if self.eval_hook: self.eval_hook(self) if hasattr(iterator, "reset"): iterator.reset() it = iterator else: it = copy.copy(iterator) summary = reporter_module.DictSummary() self.model.eval() with torch.no_grad(): for batch in it: print("evaluation batch") x = _recursive_to(batch, self.device) observation = {} with reporter_module.report_scope(observation): # read scp files # x: original json with loaded features # will be converted to chainer variable later if self.ngpu == 0: self.model(*x) else: # apex does not support torch.nn.DataParallel # data_parallel(self.model, x, range(self.ngpu)) self.model(*x) summary.add(observation) self.model.train() return summary.compute_mean() class CustomUpdater(StandardUpdater): """Custom Updater for Pytorch. Args: model (torch.nn.Module): The model to update. grad_clip_threshold (float): The gradient clipping value to use. train_iter (chainer.dataset.Iterator): The training iterator. optimizer (torch.optim.optimizer): The training optimizer. device (torch.device): The device to use. ngpu (int): The number of gpus to use. use_apex (bool): The flag to use Apex in backprop. """ def __init__( self, model, grad_clip_threshold, train_iter, optimizer, device, ngpu, grad_noise=False, accum_grad=1, use_apex=False, ddp_trainer=None ): super(CustomUpdater, self).__init__(train_iter, optimizer) self.model = model self.grad_clip_threshold = grad_clip_threshold self.device = device self.ngpu = ngpu self.accum_grad = accum_grad self.forward_count = 0 self.grad_noise = grad_noise self.iteration = 0 self.use_apex = use_apex self.ddp_trainer = ddp_trainer self.optimizer = optimizer # The core part of the update routine can be customized by overriding. def update_core(self): """Main update routine of the CustomUpdater.""" # When we pass one iterator and optimizer to StandardUpdater.__init__, # they are automatically named 'main'. train_iter = self.get_iterator("main") optimizer = self.get_optimizer("main") epoch = train_iter.epoch batch = train_iter.next() x = _recursive_to(batch, self.device) is_new_epoch = train_iter.epoch != epoch if self.ngpu == 0: loss = self.model(*x).mean() / self.accum_grad else: # apex does not support torch.nn.DataParallel #loss = ( # data_parallel(self.model, x, range(self.ngpu)).mean() / self.accum_grad #) loss = self.model(*x) / self.accum_grad if self.use_apex: from apex import amp # NOTE: for a compatibility with noam optimizer opt = optimizer.optimizer if hasattr(optimizer, "optimizer") else optimizer with amp.scale_loss(loss, opt) as scaled_loss: scaled_loss.backward() else: loss.backward() # step_print(f"| forward_count {self.forward_count} | finish backward") # gradient noise injection if self.grad_noise: from espnet.asr.asr_utils import add_gradient_noise add_gradient_noise( self.model, self.iteration, duration=100, eta=1.0, scale_factor=0.55 ) # update parameters self.forward_count += 1 if not is_new_epoch and self.forward_count != self.accum_grad: return self.forward_count = 0 # compute the gradient norm to check if it is normal or not grad_norm = torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.grad_clip_threshold ) logging.info("on device {} grad norm={}".format(self.device, grad_norm)) if math.isnan(grad_norm): logging.warning("grad norm is nan. Do not update model.") self.ddp_trainer.optimizer.zero_grad() else: """ Optimizer is never used for update. The real updating process and the DDP communication is in this `update_and_sync()` """ # self.optimizer.step() self.ddp_trainer.update_and_sync() if self.iteration % 1 == 0: step_print(f"| iteration: {self.iteration} | gradient applied") def update(self): self.update_core() # #iterations with accum_grad > 1 # Ref.: https://github.com/espnet/espnet/issues/777 if self.forward_count == 0: self.iteration += 1 class CustomConverter(object): """Custom batch converter for Pytorch. Args: subsampling_factor (int): The subsampling factor. dtype (torch.dtype): Data type to convert. """ def __init__(self, subsampling_factor=1, dtype=torch.float32): """Construct a CustomConverter object.""" self.subsampling_factor = subsampling_factor self.ignore_id = -1 self.dtype = dtype def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs, ys, texts, xs_orig = batch[0] # perform subsampling if self.subsampling_factor > 1: xs = [x[:: self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor # currently only support real number if xs[0].dtype.kind == "c": xs_pad_real = pad_list( [torch.from_numpy(x.real).float() for x in xs], 0 ).to(device, dtype=self.dtype) xs_pad_imag = pad_list( [torch.from_numpy(x.imag).float() for x in xs], 0 ).to(device, dtype=self.dtype) # Note(kamo): # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. # Don't create ComplexTensor and give it E2E here # because torch.nn.DataParellel can't handle it. xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag} else: xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to( device, dtype=self.dtype ) xs_pad_orig = pad_list([torch.from_numpy(x).float() for x in xs_orig], 0).to( device, dtype=self.dtype ) ilens = torch.from_numpy(ilens).to(device) # NOTE: this is for multi-output (e.g., speech translation) ys_pad = pad_list( [ torch.from_numpy( np.array(y[0][:]) if isinstance(y, tuple) else y ).long() for y in ys ], self.ignore_id, ).to(device) return xs_pad, ilens, ys_pad, texts, xs_pad_orig class CustomConverterMulEnc(object): """Custom batch converter for Pytorch in multi-encoder case. Args: subsampling_factors (list): List of subsampling factors for each encoder. dtype (torch.dtype): Data type to convert. """ def __init__(self, subsamping_factors=[1, 1], dtype=torch.float32): """Initialize the converter.""" self.subsamping_factors = subsamping_factors self.ignore_id = -1 self.dtype = dtype self.num_encs = len(subsamping_factors) def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list): The batch to transform. device (torch.device): The device to send to. Returns: tuple( list(torch.Tensor), list(torch.Tensor), torch.Tensor) """ # batch should be located in list assert len(batch) == 1 xs_list = batch[0][: self.num_encs] ys = batch[0][-1] # perform subsampling if np.sum(self.subsamping_factors) > self.num_encs: xs_list = [ [x[:: self.subsampling_factors[i], :] for x in xs_list[i]] for i in range(self.num_encs) ] # get batch of lengths of input sequences ilens_list = [ np.array([x.shape[0] for x in xs_list[i]]) for i in range(self.num_encs) ] # perform padding and convert to tensor # currently only support real number xs_list_pad = [ pad_list([torch.from_numpy(x).float() for x in xs_list[i]], 0).to( device, dtype=self.dtype ) for i in range(self.num_encs) ] ilens_list = [ torch.from_numpy(ilens_list[i]).to(device) for i in range(self.num_encs) ] # NOTE: this is for multi-task learning (e.g., speech translation) ys_pad = pad_list( [ torch.from_numpy(np.array(y[0]) if isinstance(y, tuple) else y).long() for y in ys ], self.ignore_id, ).to(device) return xs_list_pad, ilens_list, ys_pad def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) if args.num_encs > 1: args = format_mulenc_args(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) idim_list = [ int(valid_json[utts[0]]["input"][i]["shape"][-1]) for i in range(args.num_encs) ] odim = int(valid_json[utts[0]]["output"][0]["shape"][-1]) for i in range(args.num_encs): logging.info("stream{}: input dims : {}".format(i + 1, idim_list[i])) logging.info("#output dims: " + str(odim)) # specify attention, CTC, hybrid mode if "transducer" in args.model_module: if ( getattr(args, "etype", False) == "custom" or getattr(args, "dtype", False) == "custom" ): mtl_mode = "custom_transducer" else: mtl_mode = "transducer" logging.info("Pure transducer mode") elif args.mtlalpha == 1.0: mtl_mode = "ctc" logging.info("Pure CTC mode") elif args.mtlalpha == 0.0: mtl_mode = "att" logging.info("Pure attention mode") else: mtl_mode = "mtl" logging.info("Multitask learning mode") if (args.enc_init is not None or args.dec_init is not None) and args.num_encs == 1: model = load_trained_modules(idim_list[0], odim, args) else: model_class = dynamic_import(args.model_module) model = model_class( idim_list[0] if args.num_encs == 1 else idim_list, odim, args ) assert isinstance(model, ASRInterface) total_subsampling_factor = model.get_total_subsampling_factor() print(model) logging.info( " Total parameter of the model = " + str(sum(p.numel() for p in model.parameters())) ) if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(args.char_list), rnnlm_args.layer, rnnlm_args.unit) ) torch_load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config global_rank = args.node_rank * args.node_size + args.local_rank args.outdir = args.outdir.replace("RANK", str(global_rank)) if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to " + model_conf) f.write( json.dumps( (idim_list[0] if args.num_encs == 1 else idim_list, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True, ).encode("utf_8") ) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu) ) args.batch_size *= args.ngpu if args.num_encs > 1: # TODO(ruizhili): implement data parallel for multi-encoder setup. raise NotImplementedError( "Data parallel is not supported for multi-encoder setup." ) # set torch device assert args.ngpu in [1, 0] # this is ddp version device = torch.device(f"cuda:{args.local_rank}" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) if args.freeze_mods: model, model_params = freeze_modules(model, args.freeze_mods) else: model_params = model.parameters() logging.warning( "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), sum(p.numel() for p in model.parameters() if p.requires_grad) * 100.0 / sum(p.numel() for p in model.parameters()), ) ) # We build the SGD optimizer but never use it. # Other code needs this # The real optimizer is in ddp_trainer optimizer = torch.optim.SGD(model_params, lr=1.0) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux" ) raise e if args.opt == "noam": model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype ) else: model, optimizer = amp.initialize( model, optimizer, opt_level=args.train_dtype ) use_apex = True from espnet.nets.pytorch_backend.ctc import CTC amp.register_float_function(CTC, "loss_fn") amp.init() logging.warning("register ctc as float function") else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter if args.num_encs == 1: converter = CustomConverter(subsampling_factor=model.subsample[0], dtype=dtype) else: converter = CustomConverterMulEnc( [i[0] for i in model.subsample_list], dtype=dtype ) # read json data args.train_json = args.train_json.replace("RANK", str(global_rank + 1)) with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] # if use block_load, the utterance must sorted from shortest to longest use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 or args.block_load # make minibatch list (variable length) # disable the adaptive batch_size to sync DDP training # if use frame as the count, we do not set min_batch_size train = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.batch_size if args.batch_size > 0 else 1, #args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0, no_sort=args.block_load, ) valid = make_batchset( valid_json, args.batch_size * 2, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.batch_size, #args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=0, ) if args.block_load: assert args.n_iter_processes <= 1, "never use more than one worker" sampler = BufferSampler( length=len(train), utts_per_ark=args.utts_per_ark, batch_size=args.batch_size, buf_size=args.block_buffer_size, seed=args.seed, ) prefetch_factor = sampler.get_prefetch_factor() shuffle = None else: sampler=None prefetch_factor = 20 shuffle = not use_sortagrad load_tr = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing block_load=args.block_load, ) load_cv = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing ) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list train_dataset = TransformDataset(train, lambda data: converter([load_tr(data)])) valid_dataset = TransformDataset(valid, lambda data: converter([load_cv(data)])) train_iter = ChainerDataLoader( dataset=train_dataset, batch_size=1, num_workers=args.n_iter_processes, shuffle=shuffle, collate_fn=lambda x: x[0], prefetch_factor=prefetch_factor, sampler=sampler ) # prefetch_factor=5, valid_iter = ChainerDataLoader( dataset=valid_dataset, batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) # Set up a trainer ddp_trainer = BlockAdamTrainer(args, master_node=args.master_node, rank=global_rank, world_size=args.world_size, model=model, ) updater = CustomUpdater( model, args.grad_clip, {"main": train_iter}, optimizer, device, args.ngpu, args.grad_noise, args.accum_grad, use_apex=use_apex, ddp_trainer=ddp_trainer ) trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) if use_sortagrad and args.sortagrad != 0: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer, args.load_trainer_and_opt) # Evaluate the model with the test dataset for each epoch if args.save_interval_iters > 0: trainer.extend( CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu), trigger=(args.save_interval_iters, "iteration"), ) else: trainer.extend( CustomEvaluator(model, {"main": valid_iter}, reporter, device, args.ngpu) ) # Save attention weight each epoch is_attn_plot = ( "transformer" in args.model_module or "conformer" in args.model_module or mtl_mode in ["att", "mtl", "custom_transducer"] ) if args.num_save_attention > 0 and is_attn_plot: data = sorted( list(valid_json.items())[: args.num_save_attention], key=lambda x: int(x[1]["input"][0]["shape"][1]), reverse=True, ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, subsampling_factor=total_subsampling_factor, ) trainer.extend(att_reporter, trigger=(1, "epoch")) else: att_reporter = None # Save CTC prob at each epoch if mtl_mode in ["ctc", "mtl"] and args.num_save_ctc > 0: # NOTE: sort it by output lengths data = sorted( list(valid_json.items())[: args.num_save_ctc], key=lambda x: int(x[1]["output"][0]["shape"][0]), reverse=True, ) if hasattr(model, "module"): ctc_vis_fn = model.module.calculate_all_ctc_probs plot_class = model.module.ctc_plot_class else: ctc_vis_fn = model.calculate_all_ctc_probs plot_class = model.ctc_plot_class ctc_reporter = plot_class( ctc_vis_fn, data, args.outdir + "/ctc_prob", converter=converter, transform=load_cv, device=device, subsampling_factor=total_subsampling_factor, ) trainer.extend(ctc_reporter, trigger=(1, "epoch")) else: ctc_reporter = None # Make a plot for training and validation values if args.num_encs > 1: report_keys_loss_ctc = [ "main/loss_ctc{}".format(i + 1) for i in range(model.num_encs) ] + ["validation/main/loss_ctc{}".format(i + 1) for i in range(model.num_encs)] report_keys_cer_ctc = [ "main/cer_ctc{}".format(i + 1) for i in range(model.num_encs) ] + ["validation/main/cer_ctc{}".format(i + 1) for i in range(model.num_encs)] if hasattr(model, "is_rnnt"): trainer.extend( extensions.PlotReport( [ "main/loss", "validation/main/loss", "main/loss_trans", "validation/main/loss_trans", "main/loss_ctc", "validation/main/loss_ctc", "main/loss_lm", "validation/main/loss_lm", "main/loss_aux_trans", "validation/main/loss_aux_trans", "main/loss_aux_symm_kl", "validation/main/loss_aux_symm_kl", "main/loss_mbr", "validation/main/loss_mbr", "main/loss_mmi", "validation/main/loss_mmi", "main/loss_lang", "validation/main/loss_lang", "main/loss_att", "validation/main/loss_att", ], "epoch", file_name="loss.png", ) ) else: trainer.extend( extensions.PlotReport( [ "main/loss", "validation/main/loss", "main/loss_ctc", "validation/main/loss_ctc", "main/loss_att", "validation/main/loss_att", "main/loss_third", "validation/main/loss_third", "main/loss_mbr", "validation/main/loss_mbr", ] + ([] if args.num_encs == 1 else report_keys_loss_ctc), "epoch", file_name="loss.png", ) ) trainer.extend( extensions.PlotReport( ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png" ) ) trainer.extend( extensions.PlotReport( ["main/cer_ctc", "validation/main/cer_ctc"] + ([] if args.num_encs == 1 else report_keys_loss_ctc), "epoch", file_name="cer.png", ) ) # save the checkpoint only if this is the master GPU if global_rank == 0: # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger("validation/main/loss"), ) if mtl_mode not in ["ctc", "transducer", "custom_transducer"]: trainer.extend( snapshot_object(model, "model.acc.best"), trigger=training.triggers.MaxValueTrigger("validation/main/acc"), ) # save snapshot which contains model and optimizer states if args.save_interval_iters > 0: trainer.extend( torch_snapshot(filename="snapshot.iter.{.updater.iteration}"), trigger=(args.save_interval_iters, "iteration"), ) # save snapshot at every epoch - for model averaging trainer.extend(torch_snapshot(), trigger=(1, "epoch")) # epsilon decay in the optimizer if args.opt == "adadelta": if args.criterion == "acc" and mtl_mode != "ctc": trainer.extend( restore_snapshot( model, args.outdir + "/model.acc.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot( model, args.outdir + "/model.loss.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # NOTE: In some cases, it may take more than one epoch for the model's loss # to escape from a local minimum. # Thus, restore_snapshot extension is not used here. # see details in https://github.com/espnet/espnet/pull/2171 elif args.criterion == "loss_eps_decay_only": trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, "iteration")) ) if hasattr(model, "is_rnnt"): report_keys = [ "epoch", "iteration", "main/loss", "main/loss_trans", "main/loss_ctc", "main/loss_lm", "main/loss_aux_trans", "main/loss_aux_symm_kl", "main/loss_mbr", "main/loss_mmi", "main/loss_att", "main/loss_lang", "validation/main/loss", "validation/main/loss_trans", "validation/main/loss_ctc", "validation/main/loss_lm", "validation/main/loss_aux_trans", "validation/main/loss_aux_symm_kl", "validation/main/loss_mbr", "validation/main/loss_mmi", "validation/main/loss_att", "validation/main/loss_lang", "elapsed_time", ] else: report_keys = [ "epoch", "iteration", "main/loss", "main/loss_ctc", "main/loss_att", "main/loss_third", "main/loss_mbr", "validation/main/loss", "validation/main/loss_ctc", "validation/main/loss_att", "validation/main/loss_third", "validation/main/loss_mbr", "main/acc", "validation/main/acc", "main/cer_ctc", "validation/main/cer_ctc", "elapsed_time", ] + ([] if args.num_encs == 1 else report_keys_cer_ctc + report_keys_loss_ctc) if args.opt == "adadelta": trainer.extend( extensions.observe_value( "eps", lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][ "eps" ], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("eps") if args.report_cer: report_keys.append("validation/main/cer") if args.report_wer: report_keys.append("validation/main/wer") logwriter = open(args.outdir + f"/train.{global_rank}.log", 'w') trainer.extend( extensions.PrintReport(report_keys, out=logwriter), trigger=(args.report_interval_iters, "iteration"), ) # trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) # Run the training trainer.run() check_early_stop(trainer, args.epochs) def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) if args.ngpu == 1: gpu_id = args.local_rank - 1 logging.warning("gpu id: " + str(gpu_id)) device=torch.device("cuda:{}".format(gpu_id)) else: device=torch.device("cpu") os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # disable GPU model, train_args = load_trained_model(args.model, training=False) assert isinstance(model, ASRInterface) model.recog_args = args if args.streaming_mode and "transformer" in train_args.model_module: raise NotImplementedError("streaming mode for transformer is not implemented") logging.info( " Total parameter of the model = " + str(sum(p.numel() for p in model.parameters())) ) # read rnnlm if args.rnnlm and args.lm_weight > 0.0: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") == "default": rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility ) ) elif getattr(rnnlm_args, "model_module", "default") == "transformer": lm_class = dynamic_import_lm("transformer", rnnlm_args.backend) rnnlm = lm_class(len(train_args.char_list), rnnlm_args) else: raise ValueError("Unsupported LM type") torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(word_dict), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility ) ) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM( word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict ) ) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM( word_rnnlm.predictor, word_dict, char_dict ) ) model = model.to(device) if rnnlm: rnnlm = rnnlm.to(device) # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=True, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) # load transducer beam search if hasattr(model, "is_rnnt"): if hasattr(model, "dec"): trans_decoder = model.dec else: trans_decoder = model.decoder joint_network = model.joint_network # We only use the MMIRNNTScorer now if train_args.aux_mmi and train_args.aux_mmi_type == "mmi": adim = train_args.enc_block_arch[0]['d_hidden'] weight_path = os.path.dirname(args.result_label) + "/dump" os.makedirs(weight_path, exist_ok=True) model.aux_mmi.dump_weight(args.local_rank, weight_path) mmi_scorer_module = MMIRNNTScorer mmi_scorer = mmi_scorer_module(lang=model.aux_mmi.lang, device=device, idim=adim, sos_id=model.sos, rank=args.local_rank, use_segment=args.use_segment, char_list=train_args.char_list, weight_path=weight_path, lookahead=args.mas_lookahead, ) else: mmi_scorer = None if args.ngram_model and args.ngram_weight > 0.0: print(f"Using ngram model: {args.ngram_model}", flush=True) from espnet.nets.scorers.ngram import NgramPartScorer ngram_scorer = NgramPartScorer(args.ngram_model, train_args.char_list) else: ngram_scorer = None if args.word_ngram is not None and args.word_ngram_weight > 0.0: from espnet.nets.scorers.word_ngram import WordNgramPartialScorer word_ngram_scorer = WordNgramPartialScorer word_ngram_scorer = word_ngram_scorer( args.word_ngram, device, train_args.char_list, log_semiring=args.word_ngram_log_semiring, lower_char=args.word_ngram_lower_char) else: word_ngram_scorer = None if args.tlg_scorer is not None and args.tlg_weight > 0.0: print(f"Using tlg scorer: {args.tlg_scorer}", flush=True) from espnet.nets.scorers.tlg_scorer import TlgPartialScorer tlg_scorer = TlgPartialScorer(lang=args.tlg_scorer, nonblk_reward=args.tlg_nonblk_reward) else: tlg_scorer = None # for code-switch data if args.cs_nt_decode_feature in ["chn", "eng"]: ctc_module = getattr(model, "aux_ctc", None) else: ctc_module = getattr(model, "decoder_ctc", None) if args.eng_vocab is not None and os.path.isfile(args.eng_vocab): eng_vocab = [s.strip() for s in open(args.eng_vocab, encoding="utf-8").readlines()] else: eng_vocab = None beam_search_transducer = BeamSearchTransducer( decoder=trans_decoder, joint_network=joint_network, beam_size=args.beam_size, nbest=args.nbest, lm=rnnlm, lm_weight=args.lm_weight, search_type=args.search_type, char_list=train_args.char_list, max_sym_exp=args.max_sym_exp, u_max=args.u_max, nstep=args.nstep, prefix_alpha=args.prefix_alpha, score_norm=args.score_norm, mmi_scorer=mmi_scorer, mmi_weight=args.mmi_weight, ngram_scorer=ngram_scorer, ngram_weight=args.ngram_weight, word_ngram_scorer=word_ngram_scorer, word_ngram_weight=args.word_ngram_weight, tlg_scorer=tlg_scorer, tlg_weight=args.tlg_weight, forbid_eng=args.forbid_eng, ctc_module=ctc_module, ctc_weight=args.ctc_weight, eng_vocab=eng_vocab ) if args.k2_decode: k2_decode(model, device, js, load_inputs_and_targets, args.batchsize, args.use_segment) print("Finish FST decoding. Abort!") return nbest_dict = {} rtf_calculator = RTF_calculator(js) rtf_calculator.tik() if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feats = load_inputs_and_targets(batch) feat = ( feats[0][0] if args.num_encs == 1 else [feats[idx][0] for idx in range(model.num_encs)] ) # For Oteam ASR Only: skip all transcriptions that have english chars text_trans = js[name]["output"][0]["text"] if any([is_alphabet(x) for x in text_trans]) and args.skip_eng: continue if args.streaming_mode == "window" and args.num_encs == 1: logging.info( "Using streaming recognizer with window size %d frames", args.streaming_window, ) se2e = WindowStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) for i in range(0, feat.shape[0], args.streaming_window): logging.info( "Feeding frames %d - %d", i, i + args.streaming_window ) se2e.accept_input(feat[i : i + args.streaming_window]) logging.info("Running offline attention decoder") se2e.decode_with_attention_offline() logging.info("Offline attention decoder finished") nbest_hyps = se2e.retrieve_recognition() elif args.streaming_mode == "segment" and args.num_encs == 1: logging.info( "Using streaming recognizer with threshold value %d", args.streaming_min_blank_dur, ) nbest_hyps = [] for n in range(args.nbest): nbest_hyps.append({"yseq": [], "score": 0.0}) se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) r = np.prod(model.subsample) for i in range(0, feat.shape[0], r): hyps = se2e.accept_input(feat[i : i + r]) if hyps is not None: text = "".join( [ train_args.char_list[int(x)] for x in hyps[0]["yseq"][1:-1] if int(x) != -1 ] ) text = text.replace( "\u2581", " " ).strip() # for SentencePiece text = text.replace(model.space, " ") text = text.replace(model.blank, "") logging.info(text) for n in range(args.nbest): nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"]) nbest_hyps[n]["score"] += hyps[n]["score"] elif hasattr(model, "is_rnnt"): nbest_hyps = model.recognize(feat, beam_search_transducer, decode_feature=args.cs_nt_decode_feature) else: nbest_hyps = model.recognize( feat, args, train_args.char_list, rnnlm ) # visualization # decode_dir = os.path.dirname(args.result_label) # graph_dir = os.path.join(decode_dir, "graph") # os.makedirs(graph_dir, exist_ok=True) # plot_decoding_logs(graph_dir, train_args.char_list, # args, name, nbest_hyps) nbest_dict[name] = nbest_hyps new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list ) else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]["input"][0]["shape"][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] feats = ( load_inputs_and_targets(batch)[0] if args.num_encs == 1 else load_inputs_and_targets(batch) ) if args.streaming_mode == "window" and args.num_encs == 1: raise NotImplementedError elif args.streaming_mode == "segment" and args.num_encs == 1: if args.batchsize > 1: raise NotImplementedError feat = feats[0] nbest_hyps = [] for n in range(args.nbest): nbest_hyps.append({"yseq": [], "score": 0.0}) se2e = SegmentStreamingE2E(e2e=model, recog_args=args, rnnlm=rnnlm) r = np.prod(model.subsample) for i in range(0, feat.shape[0], r): hyps = se2e.accept_input(feat[i : i + r]) if hyps is not None: text = "".join( [ train_args.char_list[int(x)] for x in hyps[0]["yseq"][1:-1] if int(x) != -1 ] ) text = text.replace( "\u2581", " " ).strip() # for SentencePiece text = text.replace(model.space, " ") text = text.replace(model.blank, "") logging.info(text) for n in range(args.nbest): nbest_hyps[n]["yseq"].extend(hyps[n]["yseq"]) nbest_hyps[n]["score"] += hyps[n]["score"] nbest_hyps = [nbest_hyps] else: nbest_hyps = model.recognize_batch( feats, args, train_args.char_list, rnnlm=rnnlm ) for i, nbest_hyp in enumerate(nbest_hyps): name = names[i] new_js[name] = add_results_to_json( js[name], nbest_hyp, train_args.char_list ) rtf_calculator.tok() with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) def enhance(args): """Dumping enhanced speech and mask. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) # read training config idim, odim, train_args = get_model_conf(args.model, args.model_conf) # TODO(ruizhili): implement enhance for multi-encoder model assert args.num_encs == 1, "number of encoder should be 1 ({} is given)".format( args.num_encs ) # load trained model parameters logging.info("reading model parameters from " + args.model) model_class = dynamic_import(train_args.model_module) model = model_class(idim, odim, train_args) assert isinstance(model, ASRInterface) torch_load(args.model, model) model.recog_args = args # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info("gpu id: " + str(gpu_id)) model.cuda() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=None, # Apply pre_process in outer func ) if args.batchsize == 0: args.batchsize = 1 # Creates writers for outputs from the network if args.enh_wspecifier is not None: enh_writer = file_writer_helper(args.enh_wspecifier, filetype=args.enh_filetype) else: enh_writer = None # Creates a Transformation instance preprocess_conf = ( train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf ) if preprocess_conf is not None: logging.info(f"Use preprocessing: {preprocess_conf}") transform = Transformation(preprocess_conf) else: transform = None # Creates a IStft instance istft = None frame_shift = args.istft_n_shift # Used for plot the spectrogram if args.apply_istft: if preprocess_conf is not None: # Read the conffile and find stft setting with open(preprocess_conf) as f: # Json format: e.g. # {"process": [{"type": "stft", # "win_length": 400, # "n_fft": 512, "n_shift": 160, # "window": "han"}, # {"type": "foo", ...}, ...]} conf = json.load(f) assert "process" in conf, conf # Find stft setting for p in conf["process"]: if p["type"] == "stft": istft = IStft( win_length=p["win_length"], n_shift=p["n_shift"], window=p.get("window", "hann"), ) logging.info( "stft is found in {}. " "Setting istft config from it\n{}".format( preprocess_conf, istft ) ) frame_shift = p["n_shift"] break if istft is None: # Set from command line arguments istft = IStft( win_length=args.istft_win_length, n_shift=args.istft_n_shift, window=args.istft_window, ) logging.info( "Setting istft config from the command line args\n{}".format(istft) ) # sort data keys = list(js.keys()) feat_lens = [js[key]["input"][0]["shape"][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) num_images = 0 if not os.path.exists(args.image_dir): os.makedirs(args.image_dir) for names in grouper(args.batchsize, keys, None): batch = [(name, js[name]) for name in names] # May be in time region: (Batch, [Time, Channel]) org_feats = load_inputs_and_targets(batch)[0] if transform is not None: # May be in time-freq region: : (Batch, [Time, Channel, Freq]) feats = transform(org_feats, train=False) else: feats = org_feats with torch.no_grad(): enhanced, mask, ilens = model.enhance(feats) for idx, name in enumerate(names): # Assuming mask, feats : [Batch, Time, Channel. Freq] # enhanced : [Batch, Time, Freq] enh = enhanced[idx][: ilens[idx]] mas = mask[idx][: ilens[idx]] feat = feats[idx] # Plot spectrogram if args.image_dir is not None and num_images < args.num_images: import matplotlib.pyplot as plt num_images += 1 ref_ch = 0 plt.figure(figsize=(20, 10)) plt.subplot(4, 1, 1) plt.title("Mask [ref={}ch]".format(ref_ch)) plot_spectrogram( plt, mas[:, ref_ch].T, fs=args.fs, mode="linear", frame_shift=frame_shift, bottom=False, labelbottom=False, ) plt.subplot(4, 1, 2) plt.title("Noisy speech [ref={}ch]".format(ref_ch)) plot_spectrogram( plt, feat[:, ref_ch].T, fs=args.fs, mode="db", frame_shift=frame_shift, bottom=False, labelbottom=False, ) plt.subplot(4, 1, 3) plt.title("Masked speech [ref={}ch]".format(ref_ch)) plot_spectrogram( plt, (feat[:, ref_ch] * mas[:, ref_ch]).T, frame_shift=frame_shift, fs=args.fs, mode="db", bottom=False, labelbottom=False, ) plt.subplot(4, 1, 4) plt.title("Enhanced speech") plot_spectrogram( plt, enh.T, fs=args.fs, mode="db", frame_shift=frame_shift ) plt.savefig(os.path.join(args.image_dir, name + ".png")) plt.clf() # Write enhanced wave files if enh_writer is not None: if istft is not None: enh = istft(enh) else: enh = enh if args.keep_length: if len(org_feats[idx]) < len(enh): # Truncate the frames added by stft padding enh = enh[: len(org_feats[idx])] elif len(org_feats) > len(enh): padwidth = [(0, (len(org_feats[idx]) - len(enh)))] + [ (0, 0) ] * (enh.ndim - 1) enh = np.pad(enh, padwidth, mode="constant") if args.enh_filetype in ("sound", "sound.hdf5"): enh_writer[name] = (args.fs, enh) else: # Hint: To dump stft_signal, mask or etc, # enh_filetype='hdf5' might be convenient. enh_writer[name] = enh if num_images >= args.num_images and enh_writer is None: logging.info("Breaking the process.") break def ctc_align(args): """CTC forced alignments with the given args. Args: args (namespace): The program arguments. """ def add_alignment_to_json(js, alignment, char_list): """Add N-best results to json. Args: js (dict[str, Any]): Groundtruth utterance dict. alignment (list[int]): List of alignment. char_list (list[str]): List of characters. Returns: dict[str, Any]: N-best results added utterance dict. """ # copy old json info new_js = dict() new_js["ctc_alignment"] = [] alignment_tokens = [] for idx, a in enumerate(alignment): alignment_tokens.append(char_list[a]) alignment_tokens = " ".join(alignment_tokens) new_js["ctc_alignment"] = alignment_tokens return new_js set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=True, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = "cuda" else: device = "cpu" dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() # read json data with open(args.align_json, "rb") as f: js = json.load(f)["utts"] new_js = {} if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) aligning " + name, idx, len(js.keys())) batch = [(name, js[name])] feat, label = load_inputs_and_targets(batch) feat = feat[0] label = label[0] enc = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0) alignment = model.ctc.forced_align(enc, label) new_js[name] = add_alignment_to_json( js[name], alignment, train_args.char_list ) else: raise NotImplementedError("Align_batch is not implemented.") with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) ================================================ FILE: asr/pytorch_backend/asr_init.py ================================================ """Finetuning methods.""" import logging import os import torch from collections import OrderedDict from espnet.asr.asr_utils import get_model_conf from espnet.asr.asr_utils import torch_load from espnet.nets.asr_interface import ASRInterface from espnet.nets.mt_interface import MTInterface from espnet.nets.pytorch_backend.transducer.utils import custom_torch_load from espnet.nets.tts_interface import TTSInterface from espnet.utils.dynamic_import import dynamic_import def freeze_modules(model, modules): """Freeze model parameters according to modules list. Args: model (torch.nn.Module): main model to update modules (list): specified module list for freezing Return: model (torch.nn.Module): updated model model_params (filter): filtered model parameters """ for mod, param in model.named_parameters(): if any(mod.startswith(m) for m in modules): logging.info(f"freezing {mod}, it will not be updated.") param.requires_grad = False model_params = filter(lambda x: x.requires_grad, model.parameters()) return model, model_params def transfer_verification(model_state_dict, partial_state_dict, modules): """Verify tuples (key, shape) for input model modules match specified modules. Args: model_state_dict (OrderedDict): the initial model state_dict partial_state_dict (OrderedDict): the trained model state_dict modules (list): specified module list for transfer Return: (boolean): allow transfer """ modules_model = [] partial_modules = [] for key_p, value_p in partial_state_dict.items(): if any(key_p.startswith(m) for m in modules): partial_modules += [(key_p, value_p.shape)] for key_m, value_m in model_state_dict.items(): if any(key_m.startswith(m) for m in modules): modules_model += [(key_m, value_m.shape)] len_match = len(modules_model) == len(partial_modules) module_match = sorted(modules_model, key=lambda x: (x[0], x[1])) == sorted( partial_modules, key=lambda x: (x[0], x[1]) ) return len_match and module_match def get_partial_state_dict(model_state_dict, modules): """Create state_dict with specified modules matching input model modules. Note that get_partial_lm_state_dict is used if a LM specified. Args: model_state_dict (OrderedDict): trained model state_dict modules (list): specified module list for transfer Return: new_state_dict (OrderedDict): the updated state_dict """ new_state_dict = OrderedDict() for key, value in model_state_dict.items(): if any(key.startswith(m) for m in modules): new_state_dict[key] = value return new_state_dict def get_lm_state_dict(lm_state_dict): """Create compatible ASR decoder state dict from LM state dict. Args: lm_state_dict (OrderedDict): pre-trained LM state_dict Return: new_state_dict (OrderedDict): LM state_dict with updated keys """ new_state_dict = OrderedDict() for key, value in list(lm_state_dict.items()): if key == "predictor.embed.weight": new_state_dict["dec.embed.weight"] = value elif key.startswith("predictor.rnn."): _split = key.split(".") new_key = "dec.decoder." + _split[2] + "." + _split[3] + "_l0" new_state_dict[new_key] = value return new_state_dict def filter_modules(model_state_dict, modules): """Filter non-matched modules in module_state_dict. Args: model_state_dict (OrderedDict): trained model state_dict modules (list): specified module list for transfer Return: new_mods (list): the update module list """ new_mods = [] incorrect_mods = [] mods_model = list(model_state_dict.keys()) for mod in modules: if any(key.startswith(mod) for key in mods_model): new_mods += [mod] else: incorrect_mods += [mod] if incorrect_mods: logging.warning( "module(s) %s don't match or (partially match) " "available modules in model.", incorrect_mods, ) logging.warning("for information, the existing modules in model are:") logging.warning("%s", mods_model) return new_mods def load_trained_model(model_path, training=True): """Load the trained model for recognition. Args: model_path (str): Path to model.***.best """ idim, odim, train_args = get_model_conf( model_path, os.path.join(os.path.dirname(model_path), "model.json") ) logging.warning("reading model parameters from " + model_path) if hasattr(train_args, "model_module"): model_module = train_args.model_module else: model_module = "espnet.nets.pytorch_backend.e2e_asr:E2E" # CTC Loss is not needed, default to builtin to prevent import errors # if hasattr(train_args, "ctc_type"): # train_args.ctc_type = "builtin" model_class = dynamic_import(model_module) if "transducer" in model_module: model = model_class(idim, odim, train_args, training=training) custom_torch_load(model_path, model, training=training) else: model = model_class(idim, odim, train_args) torch_load(model_path, model) return model, train_args # when start decoding jobs with very large nj, this function leads # to reading error. Do this for many times def _load_trained_model(model_path, training=True, patience=10): for i in range(patience): try: model, train_args = _load_trained_model(model_path, training=training) print(f"Model Init: Successful initialize model in {i}-th trail", flush=True) return model, train_args except: print(f"Model Init: Fail in {i}-th trail. Try again!", flush=True) def get_trained_model_state_dict(model_path): """Extract the trained model state dict for pre-initialization. Args: model_path (str): Path to model.***.best Return: model.state_dict() (OrderedDict): the loaded model state_dict (bool): Boolean defining whether the model is an LM """ conf_path = os.path.join(os.path.dirname(model_path), "model.json") if "rnnlm" in model_path: logging.warning("reading model parameters from %s", model_path) return get_lm_state_dict(torch.load(model_path)) idim, odim, args = get_model_conf(model_path, conf_path) logging.warning("reading model parameters from " + model_path) if hasattr(args, "model_module"): model_module = args.model_module else: model_module = "espnet.nets.pytorch_backend.e2e_asr:E2E" model_class = dynamic_import(model_module) model = model_class(idim, odim, args) torch_load(model_path, model) assert ( isinstance(model, MTInterface) or isinstance(model, ASRInterface) or isinstance(model, TTSInterface) ) return model.state_dict() def load_trained_modules(idim, odim, args, interface=ASRInterface): """Load model encoder or/and decoder modules with ESPNET pre-trained model(s). Args: idim (int): initial input dimension. odim (int): initial output dimension. args (Namespace): The initial model arguments. interface (Interface): ASRInterface or STInterface or TTSInterface. Return: model (torch.nn.Module): The model with pretrained modules. """ def print_new_keys(state_dict, modules, model_path): logging.warning("loading %s from model: %s", modules, model_path) for k in state_dict.keys(): logging.warning("override %s" % k) enc_model_path = args.enc_init dec_model_path = args.dec_init enc_modules = args.enc_init_mods dec_modules = args.dec_init_mods model_class = dynamic_import(args.model_module) main_model = model_class(idim, odim, args) assert isinstance(main_model, interface) main_state_dict = main_model.state_dict() logging.warning("model(s) found for pre-initialization") for model_path, modules in [ (enc_model_path, enc_modules), (dec_model_path, dec_modules), ]: if model_path is not None: if os.path.isfile(model_path): model_state_dict = get_trained_model_state_dict(model_path) modules = filter_modules(model_state_dict, modules) partial_state_dict = get_partial_state_dict(model_state_dict, modules) if partial_state_dict: if transfer_verification( main_state_dict, partial_state_dict, modules ): print_new_keys(partial_state_dict, modules, model_path) main_state_dict.update(partial_state_dict) else: logging.warning( f"modules {modules} in model {model_path} " f"don't match your training config", ) else: logging.warning("model was not found : %s", model_path) main_model.load_state_dict(main_state_dict) return main_model ================================================ FILE: asr/pytorch_backend/asr_mix.py ================================================ #!/usr/bin/env python3 """ This script is used for multi-speaker speech recognition. Copyright 2017 Johns Hopkins University (Shinji Watanabe) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ import json import logging import os # chainer related from chainer import training from chainer.training import extensions from itertools import zip_longest as zip_longest import numpy as np from tensorboardX import SummaryWriter import torch from espnet.asr.asr_mix_utils import add_results_to_json from espnet.asr.asr_utils import adadelta_eps_decay from espnet.asr.asr_utils import CompareValueTrigger from espnet.asr.asr_utils import get_model_conf from espnet.asr.asr_utils import restore_snapshot from espnet.asr.asr_utils import snapshot_object from espnet.asr.asr_utils import torch_load from espnet.asr.asr_utils import torch_resume from espnet.asr.asr_utils import torch_snapshot from espnet.asr.pytorch_backend.asr import CustomEvaluator from espnet.asr.pytorch_backend.asr import CustomUpdater from espnet.asr.pytorch_backend.asr import load_trained_model import espnet.lm.pytorch_backend.extlm as extlm_pytorch from espnet.nets.asr_interface import ASRInterface from espnet.nets.pytorch_backend.e2e_asr_mix import pad_list import espnet.nets.pytorch_backend.lm.default as lm_pytorch from espnet.utils.dataset import ChainerDataLoader from espnet.utils.dataset import TransformDataset from espnet.utils.deterministic_utils import set_deterministic_pytorch from espnet.utils.dynamic_import import dynamic_import from espnet.utils.io_utils import LoadInputsAndTargets from espnet.utils.training.batchfy import make_batchset from espnet.utils.training.iterators import ShufflingEnabler from espnet.utils.training.tensorboard_logger import TensorboardLogger from espnet.utils.training.train_utils import check_early_stop from espnet.utils.training.train_utils import set_early_stop import matplotlib matplotlib.use("Agg") class CustomConverter(object): """Custom batch converter for Pytorch. Args: subsampling_factor (int): The subsampling factor. dtype (torch.dtype): Data type to convert. """ def __init__(self, subsampling_factor=1, dtype=torch.float32, num_spkrs=2): """Initialize the converter.""" self.subsampling_factor = subsampling_factor self.ignore_id = -1 self.dtype = dtype self.num_spkrs = num_spkrs def __call__(self, batch, device=torch.device("cpu")): """Transform a batch and send it to a device. Args: batch (list(tuple(str, dict[str, dict[str, Any]]))): The batch to transform. device (torch.device): The device to send to. Returns: tuple(torch.Tensor, torch.Tensor, torch.Tensor): Transformed batch. """ # batch should be located in list assert len(batch) == 1 xs, ys = batch[0][0], batch[0][-self.num_spkrs :] # perform subsampling if self.subsampling_factor > 1: xs = [x[:: self.subsampling_factor, :] for x in xs] # get batch of lengths of input sequences ilens = np.array([x.shape[0] for x in xs]) # perform padding and convert to tensor # currently only support real number if xs[0].dtype.kind == "c": xs_pad_real = pad_list( [torch.from_numpy(x.real).float() for x in xs], 0 ).to(device, dtype=self.dtype) xs_pad_imag = pad_list( [torch.from_numpy(x.imag).float() for x in xs], 0 ).to(device, dtype=self.dtype) # Note(kamo): # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. # Don't create ComplexTensor and give it to E2E here # because torch.nn.DataParallel can't handle it. xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag} else: xs_pad = pad_list([torch.from_numpy(x).float() for x in xs], 0).to( device, dtype=self.dtype ) ilens = torch.from_numpy(ilens).to(device) if not isinstance(ys[0], np.ndarray): ys_pad = [] for i in range(len(ys)): # speakers ys_pad += [torch.from_numpy(y).long() for y in ys[i]] ys_pad = pad_list(ys_pad, self.ignore_id) ys_pad = ( ys_pad.view(self.num_spkrs, -1, ys_pad.size(1)) .transpose(0, 1) .to(device) ) # (B, num_spkrs, Tmax) else: ys_pad = pad_list( [torch.from_numpy(y).long() for y in ys], self.ignore_id ).to(device) return xs_pad, ilens, ys_pad def train(args): """Train with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) # check cuda availability if not torch.cuda.is_available(): logging.warning("cuda is not available") # get input and output dimension info with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] utts = list(valid_json.keys()) idim = int(valid_json[utts[0]]["input"][0]["shape"][-1]) odim = int(valid_json[utts[0]]["output"][0]["shape"][-1]) logging.info("#input dims : " + str(idim)) logging.info("#output dims: " + str(odim)) # specify attention, CTC, hybrid mode if args.mtlalpha == 1.0: mtl_mode = "ctc" logging.info("Pure CTC mode") elif args.mtlalpha == 0.0: mtl_mode = "att" logging.info("Pure attention mode") else: mtl_mode = "mtl" logging.info("Multitask learning mode") # specify model architecture model_class = dynamic_import(args.model_module) model = model_class(idim, odim, args) assert isinstance(model, ASRInterface) subsampling_factor = model.subsample[0] if args.rnnlm is not None: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility ) ) torch.load(args.rnnlm, rnnlm) model.rnnlm = rnnlm # write model config if not os.path.exists(args.outdir): os.makedirs(args.outdir) model_conf = args.outdir + "/model.json" with open(model_conf, "wb") as f: logging.info("writing a model config file to " + model_conf) f.write( json.dumps( (idim, odim, vars(args)), indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) for key in sorted(vars(args).keys()): logging.info("ARGS: " + key + ": " + str(vars(args)[key])) reporter = model.reporter # check the use of multi-gpu if args.ngpu > 1: if args.batch_size != 0: logging.warning( "batch size is automatically increased (%d -> %d)" % (args.batch_size, args.batch_size * args.ngpu) ) args.batch_size *= args.ngpu # set torch device device = torch.device("cuda" if args.ngpu > 0 else "cpu") if args.train_dtype in ("float16", "float32", "float64"): dtype = getattr(torch, args.train_dtype) else: dtype = torch.float32 model = model.to(device=device, dtype=dtype) logging.warning( "num. model params: {:,} (num. trained: {:,} ({:.1f}%))".format( sum(p.numel() for p in model.parameters()), sum(p.numel() for p in model.parameters() if p.requires_grad), sum(p.numel() for p in model.parameters() if p.requires_grad) * 100.0 / sum(p.numel() for p in model.parameters()), ) ) # Setup an optimizer if args.opt == "adadelta": optimizer = torch.optim.Adadelta( model.parameters(), rho=0.95, eps=args.eps, weight_decay=args.weight_decay ) elif args.opt == "adam": optimizer = torch.optim.Adam(model.parameters(), weight_decay=args.weight_decay) elif args.opt == "noam": from espnet.nets.pytorch_backend.transformer.optimizer import get_std_opt optimizer = get_std_opt( model.parameters(), args.adim, args.transformer_warmup_steps, args.transformer_lr, ) else: raise NotImplementedError("unknown optimizer: " + args.opt) # setup apex.amp if args.train_dtype in ("O0", "O1", "O2", "O3"): try: from apex import amp except ImportError as e: logging.error( f"You need to install apex for --train-dtype {args.train_dtype}. " "See https://github.com/NVIDIA/apex#linux" ) raise e if args.opt == "noam": model, optimizer.optimizer = amp.initialize( model, optimizer.optimizer, opt_level=args.train_dtype ) else: model, optimizer = amp.initialize( model, optimizer, opt_level=args.train_dtype ) use_apex = True else: use_apex = False # FIXME: TOO DIRTY HACK setattr(optimizer, "target", reporter) setattr(optimizer, "serialize", lambda s: reporter.serialize(s)) # Setup a converter converter = CustomConverter( subsampling_factor=subsampling_factor, dtype=dtype, num_spkrs=args.num_spkrs ) # read json data with open(args.train_json, "rb") as f: train_json = json.load(f)["utts"] with open(args.valid_json, "rb") as f: valid_json = json.load(f)["utts"] use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0 # make minibatch list (variable length) train = make_batchset( train_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, shortest_first=use_sortagrad, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=-1, ) valid = make_batchset( valid_json, args.batch_size, args.maxlen_in, args.maxlen_out, args.minibatches, min_batch_size=args.ngpu if args.ngpu > 1 else 1, count=args.batch_count, batch_bins=args.batch_bins, batch_frames_in=args.batch_frames_in, batch_frames_out=args.batch_frames_out, batch_frames_inout=args.batch_frames_inout, iaxis=0, oaxis=-1, ) load_tr = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": True}, # Switch the mode of preprocessing ) load_cv = LoadInputsAndTargets( mode="asr", load_output=True, preprocess_conf=args.preprocess_conf, preprocess_args={"train": False}, # Switch the mode of preprocessing ) # hack to make batchsize argument as 1 # actual bathsize is included in a list # default collate function converts numpy array to pytorch tensor # we used an empty collate function instead which returns list train_iter = { "main": ChainerDataLoader( dataset=TransformDataset(train, lambda data: converter([load_tr(data)])), batch_size=1, num_workers=args.n_iter_processes, shuffle=True, collate_fn=lambda x: x[0], ) } valid_iter = { "main": ChainerDataLoader( dataset=TransformDataset(valid, lambda data: converter([load_cv(data)])), batch_size=1, shuffle=False, collate_fn=lambda x: x[0], num_workers=args.n_iter_processes, ) } # Set up a trainer updater = CustomUpdater( model, args.grad_clip, train_iter, optimizer, device, args.ngpu, args.grad_noise, args.accum_grad, use_apex=use_apex, ) trainer = training.Trainer(updater, (args.epochs, "epoch"), out=args.outdir) if use_sortagrad: trainer.extend( ShufflingEnabler([train_iter]), trigger=(args.sortagrad if args.sortagrad != -1 else args.epochs, "epoch"), ) # Resume from a snapshot if args.resume: logging.info("resumed from %s" % args.resume) torch_resume(args.resume, trainer) # Evaluate the model with the test dataset for each epoch trainer.extend(CustomEvaluator(model, valid_iter, reporter, device, args.ngpu)) # Save attention weight each epoch if args.num_save_attention > 0 and args.mtlalpha != 1.0: data = sorted( list(valid_json.items())[: args.num_save_attention], key=lambda x: int(x[1]["input"][0]["shape"][1]), reverse=True, ) if hasattr(model, "module"): att_vis_fn = model.module.calculate_all_attentions plot_class = model.module.attention_plot_class else: att_vis_fn = model.calculate_all_attentions plot_class = model.attention_plot_class att_reporter = plot_class( att_vis_fn, data, args.outdir + "/att_ws", converter=converter, transform=load_cv, device=device, ) trainer.extend(att_reporter, trigger=(1, "epoch")) else: att_reporter = None # Make a plot for training and validation values trainer.extend( extensions.PlotReport( [ "main/loss", "validation/main/loss", "main/loss_ctc", "validation/main/loss_ctc", "main/loss_att", "validation/main/loss_att", ], "epoch", file_name="loss.png", ) ) trainer.extend( extensions.PlotReport( ["main/acc", "validation/main/acc"], "epoch", file_name="acc.png" ) ) trainer.extend( extensions.PlotReport( ["main/cer_ctc", "validation/main/cer_ctc"], "epoch", file_name="cer.png" ) ) # Save best models trainer.extend( snapshot_object(model, "model.loss.best"), trigger=training.triggers.MinValueTrigger("validation/main/loss"), ) if mtl_mode != "ctc": trainer.extend( snapshot_object(model, "model.acc.best"), trigger=training.triggers.MaxValueTrigger("validation/main/acc"), ) # save snapshot which contains model and optimizer states trainer.extend(torch_snapshot(), trigger=(1, "epoch")) # epsilon decay in the optimizer if args.opt == "adadelta": if args.criterion == "acc" and mtl_mode != "ctc": trainer.extend( restore_snapshot( model, args.outdir + "/model.acc.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/acc", lambda best_value, current_value: best_value > current_value, ), ) elif args.criterion == "loss": trainer.extend( restore_snapshot( model, args.outdir + "/model.loss.best", load_fn=torch_load ), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) trainer.extend( adadelta_eps_decay(args.eps_decay), trigger=CompareValueTrigger( "validation/main/loss", lambda best_value, current_value: best_value < current_value, ), ) # Write a log of evaluation statistics for each epoch trainer.extend( extensions.LogReport(trigger=(args.report_interval_iters, "iteration")) ) report_keys = [ "epoch", "iteration", "main/loss", "main/loss_ctc", "main/loss_att", "validation/main/loss", "validation/main/loss_ctc", "validation/main/loss_att", "main/acc", "validation/main/acc", "main/cer_ctc", "validation/main/cer_ctc", "elapsed_time", ] if args.opt == "adadelta": trainer.extend( extensions.observe_value( "eps", lambda trainer: trainer.updater.get_optimizer("main").param_groups[0][ "eps" ], ), trigger=(args.report_interval_iters, "iteration"), ) report_keys.append("eps") if args.report_cer: report_keys.append("validation/main/cer") if args.report_wer: report_keys.append("validation/main/wer") trainer.extend( extensions.PrintReport(report_keys), trigger=(args.report_interval_iters, "iteration"), ) trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters)) set_early_stop(trainer, args) if args.tensorboard_dir is not None and args.tensorboard_dir != "": trainer.extend( TensorboardLogger(SummaryWriter(args.tensorboard_dir), att_reporter), trigger=(args.report_interval_iters, "iteration"), ) # Run the training trainer.run() check_early_stop(trainer, args.epochs) def recog(args): """Decode with the given args. Args: args (namespace): The program arguments. """ set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.recog_args = args # read rnnlm if args.rnnlm: rnnlm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) if getattr(rnnlm_args, "model_module", "default") != "default": raise ValueError( "use '--api v2' option to decode with non-default language model" ) rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM( len(train_args.char_list), rnnlm_args.layer, rnnlm_args.unit, getattr(rnnlm_args, "embed_unit", None), # for backward compatibility ) ) torch_load(args.rnnlm, rnnlm) rnnlm.eval() else: rnnlm = None if args.word_rnnlm: rnnlm_args = get_model_conf(args.word_rnnlm, args.word_rnnlm_conf) word_dict = rnnlm_args.char_list_dict char_dict = {x: i for i, x in enumerate(train_args.char_list)} word_rnnlm = lm_pytorch.ClassifierWithState( lm_pytorch.RNNLM(len(word_dict), rnnlm_args.layer, rnnlm_args.unit) ) torch_load(args.word_rnnlm, word_rnnlm) word_rnnlm.eval() if rnnlm is not None: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.MultiLevelLM( word_rnnlm.predictor, rnnlm.predictor, word_dict, char_dict ) ) else: rnnlm = lm_pytorch.ClassifierWithState( extlm_pytorch.LookAheadWordLM( word_rnnlm.predictor, word_dict, char_dict ) ) # gpu if args.ngpu == 1: gpu_id = list(range(args.ngpu)) logging.info("gpu id: " + str(gpu_id)) model.cuda() if rnnlm: rnnlm.cuda() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.batchsize == 0: with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] nbest_hyps = model.recognize(feat, args, train_args.char_list, rnnlm) new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list ) else: def grouper(n, iterable, fillvalue=None): kargs = [iter(iterable)] * n return zip_longest(*kargs, fillvalue=fillvalue) # sort data if batchsize > 1 keys = list(js.keys()) if args.batchsize > 1: feat_lens = [js[key]["input"][0]["shape"][0] for key in keys] sorted_index = sorted(range(len(feat_lens)), key=lambda i: -feat_lens[i]) keys = [keys[i] for i in sorted_index] with torch.no_grad(): for names in grouper(args.batchsize, keys, None): names = [name for name in names if name] batch = [(name, js[name]) for name in names] feats = load_inputs_and_targets(batch)[0] nbest_hyps = model.recognize_batch( feats, args, train_args.char_list, rnnlm=rnnlm ) for i, name in enumerate(names): nbest_hyp = [hyp[i] for hyp in nbest_hyps] new_js[name] = add_results_to_json( js[name], nbest_hyp, train_args.char_list ) with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) ================================================ FILE: asr/pytorch_backend/recog.py ================================================ """V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`.""" import json import logging import os import torch from espnet.asr.asr_utils import add_results_to_json from espnet.asr.asr_utils import get_model_conf from espnet.asr.asr_utils import torch_load from espnet.asr.pytorch_backend.asr import load_trained_model from espnet.nets.asr_interface import ASRInterface from espnet.nets.batch_beam_search import BatchBeamSearch from espnet.nets.beam_search import BeamSearch from espnet.nets.lm_interface import dynamic_import_lm from espnet.nets.scorer_interface import BatchScorerInterface from espnet.nets.scorers.length_bonus import LengthBonus from espnet.utils.deterministic_utils import set_deterministic_pytorch from espnet.utils.io_utils import LoadInputsAndTargets from espnet.nets.scorers.mmi_frame_scorer import MMIFrameScorer # from espnet.nets.scorers.mmi_prefix_score import MMIFrameScorer from espnet.nets.scorers.ctc import CTCPrefixScorer from espnet.nets.scorers.word_ngram import WordNgramPartialScorer from espnet.nets.scorers.mmi_rescorer import MMIRescorer from espnet.utils.rtf_calculator import RTF_calculator def recog_v2(args): """Decode with custom models that implements ScorerInterface. Notes: The previous backend espnet.asr.pytorch_backend.asr.recog only supports E2E and RNNLM Args: args (namespace): The program arguments. See py:func:`espnet.bin.asr_recog.get_parser` for details """ logging.warning("experimental API for custom LMs is selected by --api v2") if args.batchsize > 1: raise NotImplementedError("multi-utt batch decoding is not implemented") if args.streaming_mode is not None: raise NotImplementedError("streaming mode is not implemented") if args.word_rnnlm: raise NotImplementedError("word LM is not implemented") if args.ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") if args.ngpu == 1: device = torch.device("cuda") else: # So the cuda is not available now device = torch.device("cpu") os.environ["CUDA_VISIBLE_DEVICES"] = "-1" assert torch.cuda.is_available() == False print(f"Rank: {args.local_rank} Using device: {device}, ngpu: {args.ngpu}") set_deterministic_pytorch(args) model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) model.eval() load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=False, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) if args.rnnlm: lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf) # NOTE: for a compatibility with less than 0.5.0 version models lm_model_module = getattr(lm_args, "model_module", "default") lm_class = dynamic_import_lm(lm_model_module, lm_args.backend) lm = lm_class(len(train_args.char_list), lm_args) torch_load(args.rnnlm, lm) lm.eval() else: lm = None if args.ngram_model and args.ngram_weight > 0.0: from espnet.nets.scorers.ngram import NgramFullScorer from espnet.nets.scorers.ngram import NgramPartScorer if args.ngram_scorer == "full": ngram = NgramFullScorer(args.ngram_model, train_args.char_list) else: ngram = NgramPartScorer(args.ngram_model, train_args.char_list) else: ngram = None # load mmi_scorer if args.mmi_weight > 0.0: # Also make sure it is K2MMI assert hasattr(model.ctc, "dump_weight") # Dump a pth for each rank to avoid conflits when reading / writing weight_path = os.path.dirname(args.result_label) + "/dump" os.makedirs(weight_path, exist_ok=True) model.ctc.dump_weight(args.local_rank, weight_path) mmi_scorer = MMIFrameScorer mmi = mmi_scorer(lang=model.ctc.lang, device=device, idim=train_args.adim, sos_id=model.sos, rank=args.local_rank, use_segment=args.use_segment, char_list=train_args.char_list, weight_path=weight_path) else: mmi = None if args.mmi_rescore: weight_path = os.path.dirname(args.result_label) + "/dump" os.makedirs(weight_path, exist_ok=True) model.ctc.dump_weight(args.local_rank, weight_path) assert args.mmi_weight <= 0.0 mmi_rescorer = MMIRescorer(lang=model.ctc.lang, device=device, idim=train_args.adim, sos_id=model.sos, rank=args.local_rank, use_segment=args.use_segment, char_list=train_args.char_list, weight_path=weight_path) else: mmi_rescorer = None if args.ctc_weight > 0.0: ctc_module = model.third_loss if hasattr(model, "third_loss") else model.ctc ctc = CTCPrefixScorer(ctc_module, model.eos) else: ctc = None if args.word_ngram_weight > 0.0: word_ngram_scorer = WordNgramPartialScorer print(f"Using word ngram model: {args.word_ngram}", flush=True) word_ngram_scorer = WordNgramPartialScorer(args.word_ngram, device, train_args.char_list, log_semiring=args.word_ngram_log_semiring) else: word_ngram_scorer = None scorers = model.scorers() scorers["ctc"] = ctc scorers["mmi"] = mmi scorers["lm"] = lm scorers["ngram"] = ngram scorers["length_bonus"] = LengthBonus(len(train_args.char_list)) scorers["word_ngram"] = word_ngram_scorer weights = dict( decoder=1.0 - args.ctc_weight, ctc=args.ctc_weight, lm=args.lm_weight, ngram=args.ngram_weight, length_bonus=args.penalty, mmi=args.mmi_weight, word_ngram=args.word_ngram_weight, ) beam_search = BeamSearch( beam_size=args.beam_size, vocab_size=len(train_args.char_list), weights=weights, scorers=scorers, sos=model.sos, eos=model.eos, token_list=train_args.char_list, pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", mmi_rescorer=mmi_rescorer, ) # TODO(karita): make all scorers batchfied if args.batchsize == 1: non_batch = [ k for k, v in beam_search.full_scorers.items() if not isinstance(v, BatchScorerInterface) ] if len(non_batch) == 0: beam_search.__class__ = BatchBeamSearch logging.info("BatchBeamSearch implementation is selected.") else: logging.warning( f"As non-batch scorers {non_batch} are found, " f"fall back to non-batch implementation." ) dtype = getattr(torch, args.dtype) logging.info(f"Decoding device={device}, dtype={dtype}") model.to(device=device, dtype=dtype).eval() # beam_search.to(device=device, dtype=dtype).eval() # read json data with open(args.recog_json, "rb") as f: js = json.load(f)["utts"] new_js = {} rtf_calculator = RTF_calculator(js) rtf_calculator.tik() with torch.no_grad(): for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) decoding " + name, idx, len(js.keys())) batch = [(name, js[name])] feat = load_inputs_and_targets(batch)[0][0] enc = model.encode(torch.as_tensor(feat).to(device=device, dtype=dtype)) nbest_hyps = beam_search( x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio ) nbest_hyps = [ h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)] ] new_js[name] = add_results_to_json( js[name], nbest_hyps, train_args.char_list ) rtf_calculator.tok() with open(args.result_label, "wb") as f: f.write( json.dumps( {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True ).encode("utf_8") ) ================================================ FILE: bin/__init__.py ================================================ """Initialize sub package.""" ================================================ FILE: bin/asr_align.py ================================================ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2020 Johns Hopkins University (Xuankai Chang) # 2020, Technische Universität München; Dominik Winkelbauer, Ludwig Kürzinger # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ This program performs CTC segmentation to align utterances within audio files. Inputs: `--data-json`: A json containing list of utterances and audio files `--model`: An already trained ASR model Output: `--output`: A plain `segments` file with utterance positions in the audio files. Selected parameters: `--min-window-size`: Minimum window size considered for a single utterance. The current default value should be OK in most cases. Larger values might give better results; too large values cause IndexErrors. `--subsampling-factor`: If the encoder sub-samples its input, the number of frames at the CTC layer is reduced by this factor. `--frame-duration`: This is the non-overlapping duration of a single frame in milliseconds (the inverse of frames per millisecond). `--set-blank`: In the rare case that the blank token has not the index 0 in the character dictionary, this parameter sets the index of the blank token. `--gratis-blank`: Sets the transition cost for blank tokens to zero. Useful if there are longer unrelated segments between segments. `--replace-spaces-with-blanks`: Spaces are replaced with blanks. Helps to model pauses between words. May increase length of ground truth. May lead to misaligned segments when combined with the option `--gratis-blank`. """ import configargparse import logging import os import sys # imports for inference from espnet.asr.pytorch_backend.asr_init import load_trained_model from espnet.nets.asr_interface import ASRInterface from espnet.utils.io_utils import LoadInputsAndTargets import json import torch # imports for CTC segmentation from ctc_segmentation import ctc_segmentation from ctc_segmentation import CtcSegmentationParameters from ctc_segmentation import determine_utterance_segments from ctc_segmentation import prepare_text # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get default arguments.""" parser = configargparse.ArgumentParser( description="Align text to audio using CTC segmentation." "using a pre-trained speech recognition model.", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="Decoding config file path.") parser.add_argument( "--ngpu", type=int, default=0, help="Number of GPUs (max. 1 is supported)" ) parser.add_argument( "--dtype", choices=("float16", "float32", "float64"), default="float32", help="Float precision (only available in --api v2)", ) parser.add_argument( "--backend", type=str, default="pytorch", choices=["pytorch"], help="Backend library", ) parser.add_argument("--debugmode", type=int, default=1, help="Debugmode") parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option") parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) # task related parser.add_argument( "--data-json", type=str, help="Json of recognition data for audio and text" ) parser.add_argument("--utt-text", type=str, help="Text separated into utterances") # model (parameter) related parser.add_argument( "--model", type=str, required=True, help="Model file parameters to read" ) parser.add_argument( "--model-conf", type=str, default=None, help="Model config file" ) parser.add_argument( "--num-encs", default=1, type=int, help="Number of encoders in the model." ) # ctc-segmentation related parser.add_argument( "--subsampling-factor", type=int, default=None, help="Subsampling factor." " If the encoder sub-samples its input, the number of frames at the CTC layer" " is reduced by this factor. For example, a BLSTMP with subsampling 1_2_2_1_1" " has a subsampling factor of 4.", ) parser.add_argument( "--frame-duration", type=int, default=None, help="Non-overlapping duration of a single frame in milliseconds.", ) parser.add_argument( "--min-window-size", type=int, default=None, help="Minimum window size considered for utterance.", ) parser.add_argument( "--max-window-size", type=int, default=None, help="Maximum window size considered for utterance.", ) parser.add_argument( "--use-dict-blank", type=int, default=None, help="DEPRECATED.", ) parser.add_argument( "--set-blank", type=int, default=None, help="Index of model dictionary for blank token (default: 0).", ) parser.add_argument( "--gratis-blank", type=int, default=None, help="Set the transition cost of the blank token to zero. Audio sections" " labeled with blank tokens can then be skipped without penalty. Useful" " if there are unrelated audio segments between utterances.", ) parser.add_argument( "--replace-spaces-with-blanks", type=int, default=None, help="Fill blanks in between words to better model pauses between words." " Segments can be misaligned if this option is combined with --gratis-blank." " May increase length of ground truth.", ) parser.add_argument( "--scoring-length", type=int, default=None, help="Changes partitioning length L for calculation of the confidence score.", ) parser.add_argument( "--output", type=configargparse.FileType("w"), required=True, help="Output segments file", ) return parser def main(args): """Run the main decoding function.""" parser = get_parser() args, extra = parser.parse_known_args(args) # logging info if args.verbose == 1: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose == 2: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") if args.ngpu == 0 and args.dtype == "float16": raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.") # check CUDA_VISIBLE_DEVICES device = "cpu" if args.ngpu == 1: device = "cuda" cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is None: logging.warning("CUDA_VISIBLE_DEVICES is not set.") elif args.ngpu > 1: logging.error("Decoding only supports ngpu=1.") sys.exit(1) # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # recog logging.info("backend = " + args.backend) if args.backend == "pytorch": ctc_align(args, device) else: raise ValueError("Only pytorch is supported.") sys.exit(0) def ctc_align(args, device): """ESPnet-specific interface for CTC segmentation. Parses configuration, infers the CTC posterior probabilities, and then aligns start and end of utterances using CTC segmentation. Results are written to the output file given in the args. :param args: given configuration :param device: for inference; one of ['cuda', 'cpu'] :return: 0 on success """ model, train_args = load_trained_model(args.model) assert isinstance(model, ASRInterface) load_inputs_and_targets = LoadInputsAndTargets( mode="asr", load_output=True, sort_in_input_length=False, preprocess_conf=train_args.preprocess_conf if args.preprocess_conf is None else args.preprocess_conf, preprocess_args={"train": False}, ) logging.info(f"Decoding device={device}") # Warn for nets with high memory consumption on long audio files if hasattr(model, "enc"): encoder_module = model.enc.__class__.__module__ elif hasattr(model, "encoder"): encoder_module = model.encoder.__class__.__module__ else: encoder_module = "Unknown" logging.info(f"Encoder module: {encoder_module}") logging.info(f"CTC module: {model.ctc.__class__.__module__}") if "rnn" not in encoder_module: logging.warning("No BLSTM model detected; memory consumption may be high.") model.to(device=device).eval() # read audio and text json data with open(args.data_json, "rb") as f: js = json.load(f)["utts"] with open(args.utt_text, "r", encoding="utf-8") as f: lines = f.readlines() i = 0 text = {} segment_names = {} for name in js.keys(): text_per_audio = [] segment_names_per_audio = [] while i < len(lines) and lines[i].startswith(name): text_per_audio.append(lines[i][lines[i].find(" ") + 1 :]) segment_names_per_audio.append(lines[i][: lines[i].find(" ")]) i += 1 text[name] = text_per_audio segment_names[name] = segment_names_per_audio # apply configuration config = CtcSegmentationParameters() if args.subsampling_factor is not None: config.subsampling_factor = args.subsampling_factor if args.frame_duration is not None: config.frame_duration_ms = args.frame_duration if args.min_window_size is not None: config.min_window_size = args.min_window_size if args.max_window_size is not None: config.max_window_size = args.max_window_size config.char_list = train_args.char_list if args.use_dict_blank is not None: logging.warning( "The option --use-dict-blank is deprecated. If needed," " use --set-blank instead." ) if args.set_blank is not None: config.blank = args.set_blank if args.replace_spaces_with_blanks is not None: if args.replace_spaces_with_blanks: config.replace_spaces_with_blanks = True else: config.replace_spaces_with_blanks = False if args.gratis_blank: config.blank_transition_cost_zero = True if config.blank_transition_cost_zero and args.replace_spaces_with_blanks: logging.error( "Blanks are inserted between words, and also the transition cost of blank" " is zero. This configuration may lead to misalignments!" ) if args.scoring_length is not None: config.score_min_mean_over_L = args.scoring_length logging.info( f"Frame timings: {config.frame_duration_ms}ms * {config.subsampling_factor}" ) # Iterate over audio files to decode and align for idx, name in enumerate(js.keys(), 1): logging.info("(%d/%d) Aligning " + name, idx, len(js.keys())) batch = [(name, js[name])] feat, label = load_inputs_and_targets(batch) feat = feat[0] with torch.no_grad(): # Encode input frames enc_output = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0) # Apply ctc layer to obtain log character probabilities lpz = model.ctc.log_softmax(enc_output)[0].cpu().numpy() # Prepare the text for aligning ground_truth_mat, utt_begin_indices = prepare_text(config, text[name]) # Align using CTC segmentation timings, char_probs, state_list = ctc_segmentation( config, lpz, ground_truth_mat ) logging.debug(f"state_list = {state_list}") # Obtain list of utterances with time intervals and confidence score segments = determine_utterance_segments( config, utt_begin_indices, char_probs, timings, text[name] ) # Write to "segments" file for i, boundary in enumerate(segments): utt_segment = ( f"{segment_names[name][i]} {name} {boundary[0]:.2f}" f" {boundary[1]:.2f} {boundary[2]:.9f}\n" ) args.output.write(utt_segment) return 0 if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/asr_enhance.py ================================================ #!/usr/bin/env python3 import configargparse from distutils.util import strtobool import logging import os import random import sys import numpy as np from espnet.asr.pytorch_backend.asr import enhance # NOTE: you need this func to generate our sphinx doc def get_parser(): parser = configargparse.ArgumentParser( description="Enhance noisy speech for speech recognition", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites the settings " "in `--config` and `--config2`.", ) parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs") parser.add_argument( "--backend", default="chainer", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option") parser.add_argument( "--batchsize", default=1, type=int, help="Batch size for beam search (0: means no batch processing)", ) parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) # task related parser.add_argument( "--recog-json", type=str, help="Filename of recognition data (json)" ) # model (parameter) related parser.add_argument( "--model", type=str, required=True, help="Model file parameters to read" ) parser.add_argument( "--model-conf", type=str, default=None, help="Model config file" ) # Outputs configuration parser.add_argument( "--enh-wspecifier", type=str, default=None, help="Specify the output way for enhanced speech." "e.g. ark,scp:outdir,wav.scp", ) parser.add_argument( "--enh-filetype", type=str, default="sound", choices=["mat", "hdf5", "sound.hdf5", "sound"], help="Specify the file format for enhanced speech. " '"mat" is the matrix format in kaldi', ) parser.add_argument("--fs", type=int, default=16000, help="The sample frequency") parser.add_argument( "--keep-length", type=strtobool, default=True, help="Adjust the output length to match " "with the input for enhanced speech", ) parser.add_argument( "--image-dir", type=str, default=None, help="The directory saving the images." ) parser.add_argument( "--num-images", type=int, default=20, help="The number of images files to be saved. " "If negative, all samples are to be saved.", ) # IStft parser.add_argument( "--apply-istft", type=strtobool, default=True, help="Apply istft to the output from the network", ) parser.add_argument( "--istft-win-length", type=int, default=512, help="The window length for istft. " "This option is ignored " "if stft is found in the preprocess-conf", ) parser.add_argument( "--istft-n-shift", type=str, default=256, help="The window type for istft. " "This option is ignored " "if stft is found in the preprocess-conf", ) parser.add_argument( "--istft-window", type=str, default="hann", help="The window type for istft. " "This option is ignored " "if stft is found in the preprocess-conf", ) return parser def main(args): parser = get_parser() args = parser.parse_args(args) # logging info if args.verbose == 1: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose == 2: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check CUDA_VISIBLE_DEVICES if args.ngpu > 0: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is None: logging.warning("CUDA_VISIBLE_DEVICES is not set.") elif args.ngpu != len(cvd.split(",")): logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") sys.exit(1) # TODO(kamo): support of multiple GPUs if args.ngpu > 1: logging.error("The program only supports ngpu=1.") sys.exit(1) # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # seed setting random.seed(args.seed) np.random.seed(args.seed) logging.info("set random seed = %d" % args.seed) # recog logging.info("backend = " + args.backend) if args.backend == "pytorch": enhance(args) else: raise ValueError("Only pytorch is supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/asr_recog.py ================================================ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """End-to-end speech recognition model decoding script.""" import configargparse import logging import os import random import sys import tracemalloc import numpy as np from espnet.utils.cli_utils import strtobool # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get default arguments.""" parser = configargparse.ArgumentParser( description="Transcribe text from speech using " "a speech recognition model on one CPU or GPU", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="Config file path") parser.add( "--config2", is_config_file=True, help="Second config file path that overwrites the settings in `--config`", ) parser.add( "--config3", is_config_file=True, help="Third config file path that overwrites the settings " "in `--config` and `--config2`", ) parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs") parser.add_argument( "--dtype", choices=("float16", "float32", "float64"), default="float32", help="Float precision (only available in --api v2)", ) parser.add_argument( "--backend", type=str, default="chainer", choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--debugmode", type=int, default=1, help="Debugmode") parser.add_argument("--seed", type=int, default=1, help="Random seed") parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option") parser.add_argument( "--batchsize", type=int, default=1, help="Batch size for beam search (0: means no batch processing)", ) parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) parser.add_argument( "--api", default="v1", choices=["v1", "v2"], help="Beam search APIs " "v1: Default API. It only supports the ASRInterface.recognize method " "and DefaultRNNLM. " "v2: Experimental API. It supports any models that implements ScorerInterface.", ) # task related parser.add_argument( "--recog-json", type=str, help="Filename of recognition data (json)" ) parser.add_argument( "--result-label", type=str, required=True, help="Filename of result label data (json)", ) # model (parameter) related parser.add_argument( "--model", type=str, required=True, help="Model file parameters to read" ) parser.add_argument( "--model-conf", type=str, default=None, help="Model config file" ) parser.add_argument( "--num-spkrs", type=int, default=1, choices=[1, 2], help="Number of speakers in the speech", ) parser.add_argument( "--num-encs", default=1, type=int, help="Number of encoders in the model." ) # search related parser.add_argument("--nbest", type=int, default=10, help="Output N-best hypotheses") parser.add_argument("--beam-size", type=int, default=1, help="Beam size") parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty") parser.add_argument( "--maxlenratio", type=float, default=0.0, help="""Input length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths""", ) parser.add_argument( "--minlenratio", type=float, default=0.0, help="Input length ratio to obtain min output length", ) parser.add_argument( "--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding" ) parser.add_argument( "--weights-ctc-dec", type=float, action="append", help="ctc weight assigned to each encoder during decoding." "[in multi-encoder mode only]", ) parser.add_argument( "--ctc-window-margin", type=int, default=0, help="""Use CTC window with margin parameter to accelerate CTC/attention decoding especially on GPU. Smaller magin makes decoding faster, but may increase search errors. If margin=0 (default), this function is disabled""", ) # transducer related parser.add_argument( "--search-type", type=str, default="alsd", choices=["default", "nsc", "tsd", "alsd", "ctc_greedy", "ctc_beam"], help="""Type of beam search implementation to use during inference. Can be either: default beam search, n-step constrained beam search ("nsc"), time-synchronous decoding ("tsd") or alignment-length synchronous decoding ("alsd"). Additional associated parameters: "nstep" + "prefix-alpha" (for nsc), "max-sym-exp" (for tsd) and "u-max" (for alsd)""", ) parser.add_argument( "--nstep", type=int, default=1, help="Number of expansion steps allowed in NSC beam search.", ) parser.add_argument( "--prefix-alpha", type=int, default=2, help="Length prefix difference allowed in NSC beam search.", ) parser.add_argument( "--max-sym-exp", type=int, default=2, help="Number of symbol expansions allowed in TSD decoding.", ) parser.add_argument( "--u-max", type=int, default=400, help="Length prefix difference allowed in ALSD beam search.", ) parser.add_argument( "--score-norm", type=strtobool, nargs="?", default=True, help="Normalize transducer scores by length", ) # rnnlm related parser.add_argument( "--rnnlm", type=str, default=None, help="RNNLM model file to read" ) parser.add_argument( "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read" ) parser.add_argument( "--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read" ) parser.add_argument( "--word-rnnlm-conf", type=str, default=None, help="Word RNNLM model config file to read", ) parser.add_argument("--word-dict", type=str, default=None, help="Word list to read") parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight") # ngram related parser.add_argument( "--ngram-model", type=str, default=None, help="ngram model file to read" ) parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight") parser.add_argument( "--ngram-scorer", type=str, default="part", choices=("full", "part"), help="""if the ngram is set as a part scorer, similar with CTC scorer, ngram scorer only scores topK hypethesis. if the ngram is set as full scorer, ngram scorer scores all hypthesis the decoding speed of part scorer is musch faster than full one""", ) # streaming related parser.add_argument( "--streaming-mode", type=str, default=None, choices=["window", "segment"], help="""Use streaming recognizer for inference. `--batchsize` must be set to 0 to enable this mode""", ) parser.add_argument("--streaming-window", type=int, default=10, help="Window size") parser.add_argument( "--streaming-min-blank-dur", type=int, default=10, help="Minimum blank duration threshold", ) parser.add_argument( "--streaming-onset-margin", type=int, default=1, help="Onset margin" ) parser.add_argument( "--streaming-offset-margin", type=int, default=1, help="Offset margin" ) # non-autoregressive related # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail. parser.add_argument( "--maskctc-n-iterations", type=int, default=10, help="Number of decoding iterations." "For Mask CTC, set 0 to predict 1 mask/iter.", ) parser.add_argument( "--maskctc-probability-threshold", type=float, default=0.999, help="Threshold probability for CTC output", ) parser.add_argument( "--k2-decode", type=bool, default=False, help="Using K2 decoding", ) parser.add_argument( "--local-rank", type=int, default=-1, help="To choose GPU", ) parser.add_argument( "--mmi-weight", type=float, default=0.0, help="MMI scorer weight", ) parser.add_argument( "--mas-lookahead", type=int, default=0, help="Number of frames to look-ahead in MMI alignment scores", ) parser.add_argument( "--use-segment", type=strtobool, default=False, help="If true, the MMI score is parsed by jieba. (Chinese only)", ) parser.add_argument( "--mmi-rescore", type=strtobool, default=False, help="Do mmi rescoring after decoding, only for lasctc framework" ) parser.add_argument( "--word-ngram", type=str, default="", help="Path to word-level N-gram model lang directory" ) parser.add_argument( "--word-ngram-weight", type=float, default=0.0, help="weight of the N-gram model" ) parser.add_argument( "--word-ngram-log-semiring", type=strtobool, default=True, help="If true, score the lattice with log-semiring, else tropical semiring" ) parser.add_argument( "--word-ngram-lower-char", type=strtobool, default=True, help="If true, all english characters will be converted into lower case. otherwise upper case" ) parser.add_argument( "--tlg-scorer", type=str, default="", help="lang directory of lang that save the LG.fst. Only useful for RNNT ALSD decoding" ) parser.add_argument( "--tlg-nonblk-reward", type=float, default=1.5, help="Reward whenaver a non-blank token is generated. Used in TLG scorer", ) parser.add_argument( "--tlg-weight", type=float, default=0.0, help="weight for TLG scorer in decoding", ) parser.add_argument( "--skip-eng", type=strtobool, default=False, help="If true, skip the utterance whose transcription has english alphabet (rnnt only)", ) parser.add_argument( "--forbid-eng", type=strtobool, default=False, help="If true, forbid the rnnt model to predict English characters (rnnt only)", ) parser.add_argument( "--cs-nt-decode-feature", type=str, default="combine", choices = ["combine", "chn", "eng"], help="feature used for decoding", ) parser.add_argument( "--cs-lang-weight", type=float, default="0.0", help="weight of language classification loss", ) parser.add_argument( "--eng-vocab", type=str, default=None, help="if apply, the hypothesis is valid only if all english words are in this vocab", ) return parser def main(args): """Run the main decoding function.""" parser = get_parser() args = parser.parse_args(args) if args.ngpu == 0 and args.dtype == "float16": raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.") # logging info if args.verbose == 1: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose == 2: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check CUDA_VISIBLE_DEVICES if args.ngpu > 0: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is None: logging.warning("CUDA_VISIBLE_DEVICES is not set.") elif args.ngpu != len(cvd.split(",")): logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") sys.exit(1) # TODO(mn5k): support of multiple GPUs if args.ngpu > 1: logging.error("The program only supports ngpu=1.") sys.exit(1) # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # seed setting random.seed(args.seed) np.random.seed(args.seed) logging.info("set random seed = %d" % args.seed) # validate rnn options if args.rnnlm is not None and args.word_rnnlm is not None: logging.error( "It seems that both --rnnlm and --word-rnnlm are specified. " "Please use either option." ) sys.exit(1) # recog logging.info("backend = " + args.backend) if args.num_spkrs == 1: if args.backend == "chainer": from espnet.asr.chainer_backend.asr import recog recog(args) elif args.backend == "pytorch": if args.num_encs == 1: # Experimental API that supports custom LMs if args.api == "v2": from espnet.asr.pytorch_backend.recog import recog_v2 recog_v2(args) else: from espnet.asr.pytorch_backend.asr import recog if args.dtype != "float32": raise NotImplementedError( f"`--dtype {args.dtype}` is only available with `--api v2`" ) recog(args) else: if args.api == "v2": raise NotImplementedError( f"--num-encs {args.num_encs} > 1 is not supported in --api v2" ) else: from espnet.asr.pytorch_backend.asr import recog recog(args) else: raise ValueError("Only chainer and pytorch are supported.") elif args.num_spkrs == 2: if args.backend == "pytorch": from espnet.asr.pytorch_backend.asr_mix import recog recog(args) else: raise ValueError("Only pytorch is supported.") if __name__ == "__main__": # tracemalloc.start(10000) main(sys.argv[1:]) # size, peak = tracemalloc.get_traced_memory() # peak /= (1024 ** 2) # print(f"Maximum Memory consumed: {peak}MB") ================================================ FILE: bin/asr_train.py ================================================ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2017 Tomoki Hayashi (Nagoya University) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Automatic speech recognition model training script.""" import logging import os import random import subprocess import sys from distutils.version import LooseVersion import configargparse import numpy as np import torch from espnet import __version__ from espnet.utils.cli_utils import strtobool from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2") # NOTE: you need this func to generate our sphinx doc def get_parser(parser=None, required=True): """Get default arguments.""" if parser is None: parser = configargparse.ArgumentParser( description="Train an automatic speech recognition (ASR) model on one CPU, " "one or multiple GPUs", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites the settings in " "`--config` and `--config2`.", ) parser.add_argument( "--ngpu", default=None, type=int, help="Number of GPUs. If not given, use all visible devices", ) parser.add_argument( "--train-dtype", default="float32", choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"], help="Data type for training (only pytorch backend). " "O0,O1,.. flags require apex. " "See https://nvidia.github.io/apex/amp.html#opt-levels", ) parser.add_argument( "--backend", default="chainer", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument( "--outdir", type=str, required=required, help="Output directory" ) parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--dict", required=required, help="Dictionary") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument("--debugdir", type=str, help="Output directory for debugging") parser.add_argument( "--resume", "-r", default="", nargs="?", help="Resume the training from snapshot", ) parser.add_argument( "--minibatches", "-N", type=int, default="-1", help="Process only N minibatches (for debug)", ) parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--tensorboard-dir", default=None, type=str, nargs="?", help="Tensorboard log dir path", ) parser.add_argument( "--report-interval-iters", default=300, type=int, help="Report interval iterations", ) parser.add_argument( "--save-interval-iters", default=0, type=int, help="Save snapshot interval iterations", ) # task related parser.add_argument( "--train-json", type=str, default=None, help="Filename of train label data (json)", ) parser.add_argument( "--valid-json", type=str, default=None, help="Filename of validation label data (json)", ) # network architecture parser.add_argument( "--model-module", type=str, default=None, help="model defined module (default: espnet.nets.xxx_backend.e2e_asr:E2E)", ) # encoder parser.add_argument( "--num-encs", default=1, type=int, help="Number of encoders in the model." ) # loss related parser.add_argument( "--ctc_type", default="warpctc", type=str, choices=["builtin", "warpctc", "gtnctc", "cudnnctc", "k2mmi", 'k2ctc'], help="Type of CTC implementation to calculate loss.", ) parser.add_argument( "--mtlalpha", default=0.5, type=float, help="Multitask learning coefficient, " "alpha: alpha*ctc_loss + (1-alpha)*att_loss ", ) parser.add_argument( "--lsm-weight", default=0.0, type=float, help="Label smoothing weight" ) # recognition options to compute CER/WER parser.add_argument( "--report-cer", default=False, action="store_true", help="Compute CER on development set", ) parser.add_argument( "--report-wer", default=False, action="store_true", help="Compute WER on development set", ) parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") parser.add_argument("--beam-size", type=int, default=4, help="Beam size") parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty") parser.add_argument( "--maxlenratio", default=0.0, type=float, help="""Input length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths""", ) parser.add_argument( "--minlenratio", default=0.0, type=float, help="Input length ratio to obtain min output length", ) parser.add_argument( "--ctc-weight", default=0.3, type=float, help="CTC weight in joint decoding" ) parser.add_argument( "--rnnlm", type=str, default=None, help="RNNLM model file to read" ) parser.add_argument( "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read" ) parser.add_argument("--lm-weight", default=0.1, type=float, help="RNNLM weight.") parser.add_argument("--sym-space", default="", type=str, help="Space symbol") parser.add_argument("--sym-blank", default="", type=str, help="Blank symbol") # minibatch related parser.add_argument( "--sortagrad", default=0, type=int, nargs="?", help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs", ) parser.add_argument( "--batch-count", default="auto", choices=BATCH_COUNT_CHOICES, help="How to count batch_size. " "The default (auto) will find how to count by args.", ) parser.add_argument( "--batch-size", "--batch-seqs", "-b", default=0, type=int, help="Maximum seqs in a minibatch (0 to disable)", ) parser.add_argument( "--batch-bins", default=0, type=int, help="Maximum bins in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-in", default=0, type=int, help="Maximum input frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-out", default=0, type=int, help="Maximum output frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-inout", default=0, type=int, help="Maximum input+output frames in a minibatch (0 to disable)", ) parser.add_argument( "--maxlen-in", "--batch-seq-maxlen-in", default=800, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the input sequence length > ML.", ) parser.add_argument( "--maxlen-out", "--batch-seq-maxlen-out", default=150, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the output sequence length > ML", ) parser.add_argument( "--n-iter-processes", default=0, type=int, help="Number of processes of iterator", ) parser.add_argument( "--preprocess-conf", type=str, default=None, nargs="?", help="The configuration file for the pre-processing", ) # optimization related parser.add_argument( "--opt", default="noam_sgd", type=str, choices=["adadelta", "adam", "noam", "noam_sgd"], help="Optimizer", ) parser.add_argument( "--accum-grad", default=1, type=int, help="Number of gradient accumuration" ) parser.add_argument( "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer" ) parser.add_argument( "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon" ) parser.add_argument( "--weight-decay", default=0.0, type=float, help="Weight decay ratio" ) parser.add_argument( "--criterion", default="acc", type=str, choices=["loss", "loss_eps_decay_only", "acc"], help="Criterion to perform epsilon decay", ) parser.add_argument( "--threshold", default=1e-4, type=float, help="Threshold to stop iteration" ) parser.add_argument( "--epochs", "-e", default=30, type=int, help="Maximum number of epochs" ) parser.add_argument( "--early-stop-criterion", default="validation/main/acc", type=str, nargs="?", help="Value to monitor to trigger an early stopping of the training", ) parser.add_argument( "--patience", default=3, type=int, nargs="?", help="Number of epochs to wait without improvement " "before stopping the training", ) parser.add_argument( "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip" ) parser.add_argument( "--num-save-attention", default=0, type=int, help="Number of samples of attention to be saved", ) parser.add_argument( "--num-save-ctc", default=0, type=int, help="Number of samples of CTC probability to be saved", ) parser.add_argument( "--grad-noise", type=strtobool, default=False, help="The flag to switch to use noise injection to gradients during training", ) # asr_mix related parser.add_argument( "--num-spkrs", default=1, type=int, choices=[1, 2], help="Number of speakers in the speech.", ) # decoder related parser.add_argument( "--context-residual", default=False, type=strtobool, nargs="?", help="The flag to switch to use context vector residual in the decoder network", ) # finetuning related parser.add_argument( "--enc-init", default=None, type=str, help="Pre-trained ASR model to initialize encoder.", ) parser.add_argument( "--enc-init-mods", default="enc.enc.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of encoder modules to initialize, separated by a comma.", ) parser.add_argument( "--dec-init", default=None, type=str, help="Pre-trained ASR, MT or LM model to initialize decoder.", ) parser.add_argument( "--dec-init-mods", default="att.,dec.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of decoder modules to initialize, separated by a comma.", ) parser.add_argument( "--freeze-mods", default=None, type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of modules to freeze, separated by a comma.", ) # front end related parser.add_argument( "--use-frontend", type=strtobool, default=False, help="The flag to switch to use frontend system.", ) # WPE related parser.add_argument( "--use-wpe", type=strtobool, default=False, help="Apply Weighted Prediction Error", ) parser.add_argument( "--wtype", default="blstmp", type=str, choices=[ "lstm", "blstm", "lstmp", "blstmp", "vgglstmp", "vggblstmp", "vgglstm", "vggblstm", "gru", "bgru", "grup", "bgrup", "vgggrup", "vggbgrup", "vgggru", "vggbgru", ], help="Type of encoder network architecture " "of the mask estimator for WPE. " "", ) parser.add_argument("--wlayers", type=int, default=2, help="") parser.add_argument("--wunits", type=int, default=300, help="") parser.add_argument("--wprojs", type=int, default=300, help="") parser.add_argument("--wdropout-rate", type=float, default=0.0, help="") parser.add_argument("--wpe-taps", type=int, default=5, help="") parser.add_argument("--wpe-delay", type=int, default=3, help="") parser.add_argument( "--use-dnn-mask-for-wpe", type=strtobool, default=False, help="Use DNN to estimate the power spectrogram. " "This option is experimental.", ) # Beamformer related parser.add_argument("--use-beamformer", type=strtobool, default=True, help="") parser.add_argument( "--btype", default="blstmp", type=str, choices=[ "lstm", "blstm", "lstmp", "blstmp", "vgglstmp", "vggblstmp", "vgglstm", "vggblstm", "gru", "bgru", "grup", "bgrup", "vgggrup", "vggbgrup", "vgggru", "vggbgru", ], help="Type of encoder network architecture " "of the mask estimator for Beamformer.", ) parser.add_argument("--blayers", type=int, default=2, help="") parser.add_argument("--bunits", type=int, default=300, help="") parser.add_argument("--bprojs", type=int, default=300, help="") parser.add_argument("--badim", type=int, default=320, help="") parser.add_argument( "--bnmask", type=int, default=2, help="Number of beamforming masks, " "default is 2 for [speech, noise].", ) parser.add_argument( "--ref-channel", type=int, default=-1, help="The reference channel used for beamformer. " "By default, the channel is estimated by DNN.", ) parser.add_argument("--bdropout-rate", type=float, default=0.0, help="") # Feature transform: Normalization parser.add_argument( "--stats-file", type=str, default=None, help="The stats file for the feature normalization", ) parser.add_argument( "--apply-uttmvn", type=strtobool, default=True, help="Apply utterance level mean " "variance normalization.", ) parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="") parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="") # Feature transform: Fbank parser.add_argument( "--fbank-fs", type=int, default=16000, help="The sample frequency used for " "the mel-fbank creation.", ) parser.add_argument( "--n-mels", type=int, default=80, help="The number of mel-frequency bins." ) parser.add_argument("--fbank-fmin", type=float, default=0.0, help="") parser.add_argument("--fbank-fmax", type=float, default=None, help="") # K2 parser.add_argument("--lang", type=str, help="k2 lang dir") parser.add_argument("--den-scale", type=float, default=1.0, help="denumerator scale: loss = num + den_scale * den") parser.add_argument("--third-weight", type=float, default=0.0, help="we still need ctc loss if encoder is supervised by MMI. This is ctc_weight") parser.add_argument("--use-segment", type=strtobool, default=False, help="If true, MMI supervision is from text_org. If false, it is from ys_pad") # DDP parser.add_argument("--master-node", type=int, default=0, help="master node rank") parser.add_argument("--local_rank", type=int, default=-1, help="local GPU rank") parser.add_argument("--world-size", type=int, default=-1, help="BMUF world size") parser.add_argument("--node-rank", type=int, default=-1, help="DDP node rank") parser.add_argument("--node-size", type=int, default=8, help="number of GPU on each node") # MBR parser.add_argument("--load-trainer-and-opt", type=strtobool, default=True, help="If false, only the model weight would be loaded in snapshot") parser.add_argument("--block-load", type=strtobool, default=False, help="block loading for training. make sure all batches are in the same ark") parser.add_argument("--utts-per-ark", type=int, default=256, help="number of utterance in each ark") """ Due to the slow ceph, we cannot load data completely in random paradigm Thus, the randomness is implemented in hierarchical style. (1) We sure that each minibatch is from the same ark file. Also, make sure the utterances in json file is sorted from shortest to longest. You should do this before training starts. (2) the whole dataset is divided into many groups, each groups contains "block-buffer-size" arks. The randomness is implemented on both intra- and inter- group styles. The larger the 'block-buffer-size', the better the randomness is implemented. But more memory would be be consumed. (3) At the begining of each epoch, the training would stuck since nearly each update needs to load a new ark. This will not last long: it would be smooth once a group of data is completely loaded (4) Once a minibatch is consumed, we delete it in memory to avoid OOM (5) If we use loading stategy, we can only use one worker process ot load the data to avoid the conflicts in memory buffer. But this is fine since each ark contains many utterances and one worker is far more than enough (6) A buffer that is too large (e.g, size > 100) will make the GPU slow since the virtual memory (actually the disk) is used as the buffer """ parser.add_argument("--block-buffer-size", type=int, default=80, help="number of arks in buffer. At most 3*block_buffer_size arks would be stored in memory") return parser def main(cmd_args): """Run the main training function.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) if args.backend == "chainer" and args.train_dtype != "float32": raise NotImplementedError( f"chainer backend does not support --train-dtype {args.train_dtype}." "Use --dtype float32." ) if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"): raise ValueError( f"--train-dtype {args.train_dtype} does not support the CPU backend." ) from espnet.utils.dynamic_import import dynamic_import if args.model_module is None: if args.num_spkrs == 1: model_module = "espnet.nets." + args.backend + "_backend.e2e_asr:E2E" else: model_module = "espnet.nets." + args.backend + "_backend.e2e_asr_mix:E2E" else: model_module = args.model_module model_class = dynamic_import(model_module) model_class.add_arguments(parser) args = parser.parse_args(cmd_args) args.model_module = model_module if "chainer_backend" in args.model_module: args.backend = "chainer" if "pytorch_backend" in args.model_module: args.backend = "pytorch" # add version info in args args.version = __version__ # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 if args.ngpu is None: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: ngpu = len(cvd.split(",")) else: logging.warning("CUDA_VISIBLE_DEVICES is not set.") try: p = subprocess.run( ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except (subprocess.CalledProcessError, FileNotFoundError): ngpu = 0 else: ngpu = len(p.stderr.decode().split("\n")) - 1 else: if is_torch_1_2_plus and args.ngpu != 1: logging.debug( "There are some bugs with multi-GPU processing in PyTorch 1.2+" + " (see https://github.com/pytorch/pytorch/issues/21108)" ) ngpu = args.ngpu logging.info(f"ngpu: {ngpu}") # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # set random seed logging.info("random seed = %d" % args.seed) random.seed(args.seed) np.random.seed(args.seed) # load dictionary for debug log if args.dict is not None: with open(args.dict, "rb") as f: dictionary = f.readlines() char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary] char_list.insert(0, "") char_list.append("") # for non-autoregressive maskctc model if "maskctc" in args.model_module: char_list.append("") args.char_list = char_list else: args.char_list = None # train logging.info("backend = " + args.backend) if args.num_spkrs == 1: if args.backend == "chainer": from espnet.asr.chainer_backend.asr import train train(args) elif args.backend == "pytorch": from espnet.asr.pytorch_backend.asr import train train(args) else: raise ValueError("Only chainer and pytorch are supported.") else: # FIXME(kamo): Support --model-module if args.backend == "pytorch": from espnet.asr.pytorch_backend.asr_mix import train train(args) else: raise ValueError("Only pytorch is supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/lm_train.py ================================================ #!/usr/bin/env python3 # Copyright 2017 Johns Hopkins University (Shinji Watanabe) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) # This code is ported from the following implementation written in Torch. # https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py """Language model training script.""" import logging import os import random import subprocess import sys import configargparse import numpy as np from espnet import __version__ from espnet.nets.lm_interface import dynamic_import_lm from espnet.optimizer.factory import dynamic_import_optimizer from espnet.scheduler.scheduler import dynamic_import_scheduler # NOTE: you need this func to generate our sphinx doc def get_parser(parser=None, required=True): """Get parser.""" if parser is None: parser = configargparse.ArgumentParser( description="Train a new language model on one CPU or one GPU", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites the settings " "in `--config` and `--config2`.", ) parser.add_argument( "--ngpu", default=None, type=int, help="Number of GPUs. If not given, use all visible devices", ) parser.add_argument( "--train-dtype", default="float32", choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"], help="Data type for training (only pytorch backend). " "O0,O1,.. flags require apex. " "See https://nvidia.github.io/apex/amp.html#opt-levels", ) parser.add_argument( "--backend", default="chainer", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument( "--outdir", type=str, required=required, help="Output directory" ) parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--dict", type=str, required=required, help="Dictionary") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument( "--resume", "-r", default="", nargs="?", help="Resume the training from snapshot", ) parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--tensorboard-dir", default=None, type=str, nargs="?", help="Tensorboard log dir path", ) parser.add_argument( "--report-interval-iters", default=100, type=int, help="Report interval iterations", ) # task related parser.add_argument( "--train-label", type=str, required=required, help="Filename of train label data", ) parser.add_argument( "--valid-label", type=str, required=required, help="Filename of validation label data", ) parser.add_argument("--test-label", type=str, help="Filename of test label data") parser.add_argument( "--dump-hdf5-path", type=str, default=None, help="Path to dump a preprocessed dataset as hdf5", ) # training configuration parser.add_argument("--opt", default="sgd", type=str, help="Optimizer") parser.add_argument( "--sortagrad", default=0, type=int, nargs="?", help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs", ) parser.add_argument( "--batchsize", "-b", type=int, default=300, help="Number of examples in each mini-batch", ) parser.add_argument( "--accum-grad", type=int, default=1, help="Number of gradient accumueration" ) parser.add_argument( "--epoch", "-e", type=int, default=20, help="Number of sweeps over the dataset to train", ) parser.add_argument( "--early-stop-criterion", default="validation/main/loss", type=str, nargs="?", help="Value to monitor to trigger an early stopping of the training", ) parser.add_argument( "--patience", default=3, type=int, nargs="?", help="Number of epochs " "to wait without improvement before stopping the training", ) parser.add_argument( "--schedulers", default=None, action="append", type=lambda kv: kv.split("="), help="optimizer schedulers, you can configure params like:" " --" ' e.g., "--schedulers lr=noam --lr-noam-warmup 1000".', ) parser.add_argument( "--gradclip", "-c", type=float, default=5, help="Gradient norm threshold to clip", ) parser.add_argument( "--maxlen", type=int, default=40, help="Batch size is reduced if the input sequence > ML", ) parser.add_argument( "--model-module", type=str, default="default", help="model defined module " "(default: espnet.nets.xxx_backend.lm.default:DefaultRNNLM)", ) return parser def main(cmd_args): """Train LM.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) if args.backend == "chainer" and args.train_dtype != "float32": raise NotImplementedError( f"chainer backend does not support --train-dtype {args.train_dtype}." "Use --dtype float32." ) if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"): raise ValueError( f"--train-dtype {args.train_dtype} does not support the CPU backend." ) # parse arguments dynamically model_class = dynamic_import_lm(args.model_module, args.backend) model_class.add_arguments(parser) if args.schedulers is not None: for k, v in args.schedulers: scheduler_class = dynamic_import_scheduler(v) scheduler_class.add_arguments(k, parser) opt_class = dynamic_import_optimizer(args.opt, args.backend) opt_class.add_arguments(parser) args = parser.parse_args(cmd_args) # add version info in args args.version = __version__ # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 if args.ngpu is None: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: ngpu = len(cvd.split(",")) else: logging.warning("CUDA_VISIBLE_DEVICES is not set.") try: p = subprocess.run( ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except (subprocess.CalledProcessError, FileNotFoundError): ngpu = 0 else: ngpu = len(p.stderr.decode().split("\n")) - 1 args.ngpu = ngpu else: ngpu = args.ngpu logging.info(f"ngpu: {ngpu}") # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # seed setting nseed = args.seed random.seed(nseed) np.random.seed(nseed) # load dictionary with open(args.dict, "rb") as f: dictionary = f.readlines() char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary] char_list.insert(0, "") char_list.append("") args.char_list_dict = {x: i for i, x in enumerate(char_list)} args.n_vocab = len(char_list) # train logging.info("backend = " + args.backend) if args.backend == "chainer": from espnet.lm.chainer_backend.lm import train train(args) elif args.backend == "pytorch": from espnet.lm.pytorch_backend.lm import train train(args) else: raise ValueError("Only chainer and pytorch are supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/mt_train.py ================================================ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2019 Kyoto University (Hirofumi Inaguma) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Neural machine translation model training script.""" import logging import os import random import subprocess import sys from distutils.version import LooseVersion import configargparse import numpy as np import torch from espnet import __version__ from espnet.utils.cli_utils import strtobool from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2") # NOTE: you need this func to generate our sphinx doc def get_parser(parser=None, required=True): """Get default arguments.""" if parser is None: parser = configargparse.ArgumentParser( description="Train a neural machine translation (NMT) model on one CPU, " "one or multiple GPUs", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites the settings " "in `--config` and `--config2`.", ) parser.add_argument( "--ngpu", default=None, type=int, help="Number of GPUs. If not given, use all visible devices", ) parser.add_argument( "--train-dtype", default="float32", choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"], help="Data type for training (only pytorch backend). " "O0,O1,.. flags require apex. " "See https://nvidia.github.io/apex/amp.html#opt-levels", ) parser.add_argument( "--backend", default="chainer", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument( "--outdir", type=str, required=required, help="Output directory" ) parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument( "--dict", required=required, help="Dictionary for source/target languages" ) parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument("--debugdir", type=str, help="Output directory for debugging") parser.add_argument( "--resume", "-r", default="", nargs="?", help="Resume the training from snapshot", ) parser.add_argument( "--minibatches", "-N", type=int, default="-1", help="Process only N minibatches (for debug)", ) parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--tensorboard-dir", default=None, type=str, nargs="?", help="Tensorboard log dir path", ) parser.add_argument( "--report-interval-iters", default=100, type=int, help="Report interval iterations", ) parser.add_argument( "--save-interval-iters", default=0, type=int, help="Save snapshot interval iterations", ) # task related parser.add_argument( "--train-json", type=str, default=None, help="Filename of train label data (json)", ) parser.add_argument( "--valid-json", type=str, default=None, help="Filename of validation label data (json)", ) # network architecture parser.add_argument( "--model-module", type=str, default=None, help="model defined module (default: espnet.nets.xxx_backend.e2e_mt:E2E)", ) # loss related parser.add_argument( "--lsm-weight", default=0.0, type=float, help="Label smoothing weight" ) # translations options to compute BLEU parser.add_argument( "--report-bleu", default=True, action="store_true", help="Compute BLEU on development set", ) parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") parser.add_argument("--beam-size", type=int, default=4, help="Beam size") parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty") parser.add_argument( "--maxlenratio", default=0.0, type=float, help="""Input length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths""", ) parser.add_argument( "--minlenratio", default=0.0, type=float, help="Input length ratio to obtain min output length", ) parser.add_argument( "--rnnlm", type=str, default=None, help="RNNLM model file to read" ) parser.add_argument( "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read" ) parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.") parser.add_argument("--sym-space", default="", type=str, help="Space symbol") parser.add_argument("--sym-blank", default="", type=str, help="Blank symbol") # minibatch related parser.add_argument( "--sortagrad", default=0, type=int, nargs="?", help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs", ) parser.add_argument( "--batch-count", default="auto", choices=BATCH_COUNT_CHOICES, help="How to count batch_size. " "The default (auto) will find how to count by args.", ) parser.add_argument( "--batch-size", "--batch-seqs", "-b", default=0, type=int, help="Maximum seqs in a minibatch (0 to disable)", ) parser.add_argument( "--batch-bins", default=0, type=int, help="Maximum bins in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-in", default=0, type=int, help="Maximum input frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-out", default=0, type=int, help="Maximum output frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-inout", default=0, type=int, help="Maximum input+output frames in a minibatch (0 to disable)", ) parser.add_argument( "--maxlen-in", "--batch-seq-maxlen-in", default=100, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the input sequence length > ML.", ) parser.add_argument( "--maxlen-out", "--batch-seq-maxlen-out", default=100, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the output sequence length > ML", ) parser.add_argument( "--n-iter-processes", default=0, type=int, help="Number of processes of iterator", ) # optimization related parser.add_argument( "--opt", default="adadelta", type=str, choices=["adadelta", "adam", "noam"], help="Optimizer", ) parser.add_argument( "--accum-grad", default=1, type=int, help="Number of gradient accumuration" ) parser.add_argument( "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer" ) parser.add_argument( "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon" ) parser.add_argument( "--lr", default=1e-3, type=float, help="Learning rate for optimizer" ) parser.add_argument( "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate" ) parser.add_argument( "--weight-decay", default=0.0, type=float, help="Weight decay ratio" ) parser.add_argument( "--criterion", default="acc", type=str, choices=["loss", "acc"], help="Criterion to perform epsilon decay", ) parser.add_argument( "--threshold", default=1e-4, type=float, help="Threshold to stop iteration" ) parser.add_argument( "--epochs", "-e", default=30, type=int, help="Maximum number of epochs" ) parser.add_argument( "--early-stop-criterion", default="validation/main/acc", type=str, nargs="?", help="Value to monitor to trigger an early stopping of the training", ) parser.add_argument( "--patience", default=3, type=int, nargs="?", help="Number of epochs to wait " "without improvement before stopping the training", ) parser.add_argument( "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip" ) parser.add_argument( "--num-save-attention", default=3, type=int, help="Number of samples of attention to be saved", ) # decoder related parser.add_argument( "--context-residual", default=False, type=strtobool, nargs="?", help="The flag to switch to use context vector residual in the decoder network", ) parser.add_argument( "--tie-src-tgt-embedding", default=False, type=strtobool, nargs="?", help="Tie parameters of source embedding and target embedding.", ) parser.add_argument( "--tie-classifier", default=False, type=strtobool, nargs="?", help="Tie parameters of target embedding and output projection layer.", ) # finetuning related parser.add_argument( "--enc-init", default=None, type=str, nargs="?", help="Pre-trained ASR model to initialize encoder.", ) parser.add_argument( "--enc-init-mods", default="enc.enc.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of encoder modules to initialize, separated by a comma.", ) parser.add_argument( "--dec-init", default=None, type=str, nargs="?", help="Pre-trained ASR, MT or LM model to initialize decoder.", ) parser.add_argument( "--dec-init-mods", default="att., dec.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of decoder modules to initialize, separated by a comma.", ) # multilingual related parser.add_argument( "--multilingual", default=False, type=strtobool, help="Prepend target language ID to the source sentence. " "Both source/target language IDs must be prepend in the pre-processing stage.", ) parser.add_argument( "--replace-sos", default=False, type=strtobool, help="Replace in the decoder with a target language ID " "(the first token in the target sequence)", ) return parser def main(cmd_args): """Run the main training function.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) if args.backend == "chainer" and args.train_dtype != "float32": raise NotImplementedError( f"chainer backend does not support --train-dtype {args.train_dtype}." "Use --dtype float32." ) if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"): raise ValueError( f"--train-dtype {args.train_dtype} does not support the CPU backend." ) from espnet.utils.dynamic_import import dynamic_import if args.model_module is None: model_module = "espnet.nets." + args.backend + "_backend.e2e_mt:E2E" else: model_module = args.model_module model_class = dynamic_import(model_module) model_class.add_arguments(parser) args = parser.parse_args(cmd_args) args.model_module = model_module if "chainer_backend" in args.model_module: args.backend = "chainer" if "pytorch_backend" in args.model_module: args.backend = "pytorch" # add version info in args args.version = __version__ # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 if args.ngpu is None: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: ngpu = len(cvd.split(",")) else: logging.warning("CUDA_VISIBLE_DEVICES is not set.") try: p = subprocess.run( ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except (subprocess.CalledProcessError, FileNotFoundError): ngpu = 0 else: ngpu = len(p.stderr.decode().split("\n")) - 1 args.ngpu = ngpu else: if is_torch_1_2_plus and args.ngpu != 1: logging.debug( "There are some bugs with multi-GPU processing in PyTorch 1.2+" + " (see https://github.com/pytorch/pytorch/issues/21108)" ) ngpu = args.ngpu logging.info(f"ngpu: {ngpu}") # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # set random seed logging.info("random seed = %d" % args.seed) random.seed(args.seed) np.random.seed(args.seed) # load dictionary for debug log if args.dict is not None: with open(args.dict, "rb") as f: dictionary = f.readlines() char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary] char_list.insert(0, "") char_list.append("") args.char_list = char_list else: args.char_list = None # train logging.info("backend = " + args.backend) if args.backend == "pytorch": from espnet.mt.pytorch_backend.mt import train train(args) else: raise ValueError("Only pytorch are supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/mt_trans.py ================================================ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2019 Kyoto University (Hirofumi Inaguma) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Neural machine translation model decoding script.""" import configargparse import logging import os import random import sys import numpy as np # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get default arguments.""" parser = configargparse.ArgumentParser( description="Translate text from speech " "using a speech translation model on one CPU or GPU", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="Config file path") parser.add( "--config2", is_config_file=True, help="Second config file path that overwrites the settings in `--config`", ) parser.add( "--config3", is_config_file=True, help="Third config file path " "that overwrites the settings in `--config` and `--config2`", ) parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs") parser.add_argument( "--dtype", choices=("float16", "float32", "float64"), default="float32", help="Float precision (only available in --api v2)", ) parser.add_argument( "--backend", type=str, default="chainer", choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--debugmode", type=int, default=1, help="Debugmode") parser.add_argument("--seed", type=int, default=1, help="Random seed") parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option") parser.add_argument( "--batchsize", type=int, default=1, help="Batch size for beam search (0: means no batch processing)", ) parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) parser.add_argument( "--api", default="v1", choices=["v1", "v2"], help="Beam search APIs " "v1: Default API. It only supports " "the ASRInterface.recognize method and DefaultRNNLM. " "v2: Experimental API. " "It supports any models that implements ScorerInterface.", ) # task related parser.add_argument( "--trans-json", type=str, help="Filename of translation data (json)" ) parser.add_argument( "--result-label", type=str, required=True, help="Filename of result label data (json)", ) # model (parameter) related parser.add_argument( "--model", type=str, required=True, help="Model file parameters to read" ) parser.add_argument( "--model-conf", type=str, default=None, help="Model config file" ) # search related parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") parser.add_argument("--beam-size", type=int, default=1, help="Beam size") parser.add_argument("--penalty", type=float, default=0.1, help="Incertion penalty") parser.add_argument( "--maxlenratio", type=float, default=3.0, help="""Input length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths""", ) parser.add_argument( "--minlenratio", type=float, default=0.0, help="Input length ratio to obtain min output length", ) # multilingual related parser.add_argument( "--tgt-lang", default=False, type=str, help="target language ID (e.g., , , and etc.)", ) return parser def main(args): """Run the main decoding function.""" parser = get_parser() args = parser.parse_args(args) # logging info if args.verbose == 1: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose == 2: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check CUDA_VISIBLE_DEVICES if args.ngpu > 0: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is None: logging.warning("CUDA_VISIBLE_DEVICES is not set.") elif args.ngpu != len(cvd.split(",")): logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") sys.exit(1) # TODO(mn5k): support of multiple GPUs if args.ngpu > 1: logging.error("The program only supports ngpu=1.") sys.exit(1) # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # seed setting random.seed(args.seed) np.random.seed(args.seed) logging.info("set random seed = %d" % args.seed) # trans logging.info("backend = " + args.backend) if args.backend == "pytorch": # Experimental API that supports custom LMs from espnet.mt.pytorch_backend.mt import trans if args.dtype != "float32": raise NotImplementedError( f"`--dtype {args.dtype}` is only available with `--api v2`" ) trans(args) else: raise ValueError("Only pytorch are supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/st_train.py ================================================ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2019 Kyoto University (Hirofumi Inaguma) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """End-to-end speech translation model training script.""" from distutils.version import LooseVersion import logging import os import random import subprocess import sys import configargparse import numpy as np import torch from espnet import __version__ from espnet.utils.cli_utils import strtobool from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion("1.2") # NOTE: you need this func to generate our sphinx doc def get_parser(parser=None, required=True): """Get default arguments.""" if parser is None: parser = configargparse.ArgumentParser( description="Train a speech translation (ST) model on one CPU, " "one or multiple GPUs", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites the settings " "in `--config` and `--config2`.", ) parser.add_argument( "--ngpu", default=None, type=int, help="Number of GPUs. If not given, use all visible devices", ) parser.add_argument( "--train-dtype", default="float32", choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"], help="Data type for training (only pytorch backend). " "O0,O1,.. flags require apex. " "See https://nvidia.github.io/apex/amp.html#opt-levels", ) parser.add_argument( "--backend", default="chainer", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument( "--outdir", type=str, required=required, help="Output directory" ) parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--dict", required=required, help="Dictionary") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument("--debugdir", type=str, help="Output directory for debugging") parser.add_argument( "--resume", "-r", default="", nargs="?", help="Resume the training from snapshot", ) parser.add_argument( "--minibatches", "-N", type=int, default="-1", help="Process only N minibatches (for debug)", ) parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--tensorboard-dir", default=None, type=str, nargs="?", help="Tensorboard log dir path", ) parser.add_argument( "--report-interval-iters", default=100, type=int, help="Report interval iterations", ) parser.add_argument( "--save-interval-iters", default=0, type=int, help="Save snapshot interval iterations", ) # task related parser.add_argument( "--train-json", type=str, default=None, help="Filename of train label data (json)", ) parser.add_argument( "--valid-json", type=str, default=None, help="Filename of validation label data (json)", ) # network architecture parser.add_argument( "--model-module", type=str, default=None, help="model defined module (default: espnet.nets.xxx_backend.e2e_st:E2E)", ) # loss related parser.add_argument( "--ctc_type", default="warpctc", type=str, choices=["builtin", "warpctc", "gtnctc", "cudnnctc"], help="Type of CTC implementation to calculate loss.", ) parser.add_argument( "--mtlalpha", default=0.0, type=float, help="Multitask learning coefficient, alpha: \ alpha*ctc_loss + (1-alpha)*att_loss", ) parser.add_argument( "--asr-weight", default=0.0, type=float, help="Multitask learning coefficient for ASR task, weight: " " asr_weight*(alpha*ctc_loss + (1-alpha)*att_loss)" " + (1-asr_weight-mt_weight)*st_loss", ) parser.add_argument( "--mt-weight", default=0.0, type=float, help="Multitask learning coefficient for MT task, weight: \ mt_weight*mt_loss + (1-mt_weight-asr_weight)*st_loss", ) parser.add_argument( "--lsm-weight", default=0.0, type=float, help="Label smoothing weight" ) # recognition options to compute CER/WER parser.add_argument( "--report-cer", default=False, action="store_true", help="Compute CER on development set", ) parser.add_argument( "--report-wer", default=False, action="store_true", help="Compute WER on development set", ) # translations options to compute BLEU parser.add_argument( "--report-bleu", default=True, action="store_true", help="Compute BLEU on development set", ) parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") parser.add_argument("--beam-size", type=int, default=4, help="Beam size") parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty") parser.add_argument( "--maxlenratio", default=0.0, type=float, help="""Input length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths""", ) parser.add_argument( "--minlenratio", default=0.0, type=float, help="Input length ratio to obtain min output length", ) parser.add_argument( "--rnnlm", type=str, default=None, help="RNNLM model file to read" ) parser.add_argument( "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read" ) parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.") parser.add_argument("--sym-space", default="", type=str, help="Space symbol") parser.add_argument("--sym-blank", default="", type=str, help="Blank symbol") # minibatch related parser.add_argument( "--sortagrad", default=0, type=int, nargs="?", help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs", ) parser.add_argument( "--batch-count", default="auto", choices=BATCH_COUNT_CHOICES, help="How to count batch_size. " "The default (auto) will find how to count by args.", ) parser.add_argument( "--batch-size", "--batch-seqs", "-b", default=0, type=int, help="Maximum seqs in a minibatch (0 to disable)", ) parser.add_argument( "--batch-bins", default=0, type=int, help="Maximum bins in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-in", default=0, type=int, help="Maximum input frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-out", default=0, type=int, help="Maximum output frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-inout", default=0, type=int, help="Maximum input+output frames in a minibatch (0 to disable)", ) parser.add_argument( "--maxlen-in", "--batch-seq-maxlen-in", default=800, type=int, metavar="ML", help="When --batch-count=seq, batch size is reduced " "if the input sequence length > ML.", ) parser.add_argument( "--maxlen-out", "--batch-seq-maxlen-out", default=150, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the output sequence length > ML", ) parser.add_argument( "--n-iter-processes", default=0, type=int, help="Number of processes of iterator", ) parser.add_argument( "--preprocess-conf", type=str, default=None, nargs="?", help="The configuration file for the pre-processing", ) # optimization related parser.add_argument( "--opt", default="adadelta", type=str, choices=["adadelta", "adam", "noam"], help="Optimizer", ) parser.add_argument( "--accum-grad", default=1, type=int, help="Number of gradient accumuration" ) parser.add_argument( "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer" ) parser.add_argument( "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon" ) parser.add_argument( "--lr", default=1e-3, type=float, help="Learning rate for optimizer" ) parser.add_argument( "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate" ) parser.add_argument( "--weight-decay", default=0.0, type=float, help="Weight decay ratio" ) parser.add_argument( "--criterion", default="acc", type=str, choices=["loss", "acc"], help="Criterion to perform epsilon decay", ) parser.add_argument( "--threshold", default=1e-4, type=float, help="Threshold to stop iteration" ) parser.add_argument( "--epochs", "-e", default=30, type=int, help="Maximum number of epochs" ) parser.add_argument( "--early-stop-criterion", default="validation/main/acc", type=str, nargs="?", help="Value to monitor to trigger an early stopping of the training", ) parser.add_argument( "--patience", default=3, type=int, nargs="?", help="Number of epochs to wait " "without improvement before stopping the training", ) parser.add_argument( "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip" ) parser.add_argument( "--num-save-attention", default=3, type=int, help="Number of samples of attention to be saved", ) parser.add_argument( "--num-save-ctc", default=3, type=int, help="Number of samples of CTC probability to be saved", ) parser.add_argument( "--grad-noise", type=strtobool, default=False, help="The flag to switch to use noise injection to gradients during training", ) # speech translation related parser.add_argument( "--context-residual", default=False, type=strtobool, nargs="?", help="The flag to switch to use context vector residual in the decoder network", ) # finetuning related parser.add_argument( "--enc-init", default=None, type=str, nargs="?", help="Pre-trained ASR model to initialize encoder.", ) parser.add_argument( "--enc-init-mods", default="enc.enc.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of encoder modules to initialize, separated by a comma.", ) parser.add_argument( "--dec-init", default=None, type=str, nargs="?", help="Pre-trained ASR, MT or LM model to initialize decoder.", ) parser.add_argument( "--dec-init-mods", default="att., dec.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of decoder modules to initialize, separated by a comma.", ) # multilingual related parser.add_argument( "--multilingual", default=False, type=strtobool, help="Prepend target language ID to the source sentence. " " Both source/target language IDs must be prepend in the pre-processing stage.", ) parser.add_argument( "--replace-sos", default=False, type=strtobool, help="Replace in the decoder with a target language ID \ (the first token in the target sequence)", ) # Feature transform: Normalization parser.add_argument( "--stats-file", type=str, default=None, help="The stats file for the feature normalization", ) parser.add_argument( "--apply-uttmvn", type=strtobool, default=True, help="Apply utterance level mean " "variance normalization.", ) parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="") parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="") # Feature transform: Fbank parser.add_argument( "--fbank-fs", type=int, default=16000, help="The sample frequency used for " "the mel-fbank creation.", ) parser.add_argument( "--n-mels", type=int, default=80, help="The number of mel-frequency bins." ) parser.add_argument("--fbank-fmin", type=float, default=0.0, help="") parser.add_argument("--fbank-fmax", type=float, default=None, help="") return parser def main(cmd_args): """Run the main training function.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) if args.backend == "chainer" and args.train_dtype != "float32": raise NotImplementedError( f"chainer backend does not support --train-dtype {args.train_dtype}." "Use --dtype float32." ) if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"): raise ValueError( f"--train-dtype {args.train_dtype} does not support the CPU backend." ) from espnet.utils.dynamic_import import dynamic_import if args.model_module is None: model_module = "espnet.nets." + args.backend + "_backend.e2e_st:E2E" else: model_module = args.model_module model_class = dynamic_import(model_module) model_class.add_arguments(parser) args = parser.parse_args(cmd_args) args.model_module = model_module if "chainer_backend" in args.model_module: args.backend = "chainer" if "pytorch_backend" in args.model_module: args.backend = "pytorch" # add version info in args args.version = __version__ # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 if args.ngpu is None: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: ngpu = len(cvd.split(",")) else: logging.warning("CUDA_VISIBLE_DEVICES is not set.") try: p = subprocess.run( ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except (subprocess.CalledProcessError, FileNotFoundError): ngpu = 0 else: ngpu = len(p.stderr.decode().split("\n")) - 1 args.ngpu = ngpu else: if is_torch_1_2_plus and args.ngpu != 1: logging.debug( "There are some bugs with multi-GPU processing in PyTorch 1.2+" + " (see https://github.com/pytorch/pytorch/issues/21108)" ) ngpu = args.ngpu logging.info(f"ngpu: {ngpu}") # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # set random seed logging.info("random seed = %d" % args.seed) random.seed(args.seed) np.random.seed(args.seed) # load dictionary for debug log if args.dict is not None: with open(args.dict, "rb") as f: dictionary = f.readlines() char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary] char_list.insert(0, "") char_list.append("") args.char_list = char_list else: args.char_list = None # train logging.info("backend = " + args.backend) if args.backend == "pytorch": from espnet.st.pytorch_backend.st import train train(args) else: raise ValueError("Only pytorch are supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/st_trans.py ================================================ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2019 Kyoto University (Hirofumi Inaguma) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """End-to-end speech translation model decoding script.""" import logging import os import random import sys import configargparse import numpy as np # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get default arguments.""" parser = configargparse.ArgumentParser( description="Translate text from speech using a speech translation " "model on one CPU or GPU", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="Config file path") parser.add( "--config2", is_config_file=True, help="Second config file path that overwrites the settings in `--config`", ) parser.add( "--config3", is_config_file=True, help="Third config file path that overwrites " "the settings in `--config` and `--config2`", ) parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs") parser.add_argument( "--dtype", choices=("float16", "float32", "float64"), default="float32", help="Float precision (only available in --api v2)", ) parser.add_argument( "--backend", type=str, default="chainer", choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--debugmode", type=int, default=1, help="Debugmode") parser.add_argument("--seed", type=int, default=1, help="Random seed") parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option") parser.add_argument( "--batchsize", type=int, default=1, help="Batch size for beam search (0: means no batch processing)", ) parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) parser.add_argument( "--api", default="v1", choices=["v1", "v2"], help="Beam search APIs " "v1: Default API. " "It only supports the ASRInterface.recognize method and DefaultRNNLM. " "v2: Experimental API. " "It supports any models that implements ScorerInterface.", ) # task related parser.add_argument( "--trans-json", type=str, help="Filename of translation data (json)" ) parser.add_argument( "--result-label", type=str, required=True, help="Filename of result label data (json)", ) # model (parameter) related parser.add_argument( "--model", type=str, required=True, help="Model file parameters to read" ) # search related parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") parser.add_argument("--beam-size", type=int, default=1, help="Beam size") parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty") parser.add_argument( "--maxlenratio", type=float, default=0.0, help="""Input length ratio to obtain max output length. If maxlenratio=0.0 (default), it uses a end-detect function to automatically find maximum hypothesis lengths""", ) parser.add_argument( "--minlenratio", type=float, default=0.0, help="Input length ratio to obtain min output length", ) # multilingual related parser.add_argument( "--tgt-lang", default=False, type=str, help="target language ID (e.g., , , and etc.)", ) return parser def main(args): """Run the main decoding function.""" parser = get_parser() args = parser.parse_args(args) # logging info if args.verbose == 1: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose == 2: logging.basicConfig( level=logging.DEBUG, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check CUDA_VISIBLE_DEVICES if args.ngpu > 0: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is None: logging.warning("CUDA_VISIBLE_DEVICES is not set.") elif args.ngpu != len(cvd.split(",")): logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") sys.exit(1) # TODO(mn5k): support of multiple GPUs if args.ngpu > 1: logging.error("The program only supports ngpu=1.") sys.exit(1) # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # seed setting random.seed(args.seed) np.random.seed(args.seed) logging.info("set random seed = %d" % args.seed) # trans logging.info("backend = " + args.backend) if args.backend == "pytorch": # Experimental API that supports custom LMs from espnet.st.pytorch_backend.st import trans if args.dtype != "float32": raise NotImplementedError( f"`--dtype {args.dtype}` is only available with `--api v2`" ) trans(args) else: raise ValueError("Only pytorch are supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/tts_decode.py ================================================ #!/usr/bin/env python3 # Copyright 2018 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """TTS decoding script.""" import configargparse import logging import os import platform import subprocess import sys from espnet.utils.cli_utils import strtobool # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get parser of decoding arguments.""" parser = configargparse.ArgumentParser( description="Synthesize speech from text using a TTS model on one CPU", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites " "the settings in `--config` and `--config2`.", ) parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs") parser.add_argument( "--backend", default="pytorch", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument("--out", type=str, required=True, help="Output filename") parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) # task related parser.add_argument( "--json", type=str, required=True, help="Filename of train label data (json)" ) parser.add_argument( "--model", type=str, required=True, help="Model file parameters to read" ) parser.add_argument( "--model-conf", type=str, default=None, help="Model config file" ) # decoding related parser.add_argument( "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding" ) parser.add_argument( "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding" ) parser.add_argument( "--threshold", type=float, default=0.5, help="Threshold value in decoding" ) parser.add_argument( "--use-att-constraint", type=strtobool, default=False, help="Whether to use the attention constraint", ) parser.add_argument( "--backward-window", type=int, default=1, help="Backward window size in the attention constraint", ) parser.add_argument( "--forward-window", type=int, default=3, help="Forward window size in the attention constraint", ) parser.add_argument( "--fastspeech-alpha", type=float, default=1.0, help="Alpha to change the speed for FastSpeech", ) # save related parser.add_argument( "--save-durations", default=False, type=strtobool, help="Whether to save durations converted from attentions", ) parser.add_argument( "--save-focus-rates", default=False, type=strtobool, help="Whether to save focus rates of attentions", ) return parser def main(args): """Run deocding.""" parser = get_parser() args = parser.parse_args(args) # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check CUDA_VISIBLE_DEVICES if args.ngpu > 0: # python 2 case if platform.python_version_tuple()[0] == "2": if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]): cvd = subprocess.check_output( ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)] ).strip() logging.info("CLSP: use gpu" + cvd) os.environ["CUDA_VISIBLE_DEVICES"] = cvd # python 3 case else: if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode(): cvd = ( subprocess.check_output( ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)] ) .decode() .strip() ) logging.info("CLSP: use gpu" + cvd) os.environ["CUDA_VISIBLE_DEVICES"] = cvd cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is None: logging.warning("CUDA_VISIBLE_DEVICES is not set.") elif args.ngpu != len(cvd.split(",")): logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") sys.exit(1) # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # extract logging.info("backend = " + args.backend) if args.backend == "pytorch": from espnet.tts.pytorch_backend.tts import decode decode(args) else: raise NotImplementedError("Only pytorch is supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/tts_train.py ================================================ #!/usr/bin/env python3 # Copyright 2018 Nagoya University (Tomoki Hayashi) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Text-to-speech model training script.""" import logging import os import random import subprocess import sys import configargparse import numpy as np from espnet import __version__ from espnet.nets.tts_interface import TTSInterface from espnet.utils.cli_utils import strtobool from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get parser of training arguments.""" parser = configargparse.ArgumentParser( description="Train a new text-to-speech (TTS) model on one CPU, " "one or multiple GPUs", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites " "the settings in `--config` and `--config2`.", ) parser.add_argument( "--ngpu", default=None, type=int, help="Number of GPUs. If not given, use all visible devices", ) parser.add_argument( "--backend", default="pytorch", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--outdir", type=str, required=True, help="Output directory") parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument( "--resume", "-r", default="", type=str, nargs="?", help="Resume the training from snapshot", ) parser.add_argument( "--minibatches", "-N", type=int, default="-1", help="Process only N minibatches (for debug)", ) parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--tensorboard-dir", default=None, type=str, nargs="?", help="Tensorboard log directory path", ) parser.add_argument( "--eval-interval-epochs", default=1, type=int, help="Evaluation interval epochs" ) parser.add_argument( "--save-interval-epochs", default=1, type=int, help="Save interval epochs" ) parser.add_argument( "--report-interval-iters", default=100, type=int, help="Report interval iterations", ) # task related parser.add_argument( "--train-json", type=str, required=True, help="Filename of training json" ) parser.add_argument( "--valid-json", type=str, required=True, help="Filename of validation json" ) # network architecture parser.add_argument( "--model-module", type=str, default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2", help="model defined module", ) # minibatch related parser.add_argument( "--sortagrad", default=0, type=int, nargs="?", help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs", ) parser.add_argument( "--batch-sort-key", default="shuffle", type=str, choices=["shuffle", "output", "input"], nargs="?", help='Batch sorting key. "shuffle" only work with --batch-count "seq".', ) parser.add_argument( "--batch-count", default="auto", choices=BATCH_COUNT_CHOICES, help="How to count batch_size. " "The default (auto) will find how to count by args.", ) parser.add_argument( "--batch-size", "--batch-seqs", "-b", default=0, type=int, help="Maximum seqs in a minibatch (0 to disable)", ) parser.add_argument( "--batch-bins", default=0, type=int, help="Maximum bins in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-in", default=0, type=int, help="Maximum input frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-out", default=0, type=int, help="Maximum output frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-inout", default=0, type=int, help="Maximum input+output frames in a minibatch (0 to disable)", ) parser.add_argument( "--maxlen-in", "--batch-seq-maxlen-in", default=100, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the input sequence length > ML.", ) parser.add_argument( "--maxlen-out", "--batch-seq-maxlen-out", default=200, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the output sequence length > ML", ) parser.add_argument( "--num-iter-processes", default=0, type=int, help="Number of processes of iterator", ) parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) parser.add_argument( "--use-speaker-embedding", default=False, type=strtobool, help="Whether to use speaker embedding", ) parser.add_argument( "--use-second-target", default=False, type=strtobool, help="Whether to use second target", ) # optimization related parser.add_argument( "--opt", default="adam", type=str, choices=["adam", "noam"], help="Optimizer" ) parser.add_argument( "--accum-grad", default=1, type=int, help="Number of gradient accumuration" ) parser.add_argument( "--lr", default=1e-3, type=float, help="Learning rate for optimizer" ) parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer") parser.add_argument( "--weight-decay", default=1e-6, type=float, help="Weight decay coefficient for optimizer", ) parser.add_argument( "--epochs", "-e", default=30, type=int, help="Number of maximum epochs" ) parser.add_argument( "--early-stop-criterion", default="validation/main/loss", type=str, nargs="?", help="Value to monitor to trigger an early stopping of the training", ) parser.add_argument( "--patience", default=3, type=int, nargs="?", help="Number of epochs to wait " "without improvement before stopping the training", ) parser.add_argument( "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip" ) parser.add_argument( "--num-save-attention", default=5, type=int, help="Number of samples of attention to be saved", ) parser.add_argument( "--keep-all-data-on-mem", default=False, type=strtobool, help="Whether to keep all data on memory", ) # finetuning related parser.add_argument( "--enc-init", default=None, type=str, help="Pre-trained TTS model path to initialize encoder.", ) parser.add_argument( "--enc-init-mods", default="enc.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of encoder modules to initialize, separated by a comma.", ) parser.add_argument( "--dec-init", default=None, type=str, help="Pre-trained TTS model path to initialize decoder.", ) parser.add_argument( "--dec-init-mods", default="dec.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of decoder modules to initialize, separated by a comma.", ) parser.add_argument( "--freeze-mods", default=None, type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of modules to freeze (not to train), separated by a comma.", ) return parser def main(cmd_args): """Run training.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) from espnet.utils.dynamic_import import dynamic_import model_class = dynamic_import(args.model_module) assert issubclass(model_class, TTSInterface) model_class.add_arguments(parser) args = parser.parse_args(cmd_args) # add version info in args args.version = __version__ # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 if args.ngpu is None: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: ngpu = len(cvd.split(",")) else: logging.warning("CUDA_VISIBLE_DEVICES is not set.") try: p = subprocess.run( ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except (subprocess.CalledProcessError, FileNotFoundError): ngpu = 0 else: ngpu = len(p.stderr.decode().split("\n")) - 1 args.ngpu = ngpu else: ngpu = args.ngpu logging.info(f"ngpu: {ngpu}") # set random seed logging.info("random seed = %d" % args.seed) random.seed(args.seed) np.random.seed(args.seed) if args.backend == "pytorch": from espnet.tts.pytorch_backend.tts import train train(args) else: raise NotImplementedError("Only pytorch is supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/vc_decode.py ================================================ #!/usr/bin/env python3 # Copyright 2020 Nagoya University (Wen-Chin Huang) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """VC decoding script.""" import configargparse import logging import os import platform import subprocess import sys from espnet.utils.cli_utils import strtobool # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get parser of decoding arguments.""" parser = configargparse.ArgumentParser( description="Converting speech using a VC model on one CPU", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites the settings " "in `--config` and `--config2`.", ) parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs") parser.add_argument( "--backend", default="pytorch", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument("--out", type=str, required=True, help="Output filename") parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) # task related parser.add_argument( "--json", type=str, required=True, help="Filename of train label data (json)" ) parser.add_argument( "--model", type=str, required=True, help="Model file parameters to read" ) parser.add_argument( "--model-conf", type=str, default=None, help="Model config file" ) # decoding related parser.add_argument( "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding" ) parser.add_argument( "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding" ) parser.add_argument( "--threshold", type=float, default=0.5, help="Threshold value in decoding" ) parser.add_argument( "--use-att-constraint", type=strtobool, default=False, help="Whether to use the attention constraint", ) parser.add_argument( "--backward-window", type=int, default=1, help="Backward window size in the attention constraint", ) parser.add_argument( "--forward-window", type=int, default=3, help="Forward window size in the attention constraint", ) # save related parser.add_argument( "--save-durations", default=False, type=strtobool, help="Whether to save durations converted from attentions", ) parser.add_argument( "--save-focus-rates", default=False, type=strtobool, help="Whether to save focus rates of attentions", ) return parser def main(args): """Run deocding.""" parser = get_parser() args = parser.parse_args(args) # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # check CUDA_VISIBLE_DEVICES if args.ngpu > 0: # python 2 case if platform.python_version_tuple()[0] == "2": if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]): cvd = subprocess.check_output( ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)] ).strip() logging.info("CLSP: use gpu" + cvd) os.environ["CUDA_VISIBLE_DEVICES"] = cvd # python 3 case else: if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode(): cvd = ( subprocess.check_output( ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)] ) .decode() .strip() ) logging.info("CLSP: use gpu" + cvd) os.environ["CUDA_VISIBLE_DEVICES"] = cvd cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is None: logging.warning("CUDA_VISIBLE_DEVICES is not set.") elif args.ngpu != len(cvd.split(",")): logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.") sys.exit(1) # display PYTHONPATH logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)")) # extract logging.info("backend = " + args.backend) if args.backend == "pytorch": from espnet.vc.pytorch_backend.vc import decode decode(args) else: raise NotImplementedError("Only pytorch is supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: bin/vc_train.py ================================================ #!/usr/bin/env python3 # Copyright 2020 Nagoya University (Wen-Chin Huang) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """Voice conversion model training script.""" import logging import os import random import subprocess import sys import configargparse import numpy as np from espnet import __version__ from espnet.nets.tts_interface import TTSInterface from espnet.utils.cli_utils import strtobool from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES # NOTE: you need this func to generate our sphinx doc def get_parser(): """Get parser of training arguments.""" parser = configargparse.ArgumentParser( description="Train a new voice conversion (VC) model on one CPU, " "one or multiple GPUs", config_file_parser_class=configargparse.YAMLConfigFileParser, formatter_class=configargparse.ArgumentDefaultsHelpFormatter, ) # general configuration parser.add("--config", is_config_file=True, help="config file path") parser.add( "--config2", is_config_file=True, help="second config file path that overwrites the settings in `--config`.", ) parser.add( "--config3", is_config_file=True, help="third config file path that overwrites the settings " "in `--config` and `--config2`.", ) parser.add_argument( "--ngpu", default=None, type=int, help="Number of GPUs. If not given, use all visible devices", ) parser.add_argument( "--backend", default="pytorch", type=str, choices=["chainer", "pytorch"], help="Backend library", ) parser.add_argument("--outdir", type=str, required=True, help="Output directory") parser.add_argument("--debugmode", default=1, type=int, help="Debugmode") parser.add_argument("--seed", default=1, type=int, help="Random seed") parser.add_argument( "--resume", "-r", default="", type=str, nargs="?", help="Resume the training from snapshot", ) parser.add_argument( "--minibatches", "-N", type=int, default="-1", help="Process only N minibatches (for debug)", ) parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option") parser.add_argument( "--tensorboard-dir", default=None, type=str, nargs="?", help="Tensorboard log directory path", ) parser.add_argument( "--eval-interval-epochs", default=100, type=int, help="Evaluation interval epochs", ) parser.add_argument( "--save-interval-epochs", default=1, type=int, help="Save interval epochs" ) parser.add_argument( "--report-interval-iters", default=10, type=int, help="Report interval iterations", ) # task related parser.add_argument("--srcspk", type=str, help="Source speaker") parser.add_argument("--trgspk", type=str, help="Target speaker") parser.add_argument( "--train-json", type=str, required=True, help="Filename of training json" ) parser.add_argument( "--valid-json", type=str, required=True, help="Filename of validation json" ) # network architecture parser.add_argument( "--model-module", type=str, default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2", help="model defined module", ) # minibatch related parser.add_argument( "--sortagrad", default=0, type=int, nargs="?", help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs", ) parser.add_argument( "--batch-sort-key", default="shuffle", type=str, choices=["shuffle", "output", "input"], nargs="?", help='Batch sorting key. "shuffle" only work with --batch-count "seq".', ) parser.add_argument( "--batch-count", default="auto", choices=BATCH_COUNT_CHOICES, help="How to count batch_size. " "The default (auto) will find how to count by args.", ) parser.add_argument( "--batch-size", "--batch-seqs", "-b", default=0, type=int, help="Maximum seqs in a minibatch (0 to disable)", ) parser.add_argument( "--batch-bins", default=0, type=int, help="Maximum bins in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-in", default=0, type=int, help="Maximum input frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-out", default=0, type=int, help="Maximum output frames in a minibatch (0 to disable)", ) parser.add_argument( "--batch-frames-inout", default=0, type=int, help="Maximum input+output frames in a minibatch (0 to disable)", ) parser.add_argument( "--maxlen-in", "--batch-seq-maxlen-in", default=100, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the input sequence length > ML.", ) parser.add_argument( "--maxlen-out", "--batch-seq-maxlen-out", default=200, type=int, metavar="ML", help="When --batch-count=seq, " "batch size is reduced if the output sequence length > ML", ) parser.add_argument( "--num-iter-processes", default=0, type=int, help="Number of processes of iterator", ) parser.add_argument( "--preprocess-conf", type=str, default=None, help="The configuration file for the pre-processing", ) parser.add_argument( "--use-speaker-embedding", default=False, type=strtobool, help="Whether to use speaker embedding", ) parser.add_argument( "--use-second-target", default=False, type=strtobool, help="Whether to use second target", ) # optimization related parser.add_argument( "--opt", default="adam", type=str, choices=["adam", "noam", "lamb"], help="Optimizer", ) parser.add_argument( "--accum-grad", default=1, type=int, help="Number of gradient accumuration" ) parser.add_argument( "--lr", default=1e-3, type=float, help="Learning rate for optimizer" ) parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer") parser.add_argument( "--weight-decay", default=1e-6, type=float, help="Weight decay coefficient for optimizer", ) parser.add_argument( "--epochs", "-e", default=30, type=int, help="Number of maximum epochs" ) parser.add_argument( "--early-stop-criterion", default="validation/main/loss", type=str, nargs="?", help="Value to monitor to trigger an early stopping of the training", ) parser.add_argument( "--patience", default=3, type=int, nargs="?", help="Number of epochs to wait without improvement " "before stopping the training", ) parser.add_argument( "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip" ) parser.add_argument( "--num-save-attention", default=5, type=int, help="Number of samples of attention to be saved", ) parser.add_argument( "--keep-all-data-on-mem", default=False, type=strtobool, help="Whether to keep all data on memory", ) parser.add_argument( "--enc-init", default=None, type=str, help="Pre-trained model path to initialize encoder.", ) parser.add_argument( "--enc-init-mods", default="enc.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of encoder modules to initialize, separated by a comma.", ) parser.add_argument( "--dec-init", default=None, type=str, help="Pre-trained model path to initialize decoder.", ) parser.add_argument( "--dec-init-mods", default="dec.", type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of decoder modules to initialize, separated by a comma.", ) parser.add_argument( "--freeze-mods", default=None, type=lambda s: [str(mod) for mod in s.split(",") if s != ""], help="List of modules to freeze (not to train), separated by a comma.", ) return parser def main(cmd_args): """Run training.""" parser = get_parser() args, _ = parser.parse_known_args(cmd_args) from espnet.utils.dynamic_import import dynamic_import model_class = dynamic_import(args.model_module) assert issubclass(model_class, TTSInterface) model_class.add_arguments(parser) args = parser.parse_args(cmd_args) # add version info in args args.version = __version__ # logging info if args.verbose > 0: logging.basicConfig( level=logging.INFO, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") # If --ngpu is not given, # 1. if CUDA_VISIBLE_DEVICES is set, all visible devices # 2. if nvidia-smi exists, use all devices # 3. else ngpu=0 if args.ngpu is None: cvd = os.environ.get("CUDA_VISIBLE_DEVICES") if cvd is not None: ngpu = len(cvd.split(",")) else: logging.warning("CUDA_VISIBLE_DEVICES is not set.") try: p = subprocess.run( ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE ) except (subprocess.CalledProcessError, FileNotFoundError): ngpu = 0 else: ngpu = len(p.stderr.decode().split("\n")) - 1 else: ngpu = args.ngpu logging.info(f"ngpu: {ngpu}") # set random seed logging.info("random seed = %d" % args.seed) random.seed(args.seed) np.random.seed(args.seed) if args.backend == "pytorch": from espnet.vc.pytorch_backend.vc import train train(args) else: raise NotImplementedError("Only pytorch is supported.") if __name__ == "__main__": main(sys.argv[1:]) ================================================ FILE: egs/.gitignore ================================================ launch espnet-2021724 segment_aishell1 word_ngram ================================================ FILE: egs/aishell1/.gitignore ================================================ dump dump32 dump64 data exp fbank ================================================ FILE: egs/aishell1/aed.sh ================================================ #!/usr/bin/env bash # author: tyriontian # tyriontian@tencent.com . ./path.sh || exit 1; . ./cmd.sh || exit 1; # general configuration backend=pytorch stage=0 # start from 0 if you need to start from data preparation stop_stage=100 debugmode=1 dumpdir=dump # directory to dump full features N=0 # number of minibatches to be used (mainly for debugging). "0" uses all minibatches. verbose=0 # verbose option resume= # Resume the training from snapshot debug=false # feature configuration do_delta=false preprocess_config=conf/specaug.yaml train_config=conf/tuning/train_pytorch_conformer_kernel31.yaml lm_config=conf/lm.yaml decode_config=conf/decode.yaml # rnnlm related lm_resume= # specify a snapshot file to resume LM training lmtag= # tag for managing LMs # ngram ngramtag= n_gram=4 # decoding parameter recog_model=model.acc.best # set a model to be used for decoding: 'model.acc.best' or 'model.loss.best' # data data=/data/asr_data/aishell/ data_url=www.openslr.org/resources/33 dict=data/lang_1char/train_sp_units.txt lang=data/lang_phone ### Configurable parameters ### tag="8v100_lasmmictc_alpha03_ctc03" ngpu=8 # Train config seed=888 batch_size=8 accum_grad=1 epochs=100 use_segment=true # if true, use word-level transcription in MMI criterion ctc_type="k2mmi" # k2mmi | k2ctc | default mtlalpha=0.3 third_weight=0.3 # MBR training config aux_mbr=false aux_mbr_weight=1.0 aux_mbr_beam=4 mbr_epochs=100 mbr_lr=0.1 mbr_warmup=2500 mbr_resume= # Decode config idx_average=41_50 mmi_weight=0.0 # MMI / phonectc joint decoding ctc_weight=0.5 # char ctc joint decoding ngram_weight=0.0 ngram_order=4 word_ngram_tag=word_3gram word_ngram_weight=0.0 word_ngram_log_semiring=true lm_weight=0.0 beam_size=10 mmi_rescore=false recog_set="test dev" . utils/parse_options.sh || exit 1; if [ $debug == true ]; then export HOST_GPU_NUM=1 export HOST_NUM=1 export NODE_NUM=1 export INDEX=0 export CHIEF_IP="9.135.217.29" fi train_opts=\ "\ --seed $seed \ --batch-size $batch_size \ --accum-grad $accum_grad \ --epochs $epochs \ --use-segment $use_segment \ --ctc_type $ctc_type \ --mtlalpha $mtlalpha \ --third-weight $third_weight \ " if [ $aux_mbr == true ]; then train_opts="$train_opts \ --aux-mbr $aux_mbr \ --aux-mbr-weight $aux_mbr_weight \ --aux-mbr-beam $aux_mbr_beam \ --transformer-lr $mbr_lr \ --epochs $mbr_epochs \ --transformer-warmup-steps $mbr_warmup \ --resume $mbr_resume \ --load-trainer-and-opt false \ --save-interval-iters 1000 \ " export OMP_NUM_THREADS=6 # for on-the-fly decoding fi decode_opts=\ "\ --ctc-weight $ctc_weight \ --mmi-weight $mmi_weight \ --ngram-weight $ngram_weight \ --mmi-rescore $mmi_rescore \ --beam-size $beam_size \ --word-ngram data/${word_ngram_tag} \ --word-ngram-weight $word_ngram_weight \ --word-ngram-log-semiring $word_ngram_log_semiring \ --lm-weight $lm_weight \ " # Set bash to 'debug' mode, it will exit on : # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', set -e set -u set -o pipefail train_set=train_sp train_dev=dev expname=${train_set}_${backend}_${tag} expdir=exp/${expname} mkdir -p ${expdir} feat_tr_dir=${dumpdir}/${train_set}/delta${do_delta}; mkdir -p ${feat_tr_dir} feat_dt_dir=${dumpdir}/${train_dev}/delta${do_delta}; mkdir -p ${feat_dt_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then echo "stage 1: Network Training" # make sure in jizhi config file: "exec_start_in_all_mpi_pods": true, MASTER_PORT=22277 NCCL_DEBUG=TRACE python3 -m torch.distributed.launch \ --nproc_per_node ${HOST_GPU_NUM} --master_port $MASTER_PORT \ --nnodes=${HOST_NUM} --node_rank=${INDEX} --master_addr=${CHIEF_IP} \ ${MAIN_ROOT}/bin/asr_train.py \ --config ${train_config} \ --preprocess-conf ${preprocess_config} \ --ngpu 1 \ --backend ${backend} \ --outdir ${expdir}/results_RANK \ --debugmode ${debugmode} \ --dict ${dict} \ --debugdir ${expdir} \ --minibatches ${N} \ --verbose ${verbose} \ --resume ${resume} \ --train-json ${feat_tr_dir}/split${ngpu}utt/data_tiny.RANK.json \ --valid-json ${feat_dt_dir}/data.json \ --lang $lang \ --opt "noam_sgd" \ --n-iter-processes 8 \ --world-size $ngpu \ --node-rank ${INDEX} \ --node-size ${HOST_GPU_NUM} \ $train_opts > ${expdir}/global_record.${INDEX}.txt 2>&1 fi if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then echo "stage 2: Decoding" nj=500 if [[ $(get_yaml.py ${train_config} model-module) = *transformer* ]] || \ [[ $(get_yaml.py ${train_config} model-module) = *conformer* ]] || \ [[ $(get_yaml.py ${train_config} etype) = custom ]] || \ [[ $(get_yaml.py ${train_config} dtype) = custom ]]; then recog_model=model.last${idx_average}.avg.best echo ${expdir}/results_0/${recog_model} average_checkpoints.py --backend ${backend} \ --snapshots ${expdir}/results_0/snapshot.ep.* \ --out ${expdir}/results_0/${recog_model} \ --num ${idx_average} fi decode_parent_dir=decode_mmi${mmi_weight}_${word_ngram_tag}${word_ngram_weight}_ctc${ctc_weight}_beam${beam_size}_${idx_average} for rtask in ${recog_set}; do decode_dir=$decode_parent_dir/$rtask feat_recog_dir=${dumpdir}/${rtask}/delta${do_delta} # split data splitjson.py --parts ${nj} ${feat_recog_dir}/data.json #### use CPU for decoding ngpu=0 ${decode_cmd} JOB=1:$nj ${expdir}/${decode_dir}/log/decode.JOB.log \ asr_recog.py \ --config ${decode_config} \ --ngpu ${ngpu} \ --backend ${backend} \ --batchsize 0 \ --recog-json ${feat_recog_dir}/split${nj}utt/data.JOB.json \ --result-label ${expdir}/${decode_dir}/data.JOB.json \ --model ${expdir}/results_0/${recog_model} \ --ngram-model exp/train_ngram/${ngram_order}gram.bin \ --rnnlm exp/train_rnnlm_pytorch_lm_transformer/rnnlm.model.best \ --rnnlm-conf exp/train_rnnlm_pytorch_lm_transformer/model.json \ --local-rank JOB --api v2 \ $decode_opts score_sclite.sh ${expdir}/${decode_dir} ${dict} \ > ${expdir}/${decode_dir}/decode_result.txt done echo "Finished" fi ================================================ FILE: egs/aishell1/cmd.sh ================================================ # ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== # Usage: .pl [options] JOB=1: # e.g. # run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB # # Options: # --time